Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
d3c9cb84
Commit
d3c9cb84
authored
Mar 25, 2024
by
赵小蒙
Browse files
分段部分log限定在debug模式下才能输出
parent
8c089976
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
15 additions
and
12 deletions
+15
-12
magic_pdf/para/para_split.py
magic_pdf/para/para_split.py
+14
-11
magic_pdf/pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+1
-1
No files found.
magic_pdf/para/para_split.py
View file @
d3c9cb84
...
@@ -501,7 +501,7 @@ def find_consecutive_true_regions(input_array):
...
@@ -501,7 +501,7 @@ def find_consecutive_true_regions(input_array):
return
regions
return
regions
def
__connect_middle_align_text
(
page_paras
,
new_layout_bbox
,
page_num
,
lang
):
def
__connect_middle_align_text
(
page_paras
,
new_layout_bbox
,
page_num
,
lang
,
debug_mode
):
"""
"""
找出来中间对齐的连续单行文本,如果连续行高度相同,那么合并为一个段落。
找出来中间对齐的连续单行文本,如果连续行高度相同,那么合并为一个段落。
一个line居中的条件是:
一个line居中的条件是:
...
@@ -527,8 +527,8 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
...
@@ -527,8 +527,8 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
first_line_text
=
''
.
join
([
__get_span_text
(
span
)
for
span
in
layout_para
[
start
][
0
][
'spans'
]])
first_line_text
=
''
.
join
([
__get_span_text
(
span
)
for
span
in
layout_para
[
start
][
0
][
'spans'
]])
if
"Table"
in
first_line_text
or
"Figure"
in
first_line_text
:
if
"Table"
in
first_line_text
or
"Figure"
in
first_line_text
:
pass
pass
if
debug_mode
:
logger
.
info
(
line_hi
.
std
())
logger
.
info
(
line_hi
.
std
())
if
line_hi
.
std
()
<
2
:
if
line_hi
.
std
()
<
2
:
"""行高度相同,那么判断是否居中"""
"""行高度相同,那么判断是否居中"""
...
@@ -540,7 +540,8 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
...
@@ -540,7 +540,8 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
and
not
all
([
x1
==
layout_box
[
2
]
for
x1
in
all_right_x1
]):
and
not
all
([
x1
==
layout_box
[
2
]
for
x1
in
all_right_x1
]):
merge_para
=
[
l
[
0
]
for
l
in
layout_para
[
start
:
end
+
1
]]
merge_para
=
[
l
[
0
]
for
l
in
layout_para
[
start
:
end
+
1
]]
para_text
=
''
.
join
([
__get_span_text
(
span
)
for
line
in
merge_para
for
span
in
line
[
'spans'
]])
para_text
=
''
.
join
([
__get_span_text
(
span
)
for
line
in
merge_para
for
span
in
line
[
'spans'
]])
logger
.
info
(
para_text
)
if
debug_mode
:
logger
.
info
(
para_text
)
layout_para
[
start
:
end
+
1
]
=
[
merge_para
]
layout_para
[
start
:
end
+
1
]
=
[
merge_para
]
index_offset
-=
end
-
start
index_offset
-=
end
-
start
...
@@ -576,7 +577,7 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
...
@@ -576,7 +577,7 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
return
connected_layout_paras
,
page_list_info
return
connected_layout_paras
,
page_list_info
def
para_split
(
pdf_info_dict
,
lang
=
"en"
):
def
para_split
(
pdf_info_dict
,
debug_mode
,
lang
=
"en"
):
"""
"""
根据line和layout情况进行分段
根据line和layout情况进行分段
"""
"""
...
@@ -601,13 +602,15 @@ def para_split(pdf_info_dict, lang="en"):
...
@@ -601,13 +602,15 @@ def para_split(pdf_info_dict, lang="en"):
pre_page_layout_bbox
=
new_layout_of_pages
[
page_num
-
1
]
pre_page_layout_bbox
=
new_layout_of_pages
[
page_num
-
1
]
next_page_layout_bbox
=
new_layout_of_pages
[
page_num
]
next_page_layout_bbox
=
new_layout_of_pages
[
page_num
]
is_conn
=
__connect_para_inter_page
(
pre_page_paras
,
next_page_paras
,
pre_page_layout_bbox
,
next_page_layout_bbox
,
page_num
,
lang
)
is_conn
=
__connect_para_inter_page
(
pre_page_paras
,
next_page_paras
,
pre_page_layout_bbox
,
next_page_layout_bbox
,
page_num
,
lang
)
if
is_conn
:
if
debug_mode
:
logger
.
info
(
f
"连接了第
{
page_num
-
1
}
页和第
{
page_num
}
页的段落"
)
if
is_conn
:
logger
.
info
(
f
"连接了第
{
page_num
-
1
}
页和第
{
page_num
}
页的段落"
)
is_list_conn
=
__connect_list_inter_page
(
pre_page_paras
,
next_page_paras
,
pre_page_layout_bbox
,
next_page_layout_bbox
,
all_page_list_info
[
page_num
-
1
],
all_page_list_info
[
page_num
],
page_num
,
lang
)
is_list_conn
=
__connect_list_inter_page
(
pre_page_paras
,
next_page_paras
,
pre_page_layout_bbox
,
next_page_layout_bbox
,
all_page_list_info
[
page_num
-
1
],
all_page_list_info
[
page_num
],
page_num
,
lang
)
if
is_list_conn
:
if
debug_mode
:
logger
.
info
(
f
"连接了第
{
page_num
-
1
}
页和第
{
page_num
}
页的列表段落"
)
if
is_list_conn
:
logger
.
info
(
f
"连接了第
{
page_num
-
1
}
页和第
{
page_num
}
页的列表段落"
)
"""接下来可能会漏掉一些特别的一些可以合并的内容,对他们进行段落连接
"""接下来可能会漏掉一些特别的一些可以合并的内容,对他们进行段落连接
1. 正文中有时出现一个行顶格,接下来几行缩进的情况。
1. 正文中有时出现一个行顶格,接下来几行缩进的情况。
...
@@ -616,5 +619,5 @@ def para_split(pdf_info_dict, lang="en"):
...
@@ -616,5 +619,5 @@ def para_split(pdf_info_dict, lang="en"):
for
page_num
,
page
in
enumerate
(
pdf_info_dict
.
values
()):
for
page_num
,
page
in
enumerate
(
pdf_info_dict
.
values
()):
page_paras
=
page
[
'para_blocks'
]
page_paras
=
page
[
'para_blocks'
]
new_layout_bbox
=
new_layout_of_pages
[
page_num
]
new_layout_bbox
=
new_layout_of_pages
[
page_num
]
__connect_middle_align_text
(
page_paras
,
new_layout_bbox
,
page_num
,
lang
)
__connect_middle_align_text
(
page_paras
,
new_layout_bbox
,
page_num
,
lang
,
debug_mode
=
debug_mode
)
__merge_signle_list_text
(
page_paras
,
new_layout_bbox
,
page_num
,
lang
)
__merge_signle_list_text
(
page_paras
,
new_layout_bbox
,
page_num
,
lang
)
magic_pdf/pdf_parse_by_ocr.py
View file @
d3c9cb84
...
@@ -269,7 +269,7 @@ def parse_pdf_by_ocr(
...
@@ -269,7 +269,7 @@ def parse_pdf_by_ocr(
pdf_info_dict
[
f
"page_
{
page_id
}
"
]
=
page_info
pdf_info_dict
[
f
"page_
{
page_id
}
"
]
=
page_info
"""分段"""
"""分段"""
para_split
(
pdf_info_dict
)
para_split
(
pdf_info_dict
,
debug_mode
=
debug_mode
)
'''在测试时,保存调试信息'''
'''在测试时,保存调试信息'''
if
debug_mode
:
if
debug_mode
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment