Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
e4904cd6
Unverified
Commit
e4904cd6
authored
Oct 21, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Oct 21, 2024
Browse files
Merge pull request #765 from myhloli/add-list-group
refactor(para): improve paragraph splitting algorithm
parents
fe21eebd
8cc76c49
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
40 additions
and
34 deletions
+40
-34
magic_pdf/dict2md/ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+23
-26
magic_pdf/para/para_split_v3.py
magic_pdf/para/para_split_v3.py
+15
-2
magic_pdf/pipe/AbsPipe.py
magic_pdf/pipe/AbsPipe.py
+2
-6
No files found.
magic_pdf/dict2md/ocr_mkcontent.py
View file @
e4904cd6
...
@@ -36,9 +36,9 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
...
@@ -36,9 +36,9 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
paras_of_layout
,
'mm'
,
img_buket_path
)
paras_of_layout
,
'mm'
,
img_buket_path
)
markdown_with_para_and_pagination
.
append
({
markdown_with_para_and_pagination
.
append
({
'page_no'
:
'page_no'
:
page_no
,
page_no
,
'md_content'
:
'md_content'
:
'
\n\n
'
.
join
(
page_markdown
)
'
\n\n
'
.
join
(
page_markdown
)
})
})
page_no
+=
1
page_no
+=
1
return
markdown_with_para_and_pagination
return
markdown_with_para_and_pagination
...
@@ -47,19 +47,17 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
...
@@ -47,19 +47,17 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
def
ocr_mk_markdown_with_para_core_v2
(
paras_of_layout
,
def
ocr_mk_markdown_with_para_core_v2
(
paras_of_layout
,
mode
,
mode
,
img_buket_path
=
''
,
img_buket_path
=
''
,
parse_type
=
"auto"
,
lang
=
None
):
):
page_markdown
=
[]
page_markdown
=
[]
for
para_block
in
paras_of_layout
:
for
para_block
in
paras_of_layout
:
para_text
=
''
para_text
=
''
para_type
=
para_block
[
'type'
]
para_type
=
para_block
[
'type'
]
if
para_type
in
[
BlockType
.
Text
,
BlockType
.
List
,
BlockType
.
Index
]:
if
para_type
in
[
BlockType
.
Text
,
BlockType
.
List
,
BlockType
.
Index
]:
para_text
=
merge_para_with_text
(
para_block
,
parse_type
=
parse_type
,
lang
=
lang
)
para_text
=
merge_para_with_text
(
para_block
)
elif
para_type
==
BlockType
.
Title
:
elif
para_type
==
BlockType
.
Title
:
para_text
=
f
'#
{
merge_para_with_text
(
para_block
,
parse_type
=
parse_type
,
lang
=
lang
)
}
'
para_text
=
f
'#
{
merge_para_with_text
(
para_block
)
}
'
elif
para_type
==
BlockType
.
InterlineEquation
:
elif
para_type
==
BlockType
.
InterlineEquation
:
para_text
=
merge_para_with_text
(
para_block
,
parse_type
=
parse_type
,
lang
=
lang
)
para_text
=
merge_para_with_text
(
para_block
)
elif
para_type
==
BlockType
.
Image
:
elif
para_type
==
BlockType
.
Image
:
if
mode
==
'nlp'
:
if
mode
==
'nlp'
:
continue
continue
...
@@ -72,17 +70,17 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
...
@@ -72,17 +70,17 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
para_text
+=
f
"
\n

}
)
\n
"
para_text
+=
f
"
\n

}
)
\n
"
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼image_caption
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼image_caption
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
para_text
+=
merge_para_with_text
(
block
,
parse_type
=
parse_type
,
lang
=
lang
)
para_text
+=
merge_para_with_text
(
block
)
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼image_caption
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼image_caption
if
block
[
'type'
]
==
BlockType
.
ImageFootnote
:
if
block
[
'type'
]
==
BlockType
.
ImageFootnote
:
para_text
+=
merge_para_with_text
(
block
,
parse_type
=
parse_type
,
lang
=
lang
)
para_text
+=
merge_para_with_text
(
block
)
elif
para_type
==
BlockType
.
Table
:
elif
para_type
==
BlockType
.
Table
:
if
mode
==
'nlp'
:
if
mode
==
'nlp'
:
continue
continue
elif
mode
==
'mm'
:
elif
mode
==
'mm'
:
for
block
in
para_block
[
'blocks'
]:
# 1st.拼table_caption
for
block
in
para_block
[
'blocks'
]:
# 1st.拼table_caption
if
block
[
'type'
]
==
BlockType
.
TableCaption
:
if
block
[
'type'
]
==
BlockType
.
TableCaption
:
para_text
+=
merge_para_with_text
(
block
,
parse_type
=
parse_type
,
lang
=
lang
)
para_text
+=
merge_para_with_text
(
block
)
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼table_body
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼table_body
if
block
[
'type'
]
==
BlockType
.
TableBody
:
if
block
[
'type'
]
==
BlockType
.
TableBody
:
for
line
in
block
[
'lines'
]:
for
line
in
block
[
'lines'
]:
...
@@ -97,7 +95,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
...
@@ -97,7 +95,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
para_text
+=
f
"
\n

}
)
\n
"
para_text
+=
f
"
\n

}
)
\n
"
for
block
in
para_block
[
'blocks'
]:
# 3rd.拼table_footnote
for
block
in
para_block
[
'blocks'
]:
# 3rd.拼table_footnote
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
para_text
+=
merge_para_with_text
(
block
,
parse_type
=
parse_type
,
lang
=
lang
)
para_text
+=
merge_para_with_text
(
block
)
if
para_text
.
strip
()
==
''
:
if
para_text
.
strip
()
==
''
:
continue
continue
...
@@ -120,7 +118,7 @@ def detect_language(text):
...
@@ -120,7 +118,7 @@ def detect_language(text):
return
'empty'
return
'empty'
def
merge_para_with_text
(
para_block
,
parse_type
=
"auto"
,
lang
=
None
):
def
merge_para_with_text
(
para_block
):
para_text
=
''
para_text
=
''
for
i
,
line
in
enumerate
(
para_block
[
'lines'
]):
for
i
,
line
in
enumerate
(
para_block
[
'lines'
]):
...
@@ -161,24 +159,24 @@ def merge_para_with_text(para_block, parse_type="auto", lang=None):
...
@@ -161,24 +159,24 @@ def merge_para_with_text(para_block, parse_type="auto", lang=None):
return
para_text
return
para_text
def
para_to_standard_format_v2
(
para_block
,
img_buket_path
,
page_idx
,
parse_type
=
"auto"
,
lang
=
None
,
drop_reason
=
None
):
def
para_to_standard_format_v2
(
para_block
,
img_buket_path
,
page_idx
,
drop_reason
=
None
):
para_type
=
para_block
[
'type'
]
para_type
=
para_block
[
'type'
]
para_content
=
{}
para_content
=
{}
if
para_type
==
BlockType
.
Text
:
if
para_type
==
BlockType
.
Text
:
para_content
=
{
para_content
=
{
'type'
:
'text'
,
'type'
:
'text'
,
'text'
:
merge_para_with_text
(
para_block
,
parse_type
=
parse_type
,
lang
=
lang
),
'text'
:
merge_para_with_text
(
para_block
),
}
}
elif
para_type
==
BlockType
.
Title
:
elif
para_type
==
BlockType
.
Title
:
para_content
=
{
para_content
=
{
'type'
:
'text'
,
'type'
:
'text'
,
'text'
:
merge_para_with_text
(
para_block
,
parse_type
=
parse_type
,
lang
=
lang
),
'text'
:
merge_para_with_text
(
para_block
),
'text_level'
:
1
,
'text_level'
:
1
,
}
}
elif
para_type
==
BlockType
.
InterlineEquation
:
elif
para_type
==
BlockType
.
InterlineEquation
:
para_content
=
{
para_content
=
{
'type'
:
'equation'
,
'type'
:
'equation'
,
'text'
:
merge_para_with_text
(
para_block
,
parse_type
=
parse_type
,
lang
=
lang
),
'text'
:
merge_para_with_text
(
para_block
),
'text_format'
:
'latex'
,
'text_format'
:
'latex'
,
}
}
elif
para_type
==
BlockType
.
Image
:
elif
para_type
==
BlockType
.
Image
:
...
@@ -189,9 +187,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type=
...
@@ -189,9 +187,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type=
img_buket_path
,
img_buket_path
,
block
[
'lines'
][
0
][
'spans'
][
0
][
'image_path'
])
block
[
'lines'
][
0
][
'spans'
][
0
][
'image_path'
])
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
para_content
[
'img_caption'
]
=
merge_para_with_text
(
block
,
parse_type
=
parse_type
,
lang
=
lang
)
para_content
[
'img_caption'
]
=
merge_para_with_text
(
block
)
if
block
[
'type'
]
==
BlockType
.
ImageFootnote
:
if
block
[
'type'
]
==
BlockType
.
ImageFootnote
:
para_content
[
'img_footnote'
]
=
merge_para_with_text
(
block
,
parse_type
=
parse_type
,
lang
=
lang
)
para_content
[
'img_footnote'
]
=
merge_para_with_text
(
block
)
elif
para_type
==
BlockType
.
Table
:
elif
para_type
==
BlockType
.
Table
:
para_content
=
{
'type'
:
'table'
}
para_content
=
{
'type'
:
'table'
}
for
block
in
para_block
[
'blocks'
]:
for
block
in
para_block
[
'blocks'
]:
...
@@ -202,9 +200,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type=
...
@@ -202,9 +200,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type=
para_content
[
'table_body'
]
=
f
"
\n\n
{
block
[
'lines'
][
0
][
'spans'
][
0
][
'html'
]
}
\n\n
"
para_content
[
'table_body'
]
=
f
"
\n\n
{
block
[
'lines'
][
0
][
'spans'
][
0
][
'html'
]
}
\n\n
"
para_content
[
'img_path'
]
=
join_path
(
img_buket_path
,
block
[
"lines"
][
0
][
"spans"
][
0
][
'image_path'
])
para_content
[
'img_path'
]
=
join_path
(
img_buket_path
,
block
[
"lines"
][
0
][
"spans"
][
0
][
'image_path'
])
if
block
[
'type'
]
==
BlockType
.
TableCaption
:
if
block
[
'type'
]
==
BlockType
.
TableCaption
:
para_content
[
'table_caption'
]
=
merge_para_with_text
(
block
,
parse_type
=
parse_type
,
lang
=
lang
)
para_content
[
'table_caption'
]
=
merge_para_with_text
(
block
)
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
para_content
[
'table_footnote'
]
=
merge_para_with_text
(
block
,
parse_type
=
parse_type
,
lang
=
lang
)
para_content
[
'table_footnote'
]
=
merge_para_with_text
(
block
)
para_content
[
'page_idx'
]
=
page_idx
para_content
[
'page_idx'
]
=
page_idx
...
@@ -218,8 +216,7 @@ def union_make(pdf_info_dict: list,
...
@@ -218,8 +216,7 @@ def union_make(pdf_info_dict: list,
make_mode
:
str
,
make_mode
:
str
,
drop_mode
:
str
,
drop_mode
:
str
,
img_buket_path
:
str
=
''
,
img_buket_path
:
str
=
''
,
parse_type
:
str
=
"auto"
,
):
lang
=
None
):
output_content
=
[]
output_content
=
[]
for
page_info
in
pdf_info_dict
:
for
page_info
in
pdf_info_dict
:
drop_reason_flag
=
False
drop_reason_flag
=
False
...
@@ -246,20 +243,20 @@ def union_make(pdf_info_dict: list,
...
@@ -246,20 +243,20 @@ def union_make(pdf_info_dict: list,
continue
continue
if
make_mode
==
MakeMode
.
MM_MD
:
if
make_mode
==
MakeMode
.
MM_MD
:
page_markdown
=
ocr_mk_markdown_with_para_core_v2
(
page_markdown
=
ocr_mk_markdown_with_para_core_v2
(
paras_of_layout
,
'mm'
,
img_buket_path
,
parse_type
=
parse_type
,
lang
=
lang
)
paras_of_layout
,
'mm'
,
img_buket_path
)
output_content
.
extend
(
page_markdown
)
output_content
.
extend
(
page_markdown
)
elif
make_mode
==
MakeMode
.
NLP_MD
:
elif
make_mode
==
MakeMode
.
NLP_MD
:
page_markdown
=
ocr_mk_markdown_with_para_core_v2
(
page_markdown
=
ocr_mk_markdown_with_para_core_v2
(
paras_of_layout
,
'nlp'
,
parse_type
=
parse_type
,
lang
=
lang
)
paras_of_layout
,
'nlp'
)
output_content
.
extend
(
page_markdown
)
output_content
.
extend
(
page_markdown
)
elif
make_mode
==
MakeMode
.
STANDARD_FORMAT
:
elif
make_mode
==
MakeMode
.
STANDARD_FORMAT
:
for
para_block
in
paras_of_layout
:
for
para_block
in
paras_of_layout
:
if
drop_reason_flag
:
if
drop_reason_flag
:
para_content
=
para_to_standard_format_v2
(
para_content
=
para_to_standard_format_v2
(
para_block
,
img_buket_path
,
page_idx
,
parse_type
=
parse_type
,
lang
=
lang
,
drop_reason
=
drop_reason
)
para_block
,
img_buket_path
,
page_idx
)
else
:
else
:
para_content
=
para_to_standard_format_v2
(
para_content
=
para_to_standard_format_v2
(
para_block
,
img_buket_path
,
page_idx
,
parse_type
=
parse_type
,
lang
=
lang
)
para_block
,
img_buket_path
,
page_idx
)
output_content
.
append
(
para_content
)
output_content
.
append
(
para_content
)
if
make_mode
in
[
MakeMode
.
MM_MD
,
MakeMode
.
NLP_MD
]:
if
make_mode
in
[
MakeMode
.
MM_MD
,
MakeMode
.
NLP_MD
]:
return
'
\n\n
'
.
join
(
output_content
)
return
'
\n\n
'
.
join
(
output_content
)
...
...
magic_pdf/para/para_split_v3.py
View file @
e4904cd6
...
@@ -59,7 +59,7 @@ def __is_list_or_index_block(block):
...
@@ -59,7 +59,7 @@ def __is_list_or_index_block(block):
# index block 是一种特殊的list block
# index block 是一种特殊的list block
# 一个block如果是index block 应该同时满足以下特征
# 一个block如果是index block 应该同时满足以下特征
# 1.block内有多个line 2.block 内有多个line两侧均顶格写 3.line的开头或者结尾均为数字
# 1.block内有多个line 2.block 内有多个line两侧均顶格写 3.line的开头或者结尾均为数字
if
len
(
block
[
'lines'
])
>=
3
:
if
len
(
block
[
'lines'
])
>=
2
:
first_line
=
block
[
'lines'
][
0
]
first_line
=
block
[
'lines'
][
0
]
line_height
=
first_line
[
'bbox'
][
3
]
-
first_line
[
'bbox'
][
1
]
line_height
=
first_line
[
'bbox'
][
3
]
-
first_line
[
'bbox'
][
1
]
block_weight
=
block
[
'bbox_fs'
][
2
]
-
block
[
'bbox_fs'
][
0
]
block_weight
=
block
[
'bbox_fs'
][
2
]
-
block
[
'bbox_fs'
][
0
]
...
@@ -227,6 +227,15 @@ def __merge_2_list_blocks(block1, block2):
...
@@ -227,6 +227,15 @@ def __merge_2_list_blocks(block1, block2):
return
block1
,
block2
return
block1
,
block2
def
__is_list_group
(
text_blocks_group
):
# list group的特征是一个group内的所有block都满足以下条件
# 1.每个block都不超过3行 2. 每个block 的左边界都比较接近(逻辑简单点先不加这个规则)
for
block
in
text_blocks_group
:
if
len
(
block
[
'lines'
])
>
3
:
return
False
return
True
def
__para_merge_page
(
blocks
):
def
__para_merge_page
(
blocks
):
page_text_blocks_groups
=
__process_blocks
(
blocks
)
page_text_blocks_groups
=
__process_blocks
(
blocks
)
for
text_blocks_group
in
page_text_blocks_groups
:
for
text_blocks_group
in
page_text_blocks_groups
:
...
@@ -239,6 +248,10 @@ def __para_merge_page(blocks):
...
@@ -239,6 +248,10 @@ def __para_merge_page(blocks):
# logger.info(f"{block['type']}:{block}")
# logger.info(f"{block['type']}:{block}")
if
len
(
text_blocks_group
)
>
1
:
if
len
(
text_blocks_group
)
>
1
:
# 在合并前判断这个group 是否是一个 list group
is_list_group
=
__is_list_group
(
text_blocks_group
)
# 倒序遍历
# 倒序遍历
for
i
in
range
(
len
(
text_blocks_group
)
-
1
,
-
1
,
-
1
):
for
i
in
range
(
len
(
text_blocks_group
)
-
1
,
-
1
,
-
1
):
current_block
=
text_blocks_group
[
i
]
current_block
=
text_blocks_group
[
i
]
...
@@ -247,7 +260,7 @@ def __para_merge_page(blocks):
...
@@ -247,7 +260,7 @@ def __para_merge_page(blocks):
if
i
-
1
>=
0
:
if
i
-
1
>=
0
:
prev_block
=
text_blocks_group
[
i
-
1
]
prev_block
=
text_blocks_group
[
i
-
1
]
if
current_block
[
'type'
]
==
'text'
and
prev_block
[
'type'
]
==
'text'
:
if
current_block
[
'type'
]
==
'text'
and
prev_block
[
'type'
]
==
'text'
and
not
is_list_group
:
__merge_2_text_blocks
(
current_block
,
prev_block
)
__merge_2_text_blocks
(
current_block
,
prev_block
)
elif
(
elif
(
(
current_block
[
'type'
]
==
BlockType
.
List
and
prev_block
[
'type'
]
==
BlockType
.
List
)
or
(
current_block
[
'type'
]
==
BlockType
.
List
and
prev_block
[
'type'
]
==
BlockType
.
List
)
or
...
...
magic_pdf/pipe/AbsPipe.py
View file @
e4904cd6
...
@@ -95,9 +95,7 @@ class AbsPipe(ABC):
...
@@ -95,9 +95,7 @@ class AbsPipe(ABC):
"""
"""
pdf_mid_data
=
JsonCompressor
.
decompress_json
(
compressed_pdf_mid_data
)
pdf_mid_data
=
JsonCompressor
.
decompress_json
(
compressed_pdf_mid_data
)
pdf_info_list
=
pdf_mid_data
[
"pdf_info"
]
pdf_info_list
=
pdf_mid_data
[
"pdf_info"
]
parse_type
=
pdf_mid_data
[
"_parse_type"
]
content_list
=
union_make
(
pdf_info_list
,
MakeMode
.
STANDARD_FORMAT
,
drop_mode
,
img_buket_path
)
lang
=
pdf_mid_data
.
get
(
"_lang"
,
None
)
content_list
=
union_make
(
pdf_info_list
,
MakeMode
.
STANDARD_FORMAT
,
drop_mode
,
img_buket_path
,
parse_type
,
lang
)
return
content_list
return
content_list
@
staticmethod
@
staticmethod
...
@@ -107,9 +105,7 @@ class AbsPipe(ABC):
...
@@ -107,9 +105,7 @@ class AbsPipe(ABC):
"""
"""
pdf_mid_data
=
JsonCompressor
.
decompress_json
(
compressed_pdf_mid_data
)
pdf_mid_data
=
JsonCompressor
.
decompress_json
(
compressed_pdf_mid_data
)
pdf_info_list
=
pdf_mid_data
[
"pdf_info"
]
pdf_info_list
=
pdf_mid_data
[
"pdf_info"
]
parse_type
=
pdf_mid_data
[
"_parse_type"
]
md_content
=
union_make
(
pdf_info_list
,
md_make_mode
,
drop_mode
,
img_buket_path
)
lang
=
pdf_mid_data
.
get
(
"_lang"
,
None
)
md_content
=
union_make
(
pdf_info_list
,
md_make_mode
,
drop_mode
,
img_buket_path
,
parse_type
,
lang
)
return
md_content
return
md_content
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment