Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
702b6ac9
Unverified
Commit
702b6ac9
authored
Oct 15, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Oct 15, 2024
Browse files
Merge pull request #740 from myhloli/para-split-v3
feat(list&index block): detect and merge list and index blocks
parents
c479245e
1f1dd353
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
197 additions
and
20 deletions
+197
-20
magic_pdf/dict2md/ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+19
-14
magic_pdf/libs/draw_bbox.py
magic_pdf/libs/draw_bbox.py
+13
-0
magic_pdf/libs/ocr_content_type.py
magic_pdf/libs/ocr_content_type.py
+2
-0
magic_pdf/model/pdf_extract_kit.py
magic_pdf/model/pdf_extract_kit.py
+1
-1
magic_pdf/para/para_split_v3.py
magic_pdf/para/para_split_v3.py
+160
-2
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+1
-1
magic_pdf/pre_proc/ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+1
-2
No files found.
magic_pdf/dict2md/ocr_mkcontent.py
View file @
702b6ac9
...
@@ -8,6 +8,7 @@ from magic_pdf.libs.language import detect_lang
...
@@ -8,6 +8,7 @@ from magic_pdf.libs.language import detect_lang
from
magic_pdf.libs.MakeContentConfig
import
DropMode
,
MakeMode
from
magic_pdf.libs.MakeContentConfig
import
DropMode
,
MakeMode
from
magic_pdf.libs.markdown_utils
import
ocr_escape_special_markdown_char
from
magic_pdf.libs.markdown_utils
import
ocr_escape_special_markdown_char
from
magic_pdf.libs.ocr_content_type
import
BlockType
,
ContentType
from
magic_pdf.libs.ocr_content_type
import
BlockType
,
ContentType
from
magic_pdf.para.para_split_v3
import
ListLineTag
def
__is_hyphen_at_line_end
(
line
):
def
__is_hyphen_at_line_end
(
line
):
...
@@ -124,7 +125,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
...
@@ -124,7 +125,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
for
para_block
in
paras_of_layout
:
for
para_block
in
paras_of_layout
:
para_text
=
''
para_text
=
''
para_type
=
para_block
[
'type'
]
para_type
=
para_block
[
'type'
]
if
para_type
==
BlockType
.
Text
:
if
para_type
in
[
BlockType
.
Text
,
BlockType
.
List
,
BlockType
.
Index
]
:
para_text
=
merge_para_with_text
(
para_block
,
parse_type
=
parse_type
,
lang
=
lang
)
para_text
=
merge_para_with_text
(
para_block
,
parse_type
=
parse_type
,
lang
=
lang
)
elif
para_type
==
BlockType
.
Title
:
elif
para_type
==
BlockType
.
Title
:
para_text
=
f
'#
{
merge_para_with_text
(
para_block
,
parse_type
=
parse_type
,
lang
=
lang
)
}
'
para_text
=
f
'#
{
merge_para_with_text
(
para_block
,
parse_type
=
parse_type
,
lang
=
lang
)
}
'
...
@@ -177,22 +178,26 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
...
@@ -177,22 +178,26 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
return
page_markdown
return
page_markdown
def
merge_para_with_text
(
para_block
,
parse_type
=
"auto"
,
lang
=
None
):
def
detect_language
(
text
):
en_pattern
=
r
'[a-zA-Z]+'
def
detect_language
(
text
):
en_matches
=
re
.
findall
(
en_pattern
,
text
)
en_pattern
=
r
'[a-zA-Z]+'
en_length
=
sum
(
len
(
match
)
for
match
in
en_matches
)
en_matches
=
re
.
findall
(
en_pattern
,
text
)
if
len
(
text
)
>
0
:
en_length
=
sum
(
len
(
match
)
for
match
in
en_matches
)
if
en_length
/
len
(
text
)
>=
0.5
:
if
len
(
text
)
>
0
:
return
'en'
if
en_length
/
len
(
text
)
>=
0.5
:
return
'en'
else
:
return
'unknown'
else
:
else
:
return
'empty'
return
'unknown'
else
:
return
'empty'
def
merge_para_with_text
(
para_block
,
parse_type
=
"auto"
,
lang
=
None
):
para_text
=
''
para_text
=
''
for
line
in
para_block
[
'lines'
]:
for
i
,
line
in
enumerate
(
para_block
[
'lines'
]):
if
i
>=
1
and
line
.
get
(
ListLineTag
.
IS_LIST_START_LINE
,
False
):
para_text
+=
'
\n
'
line_text
=
''
line_text
=
''
line_lang
=
''
line_lang
=
''
for
span
in
line
[
'spans'
]:
for
span
in
line
[
'spans'
]:
...
...
magic_pdf/libs/draw_bbox.py
View file @
702b6ac9
...
@@ -75,6 +75,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
...
@@ -75,6 +75,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
titles_list
=
[]
titles_list
=
[]
texts_list
=
[]
texts_list
=
[]
interequations_list
=
[]
interequations_list
=
[]
lists_list
=
[]
indexs_list
=
[]
for
page
in
pdf_info
:
for
page
in
pdf_info
:
page_dropped_list
=
[]
page_dropped_list
=
[]
...
@@ -83,6 +85,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
...
@@ -83,6 +85,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
titles
=
[]
titles
=
[]
texts
=
[]
texts
=
[]
interequations
=
[]
interequations
=
[]
lists
=
[]
indexs
=
[]
for
dropped_bbox
in
page
[
'discarded_blocks'
]:
for
dropped_bbox
in
page
[
'discarded_blocks'
]:
page_dropped_list
.
append
(
dropped_bbox
[
'bbox'
])
page_dropped_list
.
append
(
dropped_bbox
[
'bbox'
])
...
@@ -115,6 +119,11 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
...
@@ -115,6 +119,11 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
texts
.
append
(
bbox
)
texts
.
append
(
bbox
)
elif
block
[
'type'
]
==
BlockType
.
InterlineEquation
:
elif
block
[
'type'
]
==
BlockType
.
InterlineEquation
:
interequations
.
append
(
bbox
)
interequations
.
append
(
bbox
)
elif
block
[
'type'
]
==
BlockType
.
List
:
lists
.
append
(
bbox
)
elif
block
[
'type'
]
==
BlockType
.
Index
:
indexs
.
append
(
bbox
)
tables_list
.
append
(
tables
)
tables_list
.
append
(
tables
)
tables_body_list
.
append
(
tables_body
)
tables_body_list
.
append
(
tables_body
)
tables_caption_list
.
append
(
tables_caption
)
tables_caption_list
.
append
(
tables_caption
)
...
@@ -126,6 +135,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
...
@@ -126,6 +135,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
titles_list
.
append
(
titles
)
titles_list
.
append
(
titles
)
texts_list
.
append
(
texts
)
texts_list
.
append
(
texts
)
interequations_list
.
append
(
interequations
)
interequations_list
.
append
(
interequations
)
lists_list
.
append
(
lists
)
indexs_list
.
append
(
indexs
)
layout_bbox_list
=
[]
layout_bbox_list
=
[]
...
@@ -160,6 +171,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
...
@@ -160,6 +171,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
draw_bbox_without_number
(
i
,
texts_list
,
page
,
[
153
,
0
,
76
],
True
)
draw_bbox_without_number
(
i
,
texts_list
,
page
,
[
153
,
0
,
76
],
True
)
draw_bbox_without_number
(
i
,
interequations_list
,
page
,
[
0
,
255
,
0
],
draw_bbox_without_number
(
i
,
interequations_list
,
page
,
[
0
,
255
,
0
],
True
)
True
)
draw_bbox_without_number
(
i
,
lists_list
,
page
,
[
40
,
169
,
92
],
True
)
draw_bbox_without_number
(
i
,
indexs_list
,
page
,
[
40
,
169
,
92
],
True
)
draw_bbox_with_number
(
i
,
layout_bbox_list
,
page
,
[
255
,
0
,
0
],
False
,
draw_bbox
=
False
)
draw_bbox_with_number
(
i
,
layout_bbox_list
,
page
,
[
255
,
0
,
0
],
False
,
draw_bbox
=
False
)
...
...
magic_pdf/libs/ocr_content_type.py
View file @
702b6ac9
...
@@ -20,6 +20,8 @@ class BlockType:
...
@@ -20,6 +20,8 @@ class BlockType:
InterlineEquation
=
'interline_equation'
InterlineEquation
=
'interline_equation'
Footnote
=
'footnote'
Footnote
=
'footnote'
Discarded
=
'discarded'
Discarded
=
'discarded'
List
=
'list'
Index
=
'index'
class
CategoryId
:
class
CategoryId
:
...
...
magic_pdf/model/pdf_extract_kit.py
View file @
702b6ac9
...
@@ -340,7 +340,7 @@ class CustomPEKModel:
...
@@ -340,7 +340,7 @@ class CustomPEKModel:
if
torch
.
cuda
.
is_available
():
if
torch
.
cuda
.
is_available
():
properties
=
torch
.
cuda
.
get_device_properties
(
self
.
device
)
properties
=
torch
.
cuda
.
get_device_properties
(
self
.
device
)
total_memory
=
properties
.
total_memory
/
(
1024
**
3
)
# 将字节转换为 GB
total_memory
=
properties
.
total_memory
/
(
1024
**
3
)
# 将字节转换为 GB
if
total_memory
<=
8
:
if
total_memory
<=
10
:
gc_start
=
time
.
time
()
gc_start
=
time
.
time
()
clean_memory
()
clean_memory
()
gc_time
=
round
(
time
.
time
()
-
gc_start
,
2
)
gc_time
=
round
(
time
.
time
()
-
gc_start
,
2
)
...
...
magic_pdf/para/para_split_v3.py
View file @
702b6ac9
import
copy
import
copy
from
loguru
import
logger
from
magic_pdf.libs.Constants
import
LINES_DELETED
,
CROSS_PAGE
from
magic_pdf.libs.Constants
import
LINES_DELETED
,
CROSS_PAGE
from
magic_pdf.libs.ocr_content_type
import
BlockType
,
ContentType
LINE_STOP_FLAG
=
(
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
')'
,
')'
,
'"'
,
'”'
,
':'
,
':'
,
';'
,
';'
)
LINE_STOP_FLAG
=
(
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
')'
,
')'
,
'"'
,
'”'
,
':'
,
':'
,
';'
,
';'
)
LIST_END_FLAG
=
(
'.'
,
'。'
,
';'
,
';'
)
class
ListLineTag
:
IS_LIST_START_LINE
=
"is_list_start_line"
IS_LIST_END_LINE
=
"is_list_end_line"
def
__process_blocks
(
blocks
):
def
__process_blocks
(
blocks
):
...
@@ -38,7 +47,127 @@ def __process_blocks(blocks):
...
@@ -38,7 +47,127 @@ def __process_blocks(blocks):
return
result
return
result
def
__merge_2_blocks
(
block1
,
block2
):
def
__is_list_block
(
block
):
# 一个block如果是list block 应该同时满足以下特征
# 1.block内有多个line 2.block 内有多个line左侧顶格写 3.block内有多个line 右侧不顶格(狗牙状)
# 1.block内有多个line 2.block 内有多个line左侧顶格写 3.多个line以endflag结尾
# 1.block内有多个line 2.block 内有多个line左侧顶格写 3.block内有多个line 左侧不顶格
if
len
(
block
[
'lines'
])
>=
3
:
first_line
=
block
[
'lines'
][
0
]
line_height
=
first_line
[
'bbox'
][
3
]
-
first_line
[
'bbox'
][
1
]
block_weight
=
block
[
'bbox_fs'
][
2
]
-
block
[
'bbox_fs'
][
0
]
left_close_num
=
0
left_not_close_num
=
0
right_not_close_num
=
0
lines_text_list
=
[]
for
line
in
block
[
'lines'
]:
line_text
=
""
for
span
in
line
[
'spans'
]:
span_type
=
span
[
'type'
]
if
span_type
==
ContentType
.
Text
:
line_text
+=
span
[
'content'
].
strip
()
lines_text_list
.
append
(
line_text
)
# 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
if
abs
(
block
[
'bbox_fs'
][
0
]
-
line
[
'bbox'
][
0
])
<
line_height
/
2
:
left_close_num
+=
1
elif
line
[
'bbox'
][
0
]
-
block
[
'bbox_fs'
][
0
]
>
line_height
:
# logger.info(f"{line_text}, {block['bbox_fs']}, {line['bbox']}")
left_not_close_num
+=
1
# 计算右侧是否不顶格,拍脑袋用0.3block宽度做阈值
closed_area
=
0.3
*
block_weight
# closed_area = 5 * line_height
if
block
[
'bbox_fs'
][
2
]
-
line
[
'bbox'
][
2
]
>
closed_area
:
right_not_close_num
+=
1
# 判断lines_text_list中的元素是否有超过80%都以LIST_END_FLAG结尾
line_end_flag
=
False
if
len
(
lines_text_list
)
>
0
:
num_end_count
=
0
for
line_text
in
lines_text_list
:
if
len
(
line_text
)
>
0
:
if
line_text
[
-
1
]
in
LIST_END_FLAG
:
num_end_count
+=
1
if
num_end_count
/
len
(
lines_text_list
)
>=
0.8
:
line_end_flag
=
True
if
left_close_num
>=
2
and
(
right_not_close_num
>=
2
or
line_end_flag
or
left_not_close_num
>=
2
):
for
line
in
block
[
'lines'
]:
if
abs
(
block
[
'bbox_fs'
][
0
]
-
line
[
'bbox'
][
0
])
<
line_height
/
2
:
line
[
ListLineTag
.
IS_LIST_START_LINE
]
=
True
if
abs
(
block
[
'bbox_fs'
][
2
]
-
line
[
'bbox'
][
2
])
>
line_height
:
line
[
ListLineTag
.
IS_LIST_END_LINE
]
=
True
return
True
else
:
return
False
else
:
return
False
def
__is_index_block
(
block
):
# 一个block如果是index block 应该同时满足以下特征
# 1.block内有多个line 2.block 内有多个line两侧均顶格写 3.line的开头或者结尾均为数字
if
len
(
block
[
'lines'
])
>=
3
:
first_line
=
block
[
'lines'
][
0
]
line_height
=
first_line
[
'bbox'
][
3
]
-
first_line
[
'bbox'
][
1
]
left_close_num
=
0
right_close_num
=
0
lines_text_list
=
[]
for
line
in
block
[
'lines'
]:
# 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
if
abs
(
block
[
'bbox_fs'
][
0
]
-
line
[
'bbox'
][
0
])
<
line_height
/
2
:
left_close_num
+=
1
# 计算右侧是否不顶格
if
abs
(
block
[
'bbox_fs'
][
2
]
-
line
[
'bbox'
][
2
])
<
line_height
/
2
:
right_close_num
+=
1
line_text
=
""
for
span
in
line
[
'spans'
]:
span_type
=
span
[
'type'
]
if
span_type
==
ContentType
.
Text
:
line_text
+=
span
[
'content'
].
strip
()
lines_text_list
.
append
(
line_text
)
# 判断lines_text_list中的元素是否有超过80%都以数字开头或都以数字结尾
line_num_flag
=
False
if
len
(
lines_text_list
)
>
0
:
num_start_count
=
0
num_end_count
=
0
for
line_text
in
lines_text_list
:
if
len
(
line_text
)
>
0
:
if
line_text
[
0
].
isdigit
():
num_start_count
+=
1
if
line_text
[
-
1
].
isdigit
():
num_end_count
+=
1
if
num_start_count
/
len
(
lines_text_list
)
>=
0.8
or
num_end_count
/
len
(
lines_text_list
)
>=
0.8
:
line_num_flag
=
True
if
left_close_num
>=
2
and
right_close_num
>=
2
and
line_num_flag
:
for
line
in
block
[
'lines'
]:
line
[
ListLineTag
.
IS_LIST_START_LINE
]
=
True
return
True
else
:
return
False
else
:
return
False
def
__merge_2_text_blocks
(
block1
,
block2
):
if
len
(
block1
[
'lines'
])
>
0
:
if
len
(
block1
[
'lines'
])
>
0
:
first_line
=
block1
[
'lines'
][
0
]
first_line
=
block1
[
'lines'
][
0
]
line_height
=
first_line
[
'bbox'
][
3
]
-
first_line
[
'bbox'
][
1
]
line_height
=
first_line
[
'bbox'
][
3
]
-
first_line
[
'bbox'
][
1
]
...
@@ -59,17 +188,46 @@ def __merge_2_blocks(block1, block2):
...
@@ -59,17 +188,46 @@ def __merge_2_blocks(block1, block2):
return
block1
,
block2
return
block1
,
block2
def
__merge_2_list_blocks
(
block1
,
block2
):
if
block1
[
'page_num'
]
!=
block2
[
'page_num'
]:
for
line
in
block1
[
'lines'
]:
for
span
in
line
[
'spans'
]:
span
[
CROSS_PAGE
]
=
True
block2
[
'lines'
].
extend
(
block1
[
'lines'
])
block1
[
'lines'
]
=
[]
block1
[
LINES_DELETED
]
=
True
return
block1
,
block2
def
__para_merge_page
(
blocks
):
def
__para_merge_page
(
blocks
):
page_text_blocks_groups
=
__process_blocks
(
blocks
)
page_text_blocks_groups
=
__process_blocks
(
blocks
)
for
text_blocks_group
in
page_text_blocks_groups
:
for
text_blocks_group
in
page_text_blocks_groups
:
if
len
(
text_blocks_group
)
>
0
:
# 需要先在合并前对所有block判断是否为list block
for
block
in
text_blocks_group
:
if
__is_list_block
(
block
):
block
[
'type'
]
=
BlockType
.
List
elif
__is_index_block
(
block
):
block
[
'type'
]
=
BlockType
.
Index
if
len
(
text_blocks_group
)
>
1
:
if
len
(
text_blocks_group
)
>
1
:
# 倒序遍历
# 倒序遍历
for
i
in
range
(
len
(
text_blocks_group
)
-
1
,
-
1
,
-
1
):
for
i
in
range
(
len
(
text_blocks_group
)
-
1
,
-
1
,
-
1
):
current_block
=
text_blocks_group
[
i
]
current_block
=
text_blocks_group
[
i
]
# 检查是否有前一个块
# 检查是否有前一个块
if
i
-
1
>=
0
:
if
i
-
1
>=
0
:
prev_block
=
text_blocks_group
[
i
-
1
]
prev_block
=
text_blocks_group
[
i
-
1
]
__merge_2_blocks
(
current_block
,
prev_block
)
if
current_block
[
'type'
]
==
'text'
and
prev_block
[
'type'
]
==
'text'
:
__merge_2_text_blocks
(
current_block
,
prev_block
)
if
current_block
[
'type'
]
==
BlockType
.
List
and
prev_block
[
'type'
]
==
BlockType
.
List
:
__merge_2_list_blocks
(
current_block
,
prev_block
)
if
current_block
[
'type'
]
==
BlockType
.
Index
and
prev_block
[
'type'
]
==
BlockType
.
Index
:
__merge_2_list_blocks
(
current_block
,
prev_block
)
else
:
else
:
continue
continue
...
...
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
View file @
702b6ac9
...
@@ -108,7 +108,7 @@ def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_b
...
@@ -108,7 +108,7 @@ def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_b
all_bboxes
=
remove_overlaps_min_blocks
(
all_bboxes
)
all_bboxes
=
remove_overlaps_min_blocks
(
all_bboxes
)
all_discarded_blocks
=
remove_overlaps_min_blocks
(
all_discarded_blocks
)
all_discarded_blocks
=
remove_overlaps_min_blocks
(
all_discarded_blocks
)
'''将剩余的bbox做分离处理,防止后面分layout时出错'''
'''将剩余的bbox做分离处理,防止后面分layout时出错'''
#
all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
all_bboxes
,
drop_reasons
=
remove_overlap_between_bbox_for_block
(
all_bboxes
)
return
all_bboxes
,
all_discarded_blocks
return
all_bboxes
,
all_discarded_blocks
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
702b6ac9
...
@@ -49,8 +49,7 @@ def merge_spans_to_line(spans):
...
@@ -49,8 +49,7 @@ def merge_spans_to_line(spans):
continue
continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
if
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
current_line
[
-
1
][
'bbox'
],
0.6
):
current_line
[
-
1
][
'bbox'
]):
current_line
.
append
(
span
)
current_line
.
append
(
span
)
else
:
else
:
# 否则,开始新行
# 否则,开始新行
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment