Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
a817075b
Commit
a817075b
authored
May 06, 2024
by
赵小蒙
Browse files
update discarded block and spans build logic
parent
d4f96a05
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
27 additions
and
10 deletions
+27
-10
magic_pdf/libs/ocr_content_type.py
magic_pdf/libs/ocr_content_type.py
+1
-0
magic_pdf/pdf_parse_union_core.py
magic_pdf/pdf_parse_union_core.py
+11
-6
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+4
-1
magic_pdf/pre_proc/ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+11
-3
No files found.
magic_pdf/libs/ocr_content_type.py
View file @
a817075b
...
@@ -17,4 +17,5 @@ class BlockType:
...
@@ -17,4 +17,5 @@ class BlockType:
Title
=
"title"
Title
=
"title"
InterlineEquation
=
"interline_equation"
InterlineEquation
=
"interline_equation"
Footnote
=
"footnote"
Footnote
=
"footnote"
Discarded
=
"discarded"
magic_pdf/pdf_parse_union_core.py
View file @
a817075b
...
@@ -17,7 +17,8 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
...
@@ -17,7 +17,8 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from
magic_pdf.pre_proc.equations_replace
import
remove_chars_in_text_blocks
,
replace_equations_in_textblock
,
\
from
magic_pdf.pre_proc.equations_replace
import
remove_chars_in_text_blocks
,
replace_equations_in_textblock
,
\
combine_chars_to_pymudict
combine_chars_to_pymudict
from
magic_pdf.pre_proc.ocr_detect_all_bboxes
import
ocr_prepare_bboxes_for_layout_split
from
magic_pdf.pre_proc.ocr_detect_all_bboxes
import
ocr_prepare_bboxes_for_layout_split
from
magic_pdf.pre_proc.ocr_dict_merge
import
sort_blocks_by_layout
,
fill_spans_in_blocks
,
fix_block_spans
from
magic_pdf.pre_proc.ocr_dict_merge
import
sort_blocks_by_layout
,
fill_spans_in_blocks
,
fix_block_spans
,
\
fix_discarded_block
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_overlaps_min_spans
,
get_qa_need_list_v2
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_overlaps_min_spans
,
get_qa_need_list_v2
from
magic_pdf.pre_proc.resolve_bbox_conflict
import
check_useful_block_horizontal_overlap
from
magic_pdf.pre_proc.resolve_bbox_conflict
import
check_useful_block_horizontal_overlap
...
@@ -122,15 +123,19 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
...
@@ -122,15 +123,19 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
spans
=
ocr_cut_image_and_table
(
spans
,
pdf_docs
[
page_id
],
page_id
,
pdf_bytes_md5
,
imageWriter
)
spans
=
ocr_cut_image_and_table
(
spans
,
pdf_docs
[
page_id
],
page_id
,
pdf_bytes_md5
,
imageWriter
)
'''将所有区块的bbox整理到一起'''
'''将所有区块的bbox整理到一起'''
all_bboxes
=
ocr_prepare_bboxes_for_layout_split
(
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equations
,
page_w
,
page_h
)
interline_equations
,
page_w
,
page_h
)
'''先处理不需要排版的discarded_blocks'''
discarded_block_with_spans
,
spans
=
fill_spans_in_blocks
(
all_discarded_blocks
,
spans
,
0.4
)
fix_discarded_blocks
=
fix_discarded_block
(
discarded_block_with_spans
)
'''如果当前页面没有bbox则跳过'''
'''如果当前页面没有bbox则跳过'''
if
len
(
all_bboxes
)
==
0
:
if
len
(
all_bboxes
)
==
0
:
logger
.
warning
(
f
"skip this page, not found bbox, page_id:
{
page_id
}
"
)
logger
.
warning
(
f
"skip this page, not found
useful
bbox, page_id:
{
page_id
}
"
)
return
ocr_construct_page_component_v2
([],
[],
page_id
,
page_w
,
page_h
,
[],
return
ocr_construct_page_component_v2
([],
[],
page_id
,
page_w
,
page_h
,
[],
[],
[],
interline_equations
,
discarded_blocks
,
[],
[],
interline_equations
,
fix_
discarded_blocks
,
need_drop
,
drop_reason
)
need_drop
,
drop_reason
)
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
...
@@ -171,7 +176,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
...
@@ -171,7 +176,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
sorted_blocks
=
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
)
sorted_blocks
=
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
)
'''将span填入排好序的blocks中'''
'''将span填入排好序的blocks中'''
block_with_spans
=
fill_spans_in_blocks
(
sorted_blocks
,
spans
)
block_with_spans
,
spans
=
fill_spans_in_blocks
(
sorted_blocks
,
spans
,
0.6
)
'''对block进行fix操作'''
'''对block进行fix操作'''
fix_blocks
=
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
)
fix_blocks
=
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
)
...
@@ -181,7 +186,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
...
@@ -181,7 +186,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
'''构造pdf_info_dict'''
'''构造pdf_info_dict'''
page_info
=
ocr_construct_page_component_v2
(
fix_blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
page_info
=
ocr_construct_page_component_v2
(
fix_blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
discarded_blocks
,
images
,
tables
,
interline_equations
,
fix_
discarded_blocks
,
need_drop
,
drop_reason
)
need_drop
,
drop_reason
)
return
page_info
return
page_info
...
...
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
View file @
a817075b
...
@@ -7,6 +7,7 @@ from magic_pdf.libs.ocr_content_type import BlockType
...
@@ -7,6 +7,7 @@ from magic_pdf.libs.ocr_content_type import BlockType
def
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
def
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
):
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
):
all_bboxes
=
[]
all_bboxes
=
[]
all_discarded_blocks
=
[]
for
image
in
img_blocks
:
for
image
in
img_blocks
:
x0
,
y0
,
x1
,
y1
=
image
[
'bbox'
]
x0
,
y0
,
x1
,
y1
=
image
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Image
,
None
,
None
,
None
,
None
])
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Image
,
None
,
None
,
None
,
None
])
...
@@ -38,10 +39,12 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
...
@@ -38,10 +39,12 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
for
discarded
in
discarded_blocks
:
for
discarded
in
discarded_blocks
:
x0
,
y0
,
x1
,
y1
=
discarded
[
'bbox'
]
x0
,
y0
,
x1
,
y1
=
discarded
[
'bbox'
]
all_discarded_blocks
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Discarded
,
None
,
None
,
None
,
None
])
# 将footnote加入到all_bboxes中,用来计算layout
if
(
x1
-
x0
)
>
(
page_w
/
3
)
and
(
y1
-
y0
)
>
10
and
y0
>
(
page_h
/
2
):
if
(
x1
-
x0
)
>
(
page_w
/
3
)
and
(
y1
-
y0
)
>
10
and
y0
>
(
page_h
/
2
):
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Footnote
,
None
,
None
,
None
,
None
])
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Footnote
,
None
,
None
,
None
,
None
])
return
all_bboxes
return
all_bboxes
,
all_discarded_blocks
def
fix_text_overlap_title_blocks
(
all_bboxes
):
def
fix_text_overlap_title_blocks
(
all_bboxes
):
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
a817075b
...
@@ -141,7 +141,7 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
...
@@ -141,7 +141,7 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
return
sort_blocks
return
sort_blocks
def
fill_spans_in_blocks
(
blocks
,
spans
):
def
fill_spans_in_blocks
(
blocks
,
spans
,
radio
):
'''
'''
将allspans中的span按位置关系,放入blocks中
将allspans中的span按位置关系,放入blocks中
'''
'''
...
@@ -156,7 +156,7 @@ def fill_spans_in_blocks(blocks, spans):
...
@@ -156,7 +156,7 @@ def fill_spans_in_blocks(blocks, spans):
block_spans
=
[]
block_spans
=
[]
for
span
in
spans
:
for
span
in
spans
:
span_bbox
=
span
[
'bbox'
]
span_bbox
=
span
[
'bbox'
]
if
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
0.6
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
radio
:
block_spans
.
append
(
span
)
block_spans
.
append
(
span
)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
...
@@ -178,7 +178,7 @@ def fill_spans_in_blocks(blocks, spans):
...
@@ -178,7 +178,7 @@ def fill_spans_in_blocks(blocks, spans):
for
span
in
block_spans
:
for
span
in
block_spans
:
spans
.
remove
(
span
)
spans
.
remove
(
span
)
return
block_with_spans
return
block_with_spans
,
spans
def
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
):
def
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
):
...
@@ -204,6 +204,14 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
...
@@ -204,6 +204,14 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
return
fix_blocks
return
fix_blocks
def
fix_discarded_block
(
discarded_block_with_spans
):
fix_discarded_blocks
=
[]
for
block
in
discarded_block_with_spans
:
block
=
fix_text_block
(
block
)
fix_discarded_blocks
.
append
(
block
)
return
fix_discarded_blocks
def
merge_spans_to_block
(
spans
:
list
,
block_bbox
:
list
,
block_type
:
str
):
def
merge_spans_to_block
(
spans
:
list
,
block_bbox
:
list
,
block_type
:
str
):
block_spans
=
[]
block_spans
=
[]
# 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
# 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment