Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
32fd7f95
Commit
32fd7f95
authored
Mar 13, 2024
by
赵小蒙
Browse files
将对span的操作移动到ocr_span_list_modify,增加独占一行区块的位置调整逻辑
parent
86dc22ca
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
49 additions
and
32 deletions
+49
-32
magic_pdf/pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+4
-2
magic_pdf/pre_proc/ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+0
-13
magic_pdf/pre_proc/ocr_remove_spans.py
magic_pdf/pre_proc/ocr_remove_spans.py
+0
-17
magic_pdf/pre_proc/ocr_span_list_modify.py
magic_pdf/pre_proc/ocr_span_list_modify.py
+45
-0
No files found.
magic_pdf/pdf_parse_by_ocr.py
View file @
32fd7f95
...
...
@@ -22,11 +22,11 @@ from magic_pdf.pre_proc.detect_page_number import parse_pageNos
from
magic_pdf.pre_proc.ocr_cut_image
import
cut_image_and_table
from
magic_pdf.pre_proc.ocr_detect_layout
import
layout_detect
from
magic_pdf.pre_proc.ocr_dict_merge
import
(
remove_overlaps_min_spans
,
merge_spans_to_line_by_layout
,
modify_y_axis
)
from
magic_pdf.pre_proc.ocr_remove_spans
import
remove_spans_by_bboxes
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_spans_by_bboxes
,
remove_overlaps_min_spans
,
\
adjust_bbox_for_standalone_block
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
...
...
@@ -191,6 +191,8 @@ def parse_pdf_by_ocr(
spans
=
remove_overlap_between_bbox
(
spans
)
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
spans
=
adjust_bbox_for_standalone_block
(
spans
)
# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
layout_bboxes
,
layout_tree
=
layout_detect
(
ocr_page_info
[
'subfield_dets'
],
page
,
ocr_page_info
)
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
32fd7f95
...
...
@@ -4,19 +4,6 @@ from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox
calculate_overlap_area_in_bbox1_area_ratio
# 删除重叠spans中较小的那些
def
remove_overlaps_min_spans
(
spans
):
for
span1
in
spans
.
copy
():
for
span2
in
spans
.
copy
():
if
span1
!=
span2
:
overlap_box
=
get_minbox_if_overlap_by_ratio
(
span1
[
'bbox'
],
span2
[
'bbox'
],
0.65
)
if
overlap_box
is
not
None
:
bbox_to_remove
=
next
((
span
for
span
in
spans
if
span
[
'bbox'
]
==
overlap_box
),
None
)
if
bbox_to_remove
is
not
None
:
spans
.
remove
(
bbox_to_remove
)
return
spans
# 将每一个line中的span从左到右排序
def
line_sort_spans_by_left_to_right
(
lines
):
line_objects
=
[]
...
...
magic_pdf/pre_proc/ocr_remove_spans.py
deleted
100644 → 0
View file @
86dc22ca
from
magic_pdf.libs.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
def
remove_spans_by_bboxes
(
spans
,
need_remove_spans_bboxes
):
# 遍历spans, 判断是否在removed_span_block_bboxes中
# 如果是, 则删除该span 否则, 保留该span
need_remove_spans
=
[]
for
span
in
spans
:
for
removed_bbox
in
need_remove_spans_bboxes
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
removed_bbox
)
>
0.5
:
need_remove_spans
.
append
(
span
)
break
for
span
in
need_remove_spans
:
spans
.
remove
(
span
)
return
spans
magic_pdf/pre_proc/ocr_span_list_modify.py
0 → 100644
View file @
32fd7f95
from
magic_pdf.libs.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
,
get_minbox_if_overlap_by_ratio
def
remove_overlaps_min_spans
(
spans
):
# 删除重叠spans中较小的那些
for
span1
in
spans
.
copy
():
for
span2
in
spans
.
copy
():
if
span1
!=
span2
:
overlap_box
=
get_minbox_if_overlap_by_ratio
(
span1
[
'bbox'
],
span2
[
'bbox'
],
0.65
)
if
overlap_box
is
not
None
:
bbox_to_remove
=
next
((
span
for
span
in
spans
if
span
[
'bbox'
]
==
overlap_box
),
None
)
if
bbox_to_remove
is
not
None
:
spans
.
remove
(
bbox_to_remove
)
return
spans
def
remove_spans_by_bboxes
(
spans
,
need_remove_spans_bboxes
):
# 遍历spans, 判断是否在removed_span_block_bboxes中
# 如果是, 则删除该span 否则, 保留该span
need_remove_spans
=
[]
for
span
in
spans
:
for
removed_bbox
in
need_remove_spans_bboxes
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
removed_bbox
)
>
0.5
:
need_remove_spans
.
append
(
span
)
break
for
span
in
need_remove_spans
:
spans
.
remove
(
span
)
return
spans
def
adjust_bbox_for_standalone_block
(
spans
):
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
for
sb_span
in
spans
:
if
sb_span
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]:
for
text_span
in
spans
:
if
text_span
[
'type'
]
in
[
'text'
,
'inline_equation'
]:
# 判断span2的纵向高度是否被span所覆盖
if
sb_span
[
'bbox'
][
1
]
<
text_span
[
'bbox'
][
1
]
and
sb_span
[
'bbox'
][
3
]
>
text_span
[
'bbox'
][
3
]:
# 判断span2是否在span左边
if
text_span
[
'bbox'
][
0
]
<
sb_span
[
'bbox'
][
0
]:
# 调整span的y0和span2的y0一致
sb_span
[
'bbox'
][
1
]
=
text_span
[
'bbox'
][
1
]
return
spans
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment