Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
f5dc261d
Commit
f5dc261d
authored
Mar 13, 2024
by
liukaiwen
Browse files
Merge branch 'master' into dev-in-line-bbox
parents
1f468bed
32fd7f95
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
67 additions
and
45 deletions
+67
-45
magic_pdf/libs/draw_bbox.py
magic_pdf/libs/draw_bbox.py
+18
-13
magic_pdf/pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+4
-2
magic_pdf/pre_proc/ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+0
-13
magic_pdf/pre_proc/ocr_remove_spans.py
magic_pdf/pre_proc/ocr_remove_spans.py
+0
-17
magic_pdf/pre_proc/ocr_span_list_modify.py
magic_pdf/pre_proc/ocr_span_list_modify.py
+45
-0
No files found.
magic_pdf/libs/draw_bbox.py
View file @
f5dc261d
from
magic_pdf.libs.commons
import
fitz
# PyMuPDF
def
draw_bbox
(
i
,
bbox_list
,
page
,
rgb_config
):
def
draw_bbox
_without_number
(
i
,
bbox_list
,
page
,
rgb_config
):
new_rgb
=
[]
for
item
in
rgb_config
:
item
=
float
(
item
)
/
255
...
...
@@ -12,6 +12,19 @@ def draw_bbox(i, bbox_list, page, rgb_config):
page
.
draw_rect
(
rect_coords
,
color
=
new_rgb
,
fill
=
None
,
width
=
0.5
,
overlay
=
True
)
# Draw the rectangle
def
draw_bbox_with_number
(
i
,
bbox_list
,
page
,
rgb_config
):
new_rgb
=
[]
for
item
in
rgb_config
:
item
=
float
(
item
)
/
255
new_rgb
.
append
(
item
)
page_data
=
bbox_list
[
i
]
for
j
,
bbox
in
enumerate
(
page_data
):
x0
,
y0
,
x1
,
y1
=
bbox
rect_coords
=
fitz
.
Rect
(
x0
,
y0
,
x1
,
y1
)
# Define the rectangle
page
.
draw_rect
(
rect_coords
,
color
=
new_rgb
,
fill
=
None
,
width
=
0.5
,
overlay
=
True
)
# Draw the rectangle
page
.
insert_text
((
x0
,
y0
),
str
(
j
+
1
),
fontsize
=
10
,
color
=
new_rgb
)
# Insert the index at the top left corner of the rectangle
def
draw_layout_bbox
(
pdf_info_dict
,
input_path
,
out_path
):
layout_bbox_list
=
[]
for
page
in
pdf_info_dict
.
values
():
...
...
@@ -22,13 +35,7 @@ def draw_layout_bbox(pdf_info_dict, input_path, out_path):
doc
=
fitz
.
open
(
input_path
)
for
i
,
page
in
enumerate
(
doc
):
# 获取当前页面的数据
page_data
=
layout_bbox_list
[
i
]
for
j
,
bbox
in
enumerate
(
page_data
):
x0
,
y0
,
x1
,
y1
=
bbox
rect_coords
=
fitz
.
Rect
(
x0
,
y0
,
x1
,
y1
)
# Define the rectangle
page
.
draw_rect
(
rect_coords
,
color
=
(
1
,
0
,
0
),
fill
=
None
,
width
=
0.5
,
overlay
=
True
)
# Draw the rectangle
page
.
insert_text
((
x0
,
y0
),
str
(
j
+
1
),
fontsize
=
10
,
color
=
(
1
,
0
,
0
))
# Insert the index at the top left corner of the rectangle
draw_bbox_with_number
(
i
,
layout_bbox_list
,
page
,
[
255
,
0
,
0
])
# Save the PDF
doc
.
save
(
f
"
{
out_path
}
/layout.pdf"
)
...
...
@@ -56,11 +63,9 @@ def draw_text_bbox(pdf_info_dict, input_path, out_path):
doc
=
fitz
.
open
(
input_path
)
for
i
,
page
in
enumerate
(
doc
):
# 获取当前页面的数据
draw_bbox
(
i
,
text_list
,
page
,
[
255
,
0
,
0
])
draw_bbox
(
i
,
inline_equation_list
,
page
,
[
0
,
255
,
0
])
draw_bbox
(
i
,
displayed_equation_list
,
page
,
[
0
,
0
,
255
])
draw_bbox_without_number
(
i
,
text_list
,
page
,
[
255
,
0
,
0
])
draw_bbox_without_number
(
i
,
inline_equation_list
,
page
,
[
0
,
255
,
0
])
draw_bbox_without_number
(
i
,
displayed_equation_list
,
page
,
[
0
,
0
,
255
])
# Save the PDF
doc
.
save
(
f
"
{
out_path
}
/text.pdf"
)
magic_pdf/pdf_parse_by_ocr.py
View file @
f5dc261d
...
...
@@ -22,12 +22,12 @@ from magic_pdf.pre_proc.detect_page_number import parse_pageNos
from
magic_pdf.pre_proc.ocr_cut_image
import
cut_image_and_table
from
magic_pdf.pre_proc.ocr_detect_layout
import
layout_detect
from
magic_pdf.pre_proc.ocr_dict_merge
import
(
remove_overlaps_min_spans
,
merge_spans_to_line_by_layout
,
modify_y_axis
,
modify_inline_equation
)
from
magic_pdf.pre_proc.ocr_remove_spans
import
remove_spans_by_bboxes
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_spans_by_bboxes
,
remove_overlaps_min_spans
,
\
adjust_bbox_for_standalone_block
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
...
...
@@ -195,6 +195,8 @@ def parse_pdf_by_ocr(
spans
=
remove_overlap_between_bbox
(
spans
)
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
spans
=
adjust_bbox_for_standalone_block
(
spans
)
# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
layout_bboxes
,
layout_tree
=
layout_detect
(
ocr_page_info
[
'subfield_dets'
],
page
,
ocr_page_info
)
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
f5dc261d
...
...
@@ -4,19 +4,6 @@ from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox
calculate_overlap_area_in_bbox1_area_ratio
# 删除重叠spans中较小的那些
def
remove_overlaps_min_spans
(
spans
):
for
span1
in
spans
.
copy
():
for
span2
in
spans
.
copy
():
if
span1
!=
span2
:
overlap_box
=
get_minbox_if_overlap_by_ratio
(
span1
[
'bbox'
],
span2
[
'bbox'
],
0.8
)
if
overlap_box
is
not
None
:
bbox_to_remove
=
next
((
span
for
span
in
spans
if
span
[
'bbox'
]
==
overlap_box
),
None
)
if
bbox_to_remove
is
not
None
:
spans
.
remove
(
bbox_to_remove
)
return
spans
# 将每一个line中的span从左到右排序
def
line_sort_spans_by_left_to_right
(
lines
):
line_objects
=
[]
...
...
magic_pdf/pre_proc/ocr_remove_spans.py
deleted
100644 → 0
View file @
1f468bed
from
magic_pdf.libs.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
def
remove_spans_by_bboxes
(
spans
,
need_remove_spans_bboxes
):
# 遍历spans, 判断是否在removed_span_block_bboxes中
# 如果是, 则删除该span 否则, 保留该span
need_remove_spans
=
[]
for
span
in
spans
:
for
removed_bbox
in
need_remove_spans_bboxes
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
removed_bbox
)
>
0.5
:
need_remove_spans
.
append
(
span
)
break
for
span
in
need_remove_spans
:
spans
.
remove
(
span
)
return
spans
magic_pdf/pre_proc/ocr_span_list_modify.py
0 → 100644
View file @
f5dc261d
from
magic_pdf.libs.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
,
get_minbox_if_overlap_by_ratio
def
remove_overlaps_min_spans
(
spans
):
# 删除重叠spans中较小的那些
for
span1
in
spans
.
copy
():
for
span2
in
spans
.
copy
():
if
span1
!=
span2
:
overlap_box
=
get_minbox_if_overlap_by_ratio
(
span1
[
'bbox'
],
span2
[
'bbox'
],
0.65
)
if
overlap_box
is
not
None
:
bbox_to_remove
=
next
((
span
for
span
in
spans
if
span
[
'bbox'
]
==
overlap_box
),
None
)
if
bbox_to_remove
is
not
None
:
spans
.
remove
(
bbox_to_remove
)
return
spans
def
remove_spans_by_bboxes
(
spans
,
need_remove_spans_bboxes
):
# 遍历spans, 判断是否在removed_span_block_bboxes中
# 如果是, 则删除该span 否则, 保留该span
need_remove_spans
=
[]
for
span
in
spans
:
for
removed_bbox
in
need_remove_spans_bboxes
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
removed_bbox
)
>
0.5
:
need_remove_spans
.
append
(
span
)
break
for
span
in
need_remove_spans
:
spans
.
remove
(
span
)
return
spans
def
adjust_bbox_for_standalone_block
(
spans
):
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
for
sb_span
in
spans
:
if
sb_span
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]:
for
text_span
in
spans
:
if
text_span
[
'type'
]
in
[
'text'
,
'inline_equation'
]:
# 判断span2的纵向高度是否被span所覆盖
if
sb_span
[
'bbox'
][
1
]
<
text_span
[
'bbox'
][
1
]
and
sb_span
[
'bbox'
][
3
]
>
text_span
[
'bbox'
][
3
]:
# 判断span2是否在span左边
if
text_span
[
'bbox'
][
0
]
<
sb_span
[
'bbox'
][
0
]:
# 调整span的y0和span2的y0一致
sb_span
[
'bbox'
][
1
]
=
text_span
[
'bbox'
][
1
]
return
spans
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment