Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
f9310954
Unverified
Commit
f9310954
authored
Mar 13, 2024
by
myhloli
Committed by
GitHub
Mar 13, 2024
Browse files
Merge pull request #4 from myhloli/dev-in-line-bbox
Dev in line bbox
parents
32fd7f95
64d67b5c
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
106 additions
and
98 deletions
+106
-98
magic_pdf/pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+5
-3
magic_pdf/pre_proc/ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+0
-95
magic_pdf/pre_proc/ocr_span_list_modify.py
magic_pdf/pre_proc/ocr_span_list_modify.py
+101
-0
No files found.
magic_pdf/pdf_parse_by_ocr.py
View file @
f9310954
...
@@ -23,10 +23,9 @@ from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table
...
@@ -23,10 +23,9 @@ from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table
from
magic_pdf.pre_proc.ocr_detect_layout
import
layout_detect
from
magic_pdf.pre_proc.ocr_detect_layout
import
layout_detect
from
magic_pdf.pre_proc.ocr_dict_merge
import
(
from
magic_pdf.pre_proc.ocr_dict_merge
import
(
merge_spans_to_line_by_layout
,
merge_spans_to_line_by_layout
,
modify_y_axis
)
)
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_spans_by_bboxes
,
remove_overlaps_min_spans
,
\
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_spans_by_bboxes
,
remove_overlaps_min_spans
,
\
adjust_bbox_for_standalone_block
adjust_bbox_for_standalone_block
,
modify_y_axis
,
modify_inline_equation
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
...
@@ -184,8 +183,11 @@ def parse_pdf_by_ocr(
...
@@ -184,8 +183,11 @@ def parse_pdf_by_ocr(
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
)
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
)
# 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
# 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
displayed_list
=
[]
text_inline_lines
=
[]
modify_y_axis
(
spans
,
displayed_list
,
text_inline_lines
)
# 模型识别错误的行间公式, type类型转换成行内公式
# 模型识别错误的行间公式, type类型转换成行内公式
spans
=
modify_
y_axis
(
span
s
)
spans
=
modify_
inline_equation
(
spans
,
displayed_list
,
text_inline_line
s
)
# bbox去除粘连
# bbox去除粘连
spans
=
remove_overlap_between_bbox
(
spans
)
spans
=
remove_overlap_between_bbox
(
spans
)
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
f9310954
...
@@ -81,101 +81,6 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
...
@@ -81,101 +81,6 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
def
modify_y_axis
(
spans
:
list
):
inline_list
=
[]
displayed_list
=
[]
text_list
=
[]
image_list
=
[]
table_list
=
[]
spans
.
sort
(
key
=
lambda
span
:
span
[
'bbox'
][
1
])
lines
=
[]
current_line
=
[
spans
[
0
]]
if
spans
[
0
][
"type"
]
in
[
"displayed_equation"
,
"image"
,
"table"
]:
displayed_list
.
append
(
spans
[
0
])
line_first_y0
=
spans
[
0
][
"bbox"
][
1
]
line_first_y
=
spans
[
0
][
"bbox"
][
3
]
#用于给行间公式搜索
text_inline_lines
=
[]
for
span
in
spans
[
1
:]:
# if span.get("content","") == "78.":
# print("debug")
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
# image和table类型,同上
if
span
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
or
any
(
s
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
for
s
in
current_line
):
#传入
if
span
[
"type"
]
in
[
"displayed_equation"
,
"image"
,
"table"
]:
displayed_list
.
append
(
span
)
# 则开始新行
lines
.
append
(
current_line
)
if
len
(
current_line
)
>
1
or
current_line
[
0
][
"type"
]
in
[
"text"
,
"inline_equation"
]:
text_inline_lines
.
append
((
current_line
,
(
line_first_y0
,
line_first_y
)))
current_line
=
[
span
]
line_first_y0
=
span
[
"bbox"
][
1
]
line_first_y
=
span
[
"bbox"
][
3
]
continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
current_line
[
-
1
][
'bbox'
]):
if
span
[
"bbox"
][
1
]
<
line_first_y0
:
line_first_y0
=
span
[
"bbox"
][
1
]
if
span
[
"bbox"
][
3
]
>
line_first_y
:
line_first_y
=
span
[
"bbox"
][
3
]
current_line
.
append
(
span
)
else
:
# 否则,开始新行
lines
.
append
(
current_line
)
text_inline_lines
.
append
((
current_line
,
(
line_first_y0
,
line_first_y
)))
current_line
=
[
span
]
line_first_y0
=
span
[
"bbox"
][
1
]
line_first_y
=
span
[
"bbox"
][
3
]
# 添加最后一行
if
current_line
:
lines
.
append
(
current_line
)
if
len
(
current_line
)
>
1
or
current_line
[
0
][
"type"
]
in
[
"text"
,
"inline_equation"
]:
text_inline_lines
.
append
((
current_line
,
(
line_first_y0
,
line_first_y
)))
for
line
in
text_inline_lines
:
# 按照x0坐标排序
current_line
=
line
[
0
]
current_line
.
sort
(
key
=
lambda
span
:
span
[
'bbox'
][
0
])
#调整每一个文字行内bbox统一
for
line
in
text_inline_lines
:
current_line
,
(
line_first_y0
,
line_first_y
)
=
line
for
span
in
current_line
:
span
[
"bbox"
][
1
]
=
line_first_y0
span
[
"bbox"
][
3
]
=
line_first_y
#错误行间公式转行内公式
j
=
0
for
i
in
range
(
len
(
displayed_list
)):
# if i == 8:
# print("debug")
span
=
displayed_list
[
i
]
span_y0
,
span_y
=
span
[
"bbox"
][
1
],
span
[
"bbox"
][
3
]
while
j
<
len
(
text_inline_lines
):
text_line
=
text_inline_lines
[
j
]
y0
,
y1
=
text_line
[
1
]
if
(
span_y0
<
y0
and
span_y
>
y0
or
span_y0
<
y1
and
span_y
>
y1
or
span_y0
<
y0
and
span_y
>
y1
)
and
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0
,
0
,
y1
)):
span
[
"bbox"
][
1
]
=
y0
# span["bbox"][3] = y1
#调整公式类型
if
span
[
"type"
]
==
"displayed_equation"
:
span
[
"type"
]
=
"inline_equation"
break
elif
span_y
<
y0
or
span_y0
<
y0
and
span_y
>
y0
and
not
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0
,
0
,
y1
)):
break
else
:
j
+=
1
return
spans
magic_pdf/pre_proc/ocr_span_list_modify.py
View file @
f9310954
...
@@ -43,3 +43,104 @@ def adjust_bbox_for_standalone_block(spans):
...
@@ -43,3 +43,104 @@ def adjust_bbox_for_standalone_block(spans):
# 调整span的y0和span2的y0一致
# 调整span的y0和span2的y0一致
sb_span
[
'bbox'
][
1
]
=
text_span
[
'bbox'
][
1
]
sb_span
[
'bbox'
][
1
]
=
text_span
[
'bbox'
][
1
]
return
spans
return
spans
def
modify_y_axis
(
spans
:
list
,
displayed_list
:
list
,
text_inline_lines
:
list
):
# displayed_list = []
spans
.
sort
(
key
=
lambda
span
:
span
[
'bbox'
][
1
])
lines
=
[]
current_line
=
[
spans
[
0
]]
if
spans
[
0
][
"type"
]
in
[
"displayed_equation"
,
"image"
,
"table"
]:
displayed_list
.
append
(
spans
[
0
])
line_first_y0
=
spans
[
0
][
"bbox"
][
1
]
line_first_y
=
spans
[
0
][
"bbox"
][
3
]
#用于给行间公式搜索
# text_inline_lines = []
for
span
in
spans
[
1
:]:
# if span.get("content","") == "78.":
# print("debug")
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
# image和table类型,同上
if
span
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
or
any
(
s
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
for
s
in
current_line
):
#传入
if
span
[
"type"
]
in
[
"displayed_equation"
,
"image"
,
"table"
]:
displayed_list
.
append
(
span
)
# 则开始新行
lines
.
append
(
current_line
)
if
len
(
current_line
)
>
1
or
current_line
[
0
][
"type"
]
in
[
"text"
,
"inline_equation"
]:
text_inline_lines
.
append
((
current_line
,
(
line_first_y0
,
line_first_y
)))
current_line
=
[
span
]
line_first_y0
=
span
[
"bbox"
][
1
]
line_first_y
=
span
[
"bbox"
][
3
]
continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
current_line
[
-
1
][
'bbox'
]):
if
span
[
"type"
]
==
"text"
:
line_first_y0
=
span
[
"bbox"
][
1
]
line_first_y
=
span
[
"bbox"
][
3
]
current_line
.
append
(
span
)
else
:
# 否则,开始新行
lines
.
append
(
current_line
)
text_inline_lines
.
append
((
current_line
,
(
line_first_y0
,
line_first_y
)))
current_line
=
[
span
]
line_first_y0
=
span
[
"bbox"
][
1
]
line_first_y
=
span
[
"bbox"
][
3
]
# 添加最后一行
if
current_line
:
lines
.
append
(
current_line
)
if
len
(
current_line
)
>
1
or
current_line
[
0
][
"type"
]
in
[
"text"
,
"inline_equation"
]:
text_inline_lines
.
append
((
current_line
,
(
line_first_y0
,
line_first_y
)))
for
line
in
text_inline_lines
:
# 按照x0坐标排序
current_line
=
line
[
0
]
current_line
.
sort
(
key
=
lambda
span
:
span
[
'bbox'
][
0
])
#调整每一个文字行内bbox统一
for
line
in
text_inline_lines
:
current_line
,
(
line_first_y0
,
line_first_y
)
=
line
for
span
in
current_line
:
span
[
"bbox"
][
1
]
=
line_first_y0
span
[
"bbox"
][
3
]
=
line_first_y
# return spans, displayed_list, text_inline_lines
def
modify_inline_equation
(
spans
:
list
,
displayed_list
:
list
,
text_inline_lines
:
list
):
#错误行间公式转行内公式
j
=
0
for
i
in
range
(
len
(
displayed_list
)):
# if i == 8:
# print("debug")
span
=
displayed_list
[
i
]
span_y0
,
span_y
=
span
[
"bbox"
][
1
],
span
[
"bbox"
][
3
]
while
j
<
len
(
text_inline_lines
):
text_line
=
text_inline_lines
[
j
]
y0
,
y1
=
text_line
[
1
]
if
(
span_y0
<
y0
and
span_y
>
y0
or
span_y0
<
y1
and
span_y
>
y1
or
span_y0
<
y0
and
span_y
>
y1
)
and
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0
,
0
,
y1
)):
span
[
"bbox"
][
1
]
=
y0
# span["bbox"][3] = y1
#调整公式类型
if
span
[
"type"
]
==
"displayed_equation"
:
if
j
+
1
>=
len
(
text_inline_lines
):
span
[
"type"
]
=
"inline_equation"
else
:
y0_next
,
y1_next
=
text_inline_lines
[
j
+
1
][
1
]
if
not
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0_next
,
0
,
y1_next
)):
span
[
"type"
]
=
"inline_equation"
break
elif
span_y
<
y0
or
span_y0
<
y0
and
span_y
>
y0
and
not
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0
,
0
,
y1
)):
break
else
:
j
+=
1
return
spans
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment