Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
864e9535
Commit
864e9535
authored
Mar 08, 2024
by
赵小蒙
Browse files
span->line现基于模型的layout进行拼接
parent
0c279ffc
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
51 additions
and
24 deletions
+51
-24
demo/ocr_demo.py
demo/ocr_demo.py
+1
-1
magic_pdf/pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+3
-3
magic_pdf/pre_proc/ocr_detect_layout.py
magic_pdf/pre_proc/ocr_detect_layout.py
+2
-2
magic_pdf/pre_proc/ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+45
-18
No files found.
demo/ocr_demo.py
View file @
864e9535
...
@@ -57,4 +57,4 @@ if __name__ == '__main__':
...
@@ -57,4 +57,4 @@ if __name__ == '__main__':
# logger.info(markdown_content)
# logger.info(markdown_content)
# save_markdown(markdown_text, ocr_json_file_path)
# save_markdown(markdown_text, ocr_json_file_path)
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
e
rror
(
e
)
logger
.
e
xception
(
e
)
magic_pdf/pdf_parse_by_ocr.py
View file @
864e9535
...
@@ -11,7 +11,7 @@ from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
...
@@ -11,7 +11,7 @@ from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
from
magic_pdf.pre_proc.detect_header
import
parse_headers
from
magic_pdf.pre_proc.detect_header
import
parse_headers
from
magic_pdf.pre_proc.detect_page_number
import
parse_pageNos
from
magic_pdf.pre_proc.detect_page_number
import
parse_pageNos
from
magic_pdf.pre_proc.ocr_detect_layout
import
layout_detect
from
magic_pdf.pre_proc.ocr_detect_layout
import
layout_detect
from
magic_pdf.pre_proc.ocr_dict_merge
import
merge_spans_to_line
,
remove_overlaps_min_spans
from
magic_pdf.pre_proc.ocr_dict_merge
import
remove_overlaps_min_spans
,
merge_spans_to_line_by_layout
from
magic_pdf.pre_proc.ocr_remove_spans
import
remove_spans_by_bboxes
from
magic_pdf.pre_proc.ocr_remove_spans
import
remove_spans_by_bboxes
...
@@ -151,10 +151,10 @@ def parse_pdf_by_ocr(
...
@@ -151,10 +151,10 @@ def parse_pdf_by_ocr(
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
layout_bboxes
=
layout_detect
(
ocr_page_info
[
'subfield_dets'
],
page
)
layout_bboxes
=
layout_detect
(
ocr_page_info
[
'subfield_dets'
],
page
,
ocr_page_info
)
# 将spans合并成line(在layout内,从上到下,从左到右)
# 将spans合并成line(在layout内,从上到下,从左到右)
lines
=
merge_spans_to_line
(
spans
,
layout_bboxes
)
lines
=
merge_spans_to_line
_by_layout
(
spans
,
layout_bboxes
)
# logger.info(lines)
# logger.info(lines)
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
...
...
magic_pdf/pre_proc/ocr_detect_layout.py
View file @
864e9535
...
@@ -66,7 +66,7 @@ def adjust_layouts(layout_bboxes):
...
@@ -66,7 +66,7 @@ def adjust_layouts(layout_bboxes):
return
layout_bboxes
return
layout_bboxes
def
layout_detect
(
layout_info
,
page
:
fitz
.
Page
):
def
layout_detect
(
layout_info
,
page
:
fitz
.
Page
,
ocr_page_info
):
"""
"""
对输入的布局信息进行解析,提取出每个子布局的边界框,并对所有子布局进行排序调整。
对输入的布局信息进行解析,提取出每个子布局的边界框,并对所有子布局进行排序调整。
...
@@ -77,7 +77,7 @@ def layout_detect(layout_info, page: fitz.Page):
...
@@ -77,7 +77,7 @@ def layout_detect(layout_info, page: fitz.Page):
list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
"""
"""
horizontal_scale_ratio
,
vertical_scale_ratio
=
get_scale_ratio
(
layout
_info
,
page
)
horizontal_scale_ratio
,
vertical_scale_ratio
=
get_scale_ratio
(
ocr_page
_info
,
page
)
# 初始化布局边界框列表
# 初始化布局边界框列表
layout_bboxes
=
[]
layout_bboxes
=
[]
# 遍历每个子布局
# 遍历每个子布局
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
864e9535
from
magic_pdf.libs.boxbase
import
__is_overlaps_y_exceeds_threshold
,
get_minbox_if_overlap_by_ratio
from
loguru
import
logger
from
magic_pdf.libs.boxbase
import
__is_overlaps_y_exceeds_threshold
,
get_minbox_if_overlap_by_ratio
,
\
calculate_overlap_area_in_bbox1_area_ratio
# 删除重叠spans中较小的那些
# 删除重叠spans中较小的那些
...
@@ -14,6 +17,24 @@ def remove_overlaps_min_spans(spans):
...
@@ -14,6 +17,24 @@ def remove_overlaps_min_spans(spans):
return
spans
return
spans
# 将每一个line中的span从左到右排序
def
line_sort_spans_by_left_to_right
(
lines
):
line_objects
=
[]
for
line
in
lines
:
# 按照x0坐标排序
line
.
sort
(
key
=
lambda
span
:
span
[
'bbox'
][
0
])
line_bbox
=
[
min
(
span
[
'bbox'
][
0
]
for
span
in
line
),
# x0
min
(
span
[
'bbox'
][
1
]
for
span
in
line
),
# y0
max
(
span
[
'bbox'
][
2
]
for
span
in
line
),
# x1
max
(
span
[
'bbox'
][
3
]
for
span
in
line
),
# y1
]
line_objects
.
append
({
"bbox"
:
line_bbox
,
"spans"
:
line
,
})
return
line_objects
def
merge_spans_to_line
(
spans
):
def
merge_spans_to_line
(
spans
):
# 按照y0坐标排序
# 按照y0坐标排序
spans
.
sort
(
key
=
lambda
span
:
span
[
'bbox'
][
1
])
spans
.
sort
(
key
=
lambda
span
:
span
[
'bbox'
][
1
])
...
@@ -23,7 +44,8 @@ def merge_spans_to_line(spans):
...
@@ -23,7 +44,8 @@ def merge_spans_to_line(spans):
for
span
in
spans
[
1
:]:
for
span
in
spans
[
1
:]:
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
# image和table类型,同上
# image和table类型,同上
if
span
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
or
any
(
s
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
for
s
in
current_line
):
if
span
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
or
any
(
s
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
for
s
in
current_line
):
# 则开始新行
# 则开始新行
lines
.
append
(
current_line
)
lines
.
append
(
current_line
)
current_line
=
[
span
]
current_line
=
[
span
]
...
@@ -41,20 +63,25 @@ def merge_spans_to_line(spans):
...
@@ -41,20 +63,25 @@ def merge_spans_to_line(spans):
if
current_line
:
if
current_line
:
lines
.
append
(
current_line
)
lines
.
append
(
current_line
)
# 计算每行的边界框,并对每行中的span按照x0进行排序
return
lines
line_objects
=
[]
for
line
in
lines
:
# 按照x0坐标排序
line
.
sort
(
key
=
lambda
span
:
span
[
'bbox'
][
0
])
line_bbox
=
[
min
(
span
[
'bbox'
][
0
]
for
span
in
line
),
# x0
min
(
span
[
'bbox'
][
1
]
for
span
in
line
),
# y0
max
(
span
[
'bbox'
][
2
]
for
span
in
line
),
# x1
max
(
span
[
'bbox'
][
3
]
for
span
in
line
),
# y1
]
line_objects
.
append
({
"bbox"
:
line_bbox
,
"spans"
:
line
,
})
return
line_objects
def
merge_spans_to_line_by_layout
(
spans
,
layout_bboxes
):
lines
=
[]
new_spans
=
[]
for
item
in
layout_bboxes
:
layout_bbox
=
item
[
'layout_bbox'
]
# 遍历spans,将每个span放入对应的layout中
layout_sapns
=
[]
for
span
in
spans
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
layout_bbox
)
>
0.8
:
layout_sapns
.
append
(
span
)
new_spans
.
append
(
layout_sapns
)
for
layout_sapns
in
new_spans
:
layout_lines
=
merge_spans_to_line
(
layout_sapns
)
lines
.
extend
(
layout_lines
)
#对line中的span进行排序
lines
=
line_sort_spans_by_left_to_right
(
lines
)
return
lines
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment