Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
63969109
"docs/source/vscode:/vscode.git/clone" did not exist on "5898edba25ae7d281ad5d139d76251864eb80b8e"
Commit
63969109
authored
Mar 13, 2024
by
赵小蒙
Browse files
移动modify_y_axis在pipeline中的位置
parent
61405b8a
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
7 additions
and
10 deletions
+7
-10
magic_pdf/pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+1
-4
magic_pdf/pre_proc/ocr_detect_layout.py
magic_pdf/pre_proc/ocr_detect_layout.py
+1
-1
magic_pdf/pre_proc/ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+5
-5
No files found.
magic_pdf/pdf_parse_by_ocr.py
View file @
63969109
...
@@ -177,9 +177,6 @@ def parse_pdf_by_ocr(
...
@@ -177,9 +177,6 @@ def parse_pdf_by_ocr(
# 删除重叠spans中较小的那些
# 删除重叠spans中较小的那些
spans
=
remove_overlaps_min_spans
(
spans
)
spans
=
remove_overlaps_min_spans
(
spans
)
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
spans
=
modify_y_axis
(
spans
)
# 删除remove_span_block_bboxes中的bbox
# 删除remove_span_block_bboxes中的bbox
spans
=
remove_spans_by_bboxes
(
spans
,
need_remove_spans_bboxes
)
spans
=
remove_spans_by_bboxes
(
spans
,
need_remove_spans_bboxes
)
...
@@ -187,8 +184,8 @@ def parse_pdf_by_ocr(
...
@@ -187,8 +184,8 @@ def parse_pdf_by_ocr(
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
)
spans
=
cut_image_and_table
(
spans
,
page
,
page_id
,
book_name
,
save_path
)
# 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
# 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
# 模型识别错误的行间公式, type类型转换成行内公式
# 模型识别错误的行间公式, type类型转换成行内公式
spans
=
modify_y_axis
(
spans
)
# bbox去除粘连
# bbox去除粘连
spans
=
remove_overlap_between_bbox
(
spans
)
spans
=
remove_overlap_between_bbox
(
spans
)
...
...
magic_pdf/pre_proc/ocr_detect_layout.py
View file @
63969109
...
@@ -64,7 +64,7 @@ def adjust_layouts(layout_bboxes, page_boundry, page_id):
...
@@ -64,7 +64,7 @@ def adjust_layouts(layout_bboxes, page_boundry, page_id):
# 排序调整布局边界框列表
# 排序调整布局边界框列表
new_bboxes
=
[]
new_bboxes
=
[]
for
layout_bbox
in
layout_bboxes
:
for
layout_bbox
in
layout_bboxes
:
new_bboxes
.
append
([
layout_bbox
[
0
],
layout_bbox
[
1
],
layout_bbox
[
2
],
layout_bbox
[
3
],
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
])
new_bboxes
.
append
([
layout_bbox
[
0
],
layout_bbox
[
1
],
layout_bbox
[
2
],
layout_bbox
[
3
],
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
])
layout_bboxes
,
layout_tree
=
get_bboxes_layout
(
new_bboxes
,
page_boundry
,
page_id
)
layout_bboxes
,
layout_tree
=
get_bboxes_layout
(
new_bboxes
,
page_boundry
,
page_id
)
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
63969109
...
@@ -9,7 +9,7 @@ def remove_overlaps_min_spans(spans):
...
@@ -9,7 +9,7 @@ def remove_overlaps_min_spans(spans):
for
span1
in
spans
.
copy
():
for
span1
in
spans
.
copy
():
for
span2
in
spans
.
copy
():
for
span2
in
spans
.
copy
():
if
span1
!=
span2
:
if
span1
!=
span2
:
overlap_box
=
get_minbox_if_overlap_by_ratio
(
span1
[
'bbox'
],
span2
[
'bbox'
],
0.
5
)
overlap_box
=
get_minbox_if_overlap_by_ratio
(
span1
[
'bbox'
],
span2
[
'bbox'
],
0.
8
)
if
overlap_box
is
not
None
:
if
overlap_box
is
not
None
:
bbox_to_remove
=
next
((
span
for
span
in
spans
if
span
[
'bbox'
]
==
overlap_box
),
None
)
bbox_to_remove
=
next
((
span
for
span
in
spans
if
span
[
'bbox'
]
==
overlap_box
),
None
)
if
bbox_to_remove
is
not
None
:
if
bbox_to_remove
is
not
None
:
...
@@ -113,8 +113,8 @@ def modify_y_axis(spans: list):
...
@@ -113,8 +113,8 @@ def modify_y_axis(spans: list):
#用于给行间公式搜索
#用于给行间公式搜索
text_inline_lines
=
[]
text_inline_lines
=
[]
for
span
in
spans
[
1
:]:
for
span
in
spans
[
1
:]:
if
span
.
get
(
"content"
,
""
)
==
"78."
:
#
if span.get("content","") == "78.":
print
(
"debug"
)
#
print("debug")
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
# image和table类型,同上
# image和table类型,同上
if
span
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
or
any
(
if
span
[
'type'
]
in
[
"displayed_equation"
,
"image"
,
"table"
]
or
any
(
...
@@ -167,8 +167,8 @@ def modify_y_axis(spans: list):
...
@@ -167,8 +167,8 @@ def modify_y_axis(spans: list):
#错误行间公式转行内公式
#错误行间公式转行内公式
j
=
0
j
=
0
for
i
in
range
(
len
(
displayed_list
)):
for
i
in
range
(
len
(
displayed_list
)):
if
i
==
8
:
#
if i == 8:
print
(
"debug"
)
#
print("debug")
span
=
displayed_list
[
i
]
span
=
displayed_list
[
i
]
span_y0
,
span_y
=
span
[
"bbox"
][
1
],
span
[
"bbox"
][
3
]
span_y0
,
span_y
=
span
[
"bbox"
][
1
],
span
[
"bbox"
][
3
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment