Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
0fb9619a
Unverified
Commit
0fb9619a
authored
May 07, 2024
by
Kaiwen Liu
Committed by
GitHub
May 07, 2024
Browse files
Merge branch 'magicpdf:master' into master
parents
8c3a37ff
eebd9767
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
67 additions
and
12 deletions
+67
-12
magic_pdf/pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+2
-3
magic_pdf/pdf_parse_union_core.py
magic_pdf/pdf_parse_union_core.py
+1
-2
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+3
-0
magic_pdf/pre_proc/ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+2
-2
magic_pdf/pre_proc/remove_bbox_overlap.py
magic_pdf/pre_proc/remove_bbox_overlap.py
+59
-5
No files found.
magic_pdf/pdf_parse_by_ocr.py
View file @
0fb9619a
...
...
@@ -24,7 +24,7 @@ from magic_pdf.pre_proc.ocr_dict_merge import (
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_spans_by_bboxes
,
remove_overlaps_min_spans
,
\
adjust_bbox_for_standalone_block
,
modify_y_axis
,
modify_inline_equation
,
get_qa_need_list
,
\
remove_spans_by_bboxes_dict
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
_for_span
def
parse_pdf_by_ocr
(
...
...
@@ -158,8 +158,7 @@ def parse_pdf_by_ocr(
spans
=
modify_inline_equation
(
spans
,
displayed_list
,
text_inline_lines
)
'''bbox去除粘连'''
spans
=
remove_overlap_between_bbox
(
spans
)
spans
=
remove_overlap_between_bbox_for_span
(
spans
)
'''
对tpye=["interline_equation", "image", "table"]进行额外处理,
如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
...
...
magic_pdf/pdf_parse_union_core.py
View file @
0fb9619a
...
...
@@ -20,7 +20,6 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layo
from
magic_pdf.pre_proc.ocr_dict_merge
import
sort_blocks_by_layout
,
fill_spans_in_blocks
,
fix_block_spans
,
\
fix_discarded_block
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_overlaps_min_spans
,
get_qa_need_list_v2
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
from
magic_pdf.pre_proc.resolve_bbox_conflict
import
check_useful_block_horizontal_overlap
...
...
@@ -98,7 +97,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
img_blocks
=
magic_model
.
get_imgs
(
page_id
)
table_blocks
=
magic_model
.
get_tables
(
page_id
)
discarded_blocks
=
magic_model
.
get_discarded
(
page_id
)
text_blocks
=
remove_overlap_between_bbox
(
magic_model
.
get_text_blocks
(
page_id
)
)
text_blocks
=
magic_model
.
get_text_blocks
(
page_id
)
title_blocks
=
magic_model
.
get_title_blocks
(
page_id
)
inline_equations
,
interline_equations
,
interline_equation_blocks
=
magic_model
.
get_equations
(
page_id
)
...
...
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
View file @
0fb9619a
...
...
@@ -2,6 +2,7 @@ from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_ove
calculate_iou
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.ocr_content_type
import
BlockType
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox_for_block
def
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
...
...
@@ -35,6 +36,8 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
all_bboxes
=
remove_need_drop_blocks
(
all_bboxes
,
discarded_blocks
)
'''经过以上处理后,还存在大框套小框的情况,则删除小框'''
all_bboxes
=
remove_overlaps_min_blocks
(
all_bboxes
)
'''将剩余的bbox做分离处理,防止后面分layout时出错'''
all_bboxes
=
remove_overlap_between_bbox_for_block
(
all_bboxes
)
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
for
discarded
in
discarded_blocks
:
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
0fb9619a
...
...
@@ -5,7 +5,7 @@ from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
from
magic_pdf.pre_proc.ocr_span_list_modify
import
modify_y_axis
,
modify_inline_equation
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox
_for_span
# 将每一个line中的span从左到右排序
...
...
@@ -168,7 +168,7 @@ def fill_spans_in_blocks(blocks, spans, radio):
block_spans
=
modify_inline_equation
(
block_spans
,
displayed_list
,
text_inline_lines
)
'''bbox去除粘连'''
# 去粘连会影响span的bbox,导致后续fill的时候出错
# block_spans = remove_overlap_between_bbox(block_spans)
# block_spans = remove_overlap_between_bbox
_for_span
(block_spans)
block_dict
[
'spans'
]
=
block_spans
block_with_spans
.
append
(
block_dict
)
...
...
magic_pdf/pre_proc/remove_bbox_overlap.py
View file @
0fb9619a
from
magic_pdf.libs.boxbase
import
_is_in_or_part_overlap
,
_is_in
def
_remove_overlap_between_bbox
(
spans
):
def
_remove_overlap_between_bbox
_for_span
(
spans
):
res
=
[]
keeps
=
[
True
]
*
len
(
spans
)
...
...
@@ -17,7 +17,7 @@ def _remove_overlap_between_bbox(spans):
continue
for
i
in
range
(
len
(
res
)):
if
_is_in
(
v
[
"bbox"
],
res
[
i
][
"bbox"
]):
if
_is_in
(
v
[
"bbox"
],
res
[
i
][
"bbox"
]):
continue
if
_is_in_or_part_overlap
(
res
[
i
][
"bbox"
],
v
[
"bbox"
]):
ix0
,
iy0
,
ix1
,
iy1
=
res
[
i
][
"bbox"
]
...
...
@@ -34,7 +34,7 @@ def _remove_overlap_between_bbox(spans):
else
:
mid
=
(
ix0
+
x1
)
//
2
ix0
=
max
(
mid
+
0.25
,
ix0
)
x1
=
min
(
mid
-
0.25
,
x1
)
x1
=
min
(
mid
-
0.25
,
x1
)
else
:
if
y1
>=
iy1
:
mid
=
(
y0
+
iy1
)
//
2
...
...
@@ -51,5 +51,59 @@ def _remove_overlap_between_bbox(spans):
return
res
def
remove_overlap_between_bbox
(
spans
):
return
_remove_overlap_between_bbox
(
spans
)
def
_remove_overlap_between_bbox_for_block
(
all_bboxes
):
res
=
[]
keeps
=
[
True
]
*
len
(
all_bboxes
)
for
i
in
range
(
len
(
all_bboxes
)):
for
j
in
range
(
len
(
all_bboxes
)):
if
i
==
j
:
continue
if
_is_in
(
all_bboxes
[
i
][:
4
],
all_bboxes
[
j
][:
4
]):
keeps
[
i
]
=
False
for
idx
,
v
in
enumerate
(
all_bboxes
):
if
not
keeps
[
idx
]:
continue
for
i
in
range
(
len
(
res
)):
if
_is_in
(
v
[:
4
],
res
[
i
][:
4
]):
continue
if
_is_in_or_part_overlap
(
res
[
i
][:
4
],
v
[:
4
]):
ix0
,
iy0
,
ix1
,
iy1
=
res
[
i
][:
4
]
x0
,
y0
,
x1
,
y1
=
v
[:
4
]
diff_x
=
min
(
x1
,
ix1
)
-
max
(
x0
,
ix0
)
diff_y
=
min
(
y1
,
iy1
)
-
max
(
y0
,
iy0
)
if
diff_y
>
diff_x
:
if
x1
>=
ix1
:
mid
=
(
x0
+
ix1
)
//
2
ix1
=
min
(
mid
-
0.25
,
ix1
)
x0
=
max
(
mid
+
0.25
,
x0
)
else
:
mid
=
(
ix0
+
x1
)
//
2
ix0
=
max
(
mid
+
0.25
,
ix0
)
x1
=
min
(
mid
-
0.25
,
x1
)
else
:
if
y1
>=
iy1
:
mid
=
(
y0
+
iy1
)
//
2
y0
=
max
(
mid
+
0.25
,
y0
)
iy1
=
min
(
iy1
,
mid
-
0.25
)
else
:
mid
=
(
iy0
+
y1
)
//
2
y1
=
min
(
y1
,
mid
-
0.25
)
iy0
=
max
(
mid
+
0.25
,
iy0
)
res
[
i
][:
4
]
=
[
ix0
,
iy0
,
ix1
,
iy1
]
v
[:
4
]
=
[
x0
,
y0
,
x1
,
y1
]
res
.
append
(
v
)
return
res
def
remove_overlap_between_bbox_for_span
(
spans
):
return
_remove_overlap_between_bbox_for_span
(
spans
)
def
remove_overlap_between_bbox_for_block
(
all_bboxes
):
return
_remove_overlap_between_bbox_for_block
(
all_bboxes
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment