Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
099f19f2
Unverified
Commit
099f19f2
authored
Nov 01, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Nov 01, 2024
Browse files
Merge pull request #834 from myhloli/dev
feat(pdf_parse): improve span filtering and add new block types
parents
73afb7d6
149132d6
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
49 additions
and
36 deletions
+49
-36
docs/output_file_en_us.md
docs/output_file_en_us.md
+3
-0
docs/output_file_zh_cn.md
docs/output_file_zh_cn.md
+8
-5
magic_pdf/libs/draw_bbox.py
magic_pdf/libs/draw_bbox.py
+2
-1
magic_pdf/libs/version.py
magic_pdf/libs/version.py
+1
-1
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+35
-29
No files found.
docs/output_file_en_us.md
View file @
099f19f2
...
@@ -175,11 +175,14 @@ Detailed explanation of second-level block types
...
@@ -175,11 +175,14 @@ Detailed explanation of second-level block types
| :----------------- | :--------------------- |
| :----------------- | :--------------------- |
| image_body | Main body of the image |
| image_body | Main body of the image |
| image_caption | Image description text |
| image_caption | Image description text |
| image_footnote | Image footnote |
| table_body | Main body of the table |
| table_body | Main body of the table |
| table_caption | Table description text |
| table_caption | Table description text |
| table_footnote | Table footnote |
| table_footnote | Table footnote |
| text | Text block |
| text | Text block |
| title | Title block |
| title | Title block |
| index | Index block |
| list | List block |
| interline_equation | Block formula |
| interline_equation | Block formula |
<br>
<br>
...
...
docs/output_file_zh_cn.md
View file @
099f19f2
...
@@ -174,12 +174,15 @@ poly 坐标的格式 \[x0, y0, x1, y1, x2, y2, x3, y3\], 分别表示左上、
...
@@ -174,12 +174,15 @@ poly 坐标的格式 \[x0, y0, x1, y1, x2, y2, x3, y3\], 分别表示左上、
| :----------------- | :------------- |
| :----------------- | :------------- |
| image_body | 图像的本体 |
| image_body | 图像的本体 |
| image_caption | 图像的描述文本 |
| image_caption | 图像的描述文本 |
| table_body | 表格本体 |
| image_footnote | 图像的脚注 |
| table_body | 表格本体 |
| table_caption | 表格的描述文本 |
| table_caption | 表格的描述文本 |
| table_footnote | 表格的脚注 |
| table_footnote | 表格的脚注 |
| text | 文本块 |
| text | 文本块 |
| title | 标题块 |
| title | 标题块 |
| interline_equation | 行间公式块 |
| index | 目录块 |
| list | 列表块 |
| interline_equation | 行间公式块 |
<br>
<br>
...
...
magic_pdf/libs/draw_bbox.py
View file @
099f19f2
...
@@ -249,7 +249,8 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
...
@@ -249,7 +249,8 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
page_dropped_list
.
append
(
span
[
'bbox'
])
page_dropped_list
.
append
(
span
[
'bbox'
])
dropped_list
.
append
(
page_dropped_list
)
dropped_list
.
append
(
page_dropped_list
)
# 构造其余useful_list
# 构造其余useful_list
for
block
in
page
[
'para_blocks'
]:
# for block in page['para_blocks']: # span直接用分段合并前的结果就可以
for
block
in
page
[
'preproc_blocks'
]:
if
block
[
'type'
]
in
[
if
block
[
'type'
]
in
[
BlockType
.
Text
,
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
Title
,
...
...
magic_pdf/libs/version.py
View file @
099f19f2
__version__
=
"0.
8
.0"
__version__
=
"0.
9
.0"
magic_pdf/pdf_parse_union_core_v2.py
View file @
099f19f2
...
@@ -382,39 +382,44 @@ def revert_group_blocks(blocks):
...
@@ -382,39 +382,44 @@ def revert_group_blocks(blocks):
return
new_blocks
return
new_blocks
def
remove_outside_spans
(
spans
,
all_bboxes
):
def
remove_outside_spans
(
spans
,
all_bboxes
,
all_discarded_blocks
):
image_bboxes
=
[]
def
get_block_bboxes
(
blocks
,
block_type_list
):
table_bboxes
=
[
]
return
[
block
[
0
:
4
]
for
block
in
blocks
if
block
[
7
]
in
block_type_list
]
other_block_bboxes
=
[]
for
block
in
all_bboxes
:
image_bboxes
=
get_block_bboxes
(
all_bboxes
,
[
BlockType
.
ImageBody
])
block_type
=
block
[
7
]
table_bboxes
=
get_block_bboxes
(
all_bboxes
,
[
BlockType
.
TableBody
])
block_bbox
=
block
[
0
:
4
]
other_block_type
=
[
]
for
block_type
in
BlockType
.
__dict__
.
values
():
if
block_type
==
BlockType
.
ImageBody
:
if
not
isinstance
(
block_type
,
str
)
:
image_bboxes
.
append
(
block_bbox
)
continue
el
if
block_type
==
BlockType
.
TableBody
:
if
block_type
not
in
[
BlockType
.
ImageBody
,
BlockType
.
TableBody
]
:
table_bboxes
.
append
(
block_
bbox
)
other_block_type
.
append
(
block_
type
)
else
:
other_block_bboxes
=
get_block_bboxes
(
all_bboxes
,
other_block_type
)
other_block_bboxes
.
append
(
block_bbox
)
discarded_block_bboxes
=
get_block_bboxes
(
all_discarded_blocks
,
[
BlockType
.
Discarded
]
)
new_spans
=
[]
new_spans
=
[]
for
span
in
spans
:
for
span
in
spans
:
if
span
[
'type'
]
==
ContentType
.
Image
:
span_bbox
=
span
[
'bbox'
]
for
block_bbox
in
image_bboxes
:
span_type
=
span
[
'type'
]
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
block_bbox
)
>
0.5
:
new_spans
.
append
(
span
)
if
any
(
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
0.4
for
block_bbox
in
break
discarded_block_bboxes
):
elif
span
[
'type'
]
==
ContentType
.
Table
:
new_spans
.
append
(
span
)
for
block_bbox
in
table_bboxes
:
continue
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
block_bbox
)
>
0.5
:
new_spans
.
append
(
span
)
if
span_type
==
ContentType
.
Image
:
break
if
any
(
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
0.5
for
block_bbox
in
image_bboxes
):
new_spans
.
append
(
span
)
elif
span_type
==
ContentType
.
Table
:
if
any
(
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
0.5
for
block_bbox
in
table_bboxes
):
new_spans
.
append
(
span
)
else
:
else
:
for
block_bbox
in
other_block_bboxes
:
if
any
(
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
0.5
for
block_bbox
in
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
block_bbox
)
>
0.5
:
other_block_bboxes
):
new_spans
.
append
(
span
)
new_spans
.
append
(
span
)
break
return
new_spans
return
new_spans
...
@@ -488,7 +493,8 @@ def parse_page_core(
...
@@ -488,7 +493,8 @@ def parse_page_core(
raise
Exception
(
'parse_mode must be txt or ocr'
)
raise
Exception
(
'parse_mode must be txt or ocr'
)
"""在删除重复span之前,应该通过image_body和table_body的block过滤一下image和table的span"""
"""在删除重复span之前,应该通过image_body和table_body的block过滤一下image和table的span"""
spans
=
remove_outside_spans
(
spans
,
all_bboxes
)
"""顺便删除大水印并保留abandon的span"""
spans
=
remove_outside_spans
(
spans
,
all_bboxes
,
all_discarded_blocks
)
"""删除重叠spans中置信度较低的那些"""
"""删除重叠spans中置信度较低的那些"""
spans
,
dropped_spans_by_confidence
=
remove_overlaps_low_confidence_spans
(
spans
)
spans
,
dropped_spans_by_confidence
=
remove_overlaps_low_confidence_spans
(
spans
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment