Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
1ab691fc
Unverified
Commit
1ab691fc
authored
Nov 26, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Nov 26, 2024
Browse files
Merge pull request #1095 from myhloli/dev
feat(pdf_parse): improve text extraction for vertical spans
parents
026c23eb
81635062
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
45 additions
and
3 deletions
+45
-3
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+45
-3
No files found.
magic_pdf/pdf_parse_union_core_v2.py
View file @
1ab691fc
...
@@ -164,28 +164,70 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
...
@@ -164,28 +164,70 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
def
txt_spans_extract_v2
(
pdf_page
,
spans
,
all_bboxes
,
all_discarded_blocks
,
lang
):
def
txt_spans_extract_v2
(
pdf_page
,
spans
,
all_bboxes
,
all_discarded_blocks
,
lang
):
text_blocks
=
pdf_page
.
get_text
(
'rawdict'
,
flags
=
fitz
.
TEXTFLAGS_TEXT
)[
'blocks'
]
text_blocks
_raw
=
pdf_page
.
get_text
(
'rawdict'
,
flags
=
fitz
.
TEXTFLAGS_TEXT
)[
'blocks'
]
# @todo: 拿到char之后把倾斜角度较大的先删一遍
# @todo: 拿到char之后把倾斜角度较大的先删一遍
all_pymu_chars
=
[]
all_pymu_chars
=
[]
for
block
in
text_blocks
:
for
block
in
text_blocks
_raw
:
for
line
in
block
[
'lines'
]:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
for
span
in
line
[
'spans'
]:
all_pymu_chars
.
extend
(
span
[
'chars'
])
all_pymu_chars
.
extend
(
span
[
'chars'
])
# 计算所有sapn的高度的中位数
span_height_list
=
[]
for
span
in
spans
:
if
span
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]:
continue
span_height
=
span
[
'bbox'
][
3
]
-
span
[
'bbox'
][
1
]
span
[
'height'
]
=
span_height
span_height_list
.
append
(
span_height
)
if
len
(
span_height_list
)
==
0
:
return
spans
else
:
median_span_height
=
statistics
.
median
(
span_height_list
)
useful_spans
=
[]
useful_spans
=
[]
unuseful_spans
=
[]
unuseful_spans
=
[]
# 纵向span的两个特征:1. 高度超过多个line 2. 高宽比超过某个值
vertical_spans
=
[]
for
span
in
spans
:
for
span
in
spans
:
if
span
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]:
continue
for
block
in
all_bboxes
+
all_discarded_blocks
:
for
block
in
all_bboxes
+
all_discarded_blocks
:
if
block
[
7
]
in
[
BlockType
.
ImageBody
,
BlockType
.
TableBody
,
BlockType
.
InterlineEquation
]:
if
block
[
7
]
in
[
BlockType
.
ImageBody
,
BlockType
.
TableBody
,
BlockType
.
InterlineEquation
]:
continue
continue
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
block
[
0
:
4
])
>
0.5
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
block
[
0
:
4
])
>
0.5
:
if
block
in
all_bboxes
:
if
span
[
'height'
]
>
median_span_height
*
3
and
span
[
'height'
]
>
(
span
[
'bbox'
][
2
]
-
span
[
'bbox'
][
0
])
*
3
:
vertical_spans
.
append
(
span
)
elif
block
in
all_bboxes
:
useful_spans
.
append
(
span
)
useful_spans
.
append
(
span
)
else
:
else
:
unuseful_spans
.
append
(
span
)
unuseful_spans
.
append
(
span
)
del
span
[
'height'
]
break
break
"""垂直的span框直接用pymu的line进行填充"""
if
len
(
vertical_spans
)
>
0
:
text_blocks
=
pdf_page
.
get_text
(
'dict'
,
flags
=
fitz
.
TEXTFLAGS_TEXT
)[
'blocks'
]
all_pymu_lines
=
[]
for
block
in
text_blocks
:
for
line
in
block
[
'lines'
]:
all_pymu_lines
.
append
(
line
)
for
pymu_line
in
all_pymu_lines
:
for
span
in
vertical_spans
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
pymu_line
[
'bbox'
],
span
[
'bbox'
])
>
0.5
:
for
pymu_span
in
pymu_line
[
'spans'
]:
span
[
'content'
]
+=
pymu_span
[
'text'
]
break
for
span
in
vertical_spans
:
if
len
(
span
[
'content'
])
==
0
:
spans
.
remove
(
span
)
"""水平的span框如果没有char则用ocr进行填充"""
new_spans
=
[]
new_spans
=
[]
for
span
in
useful_spans
+
unuseful_spans
:
for
span
in
useful_spans
+
unuseful_spans
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment