Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
40433aed
Commit
40433aed
authored
May 07, 2024
by
赵小蒙
Browse files
fix cross page span drawing bbox logic
parent
c8ab7913
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
20 additions
and
2 deletions
+20
-2
magic_pdf/libs/draw_bbox.py
magic_pdf/libs/draw_bbox.py
+20
-2
No files found.
magic_pdf/libs/draw_bbox.py
View file @
40433aed
from
magic_pdf.libs.Constants
import
CROSS_PAGE
from
magic_pdf.libs.commons
import
fitz
# PyMuPDF
from
magic_pdf.libs.commons
import
fitz
# PyMuPDF
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
...
@@ -148,6 +149,8 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
...
@@ -148,6 +149,8 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
image_list
=
[]
image_list
=
[]
table_list
=
[]
table_list
=
[]
dropped_list
=
[]
dropped_list
=
[]
next_page_text_list
=
[]
next_page_inline_equation_list
=
[]
for
page
in
pdf_info
:
for
page
in
pdf_info
:
page_text_list
=
[]
page_text_list
=
[]
page_inline_equation_list
=
[]
page_inline_equation_list
=
[]
...
@@ -155,6 +158,15 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
...
@@ -155,6 +158,15 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
page_image_list
=
[]
page_image_list
=
[]
page_table_list
=
[]
page_table_list
=
[]
page_dropped_list
=
[]
page_dropped_list
=
[]
# 将跨页的span放到移动到下一页的列表中
if
len
(
next_page_text_list
)
>
0
:
page_text_list
.
extend
(
next_page_text_list
)
next_page_text_list
=
[]
if
len
(
next_page_inline_equation_list
)
>
0
:
page_inline_equation_list
.
extend
(
next_page_inline_equation_list
)
next_page_inline_equation_list
=
[]
# 构造dropped_list
# 构造dropped_list
for
block
in
page
[
"discarded_blocks"
]:
for
block
in
page
[
"discarded_blocks"
]:
if
block
[
"type"
]
==
BlockType
.
Discarded
:
if
block
[
"type"
]
==
BlockType
.
Discarded
:
...
@@ -172,9 +184,15 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
...
@@ -172,9 +184,15 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
for
line
in
block
[
"lines"
]:
for
line
in
block
[
"lines"
]:
for
span
in
line
[
"spans"
]:
for
span
in
line
[
"spans"
]:
if
span
[
"type"
]
==
ContentType
.
Text
:
if
span
[
"type"
]
==
ContentType
.
Text
:
page_text_list
.
append
(
span
[
"bbox"
])
if
span
.
get
(
CROSS_PAGE
,
False
):
next_page_text_list
.
append
(
span
[
"bbox"
])
else
:
page_text_list
.
append
(
span
[
"bbox"
])
elif
span
[
"type"
]
==
ContentType
.
InlineEquation
:
elif
span
[
"type"
]
==
ContentType
.
InlineEquation
:
page_inline_equation_list
.
append
(
span
[
"bbox"
])
if
span
.
get
(
CROSS_PAGE
,
False
):
next_page_inline_equation_list
.
append
(
span
[
"bbox"
])
else
:
page_inline_equation_list
.
append
(
span
[
"bbox"
])
elif
span
[
"type"
]
==
ContentType
.
InterlineEquation
:
elif
span
[
"type"
]
==
ContentType
.
InterlineEquation
:
page_interline_equation_list
.
append
(
span
[
"bbox"
])
page_interline_equation_list
.
append
(
span
[
"bbox"
])
elif
span
[
"type"
]
==
ContentType
.
Image
:
elif
span
[
"type"
]
==
ContentType
.
Image
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment