Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
40433aed
"mmdet3d/ops/iou3d/vscode:/vscode.git/clone" did not exist on "ee5667c1ff7bbc0f4edfd661b033b61fc6dbfbec"
Commit
40433aed
authored
May 07, 2024
by
赵小蒙
Browse files
fix cross page span drawing bbox logic
parent
c8ab7913
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
20 additions
and
2 deletions
+20
-2
magic_pdf/libs/draw_bbox.py
magic_pdf/libs/draw_bbox.py
+20
-2
No files found.
magic_pdf/libs/draw_bbox.py
View file @
40433aed
from
magic_pdf.libs.Constants
import
CROSS_PAGE
from
magic_pdf.libs.commons
import
fitz
# PyMuPDF
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
...
...
@@ -148,6 +149,8 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
image_list
=
[]
table_list
=
[]
dropped_list
=
[]
next_page_text_list
=
[]
next_page_inline_equation_list
=
[]
for
page
in
pdf_info
:
page_text_list
=
[]
page_inline_equation_list
=
[]
...
...
@@ -155,6 +158,15 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
page_image_list
=
[]
page_table_list
=
[]
page_dropped_list
=
[]
# 将跨页的span放到移动到下一页的列表中
if
len
(
next_page_text_list
)
>
0
:
page_text_list
.
extend
(
next_page_text_list
)
next_page_text_list
=
[]
if
len
(
next_page_inline_equation_list
)
>
0
:
page_inline_equation_list
.
extend
(
next_page_inline_equation_list
)
next_page_inline_equation_list
=
[]
# 构造dropped_list
for
block
in
page
[
"discarded_blocks"
]:
if
block
[
"type"
]
==
BlockType
.
Discarded
:
...
...
@@ -172,8 +184,14 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
for
line
in
block
[
"lines"
]:
for
span
in
line
[
"spans"
]:
if
span
[
"type"
]
==
ContentType
.
Text
:
if
span
.
get
(
CROSS_PAGE
,
False
):
next_page_text_list
.
append
(
span
[
"bbox"
])
else
:
page_text_list
.
append
(
span
[
"bbox"
])
elif
span
[
"type"
]
==
ContentType
.
InlineEquation
:
if
span
.
get
(
CROSS_PAGE
,
False
):
next_page_inline_equation_list
.
append
(
span
[
"bbox"
])
else
:
page_inline_equation_list
.
append
(
span
[
"bbox"
])
elif
span
[
"type"
]
==
ContentType
.
InterlineEquation
:
page_interline_equation_list
.
append
(
span
[
"bbox"
])
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment