Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
fa3475a4
Unverified
Commit
fa3475a4
authored
Aug 09, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Aug 09, 2024
Browse files
Merge pull request #386 from myhloli/master
feat(draw_bbox): add model bbox drawing functionality
parents
e7b0f8be
c90ee891
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
86 additions
and
2 deletions
+86
-2
magic_pdf/libs/draw_bbox.py
magic_pdf/libs/draw_bbox.py
+66
-1
magic_pdf/libs/ocr_content_type.py
magic_pdf/libs/ocr_content_type.py
+14
-0
magic_pdf/tools/cli_dev.py
magic_pdf/tools/cli_dev.py
+2
-0
magic_pdf/tools/common.py
magic_pdf/tools/common.py
+4
-1
No files found.
magic_pdf/libs/draw_bbox.py
View file @
fa3475a4
from
magic_pdf.libs.Constants
import
CROSS_PAGE
from
magic_pdf.libs.commons
import
fitz
# PyMuPDF
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
,
CategoryId
from
magic_pdf.model.magic_model
import
MagicModel
def
draw_bbox_without_number
(
i
,
bbox_list
,
page
,
rgb_config
,
fill_config
):
...
...
@@ -225,3 +226,67 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
# Save the PDF
pdf_docs
.
save
(
f
"
{
out_path
}
/spans.pdf"
)
def
drow_model_bbox
(
model_list
:
list
,
pdf_bytes
,
out_path
):
dropped_bbox_list
=
[]
tables_body_list
,
tables_caption_list
,
tables_footnote_list
=
[],
[],
[]
imgs_body_list
,
imgs_caption_list
=
[],
[]
titles_list
=
[]
texts_list
=
[]
interequations_list
=
[]
pdf_docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
magic_model
=
MagicModel
(
model_list
,
pdf_docs
)
for
i
in
range
(
len
(
model_list
)):
page_dropped_list
=
[]
tables_body
,
tables_caption
,
tables_footnote
=
[],
[],
[]
imgs_body
,
imgs_caption
=
[],
[]
titles
=
[]
texts
=
[]
interequations
=
[]
page_info
=
magic_model
.
get_model_list
(
i
)
layout_dets
=
page_info
[
"layout_dets"
]
for
layout_det
in
layout_dets
:
bbox
=
layout_det
[
"bbox"
]
if
layout_det
[
"category_id"
]
==
CategoryId
.
Text
:
texts
.
append
(
bbox
)
elif
layout_det
[
"category_id"
]
==
CategoryId
.
Title
:
titles
.
append
(
bbox
)
elif
layout_det
[
"category_id"
]
==
CategoryId
.
TableBody
:
tables_body
.
append
(
bbox
)
elif
layout_det
[
"category_id"
]
==
CategoryId
.
TableCaption
:
tables_caption
.
append
(
bbox
)
elif
layout_det
[
"category_id"
]
==
CategoryId
.
TableFootnote
:
tables_footnote
.
append
(
bbox
)
elif
layout_det
[
"category_id"
]
==
CategoryId
.
ImageBody
:
imgs_body
.
append
(
bbox
)
elif
layout_det
[
"category_id"
]
==
CategoryId
.
ImageCaption
:
imgs_caption
.
append
(
bbox
)
elif
layout_det
[
"category_id"
]
==
CategoryId
.
InterlineEquation_YOLO
:
interequations
.
append
(
bbox
)
elif
layout_det
[
"category_id"
]
==
CategoryId
.
Abandon
:
page_dropped_list
.
append
(
bbox
)
tables_body_list
.
append
(
tables_body
)
tables_caption_list
.
append
(
tables_caption
)
tables_footnote_list
.
append
(
tables_footnote
)
imgs_body_list
.
append
(
imgs_body
)
imgs_caption_list
.
append
(
imgs_caption
)
titles_list
.
append
(
titles
)
texts_list
.
append
(
texts
)
interequations_list
.
append
(
interequations
)
dropped_bbox_list
.
append
(
page_dropped_list
)
for
i
,
page
in
enumerate
(
pdf_docs
):
draw_bbox_with_number
(
i
,
dropped_bbox_list
,
page
,
[
158
,
158
,
158
],
True
)
# color !
draw_bbox_with_number
(
i
,
tables_body_list
,
page
,
[
204
,
204
,
0
],
True
)
draw_bbox_with_number
(
i
,
tables_caption_list
,
page
,
[
255
,
255
,
102
],
True
)
draw_bbox_with_number
(
i
,
tables_footnote_list
,
page
,
[
229
,
255
,
204
],
True
)
draw_bbox_with_number
(
i
,
imgs_body_list
,
page
,
[
153
,
255
,
51
],
True
)
draw_bbox_with_number
(
i
,
imgs_caption_list
,
page
,
[
102
,
178
,
255
],
True
)
draw_bbox_with_number
(
i
,
titles_list
,
page
,
[
102
,
102
,
255
],
True
)
draw_bbox_with_number
(
i
,
texts_list
,
page
,
[
153
,
0
,
76
],
True
)
draw_bbox_with_number
(
i
,
interequations_list
,
page
,
[
0
,
255
,
0
],
True
)
# Save the PDF
pdf_docs
.
save
(
f
"
{
out_path
}
/model.pdf"
)
\ No newline at end of file
magic_pdf/libs/ocr_content_type.py
View file @
fa3475a4
...
...
@@ -19,3 +19,17 @@ class BlockType:
Footnote
=
"footnote"
Discarded
=
"discarded"
class
CategoryId
:
Title
=
0
Text
=
1
Abandon
=
2
ImageBody
=
3
ImageCaption
=
4
TableBody
=
5
TableCaption
=
6
TableFootnote
=
7
InterlineEquation_Layout
=
8
InlineEquation
=
13
InterlineEquation_YOLO
=
14
OcrText
=
15
magic_pdf/tools/cli_dev.py
View file @
fa3475a4
...
...
@@ -94,6 +94,7 @@ def jsonl(jsonl, method, output_dir):
jso
[
"doc_layout_result"
],
method
,
f_dump_content_list
=
True
,
f_draw_model_bbox
=
True
,
)
...
...
@@ -146,6 +147,7 @@ def pdf(pdf, json_data, output_dir, method):
model_json_list
,
method
,
f_dump_content_list
=
True
,
f_draw_model_bbox
=
True
,
)
...
...
magic_pdf/tools/common.py
View file @
fa3475a4
...
...
@@ -4,7 +4,7 @@ import copy
import
click
from
loguru
import
logger
from
magic_pdf.libs.MakeContentConfig
import
DropMode
,
MakeMode
from
magic_pdf.libs.draw_bbox
import
draw_layout_bbox
,
draw_span_bbox
from
magic_pdf.libs.draw_bbox
import
draw_layout_bbox
,
draw_span_bbox
,
drow_model_bbox
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.pipe.OCRPipe
import
OCRPipe
from
magic_pdf.pipe.TXTPipe
import
TXTPipe
...
...
@@ -37,6 +37,7 @@ def do_parse(
f_dump_orig_pdf
=
True
,
f_dump_content_list
=
False
,
f_make_md_mode
=
MakeMode
.
MM_MD
,
f_draw_model_bbox
=
False
,
):
orig_model_list
=
copy
.
deepcopy
(
model_list
)
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
pdf_file_name
,
parse_method
)
...
...
@@ -73,6 +74,8 @@ def do_parse(
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
)
if
f_draw_span_bbox
:
draw_span_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
)
if
f_draw_model_bbox
:
drow_model_bbox
(
orig_model_list
,
pdf_bytes
,
local_md_dir
)
md_content
=
pipe
.
pipe_mk_markdown
(
image_dir
,
drop_mode
=
DropMode
.
NONE
,
md_make_mode
=
f_make_md_mode
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment