Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
a9dea5f0
Unverified
Commit
a9dea5f0
authored
Dec 19, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Dec 19, 2024
Browse files
Merge pull request #1330 from myhloli/dev
feat(demo): add demo script for PDF processing
parents
5eb9feee
d6a29162
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
58 additions
and
0 deletions
+58
-0
demo/demo.py
demo/demo.py
+58
-0
No files found.
demo/demo.py
0 → 100644
View file @
a9dea5f0
# Copyright (c) Opendatalab. All rights reserved.
import
os
from
magic_pdf.data.data_reader_writer
import
FileBasedDataWriter
,
FileBasedDataReader
from
magic_pdf.data.dataset
import
PymuDocDataset
from
magic_pdf.model.doc_analyze_by_custom_model
import
doc_analyze
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
# args
pdf_file_name
=
"demo1.pdf"
# replace with the real pdf path
name_without_suff
=
pdf_file_name
.
split
(
"."
)[
0
]
# prepare env
local_image_dir
,
local_md_dir
=
"output/images"
,
"output"
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
os
.
makedirs
(
local_image_dir
,
exist_ok
=
True
)
image_writer
,
md_writer
=
FileBasedDataWriter
(
local_image_dir
),
FileBasedDataWriter
(
local_md_dir
)
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
# read bytes
reader1
=
FileBasedDataReader
(
""
)
pdf_bytes
=
reader1
.
read
(
pdf_file_name
)
# read the pdf content
# proc
## Create Dataset Instance
ds
=
PymuDocDataset
(
pdf_bytes
)
## inference
if
ds
.
classify
()
==
SupportedPdfParseMethod
.
OCR
:
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
True
)
## pipeline
pipe_result
=
infer_result
.
pipe_ocr_mode
(
image_writer
)
else
:
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
False
)
## pipeline
pipe_result
=
infer_result
.
pipe_txt_mode
(
image_writer
)
### draw model result on each page
infer_result
.
draw_model
(
os
.
path
.
join
(
local_md_dir
,
f
"
{
name_without_suff
}
_model.pdf"
))
### draw layout result on each page
pipe_result
.
draw_layout
(
os
.
path
.
join
(
local_md_dir
,
f
"
{
name_without_suff
}
_layout.pdf"
))
### draw spans result on each page
pipe_result
.
draw_span
(
os
.
path
.
join
(
local_md_dir
,
f
"
{
name_without_suff
}
_spans.pdf"
))
### dump markdown
pipe_result
.
dump_md
(
md_writer
,
f
"
{
name_without_suff
}
.md"
,
image_dir
)
### dump content list
pipe_result
.
dump_content_list
(
md_writer
,
f
"
{
name_without_suff
}
_content_list.json"
,
image_dir
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment