Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
c4f252d3
"vscode:/vscode.git/clone" did not exist on "cc14e3f9eb80efa37267e46f83c79b744d7a1d75"
Unverified
Commit
c4f252d3
authored
Dec 19, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Dec 19, 2024
Browse files
Add files via upload
parent
35eb3bd2
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
58 additions
and
0 deletions
+58
-0
demo/demo.py
demo/demo.py
+58
-0
No files found.
demo/demo.py
0 → 100644
View file @
c4f252d3
# Copyright (c) Opendatalab. All rights reserved.
import
os
from
magic_pdf.data.data_reader_writer
import
FileBasedDataWriter
,
FileBasedDataReader
from
magic_pdf.data.dataset
import
PymuDocDataset
from
magic_pdf.model.doc_analyze_by_custom_model
import
doc_analyze
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
# args
pdf_file_name
=
"demo1.pdf"
# replace with the real pdf path
name_without_suff
=
pdf_file_name
.
split
(
"."
)[
0
]
# prepare env
local_image_dir
,
local_md_dir
=
"output/images"
,
"output"
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
os
.
makedirs
(
local_image_dir
,
exist_ok
=
True
)
image_writer
,
md_writer
=
FileBasedDataWriter
(
local_image_dir
),
FileBasedDataWriter
(
local_md_dir
)
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
# read bytes
reader1
=
FileBasedDataReader
(
""
)
pdf_bytes
=
reader1
.
read
(
pdf_file_name
)
# read the pdf content
# proc
## Create Dataset Instance
ds
=
PymuDocDataset
(
pdf_bytes
)
## inference
if
ds
.
classify
()
==
SupportedPdfParseMethod
.
OCR
:
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
True
)
## pipeline
pipe_result
=
infer_result
.
pipe_ocr_mode
(
image_writer
)
else
:
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
False
)
## pipeline
pipe_result
=
infer_result
.
pipe_txt_mode
(
image_writer
)
### draw model result on each page
infer_result
.
draw_model
(
os
.
path
.
join
(
local_md_dir
,
f
"
{
name_without_suff
}
_model.pdf"
))
### draw layout result on each page
pipe_result
.
draw_layout
(
os
.
path
.
join
(
local_md_dir
,
f
"
{
name_without_suff
}
_layout.pdf"
))
### draw spans result on each page
pipe_result
.
draw_span
(
os
.
path
.
join
(
local_md_dir
,
f
"
{
name_without_suff
}
_spans.pdf"
))
### dump markdown
pipe_result
.
dump_md
(
md_writer
,
f
"
{
name_without_suff
}
.md"
,
image_dir
)
### dump content list
pipe_result
.
dump_content_list
(
md_writer
,
f
"
{
name_without_suff
}
_content_list.json"
,
image_dir
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment