Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
91defbb0
Commit
91defbb0
authored
Jun 12, 2025
by
myhloli
Browse files
feat: enhance PDF parsing functionality with new backend options and improved output handling
parent
ae9fd9ad
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
234 additions
and
90 deletions
+234
-90
demo/batch_demo.py
demo/batch_demo.py
+0
-23
demo/demo.py
demo/demo.py
+232
-66
mineru/cli/client.py
mineru/cli/client.py
+2
-1
No files found.
demo/batch_demo.py
deleted
100644 → 0
View file @
ae9fd9ad
import
os
from
pathlib
import
Path
from
magic_pdf.data.batch_build_dataset
import
batch_build_dataset
from
magic_pdf.tools.common
import
batch_do_parse
def
batch
(
pdf_dir
,
output_dir
,
method
,
lang
):
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
doc_paths
=
[]
for
doc_path
in
Path
(
pdf_dir
).
glob
(
'*'
):
if
doc_path
.
suffix
==
'.pdf'
:
doc_paths
.
append
(
doc_path
)
# build dataset with 2 workers
datasets
=
batch_build_dataset
(
doc_paths
,
4
,
lang
)
# os.environ["MINERU_MIN_BATCH_INFERENCE_SIZE"] = "200" # every 200 pages will be parsed in one batch
batch_do_parse
(
output_dir
,
[
str
(
doc_path
.
stem
)
for
doc_path
in
doc_paths
],
datasets
,
method
)
if
__name__
==
'__main__'
:
batch
(
"pdfs"
,
"output"
,
"auto"
,
""
)
demo/demo.py
View file @
91defbb0
# Copyright (c) Opendatalab. All rights reserved.
import
copy
import
json
import
os
from
magic_pdf.data.data_reader_writer
import
FileBasedDataWriter
,
FileBasedDataReader
from
magic_pdf.data.dataset
import
PymuDocDataset
from
magic_pdf.model.doc_analyze_by_custom_model
import
doc_analyze
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
# args
__dir__
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
pdf_file_name
=
os
.
path
.
join
(
__dir__
,
"pdfs"
,
"demo1.pdf"
)
# replace with the real pdf path
name_without_extension
=
os
.
path
.
basename
(
pdf_file_name
).
split
(
'.'
)[
0
]
# prepare env
local_image_dir
=
os
.
path
.
join
(
__dir__
,
"output"
,
name_without_extension
,
"images"
)
local_md_dir
=
os
.
path
.
join
(
__dir__
,
"output"
,
name_without_extension
)
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
os
.
makedirs
(
local_image_dir
,
exist_ok
=
True
)
image_writer
,
md_writer
=
FileBasedDataWriter
(
local_image_dir
),
FileBasedDataWriter
(
local_md_dir
)
# read bytes
reader1
=
FileBasedDataReader
(
""
)
pdf_bytes
=
reader1
.
read
(
pdf_file_name
)
# read the pdf content
# proc
## Create Dataset Instance
ds
=
PymuDocDataset
(
pdf_bytes
)
## inference
if
ds
.
classify
()
==
SupportedPdfParseMethod
.
OCR
:
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
True
)
## pipeline
pipe_result
=
infer_result
.
pipe_ocr_mode
(
image_writer
)
else
:
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
False
)
## pipeline
pipe_result
=
infer_result
.
pipe_txt_mode
(
image_writer
)
### get model inference result
model_inference_result
=
infer_result
.
get_infer_res
()
### draw layout result on each page
pipe_result
.
draw_layout
(
os
.
path
.
join
(
local_md_dir
,
f
"
{
name_without_extension
}
_layout.pdf"
))
### draw spans result on each page
pipe_result
.
draw_span
(
os
.
path
.
join
(
local_md_dir
,
f
"
{
name_without_extension
}
_spans.pdf"
))
### get markdown content
md_content
=
pipe_result
.
get_markdown
(
image_dir
)
### dump markdown
pipe_result
.
dump_md
(
md_writer
,
f
"
{
name_without_extension
}
.md"
,
image_dir
)
### get content list content
content_list_content
=
pipe_result
.
get_content_list
(
image_dir
)
### dump content list
pipe_result
.
dump_content_list
(
md_writer
,
f
"
{
name_without_extension
}
_content_list.json"
,
image_dir
)
### get middle json
middle_json_content
=
pipe_result
.
get_middle_json
()
### dump middle json
pipe_result
.
dump_middle_json
(
md_writer
,
f
'
{
name_without_extension
}
_middle.json'
)
from
pathlib
import
Path
from
loguru
import
logger
from
mineru.cli.common
import
convert_pdf_bytes_to_bytes_by_pypdfium2
,
prepare_env
,
read_fn
from
mineru.data.data_reader_writer
import
FileBasedDataWriter
from
mineru.utils.draw_bbox
import
draw_layout_bbox
,
draw_span_bbox
from
mineru.utils.enum_class
import
MakeMode
from
mineru.backend.vlm.vlm_analyze
import
doc_analyze
as
vlm_doc_analyze
from
mineru.backend.pipeline.pipeline_analyze
import
doc_analyze
as
pipeline_doc_analyze
from
mineru.backend.pipeline.pipeline_middle_json_mkcontent
import
union_make
as
pipeline_union_make
from
mineru.backend.pipeline.model_json_to_middle_json
import
result_to_middle_json
as
pipeline_result_to_middle_json
from
mineru.backend.vlm.vlm_middle_json_mkcontent
import
union_make
as
vlm_union_make
from
mineru.utils.models_download_utils
import
auto_download_and_get_model_root_path
def
do_parse
(
output_dir
,
# Output directory for storing parsing results
pdf_file_names
:
list
[
str
],
# List of PDF file names to be parsed
pdf_bytes_list
:
list
[
bytes
],
# List of PDF bytes to be parsed
p_lang_list
:
list
[
str
],
# List of languages for each PDF, default is 'ch' (Chinese)
backend
=
"pipeline"
,
# The backend for parsing PDF, default is 'pipeline'
parse_method
=
"auto"
,
# The method for parsing PDF, default is 'auto'
p_formula_enable
=
True
,
# Enable formula parsing
p_table_enable
=
True
,
# Enable table parsing
server_url
=
None
,
# Server URL for vlm-sglang-client backend
f_draw_layout_bbox
=
True
,
# Whether to draw layout bounding boxes
f_draw_span_bbox
=
True
,
# Whether to draw span bounding boxes
f_dump_md
=
True
,
# Whether to dump markdown files
f_dump_middle_json
=
True
,
# Whether to dump middle JSON files
f_dump_model_output
=
True
,
# Whether to dump model output files
f_dump_orig_pdf
=
True
,
# Whether to dump original PDF files
f_dump_content_list
=
True
,
# Whether to dump content list files
f_make_md_mode
=
MakeMode
.
MM_MD
,
# The mode for making markdown content, default is MM_MD
start_page_id
=
0
,
# Start page ID for parsing, default is 0
end_page_id
=
None
,
# End page ID for parsing, default is None (parse all pages until the end of the document)
):
if
backend
==
"pipeline"
:
for
idx
,
pdf_bytes
in
enumerate
(
pdf_bytes_list
):
new_pdf_bytes
=
convert_pdf_bytes_to_bytes_by_pypdfium2
(
pdf_bytes
,
start_page_id
,
end_page_id
)
pdf_bytes_list
[
idx
]
=
new_pdf_bytes
infer_results
,
all_image_lists
,
all_pdf_docs
,
lang_list
,
ocr_enabled_list
=
pipeline_doc_analyze
(
pdf_bytes_list
,
p_lang_list
,
parse_method
=
parse_method
,
formula_enable
=
p_formula_enable
,
table_enable
=
p_table_enable
)
for
idx
,
model_list
in
enumerate
(
infer_results
):
model_json
=
copy
.
deepcopy
(
model_list
)
pdf_file_name
=
pdf_file_names
[
idx
]
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
pdf_file_name
,
parse_method
)
image_writer
,
md_writer
=
FileBasedDataWriter
(
local_image_dir
),
FileBasedDataWriter
(
local_md_dir
)
images_list
=
all_image_lists
[
idx
]
pdf_doc
=
all_pdf_docs
[
idx
]
_lang
=
lang_list
[
idx
]
_ocr_enable
=
ocr_enabled_list
[
idx
]
middle_json
=
pipeline_result_to_middle_json
(
model_list
,
images_list
,
pdf_doc
,
image_writer
,
_lang
,
_ocr_enable
,
p_formula_enable
)
pdf_info
=
middle_json
[
"pdf_info"
]
pdf_bytes
=
pdf_bytes_list
[
idx
]
if
f_draw_layout_bbox
:
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
f
"
{
pdf_file_name
}
_layout.pdf"
)
if
f_draw_span_bbox
:
draw_span_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
f
"
{
pdf_file_name
}
_span.pdf"
)
if
f_dump_orig_pdf
:
md_writer
.
write
(
f
"
{
pdf_file_name
}
_origin.pdf"
,
pdf_bytes
,
)
if
f_dump_md
:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
md_content_str
=
pipeline_union_make
(
pdf_info
,
f_make_md_mode
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
.md"
,
md_content_str
,
)
if
f_dump_content_list
:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
content_list
=
pipeline_union_make
(
pdf_info
,
MakeMode
.
CONTENT_LIST
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_content_list.json"
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
),
)
if
f_dump_middle_json
:
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_middle.json"
,
json
.
dumps
(
middle_json
,
ensure_ascii
=
False
,
indent
=
4
),
)
if
f_dump_model_output
:
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_model.json"
,
json
.
dumps
(
model_json
,
ensure_ascii
=
False
,
indent
=
4
),
)
logger
.
info
(
f
"local output dir is
{
local_md_dir
}
"
)
else
:
if
backend
.
startswith
(
"vlm-"
):
backend
=
backend
[
4
:]
f_draw_span_bbox
=
False
parse_method
=
"vlm"
for
idx
,
pdf_bytes
in
enumerate
(
pdf_bytes_list
):
pdf_file_name
=
pdf_file_names
[
idx
]
pdf_bytes
=
convert_pdf_bytes_to_bytes_by_pypdfium2
(
pdf_bytes
,
start_page_id
,
end_page_id
)
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
pdf_file_name
,
parse_method
)
image_writer
,
md_writer
=
FileBasedDataWriter
(
local_image_dir
),
FileBasedDataWriter
(
local_md_dir
)
model_path
=
auto_download_and_get_model_root_path
(
'/'
,
'vlm'
)
middle_json
,
infer_result
=
vlm_doc_analyze
(
pdf_bytes
,
image_writer
=
image_writer
,
backend
=
backend
,
model_path
=
model_path
,
server_url
=
server_url
)
pdf_info
=
middle_json
[
"pdf_info"
]
if
f_draw_layout_bbox
:
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
f
"
{
pdf_file_name
}
_layout.pdf"
)
if
f_draw_span_bbox
:
draw_span_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
f
"
{
pdf_file_name
}
_span.pdf"
)
if
f_dump_orig_pdf
:
md_writer
.
write
(
f
"
{
pdf_file_name
}
_origin.pdf"
,
pdf_bytes
,
)
if
f_dump_md
:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
md_content_str
=
vlm_union_make
(
pdf_info
,
f_make_md_mode
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
.md"
,
md_content_str
,
)
if
f_dump_content_list
:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
content_list
=
vlm_union_make
(
pdf_info
,
MakeMode
.
CONTENT_LIST
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_content_list.json"
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
),
)
if
f_dump_middle_json
:
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_middle.json"
,
json
.
dumps
(
middle_json
,
ensure_ascii
=
False
,
indent
=
4
),
)
if
f_dump_model_output
:
model_output
=
(
"
\n
"
+
"-"
*
50
+
"
\n
"
).
join
(
infer_result
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_model_output.txt"
,
model_output
,
)
logger
.
info
(
f
"local output dir is
{
local_md_dir
}
"
)
def
parse_doc
(
path_list
:
list
[
Path
],
output_dir
,
lang
=
"ch"
,
backend
=
"pipeline"
,
method
=
"auto"
,
server_url
=
None
,
start_page_id
=
0
,
# Start page ID for parsing, default is 0
end_page_id
=
None
# End page ID for parsing, default is None (parse all pages until the end of the document)
):
"""
Parameter description:
path_list: List of document paths to be parsed, can be PDF or image files.
output_dir: Output directory for storing parsing results.
lang: Language option, default is 'ch', optional values include['ch', 'ch_server', 'ch_lite', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']。
Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
Adapted only for the case where the backend is set to "pipeline"
backend: the backend for parsing pdf:
pipeline: More general.
vlm-transformers: More general.
vlm-sglang-engine: Faster(engine).
vlm-sglang-client: Faster(client).
without method specified, pipeline will be used by default.
method: the method for parsing pdf:
auto: Automatically determine the method based on the file type.
txt: Use text extraction method.
ocr: Use OCR method for image-based PDFs.
Without method specified, 'auto' will be used by default.
Adapted only for the case where the backend is set to "pipeline".
server_url: When the backend is `sglang-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
"""
try
:
file_name_list
=
[]
pdf_bytes_list
=
[]
lang_list
=
[]
for
path
in
path_list
:
file_name
=
str
(
Path
(
path
).
stem
)
pdf_bytes
=
read_fn
(
path
)
file_name_list
.
append
(
file_name
)
pdf_bytes_list
.
append
(
pdf_bytes
)
lang_list
.
append
(
lang
)
do_parse
(
output_dir
=
output_dir
,
pdf_file_names
=
file_name_list
,
pdf_bytes_list
=
pdf_bytes_list
,
p_lang_list
=
lang_list
,
backend
=
backend
,
parse_method
=
method
,
server_url
=
server_url
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
)
except
Exception
as
e
:
logger
.
exception
(
e
)
if
__name__
==
'__main__'
:
# args
__dir__
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
pdf_files_dir
=
os
.
path
.
join
(
__dir__
,
"pdfs"
)
output_dir
=
os
.
path
.
join
(
__dir__
,
"output"
)
pdf_suffixes
=
[
".pdf"
]
image_suffixes
=
[
".png"
,
".jpeg"
,
".jpg"
]
doc_path_list
=
[]
for
doc_path
in
Path
(
pdf_files_dir
).
glob
(
'*'
):
if
doc_path
.
suffix
in
pdf_suffixes
+
image_suffixes
:
doc_path_list
.
append
(
doc_path
)
parse_doc
(
doc_path_list
,
output_dir
)
mineru/cli/client.py
View file @
91defbb0
...
...
@@ -41,7 +41,8 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
auto: Automatically determine the method based on the file type.
txt: Use text extraction method.
ocr: Use OCR method for image-based PDFs.
Without method specified, 'auto' will be used by default."""
,
Without method specified, 'auto' will be used by default.
Adapted only for the case where the backend is set to "pipeline"."""
,
default
=
'auto'
,
)
@
click
.
option
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment