Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
66e616bd
Unverified
Commit
66e616bd
authored
Jul 05, 2025
by
Xiaomeng Zhao
Committed by
GitHub
Jul 05, 2025
Browse files
Merge pull request #2895 from opendatalab/release-2.1.0
Release 2.1.0
parents
592b659e
a4c9a07b
Changes
71
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
264 additions
and
491 deletions
+264
-491
projects/multi_gpu_v2/_config_endpoint.py
projects/multi_gpu_v2/_config_endpoint.py
+60
-0
projects/multi_gpu_v2/client.py
projects/multi_gpu_v2/client.py
+82
-0
projects/multi_gpu_v2/server.py
projects/multi_gpu_v2/server.py
+108
-0
projects/web_api/Dockerfile
projects/web_api/Dockerfile
+0
-67
projects/web_api/README.md
projects/web_api/README.md
+0
-31
projects/web_api/app.py
projects/web_api/app.py
+0
-305
projects/web_api/download_models.py
projects/web_api/download_models.py
+0
-33
projects/web_api/entrypoint.sh
projects/web_api/entrypoint.sh
+0
-5
projects/web_api/magic-pdf.json
projects/web_api/magic-pdf.json
+0
-44
projects/web_api/requirements.txt
projects/web_api/requirements.txt
+0
-5
pyproject.toml
pyproject.toml
+14
-1
No files found.
projects/multi_gpu_v2/_config_endpoint.py
0 → 100644
View file @
66e616bd
import
requests
import
os
import
logging
logging
.
basicConfig
(
level
=
logging
.
INFO
)
# test connection to huggingface
TIMEOUT
=
3
def
config_endpoint
():
"""
Checks for connectivity to Hugging Face and sets the model source accordingly.
If the Hugging Face endpoint is reachable, it sets MINERU_MODEL_SOURCE to 'huggingface'.
Otherwise, it falls back to 'modelscope'.
"""
os
.
environ
.
setdefault
(
'MINERU_MODEL_SOURCE'
,
'huggingface'
)
model_list_url
=
f
"https://huggingface.co/models"
modelscope_url
=
f
"https://modelscope.cn/models"
# Use a specific check for the Hugging Face source
if
os
.
environ
[
'MINERU_MODEL_SOURCE'
]
==
'huggingface'
:
try
:
response
=
requests
.
head
(
model_list_url
,
timeout
=
TIMEOUT
)
# Check for any successful status code (2xx)
if
response
.
ok
:
logging
.
info
(
f
"Successfully connected to Hugging Face. Using 'huggingface' as model source."
)
return
True
else
:
logging
.
warning
(
f
"Hugging Face endpoint returned a non-200 status code:
{
response
.
status_code
}
"
)
except
requests
.
exceptions
.
RequestException
as
e
:
logging
.
error
(
f
"Failed to connect to Hugging Face at
{
model_list_url
}
:
{
e
}
"
)
# If any of the above checks fail, switch to modelscope
logging
.
info
(
"Falling back to 'modelscope' as model source."
)
os
.
environ
[
'MINERU_MODEL_SOURCE'
]
=
'modelscope'
elif
os
.
environ
[
'MINERU_MODEL_SOURCE'
]
==
'modelscope'
:
try
:
response
=
requests
.
head
(
modelscope_url
,
timeout
=
TIMEOUT
)
if
response
.
ok
:
logging
.
info
(
f
"Successfully connected to ModelScope. Using 'modelscope' as model source."
)
return
True
except
requests
.
exceptions
.
RequestException
as
e
:
logging
.
error
(
f
"Failed to connect to ModelScope at
{
model_list_url
}
:
{
e
}
"
)
elif
os
.
environ
[
'MINERU_MODEL_SOURCE'
]
==
'local'
:
logging
.
info
(
"Using 'local' as model source."
)
return
True
else
:
logging
.
error
(
f
"Using custom model source:
{
os
.
environ
[
'MINERU_MODEL_SOURCE'
]
}
"
)
return
True
return
False
if
__name__
==
'__main__'
:
print
(
config_endpoint
())
projects/multi_gpu_v2/client.py
0 → 100644
View file @
66e616bd
import
base64
import
requests
import
os
from
loguru
import
logger
import
asyncio
import
aiohttp
async
def
mineru_parse_async
(
session
,
file_path
,
server_url
=
'http://127.0.0.1:8000/predict'
,
**
options
):
"""
Asynchronous version of the parse function.
"""
try
:
# Asynchronously read and encode the file
with
open
(
file_path
,
'rb'
)
as
f
:
file_b64
=
base64
.
b64encode
(
f
.
read
()).
decode
(
'utf-8'
)
payload
=
{
'file'
:
file_b64
,
'options'
:
options
}
# Use the aiohttp session to send the request
async
with
session
.
post
(
server_url
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
result
=
await
response
.
json
()
logger
.
info
(
f
"✅ Processed:
{
file_path
}
->
{
result
.
get
(
'output_dir'
,
'N/A'
)
}
"
)
return
result
else
:
error_text
=
await
response
.
text
()
logger
.
error
(
f
"❌ Server error for
{
file_path
}
:
{
error_text
}
"
)
return
{
'error'
:
error_text
}
except
Exception
as
e
:
logger
.
error
(
f
"❌ Failed to process
{
file_path
}
:
{
e
}
"
)
return
{
'error'
:
str
(
e
)}
async
def
main
():
"""
Main function to run all parsing tasks concurrently.
"""
test_files
=
[
'../../demo/pdfs/demo1.pdf'
,
'../../demo/pdfs/demo2.pdf'
,
'../../demo/pdfs/demo3.pdf'
,
'../../demo/pdfs/small_ocr.pdf'
,
]
test_files
=
[
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
f
)
for
f
in
test_files
]
existing_files
=
[
f
for
f
in
test_files
if
os
.
path
.
exists
(
f
)]
if
not
existing_files
:
logger
.
warning
(
"No test files found."
)
return
# Create an aiohttp session to be reused across requests
async
with
aiohttp
.
ClientSession
()
as
session
:
# === Basic Processing ===
basic_tasks
=
[
mineru_parse_async
(
session
,
file_path
)
for
file_path
in
existing_files
[:
2
]]
# === Custom Options ===
custom_options
=
{
'backend'
:
'pipeline'
,
'lang'
:
'ch'
,
'method'
:
'auto'
,
'formula_enable'
:
True
,
'table_enable'
:
True
}
# 'backend': 'sglang-engine' requires 24+ GB VRAM per worker
custom_tasks
=
[
mineru_parse_async
(
session
,
file_path
,
**
custom_options
)
for
file_path
in
existing_files
[
2
:]]
# Start all tasks
all_tasks
=
basic_tasks
+
custom_tasks
all_results
=
await
asyncio
.
gather
(
*
all_tasks
)
logger
.
info
(
f
"All Results:
{
all_results
}
"
)
logger
.
info
(
"🎉 All processing completed!"
)
if
__name__
==
'__main__'
:
# Run the async main function
asyncio
.
run
(
main
())
\ No newline at end of file
projects/multi_gpu_v2/server.py
0 → 100644
View file @
66e616bd
import
os
import
base64
import
tempfile
from
pathlib
import
Path
import
litserve
as
ls
from
fastapi
import
HTTPException
from
loguru
import
logger
from
mineru.cli.common
import
do_parse
,
read_fn
from
mineru.utils.config_reader
import
get_device
from
mineru.utils.model_utils
import
get_vram
from
_config_endpoint
import
config_endpoint
class
MinerUAPI
(
ls
.
LitAPI
):
def
__init__
(
self
,
output_dir
=
'/tmp'
):
super
().
__init__
()
self
.
output_dir
=
output_dir
def
setup
(
self
,
device
):
"""Setup environment variables exactly like MinerU CLI does"""
logger
.
info
(
f
"Setting up on device:
{
device
}
"
)
if
os
.
getenv
(
'MINERU_DEVICE_MODE'
,
None
)
==
None
:
os
.
environ
[
'MINERU_DEVICE_MODE'
]
=
device
if
device
!=
'auto'
else
get_device
()
device_mode
=
os
.
environ
[
'MINERU_DEVICE_MODE'
]
if
os
.
getenv
(
'MINERU_VIRTUAL_VRAM_SIZE'
,
None
)
==
None
:
if
device_mode
.
startswith
(
"cuda"
)
or
device_mode
.
startswith
(
"npu"
):
vram
=
round
(
get_vram
(
device_mode
))
os
.
environ
[
'MINERU_VIRTUAL_VRAM_SIZE'
]
=
str
(
vram
)
else
:
os
.
environ
[
'MINERU_VIRTUAL_VRAM_SIZE'
]
=
'1'
logger
.
info
(
f
"MINERU_VIRTUAL_VRAM_SIZE:
{
os
.
environ
[
'MINERU_VIRTUAL_VRAM_SIZE'
]
}
"
)
if
os
.
getenv
(
'MINERU_MODEL_SOURCE'
,
None
)
in
[
'huggingface'
,
None
]:
config_endpoint
()
logger
.
info
(
f
"MINERU_MODEL_SOURCE:
{
os
.
environ
[
'MINERU_MODEL_SOURCE'
]
}
"
)
def
decode_request
(
self
,
request
):
"""Decode file and options from request"""
file_b64
=
request
[
'file'
]
options
=
request
.
get
(
'options'
,
{})
file_bytes
=
base64
.
b64decode
(
file_b64
)
with
tempfile
.
NamedTemporaryFile
(
suffix
=
'.pdf'
,
delete
=
False
)
as
temp
:
temp
.
write
(
file_bytes
)
temp_file
=
Path
(
temp
.
name
)
return
{
'input_path'
:
str
(
temp_file
),
'backend'
:
options
.
get
(
'backend'
,
'pipeline'
),
'method'
:
options
.
get
(
'method'
,
'auto'
),
'lang'
:
options
.
get
(
'lang'
,
'ch'
),
'formula_enable'
:
options
.
get
(
'formula_enable'
,
True
),
'table_enable'
:
options
.
get
(
'table_enable'
,
True
),
'start_page_id'
:
options
.
get
(
'start_page_id'
,
0
),
'end_page_id'
:
options
.
get
(
'end_page_id'
,
None
),
'server_url'
:
options
.
get
(
'server_url'
,
None
),
}
def
predict
(
self
,
inputs
):
"""Call MinerU's do_parse - same as CLI"""
input_path
=
inputs
[
'input_path'
]
output_dir
=
Path
(
self
.
output_dir
)
/
Path
(
input_path
).
stem
try
:
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
file_name
=
Path
(
input_path
).
stem
pdf_bytes
=
read_fn
(
Path
(
input_path
))
do_parse
(
output_dir
=
str
(
output_dir
),
pdf_file_names
=
[
file_name
],
pdf_bytes_list
=
[
pdf_bytes
],
p_lang_list
=
[
inputs
[
'lang'
]],
backend
=
inputs
[
'backend'
],
parse_method
=
inputs
[
'method'
],
formula_enable
=
inputs
[
'formula_enable'
],
table_enable
=
inputs
[
'table_enable'
],
server_url
=
inputs
[
'server_url'
],
start_page_id
=
inputs
[
'start_page_id'
],
end_page_id
=
inputs
[
'end_page_id'
]
)
return
str
(
output_dir
)
except
Exception
as
e
:
logger
.
error
(
f
"Processing failed:
{
e
}
"
)
raise
HTTPException
(
status_code
=
500
,
detail
=
str
(
e
))
finally
:
# Cleanup temp file
if
Path
(
input_path
).
exists
():
Path
(
input_path
).
unlink
()
def
encode_response
(
self
,
response
):
return
{
'output_dir'
:
response
}
if
__name__
==
'__main__'
:
server
=
ls
.
LitServer
(
MinerUAPI
(
output_dir
=
'/tmp/mineru_output'
),
accelerator
=
'auto'
,
devices
=
'auto'
,
workers_per_device
=
1
,
timeout
=
False
)
logger
.
info
(
"Starting MinerU server on port 8000"
)
server
.
run
(
port
=
8000
,
generate_client_file
=
False
)
projects/web_api/Dockerfile
deleted
100644 → 0
View file @
592b659e
FROM
python:3.10-slim-bookworm AS base
WORKDIR
/app
ENV
DEBIAN_FRONTEND=noninteractive \
LANG=C.UTF-8 \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PIP_NO_CACHE_DIR=1
FROM
base AS build
# Update the package list and install necessary packages
RUN
apt-get update
&&
\
apt-get
install
-y
--no-install-recommends
\
build-essential
&&
\
apt-get clean
&&
\
rm
-rf
/var/lib/apt/lists/
*
# Build Python dependencies
COPY
requirements.txt .
RUN
python
-m
venv /app/venv
&&
\
.
/app/venv/bin/activate
&&
\
pip
install
-r
requirements.txt
# pip uninstall -y paddlepaddle && \
# pip install -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ \
# paddlepaddle-gpu==3.0.0rc1
# Download models
COPY
download_models.py .
RUN
.
/app/venv/bin/activate
&&
\
./download_models.py
FROM
base AS prod
# Copy Python dependencies and models from the build stage
COPY
--from=build /app/venv /app/venv
COPY
--from=build /opt/models /opt/models
COPY
--from=build /opt/layoutreader /opt/layoutreader
# Update the package list and install necessary packages
RUN
apt-get update
&&
\
apt-get
install
-y
--no-install-recommends
\
libgl1
\
libglib2.0-0
\
libgomp1
&&
\
apt-get clean
&&
\
rm
-rf
/var/lib/apt/lists/
*
# Create volume for paddleocr models
# RUN mkdir -p /root/.paddleocr
# VOLUME [ "/root/.paddleocr" ]
# Copy the app and its configuration file
COPY
entrypoint.sh /app/entrypoint.sh
COPY
magic-pdf.json /root/magic-pdf.json
COPY
app.py /app/app.py
# Expose the port that FastAPI will run on
EXPOSE
8000
# Command to run FastAPI using Uvicorn, pointing to app.py and binding to 0.0.0.0:8000
ENTRYPOINT
[ "/app/entrypoint.sh" ]
CMD
["--host", "0.0.0.0", "--port", "8000"]
projects/web_api/README.md
deleted
100644 → 0
View file @
592b659e
# 基于MinerU的PDF解析API
-
MinerU的GPU镜像构建
-
基于FastAPI的PDF解析接口
## 构建方式
```
docker build -t mineru-api .
```
或者使用代理:
```
docker build --build-arg http_proxy=http://127.0.0.1:7890 --build-arg https_proxy=http://127.0.0.1:7890 -t mineru-api .
```
## 启动命令
```
docker run --rm -it --gpus=all -p 8000:8000 mineru-api
```
## 测试参数
访问地址:
```
http://localhost:8000/docs
http://127.0.0.1:8000/docs
```
\ No newline at end of file
projects/web_api/app.py
deleted
100644 → 0
View file @
592b659e
import
json
import
os
from
base64
import
b64encode
from
glob
import
glob
from
io
import
StringIO
import
tempfile
from
typing
import
Tuple
,
Union
import
uvicorn
from
fastapi
import
FastAPI
,
HTTPException
,
UploadFile
from
fastapi.responses
import
JSONResponse
from
loguru
import
logger
from
magic_pdf.data.read_api
import
read_local_images
,
read_local_office
import
magic_pdf.model
as
model_config
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.data.data_reader_writer
import
DataWriter
,
FileBasedDataWriter
from
magic_pdf.data.data_reader_writer.s3
import
S3DataReader
,
S3DataWriter
from
magic_pdf.data.dataset
import
ImageDataset
,
PymuDocDataset
from
magic_pdf.libs.config_reader
import
get_bucket_name
,
get_s3_config
from
magic_pdf.model.doc_analyze_by_custom_model
import
doc_analyze
from
magic_pdf.operators.models
import
InferenceResult
from
magic_pdf.operators.pipes
import
PipeResult
from
fastapi
import
Form
model_config
.
__use_inside_model__
=
True
app
=
FastAPI
()
pdf_extensions
=
[
".pdf"
]
office_extensions
=
[
".ppt"
,
".pptx"
,
".doc"
,
".docx"
]
image_extensions
=
[
".png"
,
".jpg"
,
".jpeg"
]
class
MemoryDataWriter
(
DataWriter
):
def
__init__
(
self
):
self
.
buffer
=
StringIO
()
def
write
(
self
,
path
:
str
,
data
:
bytes
)
->
None
:
if
isinstance
(
data
,
str
):
self
.
buffer
.
write
(
data
)
else
:
self
.
buffer
.
write
(
data
.
decode
(
"utf-8"
))
def
write_string
(
self
,
path
:
str
,
data
:
str
)
->
None
:
self
.
buffer
.
write
(
data
)
def
get_value
(
self
)
->
str
:
return
self
.
buffer
.
getvalue
()
def
close
(
self
):
self
.
buffer
.
close
()
def
init_writers
(
file_path
:
str
=
None
,
file
:
UploadFile
=
None
,
output_path
:
str
=
None
,
output_image_path
:
str
=
None
,
)
->
Tuple
[
Union
[
S3DataWriter
,
FileBasedDataWriter
],
Union
[
S3DataWriter
,
FileBasedDataWriter
],
bytes
,
]:
"""
Initialize writers based on path type
Args:
file_path: file path (local path or S3 path)
file: Uploaded file object
output_path: Output directory path
output_image_path: Image output directory path
Returns:
Tuple[writer, image_writer, file_bytes]: Returns initialized writer tuple and file content
"""
file_extension
:
str
=
None
if
file_path
:
is_s3_path
=
file_path
.
startswith
(
"s3://"
)
if
is_s3_path
:
bucket
=
get_bucket_name
(
file_path
)
ak
,
sk
,
endpoint
=
get_s3_config
(
bucket
)
writer
=
S3DataWriter
(
output_path
,
bucket
=
bucket
,
ak
=
ak
,
sk
=
sk
,
endpoint_url
=
endpoint
)
image_writer
=
S3DataWriter
(
output_image_path
,
bucket
=
bucket
,
ak
=
ak
,
sk
=
sk
,
endpoint_url
=
endpoint
)
# 临时创建reader读取文件内容
temp_reader
=
S3DataReader
(
""
,
bucket
=
bucket
,
ak
=
ak
,
sk
=
sk
,
endpoint_url
=
endpoint
)
file_bytes
=
temp_reader
.
read
(
file_path
)
file_extension
=
os
.
path
.
splitext
(
file_path
)[
1
]
else
:
writer
=
FileBasedDataWriter
(
output_path
)
image_writer
=
FileBasedDataWriter
(
output_image_path
)
os
.
makedirs
(
output_image_path
,
exist_ok
=
True
)
with
open
(
file_path
,
"rb"
)
as
f
:
file_bytes
=
f
.
read
()
file_extension
=
os
.
path
.
splitext
(
file_path
)[
1
]
else
:
# 处理上传的文件
file_bytes
=
file
.
file
.
read
()
file_extension
=
os
.
path
.
splitext
(
file
.
filename
)[
1
]
writer
=
FileBasedDataWriter
(
output_path
)
image_writer
=
FileBasedDataWriter
(
output_image_path
)
os
.
makedirs
(
output_image_path
,
exist_ok
=
True
)
return
writer
,
image_writer
,
file_bytes
,
file_extension
def
process_file
(
file_bytes
:
bytes
,
file_extension
:
str
,
parse_method
:
str
,
image_writer
:
Union
[
S3DataWriter
,
FileBasedDataWriter
],
)
->
Tuple
[
InferenceResult
,
PipeResult
]:
"""
Process PDF file content
Args:
file_bytes: Binary content of file
file_extension: file extension
parse_method: Parse method ('ocr', 'txt', 'auto')
image_writer: Image writer
Returns:
Tuple[InferenceResult, PipeResult]: Returns inference result and pipeline result
"""
ds
:
Union
[
PymuDocDataset
,
ImageDataset
]
=
None
if
file_extension
in
pdf_extensions
:
ds
=
PymuDocDataset
(
file_bytes
)
elif
file_extension
in
office_extensions
:
# 需要使用office解析
temp_dir
=
tempfile
.
mkdtemp
()
with
open
(
os
.
path
.
join
(
temp_dir
,
f
"temp_file.
{
file_extension
}
"
),
"wb"
)
as
f
:
f
.
write
(
file_bytes
)
ds
=
read_local_office
(
temp_dir
)[
0
]
elif
file_extension
in
image_extensions
:
# 需要使用ocr解析
temp_dir
=
tempfile
.
mkdtemp
()
with
open
(
os
.
path
.
join
(
temp_dir
,
f
"temp_file.
{
file_extension
}
"
),
"wb"
)
as
f
:
f
.
write
(
file_bytes
)
ds
=
read_local_images
(
temp_dir
)[
0
]
infer_result
:
InferenceResult
=
None
pipe_result
:
PipeResult
=
None
if
parse_method
==
"ocr"
:
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
True
)
pipe_result
=
infer_result
.
pipe_ocr_mode
(
image_writer
)
elif
parse_method
==
"txt"
:
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
False
)
pipe_result
=
infer_result
.
pipe_txt_mode
(
image_writer
)
else
:
# auto
if
ds
.
classify
()
==
SupportedPdfParseMethod
.
OCR
:
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
True
)
pipe_result
=
infer_result
.
pipe_ocr_mode
(
image_writer
)
else
:
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
False
)
pipe_result
=
infer_result
.
pipe_txt_mode
(
image_writer
)
return
infer_result
,
pipe_result
def
encode_image
(
image_path
:
str
)
->
str
:
"""Encode image using base64"""
with
open
(
image_path
,
"rb"
)
as
f
:
return
b64encode
(
f
.
read
()).
decode
()
@
app
.
post
(
"/file_parse"
,
tags
=
[
"projects"
],
summary
=
"Parse files (supports local files and S3)"
,
)
async
def
file_parse
(
file
:
UploadFile
=
None
,
file_path
:
str
=
Form
(
None
),
parse_method
:
str
=
Form
(
"auto"
),
is_json_md_dump
:
bool
=
Form
(
False
),
output_dir
:
str
=
Form
(
"output"
),
return_layout
:
bool
=
Form
(
False
),
return_info
:
bool
=
Form
(
False
),
return_content_list
:
bool
=
Form
(
False
),
return_images
:
bool
=
Form
(
False
),
):
"""
Execute the process of converting PDF to JSON and MD, outputting MD and JSON files
to the specified directory.
Args:
file: The PDF file to be parsed. Must not be specified together with
`file_path`
file_path: The path to the PDF file to be parsed. Must not be specified together
with `file`
parse_method: Parsing method, can be auto, ocr, or txt. Default is auto. If
results are not satisfactory, try ocr
is_json_md_dump: Whether to write parsed data to .json and .md files. Default
to False. Different stages of data will be written to different .json files
(3 in total), md content will be saved to .md file
output_dir: Output directory for results. A folder named after the PDF file
will be created to store all results
return_layout: Whether to return parsed PDF layout. Default to False
return_info: Whether to return parsed PDF info. Default to False
return_content_list: Whether to return parsed PDF content list. Default to False
"""
try
:
if
(
file
is
None
and
file_path
is
None
)
or
(
file
is
not
None
and
file_path
is
not
None
):
return
JSONResponse
(
content
=
{
"error"
:
"Must provide either file or file_path"
},
status_code
=
400
,
)
# Get PDF filename
file_name
=
os
.
path
.
basename
(
file_path
if
file_path
else
file
.
filename
).
split
(
"."
)[
0
]
output_path
=
f
"
{
output_dir
}
/
{
file_name
}
"
output_image_path
=
f
"
{
output_path
}
/images"
# Initialize readers/writers and get PDF content
writer
,
image_writer
,
file_bytes
,
file_extension
=
init_writers
(
file_path
=
file_path
,
file
=
file
,
output_path
=
output_path
,
output_image_path
=
output_image_path
,
)
# Process PDF
infer_result
,
pipe_result
=
process_file
(
file_bytes
,
file_extension
,
parse_method
,
image_writer
)
# Use MemoryDataWriter to get results
content_list_writer
=
MemoryDataWriter
()
md_content_writer
=
MemoryDataWriter
()
middle_json_writer
=
MemoryDataWriter
()
# Use PipeResult's dump method to get data
pipe_result
.
dump_content_list
(
content_list_writer
,
""
,
"images"
)
pipe_result
.
dump_md
(
md_content_writer
,
""
,
"images"
)
pipe_result
.
dump_middle_json
(
middle_json_writer
,
""
)
# Get content
content_list
=
json
.
loads
(
content_list_writer
.
get_value
())
md_content
=
md_content_writer
.
get_value
()
middle_json
=
json
.
loads
(
middle_json_writer
.
get_value
())
model_json
=
infer_result
.
get_infer_res
()
# If results need to be saved
if
is_json_md_dump
:
writer
.
write_string
(
f
"
{
file_name
}
_content_list.json"
,
content_list_writer
.
get_value
()
)
writer
.
write_string
(
f
"
{
file_name
}
.md"
,
md_content
)
writer
.
write_string
(
f
"
{
file_name
}
_middle.json"
,
middle_json_writer
.
get_value
()
)
writer
.
write_string
(
f
"
{
file_name
}
_model.json"
,
json
.
dumps
(
model_json
,
indent
=
4
,
ensure_ascii
=
False
),
)
# Save visualization results
pipe_result
.
draw_layout
(
os
.
path
.
join
(
output_path
,
f
"
{
file_name
}
_layout.pdf"
))
pipe_result
.
draw_span
(
os
.
path
.
join
(
output_path
,
f
"
{
file_name
}
_spans.pdf"
))
pipe_result
.
draw_line_sort
(
os
.
path
.
join
(
output_path
,
f
"
{
file_name
}
_line_sort.pdf"
)
)
infer_result
.
draw_model
(
os
.
path
.
join
(
output_path
,
f
"
{
file_name
}
_model.pdf"
))
# Build return data
data
=
{}
if
return_layout
:
data
[
"layout"
]
=
model_json
if
return_info
:
data
[
"info"
]
=
middle_json
if
return_content_list
:
data
[
"content_list"
]
=
content_list
if
return_images
:
image_paths
=
glob
(
f
"
{
output_image_path
}
/*.jpg"
)
data
[
"images"
]
=
{
os
.
path
.
basename
(
image_path
):
f
"data:image/jpeg;base64,
{
encode_image
(
image_path
)
}
"
for
image_path
in
image_paths
}
data
[
"md_content"
]
=
md_content
# md_content is always returned
# Clean up memory writers
content_list_writer
.
close
()
md_content_writer
.
close
()
middle_json_writer
.
close
()
return
JSONResponse
(
data
,
status_code
=
200
)
except
Exception
as
e
:
logger
.
exception
(
e
)
return
JSONResponse
(
content
=
{
"error"
:
str
(
e
)},
status_code
=
500
)
if
__name__
==
"__main__"
:
uvicorn
.
run
(
app
,
host
=
"0.0.0.0"
,
port
=
8888
)
projects/web_api/download_models.py
deleted
100755 → 0
View file @
592b659e
#!/usr/bin/env python
from
huggingface_hub
import
snapshot_download
if
__name__
==
"__main__"
:
mineru_patterns
=
[
# "models/Layout/LayoutLMv3/*",
"models/Layout/YOLO/*"
,
"models/MFD/YOLO/*"
,
"models/MFR/unimernet_hf_small_2503/*"
,
"models/OCR/paddleocr_torch/*"
,
# "models/TabRec/TableMaster/*",
# "models/TabRec/StructEqTable/*",
]
model_dir
=
snapshot_download
(
"opendatalab/PDF-Extract-Kit-1.0"
,
allow_patterns
=
mineru_patterns
,
local_dir
=
"/opt/"
,
)
layoutreader_pattern
=
[
"*.json"
,
"*.safetensors"
,
]
layoutreader_model_dir
=
snapshot_download
(
"hantian/layoutreader"
,
allow_patterns
=
layoutreader_pattern
,
local_dir
=
"/opt/layoutreader/"
,
)
model_dir
=
model_dir
+
"/models"
print
(
f
"model_dir is:
{
model_dir
}
"
)
print
(
f
"layoutreader_model_dir is:
{
layoutreader_model_dir
}
"
)
projects/web_api/entrypoint.sh
deleted
100755 → 0
View file @
592b659e
#!/usr/bin/env bash
set
-euo
pipefail
.
/app/venv/bin/activate
exec
uvicorn app:app
"
$@
"
projects/web_api/magic-pdf.json
deleted
100644 → 0
View file @
592b659e
{
"bucket_info"
:{
"bucket-name-1"
:[
"ak"
,
"sk"
,
"endpoint"
],
"bucket-name-2"
:[
"ak"
,
"sk"
,
"endpoint"
]
},
"models-dir"
:
"/opt/models"
,
"layoutreader-model-dir"
:
"/opt/layoutreader"
,
"device-mode"
:
"cuda"
,
"layout-config"
:
{
"model"
:
"doclayout_yolo"
},
"formula-config"
:
{
"mfd_model"
:
"yolo_v8_mfd"
,
"mfr_model"
:
"unimernet_small"
,
"enable"
:
true
},
"table-config"
:
{
"model"
:
"rapid_table"
,
"sub_model"
:
"slanet_plus"
,
"enable"
:
true
,
"max_time"
:
400
},
"llm-aided-config"
:
{
"formula_aided"
:
{
"api_key"
:
"your_api_key"
,
"base_url"
:
"https://dashscope.aliyuncs.com/compatible-mode/v1"
,
"model"
:
"qwen2.5-7b-instruct"
,
"enable"
:
false
},
"text_aided"
:
{
"api_key"
:
"your_api_key"
,
"base_url"
:
"https://dashscope.aliyuncs.com/compatible-mode/v1"
,
"model"
:
"qwen2.5-7b-instruct"
,
"enable"
:
false
},
"title_aided"
:
{
"api_key"
:
"your_api_key"
,
"base_url"
:
"https://dashscope.aliyuncs.com/compatible-mode/v1"
,
"model"
:
"qwen2.5-32b-instruct"
,
"enable"
:
false
}
},
"config_version"
:
"1.2.0"
}
projects/web_api/requirements.txt
deleted
100644 → 0
View file @
592b659e
magic-pdf[full]
fastapi
uvicorn
python-multipart
pyproject.toml
View file @
66e616bd
...
...
@@ -43,7 +43,7 @@ vlm = [
"pydantic"
,
]
sglang
=
[
"sglang[all]
==
0.4.
7
"
,
"sglang[all]
>=0.4.8,<
0.4.
9
"
,
]
pipeline
=
[
"matplotlib>=3.10,<4"
,
...
...
@@ -62,9 +62,20 @@ pipeline = [
"transformers>=4.49.0,!=4.51.0,<5.0.0"
,
"fast-langdetect>=0.2.3,<0.3.0"
,
]
api
=
[
"fastapi"
,
"python-multipart"
,
"uvicorn"
,
]
gradio
=
[
"gradio>=5.34,<6"
,
"gradio-pdf>=0.0.22"
,
]
core
=
[
"mineru[vlm]"
,
"mineru[pipeline]"
,
"mineru[api]"
,
"mineru[gradio]"
,
]
all
=
[
"mineru[core]"
,
...
...
@@ -97,6 +108,8 @@ Repository = "https://github.com/opendatalab/MinerU"
mineru
=
"mineru.cli:client.main"
mineru-sglang-server
=
"mineru.cli.vlm_sglang_server:main"
mineru-models-download
=
"mineru.cli.models_download:download_models"
mineru-api
=
"mineru.cli.fast_api:main"
mineru-gradio
=
"mineru.cli.gradio_app:main"
[tool.setuptools.dynamic]
version
=
{
attr
=
"mineru.version.__version__"
}
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment