Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
919280aa
Unverified
Commit
919280aa
authored
Jul 05, 2025
by
Xiaomeng Zhao
Committed by
GitHub
Jul 05, 2025
Browse files
Merge branch 'dev' into multi_gpu_v2
parents
ea9336c0
c6881d83
Changes
77
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
62 additions
and
675 deletions
+62
-675
projects/gradio_app/examples/complex_layout_para_split_list.pdf
...ts/gradio_app/examples/complex_layout_para_split_list.pdf
+0
-0
projects/gradio_app/examples/garbled_formula.pdf
projects/gradio_app/examples/garbled_formula.pdf
+0
-0
projects/gradio_app/examples/magazine_complex_layout_images_list.pdf
...adio_app/examples/magazine_complex_layout_images_list.pdf
+0
-0
projects/gradio_app/examples/scanned.pdf
projects/gradio_app/examples/scanned.pdf
+0
-0
projects/gradio_app/requirements.txt
projects/gradio_app/requirements.txt
+0
-3
projects/multi_gpu/README.md
projects/multi_gpu/README.md
+0
-44
projects/multi_gpu/client.py
projects/multi_gpu/client.py
+0
-39
projects/multi_gpu/server.py
projects/multi_gpu/server.py
+0
-98
projects/web_api/Dockerfile
projects/web_api/Dockerfile
+0
-67
projects/web_api/README.md
projects/web_api/README.md
+0
-31
projects/web_api/app.py
projects/web_api/app.py
+0
-305
projects/web_api/download_models.py
projects/web_api/download_models.py
+0
-33
projects/web_api/entrypoint.sh
projects/web_api/entrypoint.sh
+0
-5
projects/web_api/magic-pdf.json
projects/web_api/magic-pdf.json
+0
-44
projects/web_api/requirements.txt
projects/web_api/requirements.txt
+0
-5
pyproject.toml
pyproject.toml
+14
-1
signatures/version1/cla.json
signatures/version1/cla.json
+48
-0
No files found.
projects/gradio_app/examples/complex_layout_para_split_list.pdf
deleted
100644 → 0
View file @
ea9336c0
File deleted
projects/gradio_app/examples/garbled_formula.pdf
deleted
100644 → 0
View file @
ea9336c0
File deleted
projects/gradio_app/examples/magazine_complex_layout_images_list.pdf
deleted
100644 → 0
View file @
ea9336c0
File deleted
projects/gradio_app/examples/scanned.pdf
deleted
100755 → 0
View file @
ea9336c0
File deleted
projects/gradio_app/requirements.txt
deleted
100644 → 0
View file @
ea9336c0
magic-pdf[full]>=0.8.0
gradio
gradio-pdf
\ No newline at end of file
projects/multi_gpu/README.md
deleted
100644 → 0
View file @
ea9336c0
## 项目简介
本项目提供基于 LitServe 的多 GPU 并行处理方案。LitServe 是一个简便且灵活的 AI 模型服务引擎,基于 FastAPI 构建。它为 FastAPI 增强了批处理、流式传输和 GPU 自动扩展等功能,无需为每个模型单独重建 FastAPI 服务器。
## 环境配置
请使用以下命令配置所需的环境:
```
bash
pip
install
-U
magic-pdf[full] litserve python-multipart filetype
```
## 快速使用
### 1. 启动服务端
以下示例展示了如何启动服务端,支持自定义设置:
```
python
server
=
ls
.
LitServer
(
MinerUAPI
(
output_dir
=
'/tmp'
),
# 可自定义输出文件夹
accelerator
=
'cuda'
,
# 启用 GPU 加速
devices
=
'auto'
,
# "auto" 使用所有 GPU
workers_per_device
=
1
,
# 每个 GPU 启动一个服务实例
timeout
=
False
# 设置为 False 以禁用超时
)
server
.
run
(
port
=
8000
)
# 设定服务端口为 8000
```
启动服务端命令:
```
bash
python server.py
```
### 2. 启动客户端
以下代码展示了客户端的使用方式,可根据需求修改配置:
```
python
files
=
[
'demo/small_ocr.pdf'
]
# 替换为文件路径,支持 pdf、jpg/jpeg、png、doc、docx、ppt、pptx 文件
n_jobs
=
np
.
clip
(
len
(
files
),
1
,
8
)
# 设置并发线程数,此处最大为 8,可根据自身修改
results
=
Parallel
(
n_jobs
,
prefer
=
'threads'
,
verbose
=
10
)(
delayed
(
do_parse
)(
p
)
for
p
in
files
)
print
(
results
)
```
启动客户端命令:
```
bash
python client.py
```
好了,你的文件会自动在多个 GPU 上并行处理!🍻🍻🍻
projects/multi_gpu/client.py
deleted
100644 → 0
View file @
ea9336c0
import
base64
import
requests
import
numpy
as
np
from
loguru
import
logger
from
joblib
import
Parallel
,
delayed
def
to_b64
(
file_path
):
try
:
with
open
(
file_path
,
'rb'
)
as
f
:
return
base64
.
b64encode
(
f
.
read
()).
decode
(
'utf-8'
)
except
Exception
as
e
:
raise
Exception
(
f
'File:
{
file_path
}
- Info:
{
e
}
'
)
def
do_parse
(
file_path
,
url
=
'http://127.0.0.1:8000/predict'
,
**
kwargs
):
try
:
response
=
requests
.
post
(
url
,
json
=
{
'file'
:
to_b64
(
file_path
),
'kwargs'
:
kwargs
})
if
response
.
status_code
==
200
:
output
=
response
.
json
()
output
[
'file_path'
]
=
file_path
return
output
else
:
raise
Exception
(
response
.
text
)
except
Exception
as
e
:
logger
.
error
(
f
'File:
{
file_path
}
- Info:
{
e
}
'
)
if
__name__
==
'__main__'
:
files
=
[
'demo/small_ocr.pdf'
]
n_jobs
=
np
.
clip
(
len
(
files
),
1
,
8
)
results
=
Parallel
(
n_jobs
,
prefer
=
'threads'
,
verbose
=
10
)(
delayed
(
do_parse
)(
p
)
for
p
in
files
)
print
(
results
)
projects/multi_gpu/server.py
deleted
100644 → 0
View file @
ea9336c0
import
os
import
uuid
import
shutil
import
tempfile
import
gc
import
fitz
import
torch
import
base64
import
filetype
import
litserve
as
ls
from
pathlib
import
Path
from
fastapi
import
HTTPException
class
MinerUAPI
(
ls
.
LitAPI
):
def
__init__
(
self
,
output_dir
=
'/tmp'
):
self
.
output_dir
=
Path
(
output_dir
)
def
setup
(
self
,
device
):
if
device
.
startswith
(
'cuda'
):
os
.
environ
[
'CUDA_VISIBLE_DEVICES'
]
=
device
.
split
(
':'
)[
-
1
]
if
torch
.
cuda
.
device_count
()
>
1
:
raise
RuntimeError
(
"Remove any CUDA actions before setting 'CUDA_VISIBLE_DEVICES'."
)
from
magic_pdf.tools.cli
import
do_parse
,
convert_file_to_pdf
from
magic_pdf.model.doc_analyze_by_custom_model
import
ModelSingleton
self
.
do_parse
=
do_parse
self
.
convert_file_to_pdf
=
convert_file_to_pdf
model_manager
=
ModelSingleton
()
model_manager
.
get_model
(
True
,
False
)
model_manager
.
get_model
(
False
,
False
)
print
(
f
'Model initialization complete on
{
device
}
!'
)
def
decode_request
(
self
,
request
):
file
=
request
[
'file'
]
file
=
self
.
cvt2pdf
(
file
)
opts
=
request
.
get
(
'kwargs'
,
{})
opts
.
setdefault
(
'debug_able'
,
False
)
opts
.
setdefault
(
'parse_method'
,
'auto'
)
return
file
,
opts
def
predict
(
self
,
inputs
):
try
:
pdf_name
=
str
(
uuid
.
uuid4
())
output_dir
=
self
.
output_dir
.
joinpath
(
pdf_name
)
self
.
do_parse
(
self
.
output_dir
,
pdf_name
,
inputs
[
0
],
[],
**
inputs
[
1
])
return
output_dir
except
Exception
as
e
:
shutil
.
rmtree
(
output_dir
,
ignore_errors
=
True
)
raise
HTTPException
(
status_code
=
500
,
detail
=
str
(
e
))
finally
:
self
.
clean_memory
()
def
encode_response
(
self
,
response
):
return
{
'output_dir'
:
response
}
def
clean_memory
(
self
):
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
ipc_collect
()
gc
.
collect
()
def
cvt2pdf
(
self
,
file_base64
):
try
:
temp_dir
=
Path
(
tempfile
.
mkdtemp
())
temp_file
=
temp_dir
.
joinpath
(
'tmpfile'
)
file_bytes
=
base64
.
b64decode
(
file_base64
)
file_ext
=
filetype
.
guess_extension
(
file_bytes
)
if
file_ext
in
[
'pdf'
,
'jpg'
,
'png'
,
'doc'
,
'docx'
,
'ppt'
,
'pptx'
]:
if
file_ext
==
'pdf'
:
return
file_bytes
elif
file_ext
in
[
'jpg'
,
'png'
]:
with
fitz
.
open
(
stream
=
file_bytes
,
filetype
=
file_ext
)
as
f
:
return
f
.
convert_to_pdf
()
else
:
temp_file
.
write_bytes
(
file_bytes
)
self
.
convert_file_to_pdf
(
temp_file
,
temp_dir
)
return
temp_file
.
with_suffix
(
'.pdf'
).
read_bytes
()
else
:
raise
Exception
(
'Unsupported file format'
)
except
Exception
as
e
:
raise
HTTPException
(
status_code
=
500
,
detail
=
str
(
e
))
finally
:
shutil
.
rmtree
(
temp_dir
,
ignore_errors
=
True
)
if
__name__
==
'__main__'
:
server
=
ls
.
LitServer
(
MinerUAPI
(
output_dir
=
'/tmp'
),
accelerator
=
'cuda'
,
devices
=
'auto'
,
workers_per_device
=
1
,
timeout
=
False
)
server
.
run
(
port
=
8000
)
projects/web_api/Dockerfile
deleted
100644 → 0
View file @
ea9336c0
FROM
python:3.10-slim-bookworm AS base
WORKDIR
/app
ENV
DEBIAN_FRONTEND=noninteractive \
LANG=C.UTF-8 \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PIP_NO_CACHE_DIR=1
FROM
base AS build
# Update the package list and install necessary packages
RUN
apt-get update
&&
\
apt-get
install
-y
--no-install-recommends
\
build-essential
&&
\
apt-get clean
&&
\
rm
-rf
/var/lib/apt/lists/
*
# Build Python dependencies
COPY
requirements.txt .
RUN
python
-m
venv /app/venv
&&
\
.
/app/venv/bin/activate
&&
\
pip
install
-r
requirements.txt
# pip uninstall -y paddlepaddle && \
# pip install -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ \
# paddlepaddle-gpu==3.0.0rc1
# Download models
COPY
download_models.py .
RUN
.
/app/venv/bin/activate
&&
\
./download_models.py
FROM
base AS prod
# Copy Python dependencies and models from the build stage
COPY
--from=build /app/venv /app/venv
COPY
--from=build /opt/models /opt/models
COPY
--from=build /opt/layoutreader /opt/layoutreader
# Update the package list and install necessary packages
RUN
apt-get update
&&
\
apt-get
install
-y
--no-install-recommends
\
libgl1
\
libglib2.0-0
\
libgomp1
&&
\
apt-get clean
&&
\
rm
-rf
/var/lib/apt/lists/
*
# Create volume for paddleocr models
# RUN mkdir -p /root/.paddleocr
# VOLUME [ "/root/.paddleocr" ]
# Copy the app and its configuration file
COPY
entrypoint.sh /app/entrypoint.sh
COPY
magic-pdf.json /root/magic-pdf.json
COPY
app.py /app/app.py
# Expose the port that FastAPI will run on
EXPOSE
8000
# Command to run FastAPI using Uvicorn, pointing to app.py and binding to 0.0.0.0:8000
ENTRYPOINT
[ "/app/entrypoint.sh" ]
CMD
["--host", "0.0.0.0", "--port", "8000"]
projects/web_api/README.md
deleted
100644 → 0
View file @
ea9336c0
# 基于MinerU的PDF解析API
-
MinerU的GPU镜像构建
-
基于FastAPI的PDF解析接口
## 构建方式
```
docker build -t mineru-api .
```
或者使用代理:
```
docker build --build-arg http_proxy=http://127.0.0.1:7890 --build-arg https_proxy=http://127.0.0.1:7890 -t mineru-api .
```
## 启动命令
```
docker run --rm -it --gpus=all -p 8000:8000 mineru-api
```
## 测试参数
访问地址:
```
http://localhost:8000/docs
http://127.0.0.1:8000/docs
```
\ No newline at end of file
projects/web_api/app.py
deleted
100644 → 0
View file @
ea9336c0
import
json
import
os
from
base64
import
b64encode
from
glob
import
glob
from
io
import
StringIO
import
tempfile
from
typing
import
Tuple
,
Union
import
uvicorn
from
fastapi
import
FastAPI
,
HTTPException
,
UploadFile
from
fastapi.responses
import
JSONResponse
from
loguru
import
logger
from
magic_pdf.data.read_api
import
read_local_images
,
read_local_office
import
magic_pdf.model
as
model_config
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.data.data_reader_writer
import
DataWriter
,
FileBasedDataWriter
from
magic_pdf.data.data_reader_writer.s3
import
S3DataReader
,
S3DataWriter
from
magic_pdf.data.dataset
import
ImageDataset
,
PymuDocDataset
from
magic_pdf.libs.config_reader
import
get_bucket_name
,
get_s3_config
from
magic_pdf.model.doc_analyze_by_custom_model
import
doc_analyze
from
magic_pdf.operators.models
import
InferenceResult
from
magic_pdf.operators.pipes
import
PipeResult
from
fastapi
import
Form
model_config
.
__use_inside_model__
=
True
app
=
FastAPI
()
pdf_extensions
=
[
".pdf"
]
office_extensions
=
[
".ppt"
,
".pptx"
,
".doc"
,
".docx"
]
image_extensions
=
[
".png"
,
".jpg"
,
".jpeg"
]
class
MemoryDataWriter
(
DataWriter
):
def
__init__
(
self
):
self
.
buffer
=
StringIO
()
def
write
(
self
,
path
:
str
,
data
:
bytes
)
->
None
:
if
isinstance
(
data
,
str
):
self
.
buffer
.
write
(
data
)
else
:
self
.
buffer
.
write
(
data
.
decode
(
"utf-8"
))
def
write_string
(
self
,
path
:
str
,
data
:
str
)
->
None
:
self
.
buffer
.
write
(
data
)
def
get_value
(
self
)
->
str
:
return
self
.
buffer
.
getvalue
()
def
close
(
self
):
self
.
buffer
.
close
()
def
init_writers
(
file_path
:
str
=
None
,
file
:
UploadFile
=
None
,
output_path
:
str
=
None
,
output_image_path
:
str
=
None
,
)
->
Tuple
[
Union
[
S3DataWriter
,
FileBasedDataWriter
],
Union
[
S3DataWriter
,
FileBasedDataWriter
],
bytes
,
]:
"""
Initialize writers based on path type
Args:
file_path: file path (local path or S3 path)
file: Uploaded file object
output_path: Output directory path
output_image_path: Image output directory path
Returns:
Tuple[writer, image_writer, file_bytes]: Returns initialized writer tuple and file content
"""
file_extension
:
str
=
None
if
file_path
:
is_s3_path
=
file_path
.
startswith
(
"s3://"
)
if
is_s3_path
:
bucket
=
get_bucket_name
(
file_path
)
ak
,
sk
,
endpoint
=
get_s3_config
(
bucket
)
writer
=
S3DataWriter
(
output_path
,
bucket
=
bucket
,
ak
=
ak
,
sk
=
sk
,
endpoint_url
=
endpoint
)
image_writer
=
S3DataWriter
(
output_image_path
,
bucket
=
bucket
,
ak
=
ak
,
sk
=
sk
,
endpoint_url
=
endpoint
)
# 临时创建reader读取文件内容
temp_reader
=
S3DataReader
(
""
,
bucket
=
bucket
,
ak
=
ak
,
sk
=
sk
,
endpoint_url
=
endpoint
)
file_bytes
=
temp_reader
.
read
(
file_path
)
file_extension
=
os
.
path
.
splitext
(
file_path
)[
1
]
else
:
writer
=
FileBasedDataWriter
(
output_path
)
image_writer
=
FileBasedDataWriter
(
output_image_path
)
os
.
makedirs
(
output_image_path
,
exist_ok
=
True
)
with
open
(
file_path
,
"rb"
)
as
f
:
file_bytes
=
f
.
read
()
file_extension
=
os
.
path
.
splitext
(
file_path
)[
1
]
else
:
# 处理上传的文件
file_bytes
=
file
.
file
.
read
()
file_extension
=
os
.
path
.
splitext
(
file
.
filename
)[
1
]
writer
=
FileBasedDataWriter
(
output_path
)
image_writer
=
FileBasedDataWriter
(
output_image_path
)
os
.
makedirs
(
output_image_path
,
exist_ok
=
True
)
return
writer
,
image_writer
,
file_bytes
,
file_extension
def
process_file
(
file_bytes
:
bytes
,
file_extension
:
str
,
parse_method
:
str
,
image_writer
:
Union
[
S3DataWriter
,
FileBasedDataWriter
],
)
->
Tuple
[
InferenceResult
,
PipeResult
]:
"""
Process PDF file content
Args:
file_bytes: Binary content of file
file_extension: file extension
parse_method: Parse method ('ocr', 'txt', 'auto')
image_writer: Image writer
Returns:
Tuple[InferenceResult, PipeResult]: Returns inference result and pipeline result
"""
ds
:
Union
[
PymuDocDataset
,
ImageDataset
]
=
None
if
file_extension
in
pdf_extensions
:
ds
=
PymuDocDataset
(
file_bytes
)
elif
file_extension
in
office_extensions
:
# 需要使用office解析
temp_dir
=
tempfile
.
mkdtemp
()
with
open
(
os
.
path
.
join
(
temp_dir
,
f
"temp_file.
{
file_extension
}
"
),
"wb"
)
as
f
:
f
.
write
(
file_bytes
)
ds
=
read_local_office
(
temp_dir
)[
0
]
elif
file_extension
in
image_extensions
:
# 需要使用ocr解析
temp_dir
=
tempfile
.
mkdtemp
()
with
open
(
os
.
path
.
join
(
temp_dir
,
f
"temp_file.
{
file_extension
}
"
),
"wb"
)
as
f
:
f
.
write
(
file_bytes
)
ds
=
read_local_images
(
temp_dir
)[
0
]
infer_result
:
InferenceResult
=
None
pipe_result
:
PipeResult
=
None
if
parse_method
==
"ocr"
:
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
True
)
pipe_result
=
infer_result
.
pipe_ocr_mode
(
image_writer
)
elif
parse_method
==
"txt"
:
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
False
)
pipe_result
=
infer_result
.
pipe_txt_mode
(
image_writer
)
else
:
# auto
if
ds
.
classify
()
==
SupportedPdfParseMethod
.
OCR
:
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
True
)
pipe_result
=
infer_result
.
pipe_ocr_mode
(
image_writer
)
else
:
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
False
)
pipe_result
=
infer_result
.
pipe_txt_mode
(
image_writer
)
return
infer_result
,
pipe_result
def
encode_image
(
image_path
:
str
)
->
str
:
"""Encode image using base64"""
with
open
(
image_path
,
"rb"
)
as
f
:
return
b64encode
(
f
.
read
()).
decode
()
@
app
.
post
(
"/file_parse"
,
tags
=
[
"projects"
],
summary
=
"Parse files (supports local files and S3)"
,
)
async
def
file_parse
(
file
:
UploadFile
=
None
,
file_path
:
str
=
Form
(
None
),
parse_method
:
str
=
Form
(
"auto"
),
is_json_md_dump
:
bool
=
Form
(
False
),
output_dir
:
str
=
Form
(
"output"
),
return_layout
:
bool
=
Form
(
False
),
return_info
:
bool
=
Form
(
False
),
return_content_list
:
bool
=
Form
(
False
),
return_images
:
bool
=
Form
(
False
),
):
"""
Execute the process of converting PDF to JSON and MD, outputting MD and JSON files
to the specified directory.
Args:
file: The PDF file to be parsed. Must not be specified together with
`file_path`
file_path: The path to the PDF file to be parsed. Must not be specified together
with `file`
parse_method: Parsing method, can be auto, ocr, or txt. Default is auto. If
results are not satisfactory, try ocr
is_json_md_dump: Whether to write parsed data to .json and .md files. Default
to False. Different stages of data will be written to different .json files
(3 in total), md content will be saved to .md file
output_dir: Output directory for results. A folder named after the PDF file
will be created to store all results
return_layout: Whether to return parsed PDF layout. Default to False
return_info: Whether to return parsed PDF info. Default to False
return_content_list: Whether to return parsed PDF content list. Default to False
"""
try
:
if
(
file
is
None
and
file_path
is
None
)
or
(
file
is
not
None
and
file_path
is
not
None
):
return
JSONResponse
(
content
=
{
"error"
:
"Must provide either file or file_path"
},
status_code
=
400
,
)
# Get PDF filename
file_name
=
os
.
path
.
basename
(
file_path
if
file_path
else
file
.
filename
).
split
(
"."
)[
0
]
output_path
=
f
"
{
output_dir
}
/
{
file_name
}
"
output_image_path
=
f
"
{
output_path
}
/images"
# Initialize readers/writers and get PDF content
writer
,
image_writer
,
file_bytes
,
file_extension
=
init_writers
(
file_path
=
file_path
,
file
=
file
,
output_path
=
output_path
,
output_image_path
=
output_image_path
,
)
# Process PDF
infer_result
,
pipe_result
=
process_file
(
file_bytes
,
file_extension
,
parse_method
,
image_writer
)
# Use MemoryDataWriter to get results
content_list_writer
=
MemoryDataWriter
()
md_content_writer
=
MemoryDataWriter
()
middle_json_writer
=
MemoryDataWriter
()
# Use PipeResult's dump method to get data
pipe_result
.
dump_content_list
(
content_list_writer
,
""
,
"images"
)
pipe_result
.
dump_md
(
md_content_writer
,
""
,
"images"
)
pipe_result
.
dump_middle_json
(
middle_json_writer
,
""
)
# Get content
content_list
=
json
.
loads
(
content_list_writer
.
get_value
())
md_content
=
md_content_writer
.
get_value
()
middle_json
=
json
.
loads
(
middle_json_writer
.
get_value
())
model_json
=
infer_result
.
get_infer_res
()
# If results need to be saved
if
is_json_md_dump
:
writer
.
write_string
(
f
"
{
file_name
}
_content_list.json"
,
content_list_writer
.
get_value
()
)
writer
.
write_string
(
f
"
{
file_name
}
.md"
,
md_content
)
writer
.
write_string
(
f
"
{
file_name
}
_middle.json"
,
middle_json_writer
.
get_value
()
)
writer
.
write_string
(
f
"
{
file_name
}
_model.json"
,
json
.
dumps
(
model_json
,
indent
=
4
,
ensure_ascii
=
False
),
)
# Save visualization results
pipe_result
.
draw_layout
(
os
.
path
.
join
(
output_path
,
f
"
{
file_name
}
_layout.pdf"
))
pipe_result
.
draw_span
(
os
.
path
.
join
(
output_path
,
f
"
{
file_name
}
_spans.pdf"
))
pipe_result
.
draw_line_sort
(
os
.
path
.
join
(
output_path
,
f
"
{
file_name
}
_line_sort.pdf"
)
)
infer_result
.
draw_model
(
os
.
path
.
join
(
output_path
,
f
"
{
file_name
}
_model.pdf"
))
# Build return data
data
=
{}
if
return_layout
:
data
[
"layout"
]
=
model_json
if
return_info
:
data
[
"info"
]
=
middle_json
if
return_content_list
:
data
[
"content_list"
]
=
content_list
if
return_images
:
image_paths
=
glob
(
f
"
{
output_image_path
}
/*.jpg"
)
data
[
"images"
]
=
{
os
.
path
.
basename
(
image_path
):
f
"data:image/jpeg;base64,
{
encode_image
(
image_path
)
}
"
for
image_path
in
image_paths
}
data
[
"md_content"
]
=
md_content
# md_content is always returned
# Clean up memory writers
content_list_writer
.
close
()
md_content_writer
.
close
()
middle_json_writer
.
close
()
return
JSONResponse
(
data
,
status_code
=
200
)
except
Exception
as
e
:
logger
.
exception
(
e
)
return
JSONResponse
(
content
=
{
"error"
:
str
(
e
)},
status_code
=
500
)
if
__name__
==
"__main__"
:
uvicorn
.
run
(
app
,
host
=
"0.0.0.0"
,
port
=
8888
)
projects/web_api/download_models.py
deleted
100755 → 0
View file @
ea9336c0
#!/usr/bin/env python
from
huggingface_hub
import
snapshot_download
if
__name__
==
"__main__"
:
mineru_patterns
=
[
# "models/Layout/LayoutLMv3/*",
"models/Layout/YOLO/*"
,
"models/MFD/YOLO/*"
,
"models/MFR/unimernet_hf_small_2503/*"
,
"models/OCR/paddleocr_torch/*"
,
# "models/TabRec/TableMaster/*",
# "models/TabRec/StructEqTable/*",
]
model_dir
=
snapshot_download
(
"opendatalab/PDF-Extract-Kit-1.0"
,
allow_patterns
=
mineru_patterns
,
local_dir
=
"/opt/"
,
)
layoutreader_pattern
=
[
"*.json"
,
"*.safetensors"
,
]
layoutreader_model_dir
=
snapshot_download
(
"hantian/layoutreader"
,
allow_patterns
=
layoutreader_pattern
,
local_dir
=
"/opt/layoutreader/"
,
)
model_dir
=
model_dir
+
"/models"
print
(
f
"model_dir is:
{
model_dir
}
"
)
print
(
f
"layoutreader_model_dir is:
{
layoutreader_model_dir
}
"
)
projects/web_api/entrypoint.sh
deleted
100755 → 0
View file @
ea9336c0
#!/usr/bin/env bash
set
-euo
pipefail
.
/app/venv/bin/activate
exec
uvicorn app:app
"
$@
"
projects/web_api/magic-pdf.json
deleted
100644 → 0
View file @
ea9336c0
{
"bucket_info"
:{
"bucket-name-1"
:[
"ak"
,
"sk"
,
"endpoint"
],
"bucket-name-2"
:[
"ak"
,
"sk"
,
"endpoint"
]
},
"models-dir"
:
"/opt/models"
,
"layoutreader-model-dir"
:
"/opt/layoutreader"
,
"device-mode"
:
"cuda"
,
"layout-config"
:
{
"model"
:
"doclayout_yolo"
},
"formula-config"
:
{
"mfd_model"
:
"yolo_v8_mfd"
,
"mfr_model"
:
"unimernet_small"
,
"enable"
:
true
},
"table-config"
:
{
"model"
:
"rapid_table"
,
"sub_model"
:
"slanet_plus"
,
"enable"
:
true
,
"max_time"
:
400
},
"llm-aided-config"
:
{
"formula_aided"
:
{
"api_key"
:
"your_api_key"
,
"base_url"
:
"https://dashscope.aliyuncs.com/compatible-mode/v1"
,
"model"
:
"qwen2.5-7b-instruct"
,
"enable"
:
false
},
"text_aided"
:
{
"api_key"
:
"your_api_key"
,
"base_url"
:
"https://dashscope.aliyuncs.com/compatible-mode/v1"
,
"model"
:
"qwen2.5-7b-instruct"
,
"enable"
:
false
},
"title_aided"
:
{
"api_key"
:
"your_api_key"
,
"base_url"
:
"https://dashscope.aliyuncs.com/compatible-mode/v1"
,
"model"
:
"qwen2.5-32b-instruct"
,
"enable"
:
false
}
},
"config_version"
:
"1.2.0"
}
projects/web_api/requirements.txt
deleted
100644 → 0
View file @
ea9336c0
magic-pdf[full]
fastapi
uvicorn
python-multipart
pyproject.toml
View file @
919280aa
...
@@ -43,7 +43,7 @@ vlm = [
...
@@ -43,7 +43,7 @@ vlm = [
"pydantic"
,
"pydantic"
,
]
]
sglang
=
[
sglang
=
[
"sglang[all]
==
0.4.
7
"
,
"sglang[all]
>=0.4.8,<
0.4.
9
"
,
]
]
pipeline
=
[
pipeline
=
[
"matplotlib>=3.10,<4"
,
"matplotlib>=3.10,<4"
,
...
@@ -62,9 +62,20 @@ pipeline = [
...
@@ -62,9 +62,20 @@ pipeline = [
"transformers>=4.49.0,!=4.51.0,<5.0.0"
,
"transformers>=4.49.0,!=4.51.0,<5.0.0"
,
"fast-langdetect>=0.2.3,<0.3.0"
,
"fast-langdetect>=0.2.3,<0.3.0"
,
]
]
api
=
[
"fastapi"
,
"python-multipart"
,
"uvicorn"
,
]
gradio
=
[
"gradio>=5.34,<6"
,
"gradio-pdf>=0.0.22"
,
]
core
=
[
core
=
[
"mineru[vlm]"
,
"mineru[vlm]"
,
"mineru[pipeline]"
,
"mineru[pipeline]"
,
"mineru[api]"
,
"mineru[gradio]"
,
]
]
all
=
[
all
=
[
"mineru[core]"
,
"mineru[core]"
,
...
@@ -97,6 +108,8 @@ Repository = "https://github.com/opendatalab/MinerU"
...
@@ -97,6 +108,8 @@ Repository = "https://github.com/opendatalab/MinerU"
mineru
=
"mineru.cli:client.main"
mineru
=
"mineru.cli:client.main"
mineru-sglang-server
=
"mineru.cli.vlm_sglang_server:main"
mineru-sglang-server
=
"mineru.cli.vlm_sglang_server:main"
mineru-models-download
=
"mineru.cli.models_download:download_models"
mineru-models-download
=
"mineru.cli.models_download:download_models"
mineru-api
=
"mineru.cli.fast_api:main"
mineru-gradio
=
"mineru.cli.gradio_app:main"
[tool.setuptools.dynamic]
[tool.setuptools.dynamic]
version
=
{
attr
=
"mineru.version.__version__"
}
version
=
{
attr
=
"mineru.version.__version__"
}
...
...
signatures/version1/cla.json
View file @
919280aa
...
@@ -335,6 +335,54 @@
...
@@ -335,6 +335,54 @@
"created_at"
:
"2025-06-18T06:34:06Z"
,
"created_at"
:
"2025-06-18T06:34:06Z"
,
"repoId"
:
765083837
,
"repoId"
:
765083837
,
"pullRequestNo"
:
2719
"pullRequestNo"
:
2719
},
{
"name"
:
"yuanjua"
,
"id"
:
80858000
,
"comment_id"
:
2983805144
,
"created_at"
:
"2025-06-18T11:27:23Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
2727
},
{
"name"
:
"QIN2DIM"
,
"id"
:
62018067
,
"comment_id"
:
2992279796
,
"created_at"
:
"2025-06-20T17:04:59Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
2758
},
{
"name"
:
"herryqg"
,
"id"
:
107988674
,
"comment_id"
:
2995155194
,
"created_at"
:
"2025-06-23T06:49:59Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
2763
},
{
"name"
:
"zhanluxianshen"
,
"id"
:
161462588
,
"comment_id"
:
3002955644
,
"created_at"
:
"2025-06-25T03:59:03Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
2787
},
{
"name"
:
"ZhiweiXu-102307"
,
"id"
:
192890785
,
"comment_id"
:
3015529289
,
"created_at"
:
"2025-06-28T15:37:58Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
2826
},
{
"name"
:
"hzwzwzw"
,
"id"
:
20764045
,
"comment_id"
:
3017877153
,
"created_at"
:
"2025-06-30T05:44:13Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
2831
}
}
]
]
}
}
\ No newline at end of file
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment