Unverified Commit 919280aa authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge branch 'dev' into multi_gpu_v2

parents ea9336c0 c6881d83
magic-pdf[full]>=0.8.0
gradio
gradio-pdf
\ No newline at end of file
## 项目简介
本项目提供基于 LitServe 的多 GPU 并行处理方案。LitServe 是一个简便且灵活的 AI 模型服务引擎,基于 FastAPI 构建。它为 FastAPI 增强了批处理、流式传输和 GPU 自动扩展等功能,无需为每个模型单独重建 FastAPI 服务器。
## 环境配置
请使用以下命令配置所需的环境:
```bash
pip install -U magic-pdf[full] litserve python-multipart filetype
```
## 快速使用
### 1. 启动服务端
以下示例展示了如何启动服务端,支持自定义设置:
```python
server = ls.LitServer(
MinerUAPI(output_dir='/tmp'), # 可自定义输出文件夹
accelerator='cuda', # 启用 GPU 加速
devices='auto', # "auto" 使用所有 GPU
workers_per_device=1, # 每个 GPU 启动一个服务实例
timeout=False # 设置为 False 以禁用超时
)
server.run(port=8000) # 设定服务端口为 8000
```
启动服务端命令:
```bash
python server.py
```
### 2. 启动客户端
以下代码展示了客户端的使用方式,可根据需求修改配置:
```python
files = ['demo/small_ocr.pdf'] # 替换为文件路径,支持 pdf、jpg/jpeg、png、doc、docx、ppt、pptx 文件
n_jobs = np.clip(len(files), 1, 8) # 设置并发线程数,此处最大为 8,可根据自身修改
results = Parallel(n_jobs, prefer='threads', verbose=10)(
delayed(do_parse)(p) for p in files
)
print(results)
```
启动客户端命令:
```bash
python client.py
```
好了,你的文件会自动在多个 GPU 上并行处理!🍻🍻🍻
import base64
import requests
import numpy as np
from loguru import logger
from joblib import Parallel, delayed
def to_b64(file_path):
try:
with open(file_path, 'rb') as f:
return base64.b64encode(f.read()).decode('utf-8')
except Exception as e:
raise Exception(f'File: {file_path} - Info: {e}')
def do_parse(file_path, url='http://127.0.0.1:8000/predict', **kwargs):
try:
response = requests.post(url, json={
'file': to_b64(file_path),
'kwargs': kwargs
})
if response.status_code == 200:
output = response.json()
output['file_path'] = file_path
return output
else:
raise Exception(response.text)
except Exception as e:
logger.error(f'File: {file_path} - Info: {e}')
if __name__ == '__main__':
files = ['demo/small_ocr.pdf']
n_jobs = np.clip(len(files), 1, 8)
results = Parallel(n_jobs, prefer='threads', verbose=10)(
delayed(do_parse)(p) for p in files
)
print(results)
import os
import uuid
import shutil
import tempfile
import gc
import fitz
import torch
import base64
import filetype
import litserve as ls
from pathlib import Path
from fastapi import HTTPException
class MinerUAPI(ls.LitAPI):
def __init__(self, output_dir='/tmp'):
self.output_dir = Path(output_dir)
def setup(self, device):
if device.startswith('cuda'):
os.environ['CUDA_VISIBLE_DEVICES'] = device.split(':')[-1]
if torch.cuda.device_count() > 1:
raise RuntimeError("Remove any CUDA actions before setting 'CUDA_VISIBLE_DEVICES'.")
from magic_pdf.tools.cli import do_parse, convert_file_to_pdf
from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
self.do_parse = do_parse
self.convert_file_to_pdf = convert_file_to_pdf
model_manager = ModelSingleton()
model_manager.get_model(True, False)
model_manager.get_model(False, False)
print(f'Model initialization complete on {device}!')
def decode_request(self, request):
file = request['file']
file = self.cvt2pdf(file)
opts = request.get('kwargs', {})
opts.setdefault('debug_able', False)
opts.setdefault('parse_method', 'auto')
return file, opts
def predict(self, inputs):
try:
pdf_name = str(uuid.uuid4())
output_dir = self.output_dir.joinpath(pdf_name)
self.do_parse(self.output_dir, pdf_name, inputs[0], [], **inputs[1])
return output_dir
except Exception as e:
shutil.rmtree(output_dir, ignore_errors=True)
raise HTTPException(status_code=500, detail=str(e))
finally:
self.clean_memory()
def encode_response(self, response):
return {'output_dir': response}
def clean_memory(self):
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
gc.collect()
def cvt2pdf(self, file_base64):
try:
temp_dir = Path(tempfile.mkdtemp())
temp_file = temp_dir.joinpath('tmpfile')
file_bytes = base64.b64decode(file_base64)
file_ext = filetype.guess_extension(file_bytes)
if file_ext in ['pdf', 'jpg', 'png', 'doc', 'docx', 'ppt', 'pptx']:
if file_ext == 'pdf':
return file_bytes
elif file_ext in ['jpg', 'png']:
with fitz.open(stream=file_bytes, filetype=file_ext) as f:
return f.convert_to_pdf()
else:
temp_file.write_bytes(file_bytes)
self.convert_file_to_pdf(temp_file, temp_dir)
return temp_file.with_suffix('.pdf').read_bytes()
else:
raise Exception('Unsupported file format')
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
if __name__ == '__main__':
server = ls.LitServer(
MinerUAPI(output_dir='/tmp'),
accelerator='cuda',
devices='auto',
workers_per_device=1,
timeout=False
)
server.run(port=8000)
FROM python:3.10-slim-bookworm AS base
WORKDIR /app
ENV DEBIAN_FRONTEND=noninteractive \
LANG=C.UTF-8 \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PIP_NO_CACHE_DIR=1
FROM base AS build
# Update the package list and install necessary packages
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Build Python dependencies
COPY requirements.txt .
RUN python -m venv /app/venv && \
. /app/venv/bin/activate && \
pip install -r requirements.txt
# pip uninstall -y paddlepaddle && \
# pip install -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ \
# paddlepaddle-gpu==3.0.0rc1
# Download models
COPY download_models.py .
RUN . /app/venv/bin/activate && \
./download_models.py
FROM base AS prod
# Copy Python dependencies and models from the build stage
COPY --from=build /app/venv /app/venv
COPY --from=build /opt/models /opt/models
COPY --from=build /opt/layoutreader /opt/layoutreader
# Update the package list and install necessary packages
RUN apt-get update && \
apt-get install -y --no-install-recommends \
libgl1 \
libglib2.0-0 \
libgomp1 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Create volume for paddleocr models
# RUN mkdir -p /root/.paddleocr
# VOLUME [ "/root/.paddleocr" ]
# Copy the app and its configuration file
COPY entrypoint.sh /app/entrypoint.sh
COPY magic-pdf.json /root/magic-pdf.json
COPY app.py /app/app.py
# Expose the port that FastAPI will run on
EXPOSE 8000
# Command to run FastAPI using Uvicorn, pointing to app.py and binding to 0.0.0.0:8000
ENTRYPOINT [ "/app/entrypoint.sh" ]
CMD ["--host", "0.0.0.0", "--port", "8000"]
# 基于MinerU的PDF解析API
- MinerU的GPU镜像构建
- 基于FastAPI的PDF解析接口
## 构建方式
```
docker build -t mineru-api .
```
或者使用代理:
```
docker build --build-arg http_proxy=http://127.0.0.1:7890 --build-arg https_proxy=http://127.0.0.1:7890 -t mineru-api .
```
## 启动命令
```
docker run --rm -it --gpus=all -p 8000:8000 mineru-api
```
## 测试参数
访问地址:
```
http://localhost:8000/docs
http://127.0.0.1:8000/docs
```
\ No newline at end of file
import json
import os
from base64 import b64encode
from glob import glob
from io import StringIO
import tempfile
from typing import Tuple, Union
import uvicorn
from fastapi import FastAPI, HTTPException, UploadFile
from fastapi.responses import JSONResponse
from loguru import logger
from magic_pdf.data.read_api import read_local_images, read_local_office
import magic_pdf.model as model_config
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.data_reader_writer import DataWriter, FileBasedDataWriter
from magic_pdf.data.data_reader_writer.s3 import S3DataReader, S3DataWriter
from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
from magic_pdf.libs.config_reader import get_bucket_name, get_s3_config
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.operators.models import InferenceResult
from magic_pdf.operators.pipes import PipeResult
from fastapi import Form
model_config.__use_inside_model__ = True
app = FastAPI()
pdf_extensions = [".pdf"]
office_extensions = [".ppt", ".pptx", ".doc", ".docx"]
image_extensions = [".png", ".jpg", ".jpeg"]
class MemoryDataWriter(DataWriter):
def __init__(self):
self.buffer = StringIO()
def write(self, path: str, data: bytes) -> None:
if isinstance(data, str):
self.buffer.write(data)
else:
self.buffer.write(data.decode("utf-8"))
def write_string(self, path: str, data: str) -> None:
self.buffer.write(data)
def get_value(self) -> str:
return self.buffer.getvalue()
def close(self):
self.buffer.close()
def init_writers(
file_path: str = None,
file: UploadFile = None,
output_path: str = None,
output_image_path: str = None,
) -> Tuple[
Union[S3DataWriter, FileBasedDataWriter],
Union[S3DataWriter, FileBasedDataWriter],
bytes,
]:
"""
Initialize writers based on path type
Args:
file_path: file path (local path or S3 path)
file: Uploaded file object
output_path: Output directory path
output_image_path: Image output directory path
Returns:
Tuple[writer, image_writer, file_bytes]: Returns initialized writer tuple and file content
"""
file_extension:str = None
if file_path:
is_s3_path = file_path.startswith("s3://")
if is_s3_path:
bucket = get_bucket_name(file_path)
ak, sk, endpoint = get_s3_config(bucket)
writer = S3DataWriter(
output_path, bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint
)
image_writer = S3DataWriter(
output_image_path, bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint
)
# 临时创建reader读取文件内容
temp_reader = S3DataReader(
"", bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint
)
file_bytes = temp_reader.read(file_path)
file_extension = os.path.splitext(file_path)[1]
else:
writer = FileBasedDataWriter(output_path)
image_writer = FileBasedDataWriter(output_image_path)
os.makedirs(output_image_path, exist_ok=True)
with open(file_path, "rb") as f:
file_bytes = f.read()
file_extension = os.path.splitext(file_path)[1]
else:
# 处理上传的文件
file_bytes = file.file.read()
file_extension = os.path.splitext(file.filename)[1]
writer = FileBasedDataWriter(output_path)
image_writer = FileBasedDataWriter(output_image_path)
os.makedirs(output_image_path, exist_ok=True)
return writer, image_writer, file_bytes, file_extension
def process_file(
file_bytes: bytes,
file_extension: str,
parse_method: str,
image_writer: Union[S3DataWriter, FileBasedDataWriter],
) -> Tuple[InferenceResult, PipeResult]:
"""
Process PDF file content
Args:
file_bytes: Binary content of file
file_extension: file extension
parse_method: Parse method ('ocr', 'txt', 'auto')
image_writer: Image writer
Returns:
Tuple[InferenceResult, PipeResult]: Returns inference result and pipeline result
"""
ds: Union[PymuDocDataset, ImageDataset] = None
if file_extension in pdf_extensions:
ds = PymuDocDataset(file_bytes)
elif file_extension in office_extensions:
# 需要使用office解析
temp_dir = tempfile.mkdtemp()
with open(os.path.join(temp_dir, f"temp_file.{file_extension}"), "wb") as f:
f.write(file_bytes)
ds = read_local_office(temp_dir)[0]
elif file_extension in image_extensions:
# 需要使用ocr解析
temp_dir = tempfile.mkdtemp()
with open(os.path.join(temp_dir, f"temp_file.{file_extension}"), "wb") as f:
f.write(file_bytes)
ds = read_local_images(temp_dir)[0]
infer_result: InferenceResult = None
pipe_result: PipeResult = None
if parse_method == "ocr":
infer_result = ds.apply(doc_analyze, ocr=True)
pipe_result = infer_result.pipe_ocr_mode(image_writer)
elif parse_method == "txt":
infer_result = ds.apply(doc_analyze, ocr=False)
pipe_result = infer_result.pipe_txt_mode(image_writer)
else: # auto
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
pipe_result = infer_result.pipe_txt_mode(image_writer)
return infer_result, pipe_result
def encode_image(image_path: str) -> str:
"""Encode image using base64"""
with open(image_path, "rb") as f:
return b64encode(f.read()).decode()
@app.post(
"/file_parse",
tags=["projects"],
summary="Parse files (supports local files and S3)",
)
async def file_parse(
file: UploadFile = None,
file_path: str = Form(None),
parse_method: str = Form("auto"),
is_json_md_dump: bool = Form(False),
output_dir: str = Form("output"),
return_layout: bool = Form(False),
return_info: bool = Form(False),
return_content_list: bool = Form(False),
return_images: bool = Form(False),
):
"""
Execute the process of converting PDF to JSON and MD, outputting MD and JSON files
to the specified directory.
Args:
file: The PDF file to be parsed. Must not be specified together with
`file_path`
file_path: The path to the PDF file to be parsed. Must not be specified together
with `file`
parse_method: Parsing method, can be auto, ocr, or txt. Default is auto. If
results are not satisfactory, try ocr
is_json_md_dump: Whether to write parsed data to .json and .md files. Default
to False. Different stages of data will be written to different .json files
(3 in total), md content will be saved to .md file
output_dir: Output directory for results. A folder named after the PDF file
will be created to store all results
return_layout: Whether to return parsed PDF layout. Default to False
return_info: Whether to return parsed PDF info. Default to False
return_content_list: Whether to return parsed PDF content list. Default to False
"""
try:
if (file is None and file_path is None) or (
file is not None and file_path is not None
):
return JSONResponse(
content={"error": "Must provide either file or file_path"},
status_code=400,
)
# Get PDF filename
file_name = os.path.basename(file_path if file_path else file.filename).split(
"."
)[0]
output_path = f"{output_dir}/{file_name}"
output_image_path = f"{output_path}/images"
# Initialize readers/writers and get PDF content
writer, image_writer, file_bytes, file_extension = init_writers(
file_path=file_path,
file=file,
output_path=output_path,
output_image_path=output_image_path,
)
# Process PDF
infer_result, pipe_result = process_file(file_bytes, file_extension, parse_method, image_writer)
# Use MemoryDataWriter to get results
content_list_writer = MemoryDataWriter()
md_content_writer = MemoryDataWriter()
middle_json_writer = MemoryDataWriter()
# Use PipeResult's dump method to get data
pipe_result.dump_content_list(content_list_writer, "", "images")
pipe_result.dump_md(md_content_writer, "", "images")
pipe_result.dump_middle_json(middle_json_writer, "")
# Get content
content_list = json.loads(content_list_writer.get_value())
md_content = md_content_writer.get_value()
middle_json = json.loads(middle_json_writer.get_value())
model_json = infer_result.get_infer_res()
# If results need to be saved
if is_json_md_dump:
writer.write_string(
f"{file_name}_content_list.json", content_list_writer.get_value()
)
writer.write_string(f"{file_name}.md", md_content)
writer.write_string(
f"{file_name}_middle.json", middle_json_writer.get_value()
)
writer.write_string(
f"{file_name}_model.json",
json.dumps(model_json, indent=4, ensure_ascii=False),
)
# Save visualization results
pipe_result.draw_layout(os.path.join(output_path, f"{file_name}_layout.pdf"))
pipe_result.draw_span(os.path.join(output_path, f"{file_name}_spans.pdf"))
pipe_result.draw_line_sort(
os.path.join(output_path, f"{file_name}_line_sort.pdf")
)
infer_result.draw_model(os.path.join(output_path, f"{file_name}_model.pdf"))
# Build return data
data = {}
if return_layout:
data["layout"] = model_json
if return_info:
data["info"] = middle_json
if return_content_list:
data["content_list"] = content_list
if return_images:
image_paths = glob(f"{output_image_path}/*.jpg")
data["images"] = {
os.path.basename(
image_path
): f"data:image/jpeg;base64,{encode_image(image_path)}"
for image_path in image_paths
}
data["md_content"] = md_content # md_content is always returned
# Clean up memory writers
content_list_writer.close()
md_content_writer.close()
middle_json_writer.close()
return JSONResponse(data, status_code=200)
except Exception as e:
logger.exception(e)
return JSONResponse(content={"error": str(e)}, status_code=500)
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8888)
#!/usr/bin/env python
from huggingface_hub import snapshot_download
if __name__ == "__main__":
mineru_patterns = [
# "models/Layout/LayoutLMv3/*",
"models/Layout/YOLO/*",
"models/MFD/YOLO/*",
"models/MFR/unimernet_hf_small_2503/*",
"models/OCR/paddleocr_torch/*",
# "models/TabRec/TableMaster/*",
# "models/TabRec/StructEqTable/*",
]
model_dir = snapshot_download(
"opendatalab/PDF-Extract-Kit-1.0",
allow_patterns=mineru_patterns,
local_dir="/opt/",
)
layoutreader_pattern = [
"*.json",
"*.safetensors",
]
layoutreader_model_dir = snapshot_download(
"hantian/layoutreader",
allow_patterns=layoutreader_pattern,
local_dir="/opt/layoutreader/",
)
model_dir = model_dir + "/models"
print(f"model_dir is: {model_dir}")
print(f"layoutreader_model_dir is: {layoutreader_model_dir}")
#!/usr/bin/env bash
set -euo pipefail
. /app/venv/bin/activate
exec uvicorn app:app "$@"
{
"bucket_info":{
"bucket-name-1":["ak", "sk", "endpoint"],
"bucket-name-2":["ak", "sk", "endpoint"]
},
"models-dir":"/opt/models",
"layoutreader-model-dir":"/opt/layoutreader",
"device-mode":"cuda",
"layout-config": {
"model": "doclayout_yolo"
},
"formula-config": {
"mfd_model": "yolo_v8_mfd",
"mfr_model": "unimernet_small",
"enable": true
},
"table-config": {
"model": "rapid_table",
"sub_model": "slanet_plus",
"enable": true,
"max_time": 400
},
"llm-aided-config": {
"formula_aided": {
"api_key": "your_api_key",
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
"model": "qwen2.5-7b-instruct",
"enable": false
},
"text_aided": {
"api_key": "your_api_key",
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
"model": "qwen2.5-7b-instruct",
"enable": false
},
"title_aided": {
"api_key": "your_api_key",
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
"model": "qwen2.5-32b-instruct",
"enable": false
}
},
"config_version": "1.2.0"
}
magic-pdf[full]
fastapi
uvicorn
python-multipart
...@@ -43,7 +43,7 @@ vlm = [ ...@@ -43,7 +43,7 @@ vlm = [
"pydantic", "pydantic",
] ]
sglang = [ sglang = [
"sglang[all]==0.4.7", "sglang[all]>=0.4.8,<0.4.9",
] ]
pipeline = [ pipeline = [
"matplotlib>=3.10,<4", "matplotlib>=3.10,<4",
...@@ -62,9 +62,20 @@ pipeline = [ ...@@ -62,9 +62,20 @@ pipeline = [
"transformers>=4.49.0,!=4.51.0,<5.0.0", "transformers>=4.49.0,!=4.51.0,<5.0.0",
"fast-langdetect>=0.2.3,<0.3.0", "fast-langdetect>=0.2.3,<0.3.0",
] ]
api = [
"fastapi",
"python-multipart",
"uvicorn",
]
gradio = [
"gradio>=5.34,<6",
"gradio-pdf>=0.0.22",
]
core = [ core = [
"mineru[vlm]", "mineru[vlm]",
"mineru[pipeline]", "mineru[pipeline]",
"mineru[api]",
"mineru[gradio]",
] ]
all = [ all = [
"mineru[core]", "mineru[core]",
...@@ -97,6 +108,8 @@ Repository = "https://github.com/opendatalab/MinerU" ...@@ -97,6 +108,8 @@ Repository = "https://github.com/opendatalab/MinerU"
mineru = "mineru.cli:client.main" mineru = "mineru.cli:client.main"
mineru-sglang-server = "mineru.cli.vlm_sglang_server:main" mineru-sglang-server = "mineru.cli.vlm_sglang_server:main"
mineru-models-download = "mineru.cli.models_download:download_models" mineru-models-download = "mineru.cli.models_download:download_models"
mineru-api = "mineru.cli.fast_api:main"
mineru-gradio = "mineru.cli.gradio_app:main"
[tool.setuptools.dynamic] [tool.setuptools.dynamic]
version = {attr = "mineru.version.__version__"} version = {attr = "mineru.version.__version__"}
......
...@@ -335,6 +335,54 @@ ...@@ -335,6 +335,54 @@
"created_at": "2025-06-18T06:34:06Z", "created_at": "2025-06-18T06:34:06Z",
"repoId": 765083837, "repoId": 765083837,
"pullRequestNo": 2719 "pullRequestNo": 2719
},
{
"name": "yuanjua",
"id": 80858000,
"comment_id": 2983805144,
"created_at": "2025-06-18T11:27:23Z",
"repoId": 765083837,
"pullRequestNo": 2727
},
{
"name": "QIN2DIM",
"id": 62018067,
"comment_id": 2992279796,
"created_at": "2025-06-20T17:04:59Z",
"repoId": 765083837,
"pullRequestNo": 2758
},
{
"name": "herryqg",
"id": 107988674,
"comment_id": 2995155194,
"created_at": "2025-06-23T06:49:59Z",
"repoId": 765083837,
"pullRequestNo": 2763
},
{
"name": "zhanluxianshen",
"id": 161462588,
"comment_id": 3002955644,
"created_at": "2025-06-25T03:59:03Z",
"repoId": 765083837,
"pullRequestNo": 2787
},
{
"name": "ZhiweiXu-102307",
"id": 192890785,
"comment_id": 3015529289,
"created_at": "2025-06-28T15:37:58Z",
"repoId": 765083837,
"pullRequestNo": 2826
},
{
"name": "hzwzwzw",
"id": 20764045,
"comment_id": 3017877153,
"created_at": "2025-06-30T05:44:13Z",
"repoId": 765083837,
"pullRequestNo": 2831
} }
] ]
} }
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment