Unverified Commit a1c0d535 authored by quincyqiang's avatar quincyqiang Committed by GitHub
Browse files

features@add mineru gpu&web_api (#568)

* features@add mineru gpu&web_api

* features@update api
parent f07c2673
# Use the official Ubuntu base image
FROM ubuntu:latest
# ENV http_proxy http://127.0.0.1:7890
# ENV https_proxy http://127.0.0.1:7890
# Set environment variables to non-interactive to avoid prompts during installation
ENV DEBIAN_FRONTEND=noninteractive
ENV LANG C.UTF-8
# ADD sources.list /etc/apt
# RUN apt-get clean
# Update the package list and install necessary packages
RUN apt-get -q update \
&& apt-get -q install -y --no-install-recommends \
apt-utils \
bats \
build-essential
RUN apt-get update && apt-get install -y vim net-tools procps lsof curl wget iputils-ping telnet lrzsz git
RUN apt-get update && \
apt-get install -y \
software-properties-common && \
add-apt-repository ppa:deadsnakes/ppa && \
apt-get update && \
apt-get install -y \
python3.10 \
python3.10-venv \
python3.10-distutils \
python3-pip \
wget \
git \
libgl1 \
libglib2.0-0 \
&& rm -rf /var/lib/apt/lists/*
# RUN unset http_proxy && unset https_proxy
# Set Python 3.10 as the default python3
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
# Create a virtual environment for MinerU
RUN python3 -m venv /opt/mineru_venv
RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
# Activate the virtual environment and install necessary Python packages
RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
pip install --upgrade pip && \
pip install magic-pdf[full] --extra-index-url https://myhloli.github.io/wheels/ --no-cache-dir"
RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
pip install fastapi uvicorn python-multipart --no-cache-dir"
RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
pip uninstall paddlepaddle -y"
RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ --no-cache-dir"
# Copy the configuration file template and set up the model directory
COPY magic-pdf.template.json /root/magic-pdf.json
ADD models /opt/models
ADD .paddleocr /root/.paddleocr
ADD app.py /root/app.py
WORKDIR /root
# Set the models directory in the configuration file (adjust the path as needed)
RUN sed -i 's|/tmp/models|/opt/models|g' /root/magic-pdf.json
# Create the models directory
# RUN mkdir -p /opt/models
# Set the entry point to activate the virtual environment and run the command line tool
# ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\" && python3 app.py", "--"]
# Expose the port that FastAPI will run on
EXPOSE 8000
# Command to run FastAPI using Uvicorn, pointing to app.py and binding to 0.0.0.0:8000
CMD ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && uvicorn app:app --host 0.0.0.0 --port 8000"]
\ No newline at end of file
基于MinerU的PDF解析API
- MinerU的GPU镜像构建
- 基于FastAPI的PDF解析接口
支持一键启动,已经打包到镜像中,自带模型权重,支持GPU推理加速,GPU速度相比CPU每页解析要快几十倍不等
## 启动命令:
```docker run -itd --name=mineru_server --gpus=all -p 8888:8000 quincyqiang/mineru:0.1-models```
![](https://i-blog.csdnimg.cn/direct/bcff4f524ea5400db14421ba7cec4989.png)
具体截图请见博客:https://blog.csdn.net/yanqianglifei/article/details/141979684
## 启动日志:
![](https://i-blog.csdnimg.cn/direct/4eb5657567e4415eba912179dca5c8aa.png)
## 输入参数:
访问地址:
http://localhost:8888/docs
http://127.0.01:8888/docs
![](https://i-blog.csdnimg.cn/direct/8b3a2bc5908042268e8cc69756e331a2.png)
## 解析效果:
![](https://i-blog.csdnimg.cn/direct/a54dcae834ae48d498fb595aca4212c3.png)
## 镜像地址:
> 阿里云地址:docker pull registry.cn-beijing.aliyuncs.com/quincyqiang/mineru:0.1-models
> dockerhub地址:docker pull quincyqiang/mineru:0.1-models
import copy
import json
import os
from tempfile import NamedTemporaryFile
import magic_pdf.model as model_config
import uvicorn
from fastapi import FastAPI, File, UploadFile, Form
from fastapi.responses import JSONResponse
from loguru import logger
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
model_config.__use_inside_model__ = True
app = FastAPI()
def json_md_dump(
pipe,
md_writer,
pdf_name,
content_list,
md_content,
):
# Write model results to model.json
orig_model_list = copy.deepcopy(pipe.model_list)
md_writer.write(
content=json.dumps(orig_model_list, ensure_ascii=False, indent=4),
path=f"{pdf_name}_model.json"
)
# Write intermediate results to middle.json
md_writer.write(
content=json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
path=f"{pdf_name}_middle.json"
)
# Write text content results to content_list.json
md_writer.write(
content=json.dumps(content_list, ensure_ascii=False, indent=4),
path=f"{pdf_name}_content_list.json"
)
# Write results to .md file
md_writer.write(
content=md_content,
path=f"{pdf_name}.md"
)
@app.post("/pdf_parse", tags=["projects"], summary="Parse PDF file")
async def pdf_parse_main(
pdf_file: UploadFile = File(...),
parse_method: str = 'auto',
model_json_path: str = None,
is_json_md_dump: bool = True,
output_dir: str = "output"
):
"""
Execute the process of converting PDF to JSON and MD, outputting MD and JSON files to the specified directory
:param pdf_file: The PDF file to be parsed
:param parse_method: Parsing method, can be auto, ocr, or txt. Default is auto. If results are not satisfactory, try ocr
:param model_json_path: Path to existing model data file. If empty, use built-in model. PDF and model_json must correspond
:param is_json_md_dump: Whether to write parsed data to .json and .md files. Default is True. Different stages of data will be written to different .json files (3 in total), md content will be saved to .md file
:param output_dir: Output directory for results. A folder named after the PDF file will be created to store all results
"""
try:
# Create a temporary file to store the uploaded PDF
with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
temp_pdf.write(await pdf_file.read())
temp_pdf_path = temp_pdf.name
pdf_name = os.path.basename(pdf_file.filename).split(".")[0]
if output_dir:
output_path = os.path.join(output_dir, pdf_name)
else:
output_path = os.path.join(os.path.dirname(temp_pdf_path), pdf_name)
output_image_path = os.path.join(output_path, 'images')
# Get parent path of images for relative path in .md and content_list.json
image_path_parent = os.path.basename(output_image_path)
pdf_bytes = open(temp_pdf_path, "rb").read() # Read binary data of PDF file
if model_json_path:
# Read original JSON data of PDF file parsed by model, list type
model_json = json.loads(open(model_json_path, "r", encoding="utf-8").read())
else:
model_json = []
# Execute parsing steps
image_writer, md_writer = DiskReaderWriter(output_image_path), DiskReaderWriter(output_path)
# Choose parsing method
if parse_method == "auto":
jso_useful_key = {"_pdf_type": "", "model_list": model_json}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
elif parse_method == "txt":
pipe = TXTPipe(pdf_bytes, model_json, image_writer)
elif parse_method == "ocr":
pipe = OCRPipe(pdf_bytes, model_json, image_writer)
else:
logger.error("Unknown parse method, only auto, ocr, txt allowed")
return JSONResponse(content={"error": "Invalid parse method"}, status_code=400)
# Execute classification
pipe.pipe_classify()
# If no model data is provided, use built-in model for parsing
if not model_json:
if model_config.__use_inside_model__:
pipe.pipe_analyze() # Parse
else:
logger.error("Need model list input")
return JSONResponse(content={"error": "Model list input required"}, status_code=400)
# Execute parsing
pipe.pipe_parse()
# Save results in text and md format
content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode="none")
md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode="none")
if is_json_md_dump:
json_md_dump(pipe, md_writer, pdf_name, content_list, md_content)
data = {"layout": copy.deepcopy(pipe.model_list), "info": pipe.pdf_mid_data, "content_list": content_list,'md_content':md_content}
return JSONResponse(data, status_code=200)
except Exception as e:
logger.exception(e)
return JSONResponse(content={"error": str(e)}, status_code=500)
finally:
# Clean up the temporary file
if 'temp_pdf_path' in locals():
os.unlink(temp_pdf_path)
# if __name__ == '__main__':
# uvicorn.run(app, host="0.0.0.0", port=8888)
\ No newline at end of file
{
"bucket_info":{
"bucket-name-1":["ak", "sk", "endpoint"],
"bucket-name-2":["ak", "sk", "endpoint"]
},
"models-dir":"/opt/models",
"device-mode":"cuda",
"table-config": {
"model": "TableMaster",
"is_table_recog_enable": false,
"max_time": 400
}
}
{
"bucket_info":{
"bucket-name-1":["ak", "sk", "endpoint"],
"bucket-name-2":["ak", "sk", "endpoint"]
},
"models-dir":"/tmp/models",
"device-mode":"cuda",
"table-config": {
"model": "TableMaster",
"is_table_recog_enable": false,
"max_time": 400
}
}
\ No newline at end of file
deb http://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse
\ No newline at end of file
docker run -itd --name=mineru_server --gpus=all -p 8888:8000 quincyqiang/mineru:0.1-models /bin/bash
docker run -itd --name=mineru_server --gpus=all -p 8888:8000 quincyqiang/mineru:0.3-models
docker login --username=1185918903@qq.com registry.cn-beijing.aliyuncs.com
docker tag quincyqiang/mineru:0.3-models registry.cn-beijing.aliyuncs.com/quincyqiang/gomate:0.3-models
docker push registry.cn-beijing.aliyuncs.com/quincyqiang/gomate:0.3-models
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment