features@add mineru gpu&web_api (#568)

* features@add mineru gpu&web_api * features@update api

features@add mineru gpu&web_api (#568)
* features@add mineru gpu&web_api * features@update api
a1c0d535 · quincyqiang · GitHub · f07c2673 · a1c0d535 · a1c0d535
Unverified Commit a1c0d535 authored Sep 09, 2024 by quincyqiang Committed by GitHub Sep 09, 2024
9 changed files
--- a/web_api/Dockerfile
+++ b/web_api/Dockerfile
+# Use the official Ubuntu base image
+FROM ubuntu:latest
+
+# ENV http_proxy http://127.0.0.1:7890
+# ENV https_proxy http://127.0.0.1:7890
+
+# Set environment variables to non-interactive to avoid prompts during installation
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LANG C.UTF-8
+
+# ADD sources.list /etc/apt
+# RUN apt-get clean
+
+
+
+# Update the package list and install necessary packages
+RUN apt-get -q update \
+    && apt-get -q install -y --no-install-recommends \
+        apt-utils \
+        bats \
+        build-essential
+RUN apt-get update && apt-get install -y vim net-tools procps lsof curl wget iputils-ping telnet lrzsz git
+
+RUN apt-get update && \
+    apt-get install -y \
+        software-properties-common && \
+    add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get update && \
+    apt-get install -y \
+        python3.10 \
+        python3.10-venv \
+        python3.10-distutils \
+        python3-pip \
+        wget \
+        git \
+        libgl1 \
+        libglib2.0-0 \
+        && rm -rf /var/lib/apt/lists/*
+        
+# RUN unset http_proxy && unset https_proxy
+
+# Set Python 3.10 as the default python3
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
+
+# Create a virtual environment for MinerU
+RUN python3 -m venv /opt/mineru_venv
+RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+# Activate the virtual environment and install necessary Python packages
+RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
+    pip install --upgrade pip && \
+    pip install magic-pdf[full] --extra-index-url https://myhloli.github.io/wheels/ --no-cache-dir"
+
+
+RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
+    pip install fastapi uvicorn python-multipart --no-cache-dir"
+
+RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
+    pip uninstall  paddlepaddle -y"
+
+RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
+    python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ --no-cache-dir"
+
+# Copy the configuration file template and set up the model directory
+COPY magic-pdf.template.json /root/magic-pdf.json
+ADD models /opt/models
+ADD .paddleocr /root/.paddleocr 
+ADD app.py /root/app.py
+
+WORKDIR /root
+
+# Set the models directory in the configuration file (adjust the path as needed)
+RUN sed -i 's|/tmp/models|/opt/models|g' /root/magic-pdf.json
+
+# Create the models directory
+# RUN mkdir -p /opt/models
+
+# Set the entry point to activate the virtual environment and run the command line tool
+# ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\" && python3 app.py", "--"]
+
+
+# Expose the port that FastAPI will run on
+EXPOSE 8000
+
+# Command to run FastAPI using Uvicorn, pointing to app.py and binding to 0.0.0.0:8000
+CMD ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && uvicorn app:app --host 0.0.0.0 --port 8000"]
\ No newline at end of file
--- a/web_api/README.md
+++ b/web_api/README.md
+基于MinerU的PDF解析API
+
+    - MinerU的GPU镜像构建
+    - 基于FastAPI的PDF解析接口
+
+支持一键启动，已经打包到镜像中，自带模型权重，支持GPU推理加速，GPU速度相比CPU每页解析要快几十倍不等
+
+
+##  启动命令：
+
+
+```docker run -itd --name=mineru_server --gpus=all -p 8888:8000 quincyqiang/mineru:0.1-models```
+
+![](https://i-blog.csdnimg.cn/direct/bcff4f524ea5400db14421ba7cec4989.png)
+
+具体截图请见博客：https://blog.csdn.net/yanqianglifei/article/details/141979684
+
+
+##   启动日志：
+
+![](https://i-blog.csdnimg.cn/direct/4eb5657567e4415eba912179dca5c8aa.png)
+
+##  输入参数：
+
+访问地址：
+
+    http://localhost:8888/docs
+
+    http://127.0.01:8888/docs
+
+![](https://i-blog.csdnimg.cn/direct/8b3a2bc5908042268e8cc69756e331a2.png)
+
+##  解析效果：
+
+![](https://i-blog.csdnimg.cn/direct/a54dcae834ae48d498fb595aca4212c3.png)
+
+
+
+##   镜像地址：
+
+> 阿里云地址：docker pull registry.cn-beijing.aliyuncs.com/quincyqiang/mineru:0.1-models
+
+> dockerhub地址：docker pull quincyqiang/mineru:0.1-models
+
--- a/web_api/app.py
+++ b/web_api/app.py
+import copy
+import json
+import os
+from tempfile import NamedTemporaryFile
+
+import magic_pdf.model as model_config
+import uvicorn
+from fastapi import FastAPI, File, UploadFile, Form
+from fastapi.responses import JSONResponse
+from loguru import logger
+from magic_pdf.pipe.OCRPipe import OCRPipe
+from magic_pdf.pipe.TXTPipe import TXTPipe
+from magic_pdf.pipe.UNIPipe import UNIPipe
+from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
+
+model_config.__use_inside_model__ = True
+
+app = FastAPI()
+
+def json_md_dump(
+        pipe,
+        md_writer,
+        pdf_name,
+        content_list,
+        md_content,
+):
+    # Write model results to model.json
+    orig_model_list = copy.deepcopy(pipe.model_list)
+    md_writer.write(
+        content=json.dumps(orig_model_list, ensure_ascii=False, indent=4),
+        path=f"{pdf_name}_model.json"
+    )
+
+    # Write intermediate results to middle.json
+    md_writer.write(
+        content=json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
+        path=f"{pdf_name}_middle.json"
+    )
+
+    # Write text content results to content_list.json
+    md_writer.write(
+        content=json.dumps(content_list, ensure_ascii=False, indent=4),
+        path=f"{pdf_name}_content_list.json"
+    )
+
+    # Write results to .md file
+    md_writer.write(
+        content=md_content,
+        path=f"{pdf_name}.md"
+    )
+
+@app.post("/pdf_parse", tags=["projects"], summary="Parse PDF file")
+async def pdf_parse_main(
+        pdf_file: UploadFile = File(...),
+        parse_method: str = 'auto',
+        model_json_path: str = None,
+        is_json_md_dump: bool = True,
+        output_dir: str = "output"
+):
+    """
+    Execute the process of converting PDF to JSON and MD, outputting MD and JSON files to the specified directory
+    :param pdf_file: The PDF file to be parsed
+    :param parse_method: Parsing method, can be auto, ocr, or txt. Default is auto. If results are not satisfactory, try ocr
+    :param model_json_path: Path to existing model data file. If empty, use built-in model. PDF and model_json must correspond
+    :param is_json_md_dump: Whether to write parsed data to .json and .md files. Default is True. Different stages of data will be written to different .json files (3 in total), md content will be saved to .md file
+    :param output_dir: Output directory for results. A folder named after the PDF file will be created to store all results
+    """
+    try:
+        # Create a temporary file to store the uploaded PDF
+        with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
+            temp_pdf.write(await pdf_file.read())
+            temp_pdf_path = temp_pdf.name
+
+        pdf_name = os.path.basename(pdf_file.filename).split(".")[0]
+
+        if output_dir:
+            output_path = os.path.join(output_dir, pdf_name)
+        else:
+            output_path = os.path.join(os.path.dirname(temp_pdf_path), pdf_name)
+
+        output_image_path = os.path.join(output_path, 'images')
+
+        # Get parent path of images for relative path in .md and content_list.json
+        image_path_parent = os.path.basename(output_image_path)
+
+        pdf_bytes = open(temp_pdf_path, "rb").read()  # Read binary data of PDF file
+
+        if model_json_path:
+            # Read original JSON data of PDF file parsed by model, list type
+            model_json = json.loads(open(model_json_path, "r", encoding="utf-8").read())
+        else:
+            model_json = []
+
+        # Execute parsing steps
+        image_writer, md_writer = DiskReaderWriter(output_image_path), DiskReaderWriter(output_path)
+
+        # Choose parsing method
+        if parse_method == "auto":
+            jso_useful_key = {"_pdf_type": "", "model_list": model_json}
+            pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
+        elif parse_method == "txt":
+            pipe = TXTPipe(pdf_bytes, model_json, image_writer)
+        elif parse_method == "ocr":
+            pipe = OCRPipe(pdf_bytes, model_json, image_writer)
+        else:
+            logger.error("Unknown parse method, only auto, ocr, txt allowed")
+            return JSONResponse(content={"error": "Invalid parse method"}, status_code=400)
+
+        # Execute classification
+        pipe.pipe_classify()
+
+        # If no model data is provided, use built-in model for parsing
+        if not model_json:
+            if model_config.__use_inside_model__:
+                pipe.pipe_analyze()  # Parse
+            else:
+                logger.error("Need model list input")
+                return JSONResponse(content={"error": "Model list input required"}, status_code=400)
+
+        # Execute parsing
+        pipe.pipe_parse()
+
+        # Save results in text and md format
+        content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode="none")
+        md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode="none")
+
+        if is_json_md_dump:
+            json_md_dump(pipe, md_writer, pdf_name, content_list, md_content)
+        data = {"layout": copy.deepcopy(pipe.model_list), "info": pipe.pdf_mid_data, "content_list": content_list,'md_content':md_content}
+        return JSONResponse(data, status_code=200)
+
+    except Exception as e:
+        logger.exception(e)
+        return JSONResponse(content={"error": str(e)}, status_code=500)
+    finally:
+        # Clean up the temporary file
+        if 'temp_pdf_path' in locals():
+            os.unlink(temp_pdf_path)
+
+# if __name__ == '__main__':
+#     uvicorn.run(app, host="0.0.0.0", port=8888)
\ No newline at end of file
--- a/web_api/magic-pdf.json
+++ b/web_api/magic-pdf.json
+{
+    "bucket_info":{
+        "bucket-name-1":["ak", "sk", "endpoint"],
+        "bucket-name-2":["ak", "sk", "endpoint"]
+    },
+    "models-dir":"/opt/models",
+    "device-mode":"cuda",
+    "table-config": {
+        "model": "TableMaster",
+        "is_table_recog_enable": false,
+        "max_time": 400
+    }
+}
--- a/web_api/magic-pdf.template.json
+++ b/web_api/magic-pdf.template.json
+{
+    "bucket_info":{
+        "bucket-name-1":["ak", "sk", "endpoint"],
+        "bucket-name-2":["ak", "sk", "endpoint"]
+    },
+    "models-dir":"/tmp/models",
+    "device-mode":"cuda",
+    "table-config": {
+        "model": "TableMaster",
+        "is_table_recog_enable": false,
+        "max_time": 400
+    }
+}
\ No newline at end of file
--- a/web_api/requirements.txt
+++ b/web_api/requirements.txt
--- a/web_api/small_ocr.pdf
+++ b/web_api/small_ocr.pdf
--- a/web_api/sources.list
+++ b/web_api/sources.list
+deb http://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse
+deb-src http://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse
+deb http://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse
+deb-src http://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse
+deb http://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse
+deb-src http://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse
+deb http://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse
+deb-src http://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse
+deb http://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse
+deb-src http://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse
\ No newline at end of file
--- a/web_api/start_mineru.sh
+++ b/web_api/start_mineru.sh
+docker run -itd --name=mineru_server --gpus=all -p 8888:8000 quincyqiang/mineru:0.1-models /bin/bash
+
+docker run -itd --name=mineru_server --gpus=all -p 8888:8000 quincyqiang/mineru:0.3-models
+
+docker login --username=1185918903@qq.com registry.cn-beijing.aliyuncs.com
+docker tag quincyqiang/mineru:0.3-models registry.cn-beijing.aliyuncs.com/quincyqiang/gomate:0.3-models
+docker push registry.cn-beijing.aliyuncs.com/quincyqiang/gomate:0.3-models
\ No newline at end of file