Merge pull request #7 from opendatalab/dev

Dev

Merge pull request #7 from opendatalab/dev
Dev
6d571e2e · Kaiwen Liu · GitHub · a3358878 · 37c335ae · 6d571e2e
Unverified Commit 6d571e2e authored Oct 28, 2024 by Kaiwen Liu Committed by GitHub Oct 28, 2024
20 changed files
--- a/next_docs/zh_cn/make.bat
+++ b/next_docs/zh_cn/make.bat
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
--- a/projects/README.md
+++ b/projects/README.md
@@ -6,5 +6,4 @@
 - [gradio_app](./gradio_app/README.md): Build a web app based on gradio
 - [web_demo](./web_demo/README.md): MinerU online [demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF/) localized deployment version
 - [web_api](./web_api/README.md): Web API Based on FastAPI
-
-
+- [multi_gpu](./multi_gpu/README.md): Multi-GPU parallel processing based on LitServe
--- a/projects/README_zh-CN.md
+++ b/projects/README_zh-CN.md
@@ -6,4 +6,4 @@
 - [gradio_app](./gradio_app/README_zh-CN.md): 基于 Gradio 的 Web 应用
 - [web_demo](./web_demo/README_zh-CN.md): MinerU在线[demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF/)本地化部署版本
 - [web_api](./web_api/README.md): 基于 FastAPI 的 Web API
-
+- [multi_gpu](./multi_gpu/README.md): 基于 LitServe 的多 GPU 并行处理
--- a/projects/gradio_app/app.py
+++ b/projects/gradio_app/app.py
@@ -3,10 +3,12 @@
 import base64
 import os
 import time
+import uuid
 import zipfile
 from pathlib import Path
 import re

+import pymupdf
 from loguru import logger

 from magic_pdf.libs.hash_utils import compute_sha256
@@ -23,7 +25,7 @@ def read_fn(path):
    return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)


-def parse_pdf(doc_path, output_dir, end_page_id, is_ocr):
+def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_enable, table_enable, language):
    os.makedirs(output_dir, exist_ok=True)

    try:
@@ -42,6 +44,10 @@ def parse_pdf(doc_path, output_dir, end_page_id, is_ocr):
            parse_method,
            False,
            end_page_id=end_page_id,
+            layout_model=layout_mode,
+            formula_enable=formula_enable,
+            table_enable=table_enable,
+            lang=language,
        )
        return local_md_dir, file_name
    except Exception as e:
@@ -93,9 +99,10 @@ def replace_image_with_base64(markdown_text, image_dir_path):
    return re.sub(pattern, replace, markdown_text)


-def to_markdown(file_path, end_pages, is_ocr):
+def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language):
    # 获取识别的md文件以及压缩包文件路径
-    local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr)
+    local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr,
+                                        layout_mode, formula_enable, table_enable, language)
    archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
    zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
    if zip_archive_success == 0:
@@ -138,24 +145,71 @@ with open("header.html", "r") as file:
    header = file.read()


+latin_lang = [
+        'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
+        'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
+        'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
+        'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german'
+]
+arabic_lang = ['ar', 'fa', 'ug', 'ur']
+cyrillic_lang = [
+        'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
+        'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
+]
+devanagari_lang = [
+        'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
+        'sa', 'bgc'
+]
+other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
+
+all_lang = [""]
+all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
+
+
+def to_pdf(file_path):
+    with pymupdf.open(file_path) as f:
+        if f.is_pdf:
+            return file_path
+        else:
+            pdf_bytes = f.convert_to_pdf()
+            # 将pdfbytes 写入到uuid.pdf中
+            # 生成唯一的文件名
+            unique_filename = f"{uuid.uuid4()}.pdf"
+
+            # 构建完整的文件路径
+            tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
+
+            # 将字节数据写入文件
+            with open(tmp_file_path, 'wb') as tmp_pdf_file:
+                tmp_pdf_file.write(pdf_bytes)
+
+            return tmp_file_path
+
+
 if __name__ == "__main__":
    with gr.Blocks() as demo:
        gr.HTML(header)
        with gr.Row():
            with gr.Column(variant='panel', scale=5):
-                pdf_show = gr.Markdown()
+                file = gr.File(label="Please upload a PDF or image", file_types=[".pdf", ".png", ".jpeg", "jpg"])
                max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
-                with gr.Row() as bu_flow:
-                    is_ocr = gr.Checkbox(label="Force enable OCR")
+                with gr.Row():
+                    layout_mode = gr.Dropdown(["layoutlmv3", "doclayout_yolo"], label="Layout model", value="layoutlmv3")
+                    language = gr.Dropdown(all_lang, label="Language", value="")
+                with gr.Row():
+                    formula_enable = gr.Checkbox(label="Enable formula recognition", value=True)
+                    is_ocr = gr.Checkbox(label="Force enable OCR", value=False)
+                    table_enable = gr.Checkbox(label="Enable table recognition(test)", value=False)
+                with gr.Row():
                    change_bu = gr.Button("Convert")
-                    clear_bu = gr.ClearButton([pdf_show], value="Clear")
-                pdf_show = PDF(label="Please upload pdf", interactive=True, height=800)
+                    clear_bu = gr.ClearButton(value="Clear")
+                pdf_show = PDF(label="PDF preview", interactive=True, height=800)
                with gr.Accordion("Examples:"):
                    example_root = os.path.join(os.path.dirname(__file__), "examples")
                    gr.Examples(
                        examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
                                  _.endswith("pdf")],
-                        inputs=pdf_show,
+                        inputs=pdf_show
                    )

            with gr.Column(variant='panel', scale=5):
@@ -166,7 +220,9 @@ if __name__ == "__main__":
                                         latex_delimiters=latex_delimiters, line_breaks=True)
                    with gr.Tab("Markdown text"):
                        md_text = gr.TextArea(lines=45, show_copy_button=True)
-        change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr], outputs=[md, md_text, output_file, pdf_show])
-        clear_bu.add([md, pdf_show, md_text, output_file, is_ocr])
+        file.upload(fn=to_pdf, inputs=file, outputs=pdf_show)
+        change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
+                        outputs=[md, md_text, output_file, pdf_show])
+        clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr, table_enable, language])

-    demo.launch()
\ No newline at end of file
+    demo.launch(server_name="0.0.0.0")
\ No newline at end of file
--- a/projects/gradio_app/examples/2list_1table.pdf
+++ b/projects/gradio_app/examples/2list_1table.pdf
--- a/projects/gradio_app/examples/3list_1table.pdf
+++ b/projects/gradio_app/examples/3list_1table.pdf
--- a/projects/gradio_app/examples/academic_paper_formula.pdf
+++ b/projects/gradio_app/examples/academic_paper_formula.pdf
--- a/projects/gradio_app/examples/academic_paper_img_formula.pdf
+++ b/projects/gradio_app/examples/academic_paper_img_formula.pdf
--- a/projects/gradio_app/examples/academic_paper_list.pdf
+++ b/projects/gradio_app/examples/academic_paper_list.pdf
--- a/projects/gradio_app/examples/complex_layout.pdf
+++ b/projects/gradio_app/examples/complex_layout.pdf
--- a/projects/gradio_app/examples/complex_layout_para_split_list.pdf
+++ b/projects/gradio_app/examples/complex_layout_para_split_list.pdf
--- a/projects/gradio_app/examples/garbled_formula.pdf
+++ b/projects/gradio_app/examples/garbled_formula.pdf
--- a/projects/gradio_app/examples/garbled_formula2.pdf
+++ b/projects/gradio_app/examples/garbled_formula2.pdf
--- a/projects/gradio_app/examples/garbled_img_formula.pdf
+++ b/projects/gradio_app/examples/garbled_img_formula.pdf
--- a/projects/gradio_app/examples/magazine_complex_layout_images_list.pdf
+++ b/projects/gradio_app/examples/magazine_complex_layout_images_list.pdf
--- a/projects/multi_gpu/README.md
+++ b/projects/multi_gpu/README.md
+## 项目简介
+本项目提供基于 LitServe 的多 GPU 并行处理方案。LitServe 是一个简便且灵活的 AI 模型服务引擎，基于 FastAPI 构建。它为 FastAPI 增强了批处理、流式传输和 GPU 自动扩展等功能，无需为每个模型单独重建 FastAPI 服务器。
+
+## 环境配置
+请使用以下命令配置所需的环境：
+```bash
+pip install -U litserve python-multipart filetype
+pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
+pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118
+```
+
+## 快速使用
+### 1. 启动服务端
+以下示例展示了如何启动服务端，支持自定义设置：
+```python
+server = ls.LitServer(
+    MinerUAPI(output_dir='/tmp'),  # 可自定义输出文件夹
+    accelerator='cuda',  # 启用 GPU 加速
+    devices='auto',  # "auto" 使用所有 GPU
+    workers_per_device=1,  # 每个 GPU 启动一个服务实例
+    timeout=False  # 设置为 False 以禁用超时
+)
+server.run(port=8000)  # 设定服务端口为 8000
+```
+
+启动服务端命令：
+```bash
+python server.py
+```
+
+### 2. 启动客户端
+以下代码展示了客户端的使用方式，可根据需求修改配置：
+```python
+files = ['demo/small_ocr.pdf']  # 替换为文件路径，支持 jpg/jpeg、png、pdf 文件
+n_jobs = np.clip(len(files), 1, 8)  # 设置并发线程数，此处最大为 8，可根据自身修改
+results = Parallel(n_jobs, prefer='threads', verbose=10)(
+    delayed(do_parse)(p) for p in files
+)
+print(results)
+```
+
+启动客户端命令：
+```bash
+python client.py
+```
+好了，你的文件会自动在多个 GPU 上并行处理！🍻🍻🍻
--- a/projects/multi_gpu/client.py
+++ b/projects/multi_gpu/client.py
+import base64
+import requests
+import numpy as np
+from loguru import logger
+from joblib import Parallel, delayed
+
+
+def to_b64(file_path):
+    try:
+        with open(file_path, 'rb') as f:
+            return base64.b64encode(f.read()).decode('utf-8')
+    except Exception as e:
+        raise Exception(f'File: {file_path} - Info: {e}')
+
+
+def do_parse(file_path, url='http://127.0.0.1:8000/predict', **kwargs):
+    try:
+        response = requests.post(url, json={
+            'file': to_b64(file_path),
+            'kwargs': kwargs
+        })
+
+        if response.status_code == 200:
+            output = response.json()
+            output['file_path'] = file_path
+            return output
+        else:
+            raise Exception(response.text)
+    except Exception as e:
+        logger.error(f'File: {file_path} - Info: {e}')
+
+
+if __name__ == '__main__':
+    files = ['small_ocr.pdf']
+    n_jobs = np.clip(len(files), 1, 8)
+    results = Parallel(n_jobs, prefer='threads', verbose=10)(
+        delayed(do_parse)(p) for p in files
+    )
+    print(results)
--- a/projects/multi_gpu/server.py
+++ b/projects/multi_gpu/server.py
+import os
+import fitz
+import torch
+import base64
+import litserve as ls
+from uuid import uuid4
+from fastapi import HTTPException
+from filetype import guess_extension
+from magic_pdf.tools.common import do_parse
+from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
+
+
+class MinerUAPI(ls.LitAPI):
+    def __init__(self, output_dir='/tmp'):
+        self.output_dir = output_dir
+
+    def setup(self, device):
+        if device.startswith('cuda'):
+            os.environ['CUDA_VISIBLE_DEVICES'] = device.split(':')[-1]
+            if torch.cuda.device_count() > 1:
+                raise RuntimeError("Remove any CUDA actions before setting 'CUDA_VISIBLE_DEVICES'.")
+
+        model_manager = ModelSingleton()
+        model_manager.get_model(True, False)
+        model_manager.get_model(False, False)
+        print(f'Model initialization complete on {device}!')
+
+    def decode_request(self, request):
+        file = request['file']
+        file = self.to_pdf(file)
+        opts = request.get('kwargs', {})
+        opts.setdefault('debug_able', False)
+        opts.setdefault('parse_method', 'auto')
+        return file, opts
+
+    def predict(self, inputs):
+        try:
+            do_parse(self.output_dir, pdf_name := str(uuid4()), inputs[0], [], **inputs[1])
+            return pdf_name
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+        finally:
+            self.clean_memory()
+
+    def encode_response(self, response):
+        return {'output_dir': response}
+
+    def clean_memory(self):
+        import gc
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+        gc.collect()
+
+    def to_pdf(self, file_base64):
+        try:
+            file_bytes = base64.b64decode(file_base64)
+            file_ext = guess_extension(file_bytes)
+            with fitz.open(stream=file_bytes, filetype=file_ext) as f:
+                if f.is_pdf: return f.tobytes()
+                return f.convert_to_pdf()
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+
+
+if __name__ == '__main__':
+    server = ls.LitServer(
+        MinerUAPI(output_dir='/tmp'),
+        accelerator='cuda',
+        devices='auto',
+        workers_per_device=1,
+        timeout=False
+    )
+    server.run(port=8000)
--- a/projects/multi_gpu/small_ocr.pdf
+++ b/projects/multi_gpu/small_ocr.pdf
--- a/requirements-docker.txt
+++ b/requirements-docker.txt
@@ -5,7 +5,6 @@ PyMuPDF>=1.24.9
 loguru>=0.6.0
 numpy>=1.21.6,<2.0.0
 fast-langdetect==0.2.0
-wordninja>=2.0.0
 scikit-learn>=1.0.2
 pdfminer.six==20231228
 unimernet==0.2.1
@@ -15,4 +14,5 @@ paddleocr==2.7.3
 paddlepaddle==3.0.0b1
 pypandoc
 struct-eqtable==0.1.0
+doclayout-yolo==0.0.2
 detectron2