Unverified Commit 6d571e2e authored by Kaiwen Liu's avatar Kaiwen Liu Committed by GitHub
Browse files

Merge pull request #7 from opendatalab/dev

Dev
parents a3358878 37c335ae
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.https://www.sphinx-doc.org/
exit /b 1
)
if "%1" == "" goto help
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd
...@@ -6,5 +6,4 @@ ...@@ -6,5 +6,4 @@
- [gradio_app](./gradio_app/README.md): Build a web app based on gradio - [gradio_app](./gradio_app/README.md): Build a web app based on gradio
- [web_demo](./web_demo/README.md): MinerU online [demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF/) localized deployment version - [web_demo](./web_demo/README.md): MinerU online [demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF/) localized deployment version
- [web_api](./web_api/README.md): Web API Based on FastAPI - [web_api](./web_api/README.md): Web API Based on FastAPI
- [multi_gpu](./multi_gpu/README.md): Multi-GPU parallel processing based on LitServe
...@@ -6,4 +6,4 @@ ...@@ -6,4 +6,4 @@
- [gradio_app](./gradio_app/README_zh-CN.md): 基于 Gradio 的 Web 应用 - [gradio_app](./gradio_app/README_zh-CN.md): 基于 Gradio 的 Web 应用
- [web_demo](./web_demo/README_zh-CN.md): MinerU在线[demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF/)本地化部署版本 - [web_demo](./web_demo/README_zh-CN.md): MinerU在线[demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF/)本地化部署版本
- [web_api](./web_api/README.md): 基于 FastAPI 的 Web API - [web_api](./web_api/README.md): 基于 FastAPI 的 Web API
- [multi_gpu](./multi_gpu/README.md): 基于 LitServe 的多 GPU 并行处理
...@@ -3,10 +3,12 @@ ...@@ -3,10 +3,12 @@
import base64 import base64
import os import os
import time import time
import uuid
import zipfile import zipfile
from pathlib import Path from pathlib import Path
import re import re
import pymupdf
from loguru import logger from loguru import logger
from magic_pdf.libs.hash_utils import compute_sha256 from magic_pdf.libs.hash_utils import compute_sha256
...@@ -23,7 +25,7 @@ def read_fn(path): ...@@ -23,7 +25,7 @@ def read_fn(path):
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN) return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
def parse_pdf(doc_path, output_dir, end_page_id, is_ocr): def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_enable, table_enable, language):
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
try: try:
...@@ -42,6 +44,10 @@ def parse_pdf(doc_path, output_dir, end_page_id, is_ocr): ...@@ -42,6 +44,10 @@ def parse_pdf(doc_path, output_dir, end_page_id, is_ocr):
parse_method, parse_method,
False, False,
end_page_id=end_page_id, end_page_id=end_page_id,
layout_model=layout_mode,
formula_enable=formula_enable,
table_enable=table_enable,
lang=language,
) )
return local_md_dir, file_name return local_md_dir, file_name
except Exception as e: except Exception as e:
...@@ -93,9 +99,10 @@ def replace_image_with_base64(markdown_text, image_dir_path): ...@@ -93,9 +99,10 @@ def replace_image_with_base64(markdown_text, image_dir_path):
return re.sub(pattern, replace, markdown_text) return re.sub(pattern, replace, markdown_text)
def to_markdown(file_path, end_pages, is_ocr): def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language):
# 获取识别的md文件以及压缩包文件路径 # 获取识别的md文件以及压缩包文件路径
local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr) local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr,
layout_mode, formula_enable, table_enable, language)
archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip") archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path) zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
if zip_archive_success == 0: if zip_archive_success == 0:
...@@ -138,24 +145,71 @@ with open("header.html", "r") as file: ...@@ -138,24 +145,71 @@ with open("header.html", "r") as file:
header = file.read() header = file.read()
latin_lang = [
'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german'
]
arabic_lang = ['ar', 'fa', 'ug', 'ur']
cyrillic_lang = [
'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
]
devanagari_lang = [
'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
'sa', 'bgc'
]
other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
all_lang = [""]
all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
def to_pdf(file_path):
with pymupdf.open(file_path) as f:
if f.is_pdf:
return file_path
else:
pdf_bytes = f.convert_to_pdf()
# 将pdfbytes 写入到uuid.pdf中
# 生成唯一的文件名
unique_filename = f"{uuid.uuid4()}.pdf"
# 构建完整的文件路径
tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
# 将字节数据写入文件
with open(tmp_file_path, 'wb') as tmp_pdf_file:
tmp_pdf_file.write(pdf_bytes)
return tmp_file_path
if __name__ == "__main__": if __name__ == "__main__":
with gr.Blocks() as demo: with gr.Blocks() as demo:
gr.HTML(header) gr.HTML(header)
with gr.Row(): with gr.Row():
with gr.Column(variant='panel', scale=5): with gr.Column(variant='panel', scale=5):
pdf_show = gr.Markdown() file = gr.File(label="Please upload a PDF or image", file_types=[".pdf", ".png", ".jpeg", "jpg"])
max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages") max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
with gr.Row() as bu_flow: with gr.Row():
is_ocr = gr.Checkbox(label="Force enable OCR") layout_mode = gr.Dropdown(["layoutlmv3", "doclayout_yolo"], label="Layout model", value="layoutlmv3")
language = gr.Dropdown(all_lang, label="Language", value="")
with gr.Row():
formula_enable = gr.Checkbox(label="Enable formula recognition", value=True)
is_ocr = gr.Checkbox(label="Force enable OCR", value=False)
table_enable = gr.Checkbox(label="Enable table recognition(test)", value=False)
with gr.Row():
change_bu = gr.Button("Convert") change_bu = gr.Button("Convert")
clear_bu = gr.ClearButton([pdf_show], value="Clear") clear_bu = gr.ClearButton(value="Clear")
pdf_show = PDF(label="Please upload pdf", interactive=True, height=800) pdf_show = PDF(label="PDF preview", interactive=True, height=800)
with gr.Accordion("Examples:"): with gr.Accordion("Examples:"):
example_root = os.path.join(os.path.dirname(__file__), "examples") example_root = os.path.join(os.path.dirname(__file__), "examples")
gr.Examples( gr.Examples(
examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
_.endswith("pdf")], _.endswith("pdf")],
inputs=pdf_show, inputs=pdf_show
) )
with gr.Column(variant='panel', scale=5): with gr.Column(variant='panel', scale=5):
...@@ -166,7 +220,9 @@ if __name__ == "__main__": ...@@ -166,7 +220,9 @@ if __name__ == "__main__":
latex_delimiters=latex_delimiters, line_breaks=True) latex_delimiters=latex_delimiters, line_breaks=True)
with gr.Tab("Markdown text"): with gr.Tab("Markdown text"):
md_text = gr.TextArea(lines=45, show_copy_button=True) md_text = gr.TextArea(lines=45, show_copy_button=True)
change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr], outputs=[md, md_text, output_file, pdf_show]) file.upload(fn=to_pdf, inputs=file, outputs=pdf_show)
clear_bu.add([md, pdf_show, md_text, output_file, is_ocr]) change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
outputs=[md, md_text, output_file, pdf_show])
clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr, table_enable, language])
demo.launch() demo.launch(server_name="0.0.0.0")
\ No newline at end of file \ No newline at end of file
No preview for this file type
## 项目简介
本项目提供基于 LitServe 的多 GPU 并行处理方案。LitServe 是一个简便且灵活的 AI 模型服务引擎,基于 FastAPI 构建。它为 FastAPI 增强了批处理、流式传输和 GPU 自动扩展等功能,无需为每个模型单独重建 FastAPI 服务器。
## 环境配置
请使用以下命令配置所需的环境:
```bash
pip install -U litserve python-multipart filetype
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118
```
## 快速使用
### 1. 启动服务端
以下示例展示了如何启动服务端,支持自定义设置:
```python
server = ls.LitServer(
MinerUAPI(output_dir='/tmp'), # 可自定义输出文件夹
accelerator='cuda', # 启用 GPU 加速
devices='auto', # "auto" 使用所有 GPU
workers_per_device=1, # 每个 GPU 启动一个服务实例
timeout=False # 设置为 False 以禁用超时
)
server.run(port=8000) # 设定服务端口为 8000
```
启动服务端命令:
```bash
python server.py
```
### 2. 启动客户端
以下代码展示了客户端的使用方式,可根据需求修改配置:
```python
files = ['demo/small_ocr.pdf'] # 替换为文件路径,支持 jpg/jpeg、png、pdf 文件
n_jobs = np.clip(len(files), 1, 8) # 设置并发线程数,此处最大为 8,可根据自身修改
results = Parallel(n_jobs, prefer='threads', verbose=10)(
delayed(do_parse)(p) for p in files
)
print(results)
```
启动客户端命令:
```bash
python client.py
```
好了,你的文件会自动在多个 GPU 上并行处理!🍻🍻🍻
import base64
import requests
import numpy as np
from loguru import logger
from joblib import Parallel, delayed
def to_b64(file_path):
try:
with open(file_path, 'rb') as f:
return base64.b64encode(f.read()).decode('utf-8')
except Exception as e:
raise Exception(f'File: {file_path} - Info: {e}')
def do_parse(file_path, url='http://127.0.0.1:8000/predict', **kwargs):
try:
response = requests.post(url, json={
'file': to_b64(file_path),
'kwargs': kwargs
})
if response.status_code == 200:
output = response.json()
output['file_path'] = file_path
return output
else:
raise Exception(response.text)
except Exception as e:
logger.error(f'File: {file_path} - Info: {e}')
if __name__ == '__main__':
files = ['small_ocr.pdf']
n_jobs = np.clip(len(files), 1, 8)
results = Parallel(n_jobs, prefer='threads', verbose=10)(
delayed(do_parse)(p) for p in files
)
print(results)
import os
import fitz
import torch
import base64
import litserve as ls
from uuid import uuid4
from fastapi import HTTPException
from filetype import guess_extension
from magic_pdf.tools.common import do_parse
from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
class MinerUAPI(ls.LitAPI):
def __init__(self, output_dir='/tmp'):
self.output_dir = output_dir
def setup(self, device):
if device.startswith('cuda'):
os.environ['CUDA_VISIBLE_DEVICES'] = device.split(':')[-1]
if torch.cuda.device_count() > 1:
raise RuntimeError("Remove any CUDA actions before setting 'CUDA_VISIBLE_DEVICES'.")
model_manager = ModelSingleton()
model_manager.get_model(True, False)
model_manager.get_model(False, False)
print(f'Model initialization complete on {device}!')
def decode_request(self, request):
file = request['file']
file = self.to_pdf(file)
opts = request.get('kwargs', {})
opts.setdefault('debug_able', False)
opts.setdefault('parse_method', 'auto')
return file, opts
def predict(self, inputs):
try:
do_parse(self.output_dir, pdf_name := str(uuid4()), inputs[0], [], **inputs[1])
return pdf_name
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
finally:
self.clean_memory()
def encode_response(self, response):
return {'output_dir': response}
def clean_memory(self):
import gc
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
gc.collect()
def to_pdf(self, file_base64):
try:
file_bytes = base64.b64decode(file_base64)
file_ext = guess_extension(file_bytes)
with fitz.open(stream=file_bytes, filetype=file_ext) as f:
if f.is_pdf: return f.tobytes()
return f.convert_to_pdf()
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
if __name__ == '__main__':
server = ls.LitServer(
MinerUAPI(output_dir='/tmp'),
accelerator='cuda',
devices='auto',
workers_per_device=1,
timeout=False
)
server.run(port=8000)
...@@ -5,7 +5,6 @@ PyMuPDF>=1.24.9 ...@@ -5,7 +5,6 @@ PyMuPDF>=1.24.9
loguru>=0.6.0 loguru>=0.6.0
numpy>=1.21.6,<2.0.0 numpy>=1.21.6,<2.0.0
fast-langdetect==0.2.0 fast-langdetect==0.2.0
wordninja>=2.0.0
scikit-learn>=1.0.2 scikit-learn>=1.0.2
pdfminer.six==20231228 pdfminer.six==20231228
unimernet==0.2.1 unimernet==0.2.1
...@@ -15,4 +14,5 @@ paddleocr==2.7.3 ...@@ -15,4 +14,5 @@ paddleocr==2.7.3
paddlepaddle==3.0.0b1 paddlepaddle==3.0.0b1
pypandoc pypandoc
struct-eqtable==0.1.0 struct-eqtable==0.1.0
doclayout-yolo==0.0.2
detectron2 detectron2
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment