Unverified Commit f559fd9c authored by utopia2077's avatar utopia2077 Committed by shniubobo
Browse files

refactor(web_api): adapt to new pipeline API changes

parent e9203f91
# Use the official Ubuntu base image # Use the official Ubuntu base image
FROM ubuntu:latest FROM ubuntu:22.04
# ENV http_proxy http://127.0.0.1:7890
# ENV https_proxy http://127.0.0.1:7890
# Set environment variables to non-interactive to avoid prompts during installation # Set environment variables to non-interactive to avoid prompts during installation
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ENV LANG C.UTF-8 ENV LANG C.UTF-8
# ADD sources.list /etc/apt
# RUN apt-get clean
# Update the package list and install necessary packages # Update the package list and install necessary packages
RUN apt-get -q update \ RUN apt-get -q update && \
&& apt-get -q install -y --no-install-recommends \ apt-get -q install -y --no-install-recommends \
apt-utils \ build-essential \
bats \ software-properties-common \
build-essential # gpg \
RUN apt-get update && apt-get install -y vim net-tools procps lsof curl wget iputils-ping telnet lrzsz git # && add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update \
RUN apt-get update && \ && apt-get install -y \
apt-get install -y \
software-properties-common && \
add-apt-repository ppa:deadsnakes/ppa && \
apt-get update && \
apt-get install -y \
python3.10 \ python3.10 \
python3.10-venv \ python3.10-venv \
python3.10-distutils \ python3.10-distutils \
...@@ -35,41 +22,31 @@ RUN apt-get update && \ ...@@ -35,41 +22,31 @@ RUN apt-get update && \
git \ git \
libgl1 \ libgl1 \
libglib2.0-0 \ libglib2.0-0 \
&& rm -rf /var/lib/apt/lists/* && apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# RUN unset http_proxy && unset https_proxy
# Set Python 3.10 as the default python3 # Set Python 3.10 as the default python3
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
# Create a virtual environment for MinerU # Create a virtual environment for MinerU and install packages
RUN python3 -m venv /opt/mineru_venv RUN python3 -m venv /opt/mineru_venv && \
RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple pip config set global.index-url https://mirrors.aliyun.com/pypi/simple && \
# Activate the virtual environment and install necessary Python packages /bin/bash -c "source /opt/mineru_venv/bin/activate && \
RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
pip install --upgrade pip && \ pip install --upgrade pip && \
pip install magic-pdf[full] --extra-index-url https://myhloli.github.io/wheels/ --no-cache-dir" pip install magic-pdf[full] --extra-index-url https://myhloli.github.io/wheels/ --no-cache-dir && \
pip install fastapi uvicorn python-multipart --no-cache-dir && \
pip uninstall paddlepaddle -y && \
RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \ pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ --no-cache-dir"
pip install fastapi uvicorn python-multipart --no-cache-dir"
RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
pip uninstall paddlepaddle -y"
RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ --no-cache-dir"
# Copy the configuration file template and set up the model directory # Copy the configuration file template and set up the model directory
COPY magic-pdf.template.json /root/magic-pdf.json COPY models/models /opt/models
ADD models /opt/models COPY layoutreader /opt/layoutreader
ADD .paddleocr /root/.paddleocr COPY .paddleocr /root/.paddleocr
ADD app.py /root/app.py COPY app.py /root/app.py
COPY magic-pdf.json /root/magic-pdf.json
WORKDIR /root WORKDIR /root
# Set the models directory in the configuration file (adjust the path as needed)
RUN sed -i 's|/tmp/models|/opt/models|g' /root/magic-pdf.json
# Create the models directory # Create the models directory
# RUN mkdir -p /opt/models # RUN mkdir -p /opt/models
......
...@@ -42,3 +42,7 @@ ...@@ -42,3 +42,7 @@
> dockerhub地址:docker pull quincyqiang/mineru:0.1-models > dockerhub地址:docker pull quincyqiang/mineru:0.1-models
## 构建方式:
1. 拷贝`hantian/layoutreader`,`opendatalab/PDF-Extract-Kit-1.0`,`paddleocr`模型到当前目录。
2. `docker build --build-arg http_proxy=http://127.0.0.1:7890 --build-arg https_proxy=http://127.0.0.1:7890 -t mineru-api .`
import copy
import json import json
import os import os
from tempfile import NamedTemporaryFile from io import StringIO
from typing import Tuple, Union
import uvicorn import uvicorn
from fastapi import FastAPI, File, UploadFile from fastapi import FastAPI, HTTPException, UploadFile
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from loguru import logger from loguru import logger
import magic_pdf.model as model_config import magic_pdf.model as model_config
from magic_pdf.config.enums import SupportedPdfParseMethod from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.data_reader_writer import FileBasedDataWriter from magic_pdf.data.data_reader_writer import DataWriter, FileBasedDataWriter
from magic_pdf.data.data_reader_writer.s3 import S3DataReader, S3DataWriter
from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.libs.config_reader import get_bucket_name, get_s3_config
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.operators.models import InferenceResult from magic_pdf.operators.models import InferenceResult
from magic_pdf.operators.pipes import PipeResult
model_config.__use_inside_model__ = True model_config.__use_inside_model__ = True
app = FastAPI() app = FastAPI()
def json_md_dump(
model_json,
middle_json,
md_writer,
pdf_name,
content_list,
md_content,
):
# Write model results to model.json
orig_model_list = copy.deepcopy(model_json)
md_writer.write_string(
f'{pdf_name}_model.json',
json.dumps(orig_model_list, ensure_ascii=False, indent=4),
)
# Write intermediate results to middle.json
md_writer.write_string(
f'{pdf_name}_middle.json',
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
# Write text content results to content_list.json
md_writer.write_string(
f'{pdf_name}_content_list.json',
json.dumps(content_list, ensure_ascii=False, indent=4),
)
# Write results to .md file
md_writer.write_string(
f'{pdf_name}.md',
md_content,
)
@app.post('/pdf_parse', tags=['projects'], summary='Parse PDF file')
async def pdf_parse_main(
pdf_file: UploadFile = File(...),
parse_method: str = 'auto',
model_json_path: str = None,
is_json_md_dump: bool = True,
output_dir: str = 'output',
):
"""Execute the process of converting PDF to JSON and MD, outputting MD and
JSON files to the specified directory.
:param pdf_file: The PDF file to be parsed
:param parse_method: Parsing method, can be auto, ocr, or txt. Default is auto. If results are not satisfactory, try ocr
:param model_json_path: Path to existing model data file. If empty, use built-in model. PDF and model_json must correspond
:param is_json_md_dump: Whether to write parsed data to .json and .md files. Default is True. Different stages of data will be written to different .json files (3 in total), md content will be saved to .md file # noqa E501
:param output_dir: Output directory for results. A folder named after the PDF file will be created to store all results
"""
try:
# Create a temporary file to store the uploaded PDF
with NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
temp_pdf.write(await pdf_file.read())
temp_pdf_path = temp_pdf.name
pdf_name = os.path.basename(pdf_file.filename).split('.')[0] class MemoryDataWriter(DataWriter):
def __init__(self):
self.buffer = StringIO()
if output_dir: def write(self, path: str, data: bytes) -> None:
output_path = os.path.join(output_dir, pdf_name) if isinstance(data, str):
self.buffer.write(data)
else: else:
output_path = os.path.join(os.path.dirname(temp_pdf_path), pdf_name) self.buffer.write(data.decode('utf-8'))
output_image_path = os.path.join(output_path, 'images') def write_string(self, path: str, data: str) -> None:
self.buffer.write(data)
# Get parent path of images for relative path in .md and content_list.json def get_value(self) -> str:
image_path_parent = os.path.basename(output_image_path) return self.buffer.getvalue()
pdf_bytes = open(temp_pdf_path, 'rb').read() # Read binary data of PDF file def close(self):
self.buffer.close()
if model_json_path: def init_writers(
# Read original JSON data of PDF file parsed by model, list type pdf_path: str = None,
model_json = json.loads(open(model_json_path, 'r', encoding='utf-8').read()) pdf_file: UploadFile = None,
else: output_path: str = None,
model_json = [] output_image_path: str = None,
) -> Tuple[Union[S3DataWriter, FileBasedDataWriter], Union[S3DataWriter, FileBasedDataWriter], bytes]:
# Execute parsing steps """
image_writer, md_writer = FileBasedDataWriter( Initialize writers based on path type
output_image_path
), FileBasedDataWriter(output_path)
ds = PymuDocDataset(pdf_bytes)
# Choose parsing method
if parse_method == 'auto':
if ds.classify() == SupportedPdfParseMethod.OCR:
parse_method = 'ocr'
else:
parse_method = 'txt'
if parse_method not in ['txt', 'ocr']:
logger.error('Unknown parse method, only auto, ocr, txt allowed')
return JSONResponse(
content={'error': 'Invalid parse method'}, status_code=400
)
if len(model_json) == 0:
if parse_method == 'ocr':
infer_result = ds.apply(doc_analyze, ocr=True)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
Args:
pdf_path: PDF file path (local path or S3 path)
pdf_file: Uploaded PDF file object
output_path: Output directory path
output_image_path: Image output directory path
Returns:
Tuple[writer, image_writer, pdf_bytes]: Returns initialized writer tuple and PDF file content
"""
if pdf_path:
is_s3_path = pdf_path.startswith('s3://')
if is_s3_path:
bucket = get_bucket_name(pdf_path)
ak, sk, endpoint = get_s3_config(bucket)
writer = S3DataWriter(output_path, bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint)
image_writer = S3DataWriter(output_image_path, bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint)
# 临时创建reader读取文件内容
temp_reader = S3DataReader("", bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint)
pdf_bytes = temp_reader.read(pdf_path)
else: else:
infer_result = InferenceResult(model_json, ds) writer = FileBasedDataWriter(output_path)
image_writer = FileBasedDataWriter(output_image_path)
if len(model_json) == 0 and not model_config.__use_inside_model__: os.makedirs(output_image_path, exist_ok=True)
logger.error('Need model list input') with open(pdf_path, 'rb') as f:
return JSONResponse( pdf_bytes = f.read()
content={'error': 'Model list input required'}, status_code=400 else:
) # 处理上传的文件
if parse_method == 'ocr': pdf_bytes = pdf_file.file.read()
pipe_res = infer_result.pipe_ocr_mode(image_writer) writer = FileBasedDataWriter(output_path)
else: image_writer = FileBasedDataWriter(output_image_path)
pipe_res = infer_result.pipe_txt_mode(image_writer) os.makedirs(output_image_path, exist_ok=True)
return writer, image_writer, pdf_bytes
def process_pdf(
pdf_bytes: bytes,
parse_method: str,
image_writer: Union[S3DataWriter, FileBasedDataWriter]
) -> Tuple[InferenceResult, PipeResult]:
"""
Process PDF file content
Args:
pdf_bytes: Binary content of PDF file
parse_method: Parse method ('ocr', 'txt', 'auto')
image_writer: Image writer
Returns:
Tuple[InferenceResult, PipeResult]: Returns inference result and pipeline result
"""
ds = PymuDocDataset(pdf_bytes)
infer_result : InferenceResult = None
pipe_result : PipeResult = None
if parse_method == 'ocr':
infer_result = ds.apply(doc_analyze, ocr=True)
pipe_result = infer_result.pipe_ocr_mode(image_writer)
elif parse_method == 'txt':
infer_result = ds.apply(doc_analyze, ocr=False)
pipe_result = infer_result.pipe_txt_mode(image_writer)
else: # auto
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
pipe_result = infer_result.pipe_txt_mode(image_writer)
# Save results in text and md format return infer_result, pipe_result
content_list = pipe_res.get_content_list(image_path_parent, drop_mode='none')
md_content = pipe_res.get_markdown(image_path_parent, drop_mode='none')
@app.post('/pdf_parse', tags=['projects'], summary='Parse PDF files (supports local files and S3)')
async def pdf_parse(
pdf_file: UploadFile = None,
pdf_path: str = None,
parse_method: str = 'auto',
is_json_md_dump: bool = True,
output_dir: str = 'output',
return_layout: bool = False,
return_info: bool = False,
return_content_list: bool = False,
):
try:
if pdf_file is None and pdf_path is None:
raise HTTPException(status_code=400, detail="Must provide either pdf_file or pdf_path")
# Get PDF filename
pdf_name = os.path.basename(pdf_path if pdf_path else pdf_file.filename).split('.')[0]
output_path = f"{output_dir}/{pdf_name}"
output_image_path = f"{output_path}/images"
# Initialize readers/writers and get PDF content
writer, image_writer, pdf_bytes = init_writers(
pdf_path=pdf_path,
pdf_file=pdf_file,
output_path=output_path,
output_image_path=output_image_path
)
# Process PDF
infer_result, pipe_result = process_pdf(pdf_bytes, parse_method, image_writer)
# Use MemoryDataWriter to get results
content_list_writer = MemoryDataWriter()
md_content_writer = MemoryDataWriter()
middle_json_writer = MemoryDataWriter()
# Use PipeResult's dump method to get data
pipe_result.dump_content_list(content_list_writer, "", "images")
pipe_result.dump_md(md_content_writer, "", "images")
pipe_result.dump_middle_json(middle_json_writer, "")
# Get content
content_list = json.loads(content_list_writer.get_value())
md_content = md_content_writer.get_value()
middle_json = json.loads(middle_json_writer.get_value())
model_json = infer_result.get_infer_res()
# If results need to be saved
if is_json_md_dump: if is_json_md_dump:
json_md_dump(infer_result._infer_res, pipe_res._pipe_res, md_writer, pdf_name, content_list, md_content) writer.write_string(f"{pdf_name}_content_list.json", content_list_writer.get_value())
data = { writer.write_string(f"{pdf_name}.md", md_content)
'layout': copy.deepcopy(infer_result._infer_res), writer.write_string(f"{pdf_name}_middle.json", middle_json_writer.get_value())
'info': pipe_res._pipe_res, writer.write_string(f"{pdf_name}_model.json", json.dumps(model_json, indent=4, ensure_ascii=False))
'content_list': content_list, # Save visualization results
'md_content': md_content, pipe_result.draw_layout(os.path.join(output_path, f'{pdf_name}_layout.pdf'))
} pipe_result.draw_span(os.path.join(output_path, f'{pdf_name}_spans.pdf'))
pipe_result.draw_line_sort(os.path.join(output_path, f'{pdf_name}_line_sort.pdf'))
infer_result.draw_model(os.path.join(output_path, f'{pdf_name}_model.pdf'))
# Build return data
data = {}
if return_layout:
data['layout'] = model_json
if return_info:
data['info'] = middle_json
if return_content_list:
data['content_list'] = content_list
data['md_content'] = md_content # md_content is always returned
# Clean up memory writers
content_list_writer.close()
md_content_writer.close()
middle_json_writer.close()
return JSONResponse(data, status_code=200) return JSONResponse(data, status_code=200)
except Exception as e: except Exception as e:
logger.exception(e) logger.exception(e)
return JSONResponse(content={'error': str(e)}, status_code=500) return JSONResponse(content={'error': str(e)}, status_code=500)
finally:
# Clean up the temporary file
if 'temp_pdf_path' in locals():
os.unlink(temp_pdf_path)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -4,10 +4,20 @@ ...@@ -4,10 +4,20 @@
"bucket-name-2":["ak", "sk", "endpoint"] "bucket-name-2":["ak", "sk", "endpoint"]
}, },
"models-dir":"/opt/models", "models-dir":"/opt/models",
"layoutreader-model-dir":"/opt/layoutreader",
"device-mode":"cuda", "device-mode":"cuda",
"layout-config": {
"model": "layoutlmv3"
},
"formula-config": {
"mfd_model": "yolo_v8_mfd",
"mfr_model": "unimernet_small",
"enable": true
},
"table-config": { "table-config": {
"model": "TableMaster", "model": "rapid_table",
"is_table_recog_enable": false, "enable": false,
"max_time": 400 "max_time": 400
} },
"config_version": "1.0.0"
} }
{
"bucket_info":{
"bucket-name-1":["ak", "sk", "endpoint"],
"bucket-name-2":["ak", "sk", "endpoint"]
},
"models-dir":"/tmp/models",
"device-mode":"cuda",
"table-config": {
"model": "TableMaster",
"is_table_recog_enable": false,
"max_time": 400
}
}
\ No newline at end of file
deb http://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment