app.py 9.08 KB
Newer Older
1
2
import json
import os
3
4
from io import StringIO
from typing import Tuple, Union
5
6

import uvicorn
7
from fastapi import FastAPI, HTTPException, UploadFile
8
9
from fastapi.responses import JSONResponse
from loguru import logger
10
11

import magic_pdf.model as model_config
icecraft's avatar
icecraft committed
12
from magic_pdf.config.enums import SupportedPdfParseMethod
13
14
from magic_pdf.data.data_reader_writer import DataWriter, FileBasedDataWriter
from magic_pdf.data.data_reader_writer.s3 import S3DataReader, S3DataWriter
icecraft's avatar
icecraft committed
15
from magic_pdf.data.dataset import PymuDocDataset
16
from magic_pdf.libs.config_reader import get_bucket_name, get_s3_config
icecraft's avatar
icecraft committed
17
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
18
from magic_pdf.operators.models import InferenceResult
19
from magic_pdf.operators.pipes import PipeResult
20
21
22
23
24

model_config.__use_inside_model__ = True

app = FastAPI()

25

26
27
28
class MemoryDataWriter(DataWriter):
    def __init__(self):
        self.buffer = StringIO()
29

30
31
32
    def write(self, path: str, data: bytes) -> None:
        if isinstance(data, str):
            self.buffer.write(data)
33
        else:
shniubobo's avatar
shniubobo committed
34
            self.buffer.write(data.decode("utf-8"))
35

36
37
    def write_string(self, path: str, data: str) -> None:
        self.buffer.write(data)
38

39
40
    def get_value(self) -> str:
        return self.buffer.getvalue()
41

42
43
    def close(self):
        self.buffer.close()
44

shniubobo's avatar
shniubobo committed
45

46
47
48
49
50
def init_writers(
    pdf_path: str = None,
    pdf_file: UploadFile = None,
    output_path: str = None,
    output_image_path: str = None,
shniubobo's avatar
shniubobo committed
51
52
53
54
55
) -> Tuple[
    Union[S3DataWriter, FileBasedDataWriter],
    Union[S3DataWriter, FileBasedDataWriter],
    bytes,
]:
56
57
    """
    Initialize writers based on path type
icecraft's avatar
icecraft committed
58

59
60
61
62
63
64
65
    Args:
        pdf_path: PDF file path (local path or S3 path)
        pdf_file: Uploaded PDF file object
        output_path: Output directory path
        output_image_path: Image output directory path

    Returns:
shniubobo's avatar
shniubobo committed
66
67
        Tuple[writer, image_writer, pdf_bytes]: Returns initialized writer tuple and PDF
        file content
68
69
    """
    if pdf_path:
shniubobo's avatar
shniubobo committed
70
        is_s3_path = pdf_path.startswith("s3://")
71
72
73
74
        if is_s3_path:
            bucket = get_bucket_name(pdf_path)
            ak, sk, endpoint = get_s3_config(bucket)

shniubobo's avatar
shniubobo committed
75
76
77
78
79
80
            writer = S3DataWriter(
                output_path, bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint
            )
            image_writer = S3DataWriter(
                output_image_path, bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint
            )
81
            # 临时创建reader读取文件内容
shniubobo's avatar
shniubobo committed
82
83
84
            temp_reader = S3DataReader(
                "", bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint
            )
85
            pdf_bytes = temp_reader.read(pdf_path)
icecraft's avatar
icecraft committed
86
        else:
87
88
89
            writer = FileBasedDataWriter(output_path)
            image_writer = FileBasedDataWriter(output_image_path)
            os.makedirs(output_image_path, exist_ok=True)
shniubobo's avatar
shniubobo committed
90
            with open(pdf_path, "rb") as f:
91
92
93
94
95
96
97
98
99
100
                pdf_bytes = f.read()
    else:
        # 处理上传的文件
        pdf_bytes = pdf_file.file.read()
        writer = FileBasedDataWriter(output_path)
        image_writer = FileBasedDataWriter(output_image_path)
        os.makedirs(output_image_path, exist_ok=True)

    return writer, image_writer, pdf_bytes

shniubobo's avatar
shniubobo committed
101

102
103
104
def process_pdf(
    pdf_bytes: bytes,
    parse_method: str,
shniubobo's avatar
shniubobo committed
105
    image_writer: Union[S3DataWriter, FileBasedDataWriter],
106
107
108
109
110
111
112
113
) -> Tuple[InferenceResult, PipeResult]:
    """
    Process PDF file content

    Args:
        pdf_bytes: Binary content of PDF file
        parse_method: Parse method ('ocr', 'txt', 'auto')
        image_writer: Image writer
114

115
116
117
118
    Returns:
        Tuple[InferenceResult, PipeResult]: Returns inference result and pipeline result
    """
    ds = PymuDocDataset(pdf_bytes)
shniubobo's avatar
shniubobo committed
119
120
    infer_result: InferenceResult = None
    pipe_result: PipeResult = None
121

shniubobo's avatar
shniubobo committed
122
    if parse_method == "ocr":
123
124
        infer_result = ds.apply(doc_analyze, ocr=True)
        pipe_result = infer_result.pipe_ocr_mode(image_writer)
shniubobo's avatar
shniubobo committed
125
    elif parse_method == "txt":
126
127
128
129
130
131
132
133
134
        infer_result = ds.apply(doc_analyze, ocr=False)
        pipe_result = infer_result.pipe_txt_mode(image_writer)
    else:  # auto
        if ds.classify() == SupportedPdfParseMethod.OCR:
            infer_result = ds.apply(doc_analyze, ocr=True)
            pipe_result = infer_result.pipe_ocr_mode(image_writer)
        else:
            infer_result = ds.apply(doc_analyze, ocr=False)
            pipe_result = infer_result.pipe_txt_mode(image_writer)
135

136
    return infer_result, pipe_result
137

shniubobo's avatar
shniubobo committed
138
139
140
141
142
143

@app.post(
    "/pdf_parse",
    tags=["projects"],
    summary="Parse PDF files (supports local files and S3)",
)
144
145
146
async def pdf_parse(
    pdf_file: UploadFile = None,
    pdf_path: str = None,
shniubobo's avatar
shniubobo committed
147
    parse_method: str = "auto",
148
    is_json_md_dump: bool = True,
shniubobo's avatar
shniubobo committed
149
    output_dir: str = "output",
150
151
152
153
    return_layout: bool = False,
    return_info: bool = False,
    return_content_list: bool = False,
):
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
    """
    Execute the process of converting PDF to JSON and MD, outputting MD and JSON files
    to the specified directory.

    :param pdf_file: The PDF file to be parsed. Must not be specified together with
        `pdf_path`
    :param pdf_path: The path to the PDF file to be parsed. Must not be specified
        together with `pdf_file`
    :param parse_method: Parsing method, can be auto, ocr, or txt. Default is auto. If
        results are not satisfactory, try ocr
    :param is_json_md_dump: Whether to write parsed data to .json and .md files. Default
        is True. Different stages of data will be written to different .json files (3 in
        total), md content will be saved to .md file
    :param output_dir: Output directory for results. A folder named after the PDF file
        will be created to store all results
    :param return_layout: Whether to return parsed PDF layout. Default to False
    :param return_info: Whether to return parsed PDF info. Default to False
    :param return_content_list: Whether to return parsed PDF content list. Default to
        False
    """
174
    try:
175
176
177
178
179
180
        if (pdf_file is None and pdf_path is None) or (
            pdf_file is not None and pdf_path is not None
        ):
            return JSONResponse(
                content={"error": "Must provide either pdf_file or pdf_path"},
                status_code=400,
shniubobo's avatar
shniubobo committed
181
            )
182
183

        # Get PDF filename
shniubobo's avatar
shniubobo committed
184
185
186
        pdf_name = os.path.basename(pdf_path if pdf_path else pdf_file.filename).split(
            "."
        )[0]
187
188
189
190
191
192
193
194
        output_path = f"{output_dir}/{pdf_name}"
        output_image_path = f"{output_path}/images"

        # Initialize readers/writers and get PDF content
        writer, image_writer, pdf_bytes = init_writers(
            pdf_path=pdf_path,
            pdf_file=pdf_file,
            output_path=output_path,
shniubobo's avatar
shniubobo committed
195
            output_image_path=output_image_path,
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
        )

        # Process PDF
        infer_result, pipe_result = process_pdf(pdf_bytes, parse_method, image_writer)

        # Use MemoryDataWriter to get results
        content_list_writer = MemoryDataWriter()
        md_content_writer = MemoryDataWriter()
        middle_json_writer = MemoryDataWriter()

        # Use PipeResult's dump method to get data
        pipe_result.dump_content_list(content_list_writer, "", "images")
        pipe_result.dump_md(md_content_writer, "", "images")
        pipe_result.dump_middle_json(middle_json_writer, "")

        # Get content
        content_list = json.loads(content_list_writer.get_value())
        md_content = md_content_writer.get_value()
        middle_json = json.loads(middle_json_writer.get_value())
        model_json = infer_result.get_infer_res()

        # If results need to be saved
218
        if is_json_md_dump:
shniubobo's avatar
shniubobo committed
219
220
221
            writer.write_string(
                f"{pdf_name}_content_list.json", content_list_writer.get_value()
            )
222
            writer.write_string(f"{pdf_name}.md", md_content)
shniubobo's avatar
shniubobo committed
223
224
225
226
227
228
229
            writer.write_string(
                f"{pdf_name}_middle.json", middle_json_writer.get_value()
            )
            writer.write_string(
                f"{pdf_name}_model.json",
                json.dumps(model_json, indent=4, ensure_ascii=False),
            )
230
            # Save visualization results
shniubobo's avatar
shniubobo committed
231
232
233
234
235
236
            pipe_result.draw_layout(os.path.join(output_path, f"{pdf_name}_layout.pdf"))
            pipe_result.draw_span(os.path.join(output_path, f"{pdf_name}_spans.pdf"))
            pipe_result.draw_line_sort(
                os.path.join(output_path, f"{pdf_name}_line_sort.pdf")
            )
            infer_result.draw_model(os.path.join(output_path, f"{pdf_name}_model.pdf"))
237
238
239
240

        # Build return data
        data = {}
        if return_layout:
shniubobo's avatar
shniubobo committed
241
            data["layout"] = model_json
242
        if return_info:
shniubobo's avatar
shniubobo committed
243
            data["info"] = middle_json
244
        if return_content_list:
shniubobo's avatar
shniubobo committed
245
246
            data["content_list"] = content_list
        data["md_content"] = md_content  # md_content is always returned
247
248
249
250
251
252

        # Clean up memory writers
        content_list_writer.close()
        md_content_writer.close()
        middle_json_writer.close()

253
254
255
256
        return JSONResponse(data, status_code=200)

    except Exception as e:
        logger.exception(e)
shniubobo's avatar
shniubobo committed
257
        return JSONResponse(content={"error": str(e)}, status_code=500)
258

259

shniubobo's avatar
shniubobo committed
260
261
if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8888)