"examples/roberta/commonsense_qa/__init__.py" did not exist on "2f6d8b352a142424bef1858ef62e24180e8fbd0b"
app.py 5.72 KB
Newer Older
1
2
3
4
5
6
import copy
import json
import os
from tempfile import NamedTemporaryFile

import uvicorn
7
from fastapi import FastAPI, File, UploadFile
8
9
from fastapi.responses import JSONResponse
from loguru import logger
10
11

import magic_pdf.model as model_config
icecraft's avatar
icecraft committed
12
from magic_pdf.config.enums import SupportedPdfParseMethod
13
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
icecraft's avatar
icecraft committed
14
15
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
16
from magic_pdf.operators.models import InferenceResult
17
18
19
20
21

model_config.__use_inside_model__ = True

app = FastAPI()

22

23
def json_md_dump(
icecraft's avatar
icecraft committed
24
25
    model_json,
    middle_json,
26
27
28
29
    md_writer,
    pdf_name,
    content_list,
    md_content,
30
31
):
    # Write model results to model.json
icecraft's avatar
icecraft committed
32
    orig_model_list = copy.deepcopy(model_json)
33
34
35
    md_writer.write_string(
        f'{pdf_name}_model.json',
        json.dumps(orig_model_list, ensure_ascii=False, indent=4),
36
37
38
    )

    # Write intermediate results to middle.json
39
40
    md_writer.write_string(
        f'{pdf_name}_middle.json',
icecraft's avatar
icecraft committed
41
        json.dumps(middle_json, ensure_ascii=False, indent=4),
42
43
44
    )

    # Write text content results to content_list.json
45
46
47
    md_writer.write_string(
        f'{pdf_name}_content_list.json',
        json.dumps(content_list, ensure_ascii=False, indent=4),
48
49
50
    )

    # Write results to .md file
51
52
53
    md_writer.write_string(
        f'{pdf_name}.md',
        md_content,
54
55
    )

56
57

@app.post('/pdf_parse', tags=['projects'], summary='Parse PDF file')
58
async def pdf_parse_main(
59
60
61
62
63
    pdf_file: UploadFile = File(...),
    parse_method: str = 'auto',
    model_json_path: str = None,
    is_json_md_dump: bool = True,
    output_dir: str = 'output',
64
):
65
66
67
    """Execute the process of converting PDF to JSON and MD, outputting MD and
    JSON files to the specified directory.

68
69
70
    :param pdf_file: The PDF file to be parsed
    :param parse_method: Parsing method, can be auto, ocr, or txt. Default is auto. If results are not satisfactory, try ocr
    :param model_json_path: Path to existing model data file. If empty, use built-in model. PDF and model_json must correspond
71
    :param is_json_md_dump: Whether to write parsed data to .json and .md files. Default is True. Different stages of data will be written to different .json files (3 in total), md content will be saved to .md file  # noqa E501
72
73
74
75
    :param output_dir: Output directory for results. A folder named after the PDF file will be created to store all results
    """
    try:
        # Create a temporary file to store the uploaded PDF
76
        with NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
77
78
79
            temp_pdf.write(await pdf_file.read())
            temp_pdf_path = temp_pdf.name

80
        pdf_name = os.path.basename(pdf_file.filename).split('.')[0]
81
82
83
84
85
86
87
88
89
90
91

        if output_dir:
            output_path = os.path.join(output_dir, pdf_name)
        else:
            output_path = os.path.join(os.path.dirname(temp_pdf_path), pdf_name)

        output_image_path = os.path.join(output_path, 'images')

        # Get parent path of images for relative path in .md and content_list.json
        image_path_parent = os.path.basename(output_image_path)

92
        pdf_bytes = open(temp_pdf_path, 'rb').read()  # Read binary data of PDF file
93
94
95

        if model_json_path:
            # Read original JSON data of PDF file parsed by model, list type
96
            model_json = json.loads(open(model_json_path, 'r', encoding='utf-8').read())
97
98
99
100
        else:
            model_json = []

        # Execute parsing steps
101
102
103
        image_writer, md_writer = FileBasedDataWriter(
            output_image_path
        ), FileBasedDataWriter(output_path)
104

icecraft's avatar
icecraft committed
105
        ds = PymuDocDataset(pdf_bytes)
106
        # Choose parsing method
107
        if parse_method == 'auto':
icecraft's avatar
icecraft committed
108
109
110
111
112
113
            if ds.classify() == SupportedPdfParseMethod.OCR:
                parse_method = 'ocr'
            else:
                parse_method = 'txt'

        if parse_method not in ['txt', 'ocr']:
114
115
116
117
            logger.error('Unknown parse method, only auto, ocr, txt allowed')
            return JSONResponse(
                content={'error': 'Invalid parse method'}, status_code=400
            )
118

icecraft's avatar
icecraft committed
119
120
121
        if len(model_json) == 0:
            if parse_method == 'ocr':
                infer_result = ds.apply(doc_analyze, ocr=True)
122
            else:
icecraft's avatar
icecraft committed
123
124
125
126
127
128
                infer_result = ds.apply(doc_analyze, ocr=False)

        else:
            infer_result = InferenceResult(model_json, ds)

        if len(model_json) == 0 and not model_config.__use_inside_model__:
129
130
131
132
                logger.error('Need model list input')
                return JSONResponse(
                    content={'error': 'Model list input required'}, status_code=400
                )
icecraft's avatar
icecraft committed
133
134
135
136
        if parse_method == 'ocr':
            pipe_res = infer_result.pipe_ocr_mode(image_writer)
        else:
            pipe_res = infer_result.pipe_txt_mode(image_writer)
137
138
139


        # Save results in text and md format
icecraft's avatar
icecraft committed
140
141
        content_list = pipe_res.get_content_list(image_path_parent, drop_mode='none')
        md_content = pipe_res.get_markdown(image_path_parent, drop_mode='none')
142
143

        if is_json_md_dump:
icecraft's avatar
icecraft committed
144
            json_md_dump(infer_result._infer_res, pipe_res._pipe_res, md_writer, pdf_name, content_list, md_content)
145
        data = {
icecraft's avatar
icecraft committed
146
147
            'layout': copy.deepcopy(infer_result._infer_res),
            'info': pipe_res._pipe_res,
148
149
150
            'content_list': content_list,
            'md_content': md_content,
        }
151
152
153
154
        return JSONResponse(data, status_code=200)

    except Exception as e:
        logger.exception(e)
155
        return JSONResponse(content={'error': str(e)}, status_code=500)
156
157
158
159
160
    finally:
        # Clean up the temporary file
        if 'temp_pdf_path' in locals():
            os.unlink(temp_pdf_path)

161
162
163

if __name__ == '__main__':
    uvicorn.run(app, host='0.0.0.0', port=8888)