app.py 5.46 KB
Newer Older
1
2
3
4
5
6
import copy
import json
import os
from tempfile import NamedTemporaryFile

import uvicorn
7
from fastapi import FastAPI, File, UploadFile
8
9
from fastapi.responses import JSONResponse
from loguru import logger
10
11
12

import magic_pdf.model as model_config
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
13
14
15
16
17
18
19
20
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
from magic_pdf.pipe.UNIPipe import UNIPipe

model_config.__use_inside_model__ = True

app = FastAPI()

21

22
def json_md_dump(
23
24
25
26
27
    pipe,
    md_writer,
    pdf_name,
    content_list,
    md_content,
28
29
30
):
    # Write model results to model.json
    orig_model_list = copy.deepcopy(pipe.model_list)
31
32
33
    md_writer.write_string(
        f'{pdf_name}_model.json',
        json.dumps(orig_model_list, ensure_ascii=False, indent=4),
34
35
36
    )

    # Write intermediate results to middle.json
37
38
39
    md_writer.write_string(
        f'{pdf_name}_middle.json',
        json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
40
41
42
    )

    # Write text content results to content_list.json
43
44
45
    md_writer.write_string(
        f'{pdf_name}_content_list.json',
        json.dumps(content_list, ensure_ascii=False, indent=4),
46
47
48
    )

    # Write results to .md file
49
50
51
    md_writer.write_string(
        f'{pdf_name}.md',
        md_content,
52
53
    )

54
55

@app.post('/pdf_parse', tags=['projects'], summary='Parse PDF file')
56
async def pdf_parse_main(
57
58
59
60
61
    pdf_file: UploadFile = File(...),
    parse_method: str = 'auto',
    model_json_path: str = None,
    is_json_md_dump: bool = True,
    output_dir: str = 'output',
62
):
63
64
65
    """Execute the process of converting PDF to JSON and MD, outputting MD and
    JSON files to the specified directory.

66
67
68
    :param pdf_file: The PDF file to be parsed
    :param parse_method: Parsing method, can be auto, ocr, or txt. Default is auto. If results are not satisfactory, try ocr
    :param model_json_path: Path to existing model data file. If empty, use built-in model. PDF and model_json must correspond
69
    :param is_json_md_dump: Whether to write parsed data to .json and .md files. Default is True. Different stages of data will be written to different .json files (3 in total), md content will be saved to .md file  # noqa E501
70
71
72
73
    :param output_dir: Output directory for results. A folder named after the PDF file will be created to store all results
    """
    try:
        # Create a temporary file to store the uploaded PDF
74
        with NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
75
76
77
            temp_pdf.write(await pdf_file.read())
            temp_pdf_path = temp_pdf.name

78
        pdf_name = os.path.basename(pdf_file.filename).split('.')[0]
79
80
81
82
83
84
85
86
87
88
89

        if output_dir:
            output_path = os.path.join(output_dir, pdf_name)
        else:
            output_path = os.path.join(os.path.dirname(temp_pdf_path), pdf_name)

        output_image_path = os.path.join(output_path, 'images')

        # Get parent path of images for relative path in .md and content_list.json
        image_path_parent = os.path.basename(output_image_path)

90
        pdf_bytes = open(temp_pdf_path, 'rb').read()  # Read binary data of PDF file
91
92
93

        if model_json_path:
            # Read original JSON data of PDF file parsed by model, list type
94
            model_json = json.loads(open(model_json_path, 'r', encoding='utf-8').read())
95
96
97
98
        else:
            model_json = []

        # Execute parsing steps
99
100
101
        image_writer, md_writer = FileBasedDataWriter(
            output_image_path
        ), FileBasedDataWriter(output_path)
102
103

        # Choose parsing method
104
105
        if parse_method == 'auto':
            jso_useful_key = {'_pdf_type': '', 'model_list': model_json}
106
            pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
107
        elif parse_method == 'txt':
108
            pipe = TXTPipe(pdf_bytes, model_json, image_writer)
109
        elif parse_method == 'ocr':
110
111
            pipe = OCRPipe(pdf_bytes, model_json, image_writer)
        else:
112
113
114
115
            logger.error('Unknown parse method, only auto, ocr, txt allowed')
            return JSONResponse(
                content={'error': 'Invalid parse method'}, status_code=400
            )
116
117
118
119
120
121
122
123
124

        # Execute classification
        pipe.pipe_classify()

        # If no model data is provided, use built-in model for parsing
        if not model_json:
            if model_config.__use_inside_model__:
                pipe.pipe_analyze()  # Parse
            else:
125
126
127
128
                logger.error('Need model list input')
                return JSONResponse(
                    content={'error': 'Model list input required'}, status_code=400
                )
129
130
131
132
133

        # Execute parsing
        pipe.pipe_parse()

        # Save results in text and md format
134
135
        content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode='none')
        md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode='none')
136
137
138

        if is_json_md_dump:
            json_md_dump(pipe, md_writer, pdf_name, content_list, md_content)
139
140
141
142
143
144
        data = {
            'layout': copy.deepcopy(pipe.model_list),
            'info': pipe.pdf_mid_data,
            'content_list': content_list,
            'md_content': md_content,
        }
145
146
147
148
        return JSONResponse(data, status_code=200)

    except Exception as e:
        logger.exception(e)
149
        return JSONResponse(content={'error': str(e)}, status_code=500)
150
151
152
153
154
    finally:
        # Clean up the temporary file
        if 'temp_pdf_path' in locals():
            os.unlink(temp_pdf_path)

155
156
157

if __name__ == '__main__':
    uvicorn.run(app, host='0.0.0.0', port=8888)