common.py 7.71 KB
Newer Older
1
2
import os

icecraft's avatar
icecraft committed
3
import click
4
import fitz
icecraft's avatar
icecraft committed
5
from loguru import logger
6
7

import magic_pdf.model as model_config
8
from magic_pdf.config.enums import SupportedPdfParseMethod
9
from magic_pdf.config.make_content_config import DropMode, MakeMode
10
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
11
12
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
icecraft's avatar
icecraft committed
13
from magic_pdf.model.operators import InferenceResult
14

15
16
# from io import BytesIO
# from pypdf import PdfReader, PdfWriter
icecraft's avatar
icecraft committed
17
18
19
20
21


def prepare_env(output_dir, pdf_file_name, method):
    local_parent_dir = os.path.join(output_dir, pdf_file_name, method)

22
    local_image_dir = os.path.join(str(local_parent_dir), 'images')
icecraft's avatar
icecraft committed
23
24
25
26
27
28
    local_md_dir = local_parent_dir
    os.makedirs(local_image_dir, exist_ok=True)
    os.makedirs(local_md_dir, exist_ok=True)
    return local_image_dir, local_md_dir


29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# def convert_pdf_bytes_to_bytes_by_pypdf(pdf_bytes, start_page_id=0, end_page_id=None):
#     # 将字节数据包装在 BytesIO 对象中
#     pdf_file = BytesIO(pdf_bytes)
#     # 读取 PDF 的字节数据
#     reader = PdfReader(pdf_file)
#     # 创建一个新的 PDF 写入器
#     writer = PdfWriter()
#     # 将所有页面添加到新的 PDF 写入器中
#     end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(reader.pages) - 1
#     if end_page_id > len(reader.pages) - 1:
#         logger.warning("end_page_id is out of range, use pdf_docs length")
#         end_page_id = len(reader.pages) - 1
#     for i, page in enumerate(reader.pages):
#         if start_page_id <= i <= end_page_id:
#             writer.add_page(page)
#     # 创建一个字节缓冲区来存储输出的 PDF 数据
#     output_buffer = BytesIO()
#     # 将 PDF 写入字节缓冲区
#     writer.write(output_buffer)
#     # 获取字节缓冲区的内容
#     converted_pdf_bytes = output_buffer.getvalue()
#     return converted_pdf_bytes


def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
54
    document = fitz.open('pdf', pdf_bytes)
55
    output_document = fitz.open()
56
57
58
59
60
    end_page_id = (
        end_page_id
        if end_page_id is not None and end_page_id >= 0
        else len(document) - 1
    )
61
    if end_page_id > len(document) - 1:
62
        logger.warning('end_page_id is out of range, use pdf_docs length')
63
64
65
66
67
68
        end_page_id = len(document) - 1
    output_document.insert_pdf(document, from_page=start_page_id, to_page=end_page_id)
    output_bytes = output_document.tobytes()
    return output_bytes


icecraft's avatar
icecraft committed
69
70
71
72
73
74
def do_parse(
    output_dir,
    pdf_file_name,
    pdf_bytes,
    model_list,
    parse_method,
75
    debug_able,
icecraft's avatar
icecraft committed
76
77
78
79
80
81
    f_draw_span_bbox=True,
    f_draw_layout_bbox=True,
    f_dump_md=True,
    f_dump_middle_json=True,
    f_dump_model_json=True,
    f_dump_orig_pdf=True,
82
    f_dump_content_list=True,
icecraft's avatar
icecraft committed
83
    f_make_md_mode=MakeMode.MM_MD,
84
    f_draw_model_bbox=False,
85
    f_draw_line_sort_bbox=False,
86
87
    start_page_id=0,
    end_page_id=None,
88
    lang=None,
89
90
91
    layout_model=None,
    formula_enable=None,
    table_enable=None,
icecraft's avatar
icecraft committed
92
):
93
    if debug_able:
94
        logger.warning('debug mode is on')
95
        f_draw_model_bbox = True
96
        f_draw_line_sort_bbox = True
97

98
    if lang == '':
99
100
        lang = None

101
102
103
    pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
        pdf_bytes, start_page_id, end_page_id
    )
104

105
    local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
106

107
108
109
    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
        local_md_dir
    )
icecraft's avatar
icecraft committed
110
111
    image_dir = str(os.path.basename(local_image_dir))

112
    ds = PymuDocDataset(pdf_bytes)
icecraft's avatar
icecraft committed
113
114
115

    if len(model_list) == 0:
        if model_config.__use_inside_model__:
116
117
118
119
120
121
122
123
124
125
            if parse_method == 'auto':
                if ds.classify() == SupportedPdfParseMethod.TXT:
                    infer_result = ds.apply(
                        doc_analyze,
                        ocr=False,
                        lang=lang,
                        layout_model=layout_model,
                        formula_enable=formula_enable,
                        table_enable=table_enable,
                    )
icecraft's avatar
icecraft committed
126
127
128
                    pipe_result = infer_result.pipe_txt_mode(
                        image_writer, debug_mode=True, lang=lang
                    )
129
130
131
132
133
134
135
136
137
                else:
                    infer_result = ds.apply(
                        doc_analyze,
                        ocr=True,
                        lang=lang,
                        layout_model=layout_model,
                        formula_enable=formula_enable,
                        table_enable=table_enable,
                    )
icecraft's avatar
icecraft committed
138
139
140
                    pipe_result = infer_result.pipe_ocr_mode(
                        image_writer, debug_mode=True, lang=lang
                    )
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168

            elif parse_method == 'txt':
                infer_result = ds.apply(
                    doc_analyze,
                    ocr=False,
                    lang=lang,
                    layout_model=layout_model,
                    formula_enable=formula_enable,
                    table_enable=table_enable,
                )
                pipe_result = infer_result.pipe_txt_mode(
                    image_writer, debug_mode=True, lang=lang
                )
            elif parse_method == 'ocr':
                infer_result = ds.apply(
                    doc_analyze,
                    ocr=True,
                    lang=lang,
                    layout_model=layout_model,
                    formula_enable=formula_enable,
                    table_enable=table_enable,
                )
                pipe_result = infer_result.pipe_ocr_mode(
                    image_writer, debug_mode=True, lang=lang
                )
            else:
                logger.error('unknown parse method')
                exit(1)
icecraft's avatar
icecraft committed
169
        else:
170
            logger.error('need model list input')
icecraft's avatar
icecraft committed
171
            exit(2)
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
    else:
        infer_result = InferenceResult(model_list, ds)
        if parse_method == 'ocr':
            pipe_result = infer_result.pipe_ocr_mode(
                image_writer, debug_mode=True, lang=lang
            )
        elif parse_method == 'txt':
            pipe_result = infer_result.pipe_txt_mode(
                image_writer, debug_mode=True, lang=lang
            )
        else:
            pipe_result = infer_result.pipe_auto_mode(
                image_writer, debug_mode=True, lang=lang
            )

    if f_draw_model_bbox:
        infer_result.draw_model(
            os.path.join(local_md_dir, f'{pdf_file_name}_model.pdf')
        )
icecraft's avatar
icecraft committed
191
192

    if f_draw_layout_bbox:
193
194
195
        pipe_result.draw_layout(
            os.path.join(local_md_dir, f'{pdf_file_name}_layout.pdf')
        )
icecraft's avatar
icecraft committed
196
    if f_draw_span_bbox:
197
198
        pipe_result.draw_span(os.path.join(local_md_dir, f'{pdf_file_name}_spans.pdf'))

199
    if f_draw_line_sort_bbox:
200
201
202
        pipe_result.draw_line_sort(
            os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf')
        )
203

icecraft's avatar
icecraft committed
204
    if f_dump_md:
205
206
        pipe_result.dump_md(
            md_writer,
207
            f'{pdf_file_name}.md',
208
209
210
            image_dir,
            drop_mode=DropMode.NONE,
            md_make_mode=f_make_md_mode,
icecraft's avatar
icecraft committed
211
212
213
        )

    if f_dump_middle_json:
214
        pipe_result.dump_middle_json(md_writer, f'{pdf_file_name}_middle.json')
icecraft's avatar
icecraft committed
215
216

    if f_dump_model_json:
217
        infer_result.dump_model(md_writer, f'{pdf_file_name}_model.json')
icecraft's avatar
icecraft committed
218
219
220

    if f_dump_orig_pdf:
        md_writer.write(
221
222
            f'{pdf_file_name}_origin.pdf',
            pdf_bytes,
icecraft's avatar
icecraft committed
223
224
225
        )

    if f_dump_content_list:
226
227
        pipe_result.dump_content_list(
            md_writer,
228
            f'{pdf_file_name}_content_list.json',
icecraft's avatar
icecraft committed
229
            image_dir
icecraft's avatar
icecraft committed
230
231
        )

232
    logger.info(f'local output dir is {local_md_dir}')
233

icecraft's avatar
icecraft committed
234

235
parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])