common.py 12.3 KB
Newer Older
1
2
import os

icecraft's avatar
icecraft committed
3
import click
4
import fitz
icecraft's avatar
icecraft committed
5
from loguru import logger
6
7

import magic_pdf.model as model_config
8
from magic_pdf.config.enums import SupportedPdfParseMethod
9
from magic_pdf.config.make_content_config import DropMode, MakeMode
10
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
icecraft's avatar
icecraft committed
11
from magic_pdf.data.dataset import Dataset, PymuDocDataset
12
from magic_pdf.libs.draw_bbox import draw_char_bbox
icecraft's avatar
icecraft committed
13
14
from magic_pdf.model.doc_analyze_by_custom_model import (batch_doc_analyze,
                                                         doc_analyze)
15

16
17
# from io import BytesIO
# from pypdf import PdfReader, PdfWriter
icecraft's avatar
icecraft committed
18
19
20
21
22


def prepare_env(output_dir, pdf_file_name, method):
    local_parent_dir = os.path.join(output_dir, pdf_file_name, method)

23
    local_image_dir = os.path.join(str(local_parent_dir), 'images')
icecraft's avatar
icecraft committed
24
25
26
27
28
29
    local_md_dir = local_parent_dir
    os.makedirs(local_image_dir, exist_ok=True)
    os.makedirs(local_md_dir, exist_ok=True)
    return local_image_dir, local_md_dir


30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# def convert_pdf_bytes_to_bytes_by_pypdf(pdf_bytes, start_page_id=0, end_page_id=None):
#     # 将字节数据包装在 BytesIO 对象中
#     pdf_file = BytesIO(pdf_bytes)
#     # 读取 PDF 的字节数据
#     reader = PdfReader(pdf_file)
#     # 创建一个新的 PDF 写入器
#     writer = PdfWriter()
#     # 将所有页面添加到新的 PDF 写入器中
#     end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(reader.pages) - 1
#     if end_page_id > len(reader.pages) - 1:
#         logger.warning("end_page_id is out of range, use pdf_docs length")
#         end_page_id = len(reader.pages) - 1
#     for i, page in enumerate(reader.pages):
#         if start_page_id <= i <= end_page_id:
#             writer.add_page(page)
#     # 创建一个字节缓冲区来存储输出的 PDF 数据
#     output_buffer = BytesIO()
#     # 将 PDF 写入字节缓冲区
#     writer.write(output_buffer)
#     # 获取字节缓冲区的内容
#     converted_pdf_bytes = output_buffer.getvalue()
#     return converted_pdf_bytes


def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
55
    document = fitz.open('pdf', pdf_bytes)
56
    output_document = fitz.open()
57
58
59
60
61
    end_page_id = (
        end_page_id
        if end_page_id is not None and end_page_id >= 0
        else len(document) - 1
    )
62
    if end_page_id > len(document) - 1:
63
        logger.warning('end_page_id is out of range, use pdf_docs length')
64
65
66
67
68
69
        end_page_id = len(document) - 1
    output_document.insert_pdf(document, from_page=start_page_id, to_page=end_page_id)
    output_bytes = output_document.tobytes()
    return output_bytes


icecraft's avatar
icecraft committed
70
def _do_parse(
icecraft's avatar
icecraft committed
71
72
    output_dir,
    pdf_file_name,
icecraft's avatar
icecraft committed
73
    pdf_bytes_or_dataset,
icecraft's avatar
icecraft committed
74
75
    model_list,
    parse_method,
76
    debug_able=False,
icecraft's avatar
icecraft committed
77
78
79
80
81
82
    f_draw_span_bbox=True,
    f_draw_layout_bbox=True,
    f_dump_md=True,
    f_dump_middle_json=True,
    f_dump_model_json=True,
    f_dump_orig_pdf=True,
83
    f_dump_content_list=True,
icecraft's avatar
icecraft committed
84
    f_make_md_mode=MakeMode.MM_MD,
85
    f_draw_model_bbox=False,
86
    f_draw_line_sort_bbox=False,
87
    f_draw_char_bbox=False,
88
89
    start_page_id=0,
    end_page_id=None,
90
    lang=None,
91
92
93
    layout_model=None,
    formula_enable=None,
    table_enable=None,
icecraft's avatar
icecraft committed
94
):
icecraft's avatar
icecraft committed
95
    from magic_pdf.operators.models import InferenceResult
96
    if debug_able:
97
        logger.warning('debug mode is on')
98
        f_draw_model_bbox = True
99
        f_draw_line_sort_bbox = True
100
        # f_draw_char_bbox = True
101

icecraft's avatar
icecraft committed
102
103
104
105
106
107
108
109
    if isinstance(pdf_bytes_or_dataset, bytes):
        pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
            pdf_bytes_or_dataset, start_page_id, end_page_id
        )
        ds = PymuDocDataset(pdf_bytes, lang=lang)
    else:
        ds = pdf_bytes_or_dataset
    pdf_bytes = ds._raw_data
110
    local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
111

112
    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
icecraft's avatar
icecraft committed
113
114
115
116
    image_dir = str(os.path.basename(local_image_dir))

    if len(model_list) == 0:
        if model_config.__use_inside_model__:
117
118
119
120
121
            if parse_method == 'auto':
                if ds.classify() == SupportedPdfParseMethod.TXT:
                    infer_result = ds.apply(
                        doc_analyze,
                        ocr=False,
122
                        lang=ds._lang,
123
124
125
126
                        layout_model=layout_model,
                        formula_enable=formula_enable,
                        table_enable=table_enable,
                    )
icecraft's avatar
icecraft committed
127
                    pipe_result = infer_result.pipe_txt_mode(
128
                        image_writer, debug_mode=True, lang=ds._lang
icecraft's avatar
icecraft committed
129
                    )
130
131
132
133
                else:
                    infer_result = ds.apply(
                        doc_analyze,
                        ocr=True,
134
                        lang=ds._lang,
135
136
137
138
                        layout_model=layout_model,
                        formula_enable=formula_enable,
                        table_enable=table_enable,
                    )
icecraft's avatar
icecraft committed
139
                    pipe_result = infer_result.pipe_ocr_mode(
140
                        image_writer, debug_mode=True, lang=ds._lang
icecraft's avatar
icecraft committed
141
                    )
142
143
144
145
146

            elif parse_method == 'txt':
                infer_result = ds.apply(
                    doc_analyze,
                    ocr=False,
147
                    lang=ds._lang,
148
149
150
151
152
                    layout_model=layout_model,
                    formula_enable=formula_enable,
                    table_enable=table_enable,
                )
                pipe_result = infer_result.pipe_txt_mode(
153
                    image_writer, debug_mode=True, lang=ds._lang
154
155
156
157
158
                )
            elif parse_method == 'ocr':
                infer_result = ds.apply(
                    doc_analyze,
                    ocr=True,
159
                    lang=ds._lang,
160
161
162
163
164
                    layout_model=layout_model,
                    formula_enable=formula_enable,
                    table_enable=table_enable,
                )
                pipe_result = infer_result.pipe_ocr_mode(
165
                    image_writer, debug_mode=True, lang=ds._lang
166
167
168
169
                )
            else:
                logger.error('unknown parse method')
                exit(1)
icecraft's avatar
icecraft committed
170
        else:
171
            logger.error('need model list input')
icecraft's avatar
icecraft committed
172
            exit(2)
173
    else:
icecraft's avatar
icecraft committed
174

175
176
177
        infer_result = InferenceResult(model_list, ds)
        if parse_method == 'ocr':
            pipe_result = infer_result.pipe_ocr_mode(
178
                image_writer, debug_mode=True, lang=ds._lang
179
180
181
            )
        elif parse_method == 'txt':
            pipe_result = infer_result.pipe_txt_mode(
182
                image_writer, debug_mode=True, lang=ds._lang
183
184
            )
        else:
xu rui's avatar
xu rui committed
185
186
            if ds.classify() == SupportedPdfParseMethod.TXT:
                pipe_result = infer_result.pipe_txt_mode(
187
                        image_writer, debug_mode=True, lang=ds._lang
xu rui's avatar
xu rui committed
188
189
                    )
            else:
xu rui's avatar
xu rui committed
190
                pipe_result = infer_result.pipe_ocr_mode(
191
                        image_writer, debug_mode=True, lang=ds._lang
xu rui's avatar
xu rui committed
192
                    )
icecraft's avatar
icecraft committed
193

194
195
196
197
198

    if f_draw_model_bbox:
        infer_result.draw_model(
            os.path.join(local_md_dir, f'{pdf_file_name}_model.pdf')
        )
icecraft's avatar
icecraft committed
199
200

    if f_draw_layout_bbox:
201
202
203
        pipe_result.draw_layout(
            os.path.join(local_md_dir, f'{pdf_file_name}_layout.pdf')
        )
icecraft's avatar
icecraft committed
204
    if f_draw_span_bbox:
205
206
        pipe_result.draw_span(os.path.join(local_md_dir, f'{pdf_file_name}_spans.pdf'))

207
    if f_draw_line_sort_bbox:
208
209
210
        pipe_result.draw_line_sort(
            os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf')
        )
211

212
213
214
    if f_draw_char_bbox:
        draw_char_bbox(pdf_bytes, local_md_dir, f'{pdf_file_name}_char_bbox.pdf')

icecraft's avatar
icecraft committed
215
    if f_dump_md:
216
217
        pipe_result.dump_md(
            md_writer,
218
            f'{pdf_file_name}.md',
219
220
221
            image_dir,
            drop_mode=DropMode.NONE,
            md_make_mode=f_make_md_mode,
icecraft's avatar
icecraft committed
222
223
224
        )

    if f_dump_middle_json:
225
        pipe_result.dump_middle_json(md_writer, f'{pdf_file_name}_middle.json')
icecraft's avatar
icecraft committed
226
227

    if f_dump_model_json:
228
        infer_result.dump_model(md_writer, f'{pdf_file_name}_model.json')
icecraft's avatar
icecraft committed
229
230
231

    if f_dump_orig_pdf:
        md_writer.write(
232
233
            f'{pdf_file_name}_origin.pdf',
            pdf_bytes,
icecraft's avatar
icecraft committed
234
235
236
        )

    if f_dump_content_list:
237
238
        pipe_result.dump_content_list(
            md_writer,
239
            f'{pdf_file_name}_content_list.json',
icecraft's avatar
icecraft committed
240
            image_dir
icecraft's avatar
icecraft committed
241
242
        )

243
    logger.info(f'local output dir is {local_md_dir}')
244

icecraft's avatar
icecraft committed
245
246
247
248
249
250
def do_parse(
    output_dir,
    pdf_file_name,
    pdf_bytes_or_dataset,
    model_list,
    parse_method,
251
    debug_able=False,
icecraft's avatar
icecraft committed
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
    f_draw_span_bbox=True,
    f_draw_layout_bbox=True,
    f_dump_md=True,
    f_dump_middle_json=True,
    f_dump_model_json=True,
    f_dump_orig_pdf=True,
    f_dump_content_list=True,
    f_make_md_mode=MakeMode.MM_MD,
    f_draw_model_bbox=False,
    f_draw_line_sort_bbox=False,
    f_draw_char_bbox=False,
    start_page_id=0,
    end_page_id=None,
    lang=None,
    layout_model=None,
    formula_enable=None,
    table_enable=None,
):
    parallel_count = 1
    if os.environ.get('MINERU_PARALLEL_INFERENCE_COUNT'):
        parallel_count = int(os.environ['MINERU_PARALLEL_INFERENCE_COUNT'])

    if parallel_count > 1:
        if isinstance(pdf_bytes_or_dataset, bytes):
            pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
                pdf_bytes_or_dataset, start_page_id, end_page_id
            )
            ds = PymuDocDataset(pdf_bytes, lang=lang)
        else:
            ds = pdf_bytes_or_dataset
282
        batch_do_parse(output_dir, [pdf_file_name], [ds], parse_method, debug_able, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox, lang=lang)
icecraft's avatar
icecraft committed
283
284
285
286
287
288
289
290
291
    else:
        _do_parse(output_dir, pdf_file_name, pdf_bytes_or_dataset, model_list, parse_method, debug_able, start_page_id=start_page_id, end_page_id=end_page_id, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable,  f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox)


def batch_do_parse(
    output_dir,
    pdf_file_names: list[str],
    pdf_bytes_or_datasets: list[bytes | Dataset],
    parse_method,
292
    debug_able=False,
icecraft's avatar
icecraft committed
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
    f_draw_span_bbox=True,
    f_draw_layout_bbox=True,
    f_dump_md=True,
    f_dump_middle_json=True,
    f_dump_model_json=True,
    f_dump_orig_pdf=True,
    f_dump_content_list=True,
    f_make_md_mode=MakeMode.MM_MD,
    f_draw_model_bbox=False,
    f_draw_line_sort_bbox=False,
    f_draw_char_bbox=False,
    lang=None,
    layout_model=None,
    formula_enable=None,
    table_enable=None,
):
    dss = []
    for v in pdf_bytes_or_datasets:
        if isinstance(v, bytes):
            dss.append(PymuDocDataset(v, lang=lang))
        else:
            dss.append(v)
315
316

    infer_results = batch_doc_analyze(dss, parse_method, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
icecraft's avatar
icecraft committed
317
    for idx, infer_result in enumerate(infer_results):
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
        _do_parse(
            output_dir = output_dir,
            pdf_file_name = pdf_file_names[idx],
            pdf_bytes_or_dataset = dss[idx],
            model_list = infer_result.get_infer_res(),
            parse_method = parse_method,
            debug_able = debug_able,
            f_draw_span_bbox = f_draw_span_bbox,
            f_draw_layout_bbox = f_draw_layout_bbox,
            f_dump_md=f_dump_md,
            f_dump_middle_json=f_dump_middle_json,
            f_dump_model_json=f_dump_model_json,
            f_dump_orig_pdf=f_dump_orig_pdf,
            f_dump_content_list=f_dump_content_list,
            f_make_md_mode=MakeMode.MM_MD,
            f_draw_model_bbox=f_draw_model_bbox,
            f_draw_line_sort_bbox=f_draw_line_sort_bbox,
            f_draw_char_bbox=f_draw_char_bbox,
            lang=lang,
        )
icecraft's avatar
icecraft committed
338

icecraft's avatar
icecraft committed
339

340
parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])