"megatron/core/inference/sampling_params.py" did not exist on "b01809dd73ffc9a42f92c82088fce5b581d9ca74"
common.py 12 KB
Newer Older
1
2
import os

icecraft's avatar
icecraft committed
3
import click
4
import fitz
icecraft's avatar
icecraft committed
5
from loguru import logger
6
7

import magic_pdf.model as model_config
8
from magic_pdf.config.enums import SupportedPdfParseMethod
9
from magic_pdf.config.make_content_config import DropMode, MakeMode
10
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
icecraft's avatar
icecraft committed
11
from magic_pdf.data.dataset import Dataset, PymuDocDataset
12
from magic_pdf.libs.draw_bbox import draw_char_bbox
icecraft's avatar
icecraft committed
13
14
from magic_pdf.model.doc_analyze_by_custom_model import (batch_doc_analyze,
                                                         doc_analyze)
15

16
17
# from io import BytesIO
# from pypdf import PdfReader, PdfWriter
icecraft's avatar
icecraft committed
18
19
20
21
22


def prepare_env(output_dir, pdf_file_name, method):
    local_parent_dir = os.path.join(output_dir, pdf_file_name, method)

23
    local_image_dir = os.path.join(str(local_parent_dir), 'images')
icecraft's avatar
icecraft committed
24
25
26
27
28
29
    local_md_dir = local_parent_dir
    os.makedirs(local_image_dir, exist_ok=True)
    os.makedirs(local_md_dir, exist_ok=True)
    return local_image_dir, local_md_dir


30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# def convert_pdf_bytes_to_bytes_by_pypdf(pdf_bytes, start_page_id=0, end_page_id=None):
#     # 将字节数据包装在 BytesIO 对象中
#     pdf_file = BytesIO(pdf_bytes)
#     # 读取 PDF 的字节数据
#     reader = PdfReader(pdf_file)
#     # 创建一个新的 PDF 写入器
#     writer = PdfWriter()
#     # 将所有页面添加到新的 PDF 写入器中
#     end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(reader.pages) - 1
#     if end_page_id > len(reader.pages) - 1:
#         logger.warning("end_page_id is out of range, use pdf_docs length")
#         end_page_id = len(reader.pages) - 1
#     for i, page in enumerate(reader.pages):
#         if start_page_id <= i <= end_page_id:
#             writer.add_page(page)
#     # 创建一个字节缓冲区来存储输出的 PDF 数据
#     output_buffer = BytesIO()
#     # 将 PDF 写入字节缓冲区
#     writer.write(output_buffer)
#     # 获取字节缓冲区的内容
#     converted_pdf_bytes = output_buffer.getvalue()
#     return converted_pdf_bytes


def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
55
    document = fitz.open('pdf', pdf_bytes)
56
    output_document = fitz.open()
57
58
59
60
61
    end_page_id = (
        end_page_id
        if end_page_id is not None and end_page_id >= 0
        else len(document) - 1
    )
62
    if end_page_id > len(document) - 1:
63
        logger.warning('end_page_id is out of range, use pdf_docs length')
64
65
66
67
68
69
        end_page_id = len(document) - 1
    output_document.insert_pdf(document, from_page=start_page_id, to_page=end_page_id)
    output_bytes = output_document.tobytes()
    return output_bytes


icecraft's avatar
icecraft committed
70
def _do_parse(
icecraft's avatar
icecraft committed
71
72
    output_dir,
    pdf_file_name,
icecraft's avatar
icecraft committed
73
    pdf_bytes_or_dataset,
icecraft's avatar
icecraft committed
74
75
    model_list,
    parse_method,
76
    debug_able,
icecraft's avatar
icecraft committed
77
78
79
80
81
82
    f_draw_span_bbox=True,
    f_draw_layout_bbox=True,
    f_dump_md=True,
    f_dump_middle_json=True,
    f_dump_model_json=True,
    f_dump_orig_pdf=True,
83
    f_dump_content_list=True,
icecraft's avatar
icecraft committed
84
    f_make_md_mode=MakeMode.MM_MD,
85
    f_draw_model_bbox=False,
86
    f_draw_line_sort_bbox=False,
87
    f_draw_char_bbox=False,
88
89
    start_page_id=0,
    end_page_id=None,
90
    lang=None,
91
92
93
    layout_model=None,
    formula_enable=None,
    table_enable=None,
icecraft's avatar
icecraft committed
94
):
icecraft's avatar
icecraft committed
95
    from magic_pdf.operators.models import InferenceResult
96
    if debug_able:
97
        logger.warning('debug mode is on')
98
        f_draw_model_bbox = True
99
        f_draw_line_sort_bbox = True
100
        # f_draw_char_bbox = True
101

icecraft's avatar
icecraft committed
102
103
104
105
106
107
108
109
    if isinstance(pdf_bytes_or_dataset, bytes):
        pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
            pdf_bytes_or_dataset, start_page_id, end_page_id
        )
        ds = PymuDocDataset(pdf_bytes, lang=lang)
    else:
        ds = pdf_bytes_or_dataset
    pdf_bytes = ds._raw_data
110
    local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
111

112
113
114
    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
        local_md_dir
    )
icecraft's avatar
icecraft committed
115
116
117
118
    image_dir = str(os.path.basename(local_image_dir))

    if len(model_list) == 0:
        if model_config.__use_inside_model__:
119
120
121
122
123
            if parse_method == 'auto':
                if ds.classify() == SupportedPdfParseMethod.TXT:
                    infer_result = ds.apply(
                        doc_analyze,
                        ocr=False,
124
                        lang=ds._lang,
125
126
127
128
                        layout_model=layout_model,
                        formula_enable=formula_enable,
                        table_enable=table_enable,
                    )
icecraft's avatar
icecraft committed
129
                    pipe_result = infer_result.pipe_txt_mode(
130
                        image_writer, debug_mode=True, lang=ds._lang
icecraft's avatar
icecraft committed
131
                    )
132
133
134
135
                else:
                    infer_result = ds.apply(
                        doc_analyze,
                        ocr=True,
136
                        lang=ds._lang,
137
138
139
140
                        layout_model=layout_model,
                        formula_enable=formula_enable,
                        table_enable=table_enable,
                    )
icecraft's avatar
icecraft committed
141
                    pipe_result = infer_result.pipe_ocr_mode(
142
                        image_writer, debug_mode=True, lang=ds._lang
icecraft's avatar
icecraft committed
143
                    )
144
145
146
147
148

            elif parse_method == 'txt':
                infer_result = ds.apply(
                    doc_analyze,
                    ocr=False,
149
                    lang=ds._lang,
150
151
152
153
154
                    layout_model=layout_model,
                    formula_enable=formula_enable,
                    table_enable=table_enable,
                )
                pipe_result = infer_result.pipe_txt_mode(
155
                    image_writer, debug_mode=True, lang=ds._lang
156
157
158
159
160
                )
            elif parse_method == 'ocr':
                infer_result = ds.apply(
                    doc_analyze,
                    ocr=True,
161
                    lang=ds._lang,
162
163
164
165
166
                    layout_model=layout_model,
                    formula_enable=formula_enable,
                    table_enable=table_enable,
                )
                pipe_result = infer_result.pipe_ocr_mode(
167
                    image_writer, debug_mode=True, lang=ds._lang
168
169
170
171
                )
            else:
                logger.error('unknown parse method')
                exit(1)
icecraft's avatar
icecraft committed
172
        else:
173
            logger.error('need model list input')
icecraft's avatar
icecraft committed
174
            exit(2)
175
    else:
icecraft's avatar
icecraft committed
176

177
178
179
        infer_result = InferenceResult(model_list, ds)
        if parse_method == 'ocr':
            pipe_result = infer_result.pipe_ocr_mode(
180
                image_writer, debug_mode=True, lang=ds._lang
181
182
183
            )
        elif parse_method == 'txt':
            pipe_result = infer_result.pipe_txt_mode(
184
                image_writer, debug_mode=True, lang=ds._lang
185
186
            )
        else:
xu rui's avatar
xu rui committed
187
188
            if ds.classify() == SupportedPdfParseMethod.TXT:
                pipe_result = infer_result.pipe_txt_mode(
189
                        image_writer, debug_mode=True, lang=ds._lang
xu rui's avatar
xu rui committed
190
191
                    )
            else:
xu rui's avatar
xu rui committed
192
                pipe_result = infer_result.pipe_ocr_mode(
193
                        image_writer, debug_mode=True, lang=ds._lang
xu rui's avatar
xu rui committed
194
                    )
icecraft's avatar
icecraft committed
195

196
197
198
199
200

    if f_draw_model_bbox:
        infer_result.draw_model(
            os.path.join(local_md_dir, f'{pdf_file_name}_model.pdf')
        )
icecraft's avatar
icecraft committed
201
202

    if f_draw_layout_bbox:
203
204
205
        pipe_result.draw_layout(
            os.path.join(local_md_dir, f'{pdf_file_name}_layout.pdf')
        )
icecraft's avatar
icecraft committed
206
    if f_draw_span_bbox:
207
208
        pipe_result.draw_span(os.path.join(local_md_dir, f'{pdf_file_name}_spans.pdf'))

209
    if f_draw_line_sort_bbox:
210
211
212
        pipe_result.draw_line_sort(
            os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf')
        )
213

214
215
216
    if f_draw_char_bbox:
        draw_char_bbox(pdf_bytes, local_md_dir, f'{pdf_file_name}_char_bbox.pdf')

icecraft's avatar
icecraft committed
217
    if f_dump_md:
218
219
        pipe_result.dump_md(
            md_writer,
220
            f'{pdf_file_name}.md',
221
222
223
            image_dir,
            drop_mode=DropMode.NONE,
            md_make_mode=f_make_md_mode,
icecraft's avatar
icecraft committed
224
225
226
        )

    if f_dump_middle_json:
227
        pipe_result.dump_middle_json(md_writer, f'{pdf_file_name}_middle.json')
icecraft's avatar
icecraft committed
228
229

    if f_dump_model_json:
230
        infer_result.dump_model(md_writer, f'{pdf_file_name}_model.json')
icecraft's avatar
icecraft committed
231
232
233

    if f_dump_orig_pdf:
        md_writer.write(
234
235
            f'{pdf_file_name}_origin.pdf',
            pdf_bytes,
icecraft's avatar
icecraft committed
236
237
238
        )

    if f_dump_content_list:
239
240
        pipe_result.dump_content_list(
            md_writer,
241
            f'{pdf_file_name}_content_list.json',
icecraft's avatar
icecraft committed
242
            image_dir
icecraft's avatar
icecraft committed
243
244
        )

245
    logger.info(f'local output dir is {local_md_dir}')
246

icecraft's avatar
icecraft committed
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
def do_parse(
    output_dir,
    pdf_file_name,
    pdf_bytes_or_dataset,
    model_list,
    parse_method,
    debug_able,
    f_draw_span_bbox=True,
    f_draw_layout_bbox=True,
    f_dump_md=True,
    f_dump_middle_json=True,
    f_dump_model_json=True,
    f_dump_orig_pdf=True,
    f_dump_content_list=True,
    f_make_md_mode=MakeMode.MM_MD,
    f_draw_model_bbox=False,
    f_draw_line_sort_bbox=False,
    f_draw_char_bbox=False,
    start_page_id=0,
    end_page_id=None,
    lang=None,
    layout_model=None,
    formula_enable=None,
    table_enable=None,
):
    parallel_count = 1
    if os.environ.get('MINERU_PARALLEL_INFERENCE_COUNT'):
        parallel_count = int(os.environ['MINERU_PARALLEL_INFERENCE_COUNT'])

    if parallel_count > 1:
        if isinstance(pdf_bytes_or_dataset, bytes):
            pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
                pdf_bytes_or_dataset, start_page_id, end_page_id
            )
            ds = PymuDocDataset(pdf_bytes, lang=lang)
        else:
            ds = pdf_bytes_or_dataset
        batch_do_parse(output_dir, [pdf_file_name], [ds], parse_method, debug_able, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox)
    else:
        _do_parse(output_dir, pdf_file_name, pdf_bytes_or_dataset, model_list, parse_method, debug_able, start_page_id=start_page_id, end_page_id=end_page_id, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable,  f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox)


def batch_do_parse(
    output_dir,
    pdf_file_names: list[str],
    pdf_bytes_or_datasets: list[bytes | Dataset],
    parse_method,
    debug_able,
    f_draw_span_bbox=True,
    f_draw_layout_bbox=True,
    f_dump_md=True,
    f_dump_middle_json=True,
    f_dump_model_json=True,
    f_dump_orig_pdf=True,
    f_dump_content_list=True,
    f_make_md_mode=MakeMode.MM_MD,
    f_draw_model_bbox=False,
    f_draw_line_sort_bbox=False,
    f_draw_char_bbox=False,
    lang=None,
    layout_model=None,
    formula_enable=None,
    table_enable=None,
):
    dss = []
    for v in pdf_bytes_or_datasets:
        if isinstance(v, bytes):
            dss.append(PymuDocDataset(v, lang=lang))
        else:
            dss.append(v)
icecraft's avatar
icecraft committed
317
    infer_results = batch_doc_analyze(dss, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
icecraft's avatar
icecraft committed
318
319
320
    for idx, infer_result in enumerate(infer_results):
        _do_parse(output_dir, pdf_file_names[idx], dss[idx], infer_result.get_infer_res(), parse_method, debug_able, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox)

icecraft's avatar
icecraft committed
321

322
parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])