common.py 7.19 KB
Newer Older
icecraft's avatar
icecraft committed
1
import copy
2
3
4
import json as json_parse
import os

icecraft's avatar
icecraft committed
5
6
import click
from loguru import logger
7
8

import magic_pdf.model as model_config
9
10
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
                                      draw_model_bbox, draw_span_bbox)
icecraft's avatar
icecraft committed
11
12
13
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
14
from magic_pdf.pipe.UNIPipe import UNIPipe
icecraft's avatar
icecraft committed
15
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
16
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
17
18
19
import fitz
# from io import BytesIO
# from pypdf import PdfReader, PdfWriter
icecraft's avatar
icecraft committed
20
21
22
23
24


def prepare_env(output_dir, pdf_file_name, method):
    local_parent_dir = os.path.join(output_dir, pdf_file_name, method)

25
    local_image_dir = os.path.join(str(local_parent_dir), 'images')
icecraft's avatar
icecraft committed
26
27
28
29
30
31
    local_md_dir = local_parent_dir
    os.makedirs(local_image_dir, exist_ok=True)
    os.makedirs(local_md_dir, exist_ok=True)
    return local_image_dir, local_md_dir


32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# def convert_pdf_bytes_to_bytes_by_pypdf(pdf_bytes, start_page_id=0, end_page_id=None):
#     # 将字节数据包装在 BytesIO 对象中
#     pdf_file = BytesIO(pdf_bytes)
#     # 读取 PDF 的字节数据
#     reader = PdfReader(pdf_file)
#     # 创建一个新的 PDF 写入器
#     writer = PdfWriter()
#     # 将所有页面添加到新的 PDF 写入器中
#     end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(reader.pages) - 1
#     if end_page_id > len(reader.pages) - 1:
#         logger.warning("end_page_id is out of range, use pdf_docs length")
#         end_page_id = len(reader.pages) - 1
#     for i, page in enumerate(reader.pages):
#         if start_page_id <= i <= end_page_id:
#             writer.add_page(page)
#     # 创建一个字节缓冲区来存储输出的 PDF 数据
#     output_buffer = BytesIO()
#     # 将 PDF 写入字节缓冲区
#     writer.write(output_buffer)
#     # 获取字节缓冲区的内容
#     converted_pdf_bytes = output_buffer.getvalue()
#     return converted_pdf_bytes


def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
    document = fitz.open("pdf", pdf_bytes)
    output_document = fitz.open()
    end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(document) - 1
    if end_page_id > len(document) - 1:
        logger.warning("end_page_id is out of range, use pdf_docs length")
        end_page_id = len(document) - 1
    output_document.insert_pdf(document, from_page=start_page_id, to_page=end_page_id)
    output_bytes = output_document.tobytes()
    return output_bytes


icecraft's avatar
icecraft committed
68
69
70
71
72
73
def do_parse(
    output_dir,
    pdf_file_name,
    pdf_bytes,
    model_list,
    parse_method,
74
    debug_able,
icecraft's avatar
icecraft committed
75
76
77
78
79
80
    f_draw_span_bbox=True,
    f_draw_layout_bbox=True,
    f_dump_md=True,
    f_dump_middle_json=True,
    f_dump_model_json=True,
    f_dump_orig_pdf=True,
81
    f_dump_content_list=True,
icecraft's avatar
icecraft committed
82
    f_make_md_mode=MakeMode.MM_MD,
83
    f_draw_model_bbox=False,
84
    f_draw_line_sort_bbox=False,
85
86
    start_page_id=0,
    end_page_id=None,
87
    lang=None,
88
89
90
    layout_model=None,
    formula_enable=None,
    table_enable=None,
icecraft's avatar
icecraft committed
91
):
92
    if debug_able:
93
        logger.warning('debug mode is on')
94
        f_draw_model_bbox = True
95
        f_draw_line_sort_bbox = True
96

97
98
    pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id, end_page_id)

icecraft's avatar
icecraft committed
99
    orig_model_list = copy.deepcopy(model_list)
100
101
    local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
                                                parse_method)
102

103
104
    image_writer, md_writer = DiskReaderWriter(
        local_image_dir), DiskReaderWriter(local_md_dir)
icecraft's avatar
icecraft committed
105
106
    image_dir = str(os.path.basename(local_image_dir))

107
108
    if parse_method == 'auto':
        jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
109
        pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
110
111
                       # start_page_id=start_page_id, end_page_id=end_page_id,
                       lang=lang,
112
                       layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
113
    elif parse_method == 'txt':
114
        pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
115
116
                       # start_page_id=start_page_id, end_page_id=end_page_id,
                       lang=lang,
117
                       layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
118
    elif parse_method == 'ocr':
119
        pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
120
121
                       # start_page_id=start_page_id, end_page_id=end_page_id,
                       lang=lang,
122
                       layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
icecraft's avatar
icecraft committed
123
    else:
124
        logger.error('unknown parse method')
icecraft's avatar
icecraft committed
125
126
127
128
129
130
131
132
133
        exit(1)

    pipe.pipe_classify()

    if len(model_list) == 0:
        if model_config.__use_inside_model__:
            pipe.pipe_analyze()
            orig_model_list = copy.deepcopy(pipe.model_list)
        else:
134
            logger.error('need model list input')
icecraft's avatar
icecraft committed
135
136
137
            exit(2)

    pipe.pipe_parse()
138
    pdf_info = pipe.pdf_mid_data['pdf_info']
icecraft's avatar
icecraft committed
139
    if f_draw_layout_bbox:
140
        draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
icecraft's avatar
icecraft committed
141
    if f_draw_span_bbox:
142
        draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
143
    if f_draw_model_bbox:
144
        draw_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
145
146
    if f_draw_line_sort_bbox:
        draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
147

148
149
150
    md_content = pipe.pipe_mk_markdown(image_dir,
                                       drop_mode=DropMode.NONE,
                                       md_make_mode=f_make_md_mode)
icecraft's avatar
icecraft committed
151
152
153
    if f_dump_md:
        md_writer.write(
            content=md_content,
154
            path=f'{pdf_file_name}.md',
icecraft's avatar
icecraft committed
155
156
157
158
159
            mode=AbsReaderWriter.MODE_TXT,
        )

    if f_dump_middle_json:
        md_writer.write(
160
161
162
163
            content=json_parse.dumps(pipe.pdf_mid_data,
                                     ensure_ascii=False,
                                     indent=4),
            path=f'{pdf_file_name}_middle.json',
icecraft's avatar
icecraft committed
164
165
166
167
168
            mode=AbsReaderWriter.MODE_TXT,
        )

    if f_dump_model_json:
        md_writer.write(
169
170
171
172
            content=json_parse.dumps(orig_model_list,
                                     ensure_ascii=False,
                                     indent=4),
            path=f'{pdf_file_name}_model.json',
icecraft's avatar
icecraft committed
173
174
175
176
177
178
            mode=AbsReaderWriter.MODE_TXT,
        )

    if f_dump_orig_pdf:
        md_writer.write(
            content=pdf_bytes,
179
            path=f'{pdf_file_name}_origin.pdf',
icecraft's avatar
icecraft committed
180
181
182
183
184
185
            mode=AbsReaderWriter.MODE_BIN,
        )

    content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
    if f_dump_content_list:
        md_writer.write(
186
187
188
189
            content=json_parse.dumps(content_list,
                                     ensure_ascii=False,
                                     indent=4),
            path=f'{pdf_file_name}_content_list.json',
icecraft's avatar
icecraft committed
190
191
192
            mode=AbsReaderWriter.MODE_TXT,
        )

193
    logger.info(f'local output dir is {local_md_dir}')
194

icecraft's avatar
icecraft committed
195

196
parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])