common.py 7.23 KB
Newer Older
icecraft's avatar
icecraft committed
1
import copy
2
3
4
import json as json_parse
import os

icecraft's avatar
icecraft committed
5
6
import click
from loguru import logger
7
8

import magic_pdf.model as model_config
9
10
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
                                      draw_model_bbox, draw_span_bbox)
icecraft's avatar
icecraft committed
11
12
13
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
14
from magic_pdf.pipe.UNIPipe import UNIPipe
icecraft's avatar
icecraft committed
15
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
16
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
17
18
19
import fitz
# from io import BytesIO
# from pypdf import PdfReader, PdfWriter
icecraft's avatar
icecraft committed
20
21
22
23
24


def prepare_env(output_dir, pdf_file_name, method):
    local_parent_dir = os.path.join(output_dir, pdf_file_name, method)

25
    local_image_dir = os.path.join(str(local_parent_dir), 'images')
icecraft's avatar
icecraft committed
26
27
28
29
30
31
    local_md_dir = local_parent_dir
    os.makedirs(local_image_dir, exist_ok=True)
    os.makedirs(local_md_dir, exist_ok=True)
    return local_image_dir, local_md_dir


32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# def convert_pdf_bytes_to_bytes_by_pypdf(pdf_bytes, start_page_id=0, end_page_id=None):
#     # 将字节数据包装在 BytesIO 对象中
#     pdf_file = BytesIO(pdf_bytes)
#     # 读取 PDF 的字节数据
#     reader = PdfReader(pdf_file)
#     # 创建一个新的 PDF 写入器
#     writer = PdfWriter()
#     # 将所有页面添加到新的 PDF 写入器中
#     end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(reader.pages) - 1
#     if end_page_id > len(reader.pages) - 1:
#         logger.warning("end_page_id is out of range, use pdf_docs length")
#         end_page_id = len(reader.pages) - 1
#     for i, page in enumerate(reader.pages):
#         if start_page_id <= i <= end_page_id:
#             writer.add_page(page)
#     # 创建一个字节缓冲区来存储输出的 PDF 数据
#     output_buffer = BytesIO()
#     # 将 PDF 写入字节缓冲区
#     writer.write(output_buffer)
#     # 获取字节缓冲区的内容
#     converted_pdf_bytes = output_buffer.getvalue()
#     return converted_pdf_bytes


def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
    document = fitz.open("pdf", pdf_bytes)
    output_document = fitz.open()
    end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(document) - 1
    if end_page_id > len(document) - 1:
        logger.warning("end_page_id is out of range, use pdf_docs length")
        end_page_id = len(document) - 1
    output_document.insert_pdf(document, from_page=start_page_id, to_page=end_page_id)
    output_bytes = output_document.tobytes()
    return output_bytes


icecraft's avatar
icecraft committed
68
69
70
71
72
73
def do_parse(
    output_dir,
    pdf_file_name,
    pdf_bytes,
    model_list,
    parse_method,
74
    debug_able,
icecraft's avatar
icecraft committed
75
76
77
78
79
80
    f_draw_span_bbox=True,
    f_draw_layout_bbox=True,
    f_dump_md=True,
    f_dump_middle_json=True,
    f_dump_model_json=True,
    f_dump_orig_pdf=True,
81
    f_dump_content_list=True,
icecraft's avatar
icecraft committed
82
    f_make_md_mode=MakeMode.MM_MD,
83
    f_draw_model_bbox=False,
84
    f_draw_line_sort_bbox=False,
85
86
    start_page_id=0,
    end_page_id=None,
87
    lang=None,
88
89
90
    layout_model=None,
    formula_enable=None,
    table_enable=None,
icecraft's avatar
icecraft committed
91
):
92
    if debug_able:
93
        logger.warning('debug mode is on')
94
        f_draw_model_bbox = True
95
        f_draw_line_sort_bbox = True
96

97
98
99
    if lang == "":
        lang = None

100
101
    pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id, end_page_id)

icecraft's avatar
icecraft committed
102
    orig_model_list = copy.deepcopy(model_list)
103
104
    local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
                                                parse_method)
105

106
107
    image_writer, md_writer = DiskReaderWriter(
        local_image_dir), DiskReaderWriter(local_md_dir)
icecraft's avatar
icecraft committed
108
109
    image_dir = str(os.path.basename(local_image_dir))

110
111
    if parse_method == 'auto':
        jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
112
        pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
113
114
                       # start_page_id=start_page_id, end_page_id=end_page_id,
                       lang=lang,
115
                       layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
116
    elif parse_method == 'txt':
117
        pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
118
119
                       # start_page_id=start_page_id, end_page_id=end_page_id,
                       lang=lang,
120
                       layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
121
    elif parse_method == 'ocr':
122
        pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
123
124
                       # start_page_id=start_page_id, end_page_id=end_page_id,
                       lang=lang,
125
                       layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
icecraft's avatar
icecraft committed
126
    else:
127
        logger.error('unknown parse method')
icecraft's avatar
icecraft committed
128
129
130
131
132
133
134
135
136
        exit(1)

    pipe.pipe_classify()

    if len(model_list) == 0:
        if model_config.__use_inside_model__:
            pipe.pipe_analyze()
            orig_model_list = copy.deepcopy(pipe.model_list)
        else:
137
            logger.error('need model list input')
icecraft's avatar
icecraft committed
138
139
140
            exit(2)

    pipe.pipe_parse()
141
    pdf_info = pipe.pdf_mid_data['pdf_info']
icecraft's avatar
icecraft committed
142
    if f_draw_layout_bbox:
143
        draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
icecraft's avatar
icecraft committed
144
    if f_draw_span_bbox:
145
        draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
146
    if f_draw_model_bbox:
147
        draw_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
148
149
    if f_draw_line_sort_bbox:
        draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
150

151
152
153
    md_content = pipe.pipe_mk_markdown(image_dir,
                                       drop_mode=DropMode.NONE,
                                       md_make_mode=f_make_md_mode)
icecraft's avatar
icecraft committed
154
155
156
    if f_dump_md:
        md_writer.write(
            content=md_content,
157
            path=f'{pdf_file_name}.md',
icecraft's avatar
icecraft committed
158
159
160
161
162
            mode=AbsReaderWriter.MODE_TXT,
        )

    if f_dump_middle_json:
        md_writer.write(
163
164
165
166
            content=json_parse.dumps(pipe.pdf_mid_data,
                                     ensure_ascii=False,
                                     indent=4),
            path=f'{pdf_file_name}_middle.json',
icecraft's avatar
icecraft committed
167
168
169
170
171
            mode=AbsReaderWriter.MODE_TXT,
        )

    if f_dump_model_json:
        md_writer.write(
172
173
174
175
            content=json_parse.dumps(orig_model_list,
                                     ensure_ascii=False,
                                     indent=4),
            path=f'{pdf_file_name}_model.json',
icecraft's avatar
icecraft committed
176
177
178
179
180
181
            mode=AbsReaderWriter.MODE_TXT,
        )

    if f_dump_orig_pdf:
        md_writer.write(
            content=pdf_bytes,
182
            path=f'{pdf_file_name}_origin.pdf',
icecraft's avatar
icecraft committed
183
184
185
186
187
188
            mode=AbsReaderWriter.MODE_BIN,
        )

    content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
    if f_dump_content_list:
        md_writer.write(
189
190
191
192
            content=json_parse.dumps(content_list,
                                     ensure_ascii=False,
                                     indent=4),
            path=f'{pdf_file_name}_content_list.json',
icecraft's avatar
icecraft committed
193
194
195
            mode=AbsReaderWriter.MODE_TXT,
        )

196
    logger.info(f'local output dir is {local_md_dir}')
197

icecraft's avatar
icecraft committed
198

199
parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])