magicpdf.py 11.8 KB
Newer Older
许瑞's avatar
许瑞 committed
1
2
import os
import json as json_parse
许瑞's avatar
许瑞 committed
3
import click
4
from loguru import logger
kernel.h@qq.com's avatar
kernel.h@qq.com committed
5
from pathlib import Path
赵小蒙's avatar
赵小蒙 committed
6
from magic_pdf.libs.version import __version__
7

8
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
赵小蒙's avatar
赵小蒙 committed
9
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
许瑞's avatar
许瑞 committed
10
from magic_pdf.pipe.UNIPipe import UNIPipe
许瑞's avatar
许瑞 committed
11
12
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
许瑞's avatar
许瑞 committed
13
14
15
16
17
from magic_pdf.libs.path_utils import (
    parse_s3path,
    parse_s3_range_params,
    remove_non_official_s3_args,
)
18
19
20
21
from magic_pdf.libs.config_reader import (
    get_local_dir,
    get_s3_config,
)
许瑞's avatar
许瑞 committed
22
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
kernel.h@qq.com's avatar
kernel.h@qq.com committed
23
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
许瑞's avatar
许瑞 committed
24
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
25
import csv
26
import copy
27
import magic_pdf.model as model_config
kernel.h@qq.com's avatar
kernel.h@qq.com committed
28

29
30
31
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])


赵小蒙's avatar
赵小蒙 committed
32
def prepare_env(pdf_file_name, method):
blue's avatar
blue committed
33
    local_parent_dir = os.path.join(get_local_dir(), "magic-pdf", pdf_file_name, method)
许瑞's avatar
许瑞 committed
34

赵小蒙's avatar
赵小蒙 committed
35
    local_image_dir = os.path.join(str(local_parent_dir), "images")
36
    local_md_dir = local_parent_dir
许瑞's avatar
许瑞 committed
37
38
39
    os.makedirs(local_image_dir, exist_ok=True)
    os.makedirs(local_md_dir, exist_ok=True)
    return local_image_dir, local_md_dir
kernel.h@qq.com's avatar
kernel.h@qq.com committed
40
41


42
def write_to_csv(csv_file_path, csv_data):
blue's avatar
blue committed
43
    with open(csv_file_path, mode="a", newline="", encoding="utf-8") as csvfile:
44
45
46
47
        # 创建csv writer对象
        csv_writer = csv.writer(csvfile)
        # 写入数据
        csv_writer.writerow(csv_data)
48
    logger.info(f"数据已成功追加到 '{csv_file_path}'")
49
50


blue's avatar
blue committed
51
def do_parse(
赵小蒙's avatar
赵小蒙 committed
52
53
54
55
56
57
58
59
60
61
62
        pdf_file_name,
        pdf_bytes,
        model_list,
        parse_method,
        f_draw_span_bbox=True,
        f_draw_layout_bbox=True,
        f_dump_md=True,
        f_dump_middle_json=True,
        f_dump_model_json=True,
        f_dump_orig_pdf=True,
        f_dump_content_list=True,
63
        f_make_md_mode=MakeMode.MM_MD,
blue's avatar
blue committed
64
):
65

66
    orig_model_list = copy.deepcopy(model_list)
blue's avatar
blue committed
67
68

    local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
赵小蒙's avatar
format  
赵小蒙 committed
69
    image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
赵小蒙's avatar
赵小蒙 committed
70
    image_dir = str(os.path.basename(local_image_dir))
blue's avatar
blue committed
71

许瑞's avatar
许瑞 committed
72
    if parse_method == "auto":
blue's avatar
blue committed
73
        jso_useful_key = {"_pdf_type": "", "model_list": model_list}
赵小蒙's avatar
赵小蒙 committed
74
        pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
许瑞's avatar
许瑞 committed
75
    elif parse_method == "txt":
76
        pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
许瑞's avatar
许瑞 committed
77
    elif parse_method == "ocr":
78
        pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
许瑞's avatar
许瑞 committed
79
    else:
赵小蒙's avatar
赵小蒙 committed
80
        logger.error("unknown parse method")
81
        exit(1)
许瑞's avatar
许瑞 committed
82
83

    pipe.pipe_classify()
84

赵小蒙's avatar
赵小蒙 committed
85
    """如果没有传入有效的模型数据,则使用内置model解析"""
86
    if len(model_list) == 0:
87
        if model_config.__use_inside_model__:
赵小蒙's avatar
赵小蒙 committed
88
            pipe.pipe_analyze()
89
            orig_model_list = copy.deepcopy(pipe.model_list)
赵小蒙's avatar
赵小蒙 committed
90
91
        else:
            logger.error("need model list input")
92
            exit(1)
93

许瑞's avatar
许瑞 committed
94
    pipe.pipe_parse()
blue's avatar
blue committed
95
96
97
98
99
    pdf_info = pipe.pdf_mid_data["pdf_info"]
    if f_draw_layout_bbox:
        draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
    if f_draw_span_bbox:
        draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
100

101
    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
blue's avatar
blue committed
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
    if f_dump_md:
        """写markdown"""
        md_writer.write(
            content=md_content,
            path=f"{pdf_file_name}.md",
            mode=AbsReaderWriter.MODE_TXT,
        )

    if f_dump_middle_json:
        """写middle_json"""
        md_writer.write(
            content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
            path=f"{pdf_file_name}_middle.json",
            mode=AbsReaderWriter.MODE_TXT,
        )

    if f_dump_model_json:
        """写model_json"""
        md_writer.write(
121
            content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
blue's avatar
blue committed
122
123
124
125
126
127
128
129
130
131
132
133
            path=f"{pdf_file_name}_model.json",
            mode=AbsReaderWriter.MODE_TXT,
        )

    if f_dump_orig_pdf:
        """写源pdf"""
        md_writer.write(
            content=pdf_bytes,
            path=f"{pdf_file_name}_origin.pdf",
            mode=AbsReaderWriter.MODE_BIN,
        )

134
    content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
blue's avatar
blue committed
135
136
137
138
139
140
141
    if f_dump_content_list:
        """写content_list"""
        md_writer.write(
            content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
            path=f"{pdf_file_name}_content_list.json",
            mode=AbsReaderWriter.MODE_TXT,
        )
142
    logger.info(f"local output dir is '{local_md_dir}', you can found the result in it.")
许瑞's avatar
许瑞 committed
143
144


kernel.h@qq.com's avatar
kernel.h@qq.com committed
145
@click.group()
赵小蒙's avatar
赵小蒙 committed
146
147
@click.version_option(__version__, "--version", "-v", help="显示版本信息")
@click.help_option("--help", "-h", help="显示帮助信息")
kernel.h@qq.com's avatar
kernel.h@qq.com committed
148
149
150
def cli():
    pass

许瑞's avatar
许瑞 committed
151

kernel.h@qq.com's avatar
kernel.h@qq.com committed
152
@cli.command()
许瑞's avatar
许瑞 committed
153
@click.option("--json", type=str, help="输入一个S3路径")
154
155
156
157
158
159
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
160
@click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试")
161
162
@click.option("--model_mode", type=click.STRING, default="full",
              help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
163
def json_command(json, method, inside_model, model_mode):
赵小蒙's avatar
赵小蒙 committed
164
    model_config.__use_inside_model__ = inside_model
165
    model_config.__model_mode__ = model_mode
赵小蒙's avatar
赵小蒙 committed
166

许瑞's avatar
许瑞 committed
167
    if not json.startswith("s3://"):
168
169
        logger.error("usage: magic-pdf json-command --json s3://some_bucket/some_path")
        exit(1)
许瑞's avatar
许瑞 committed
170
171
172
173
174
175
176
177

    def read_s3_path(s3path):
        bucket, key = parse_s3path(s3path)

        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
        s3_rw = S3ReaderWriter(
            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
        )
178
        may_range_params = parse_s3_range_params(s3path)
许瑞's avatar
许瑞 committed
179
180
181
182
        if may_range_params is None or 2 != len(may_range_params):
            byte_start, byte_end = 0, None
        else:
            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
许瑞's avatar
许瑞 committed
183
            byte_end += byte_start - 1
许瑞's avatar
许瑞 committed
184
        return s3_rw.read_jsonl(
许瑞's avatar
许瑞 committed
185
186
187
188
            remove_non_official_s3_args(s3path),
            byte_start,
            byte_end,
            AbsReaderWriter.MODE_BIN,
许瑞's avatar
许瑞 committed
189
190
191
        )

    jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
赵小蒙's avatar
赵小蒙 committed
192
193
194
    s3_file_path = jso.get("file_location")
    if s3_file_path is None:
        s3_file_path = jso.get("path")
kernel.h@qq.com's avatar
kernel.h@qq.com committed
195
196
    pdf_file_name = Path(s3_file_path).stem
    pdf_data = read_s3_path(s3_file_path)
blue's avatar
blue committed
197
198

    do_parse(
kernel.h@qq.com's avatar
kernel.h@qq.com committed
199
        pdf_file_name,
许瑞's avatar
许瑞 committed
200
        pdf_data,
许瑞's avatar
许瑞 committed
201
        jso["doc_layout_result"],
许瑞's avatar
许瑞 committed
202
203
        method,
    )
许瑞's avatar
许瑞 committed
204

kernel.h@qq.com's avatar
kernel.h@qq.com committed
205

赵小蒙's avatar
赵小蒙 committed
206
207
208
209
210
211
212
213
@cli.command()
@click.option("--local_json", type=str, help="输入一个本地jsonl路径")
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
214
@click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试")
215
216
@click.option("--model_mode", type=click.STRING, default="full",
              help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
217
def local_json_command(local_json, method, inside_model, model_mode):
赵小蒙's avatar
赵小蒙 committed
218
    model_config.__use_inside_model__ = inside_model
219
    model_config.__model_mode__ = model_mode
赵小蒙's avatar
赵小蒙 committed
220

赵小蒙's avatar
赵小蒙 committed
221
222
    def read_s3_path(s3path):
        bucket, key = parse_s3path(s3path)
赵小蒙's avatar
赵小蒙 committed
223

赵小蒙's avatar
赵小蒙 committed
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
        s3_rw = S3ReaderWriter(
            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
        )
        may_range_params = parse_s3_range_params(s3path)
        if may_range_params is None or 2 != len(may_range_params):
            byte_start, byte_end = 0, None
        else:
            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
            byte_end += byte_start - 1
        return s3_rw.read_jsonl(
            remove_non_official_s3_args(s3path),
            byte_start,
            byte_end,
            AbsReaderWriter.MODE_BIN,
        )

    with open(local_json, "r", encoding="utf-8") as f:
        for json_line in f:
            jso = json_parse.loads(json_line)

            s3_file_path = jso.get("file_location")
            if s3_file_path is None:
                s3_file_path = jso.get("path")
            pdf_file_name = Path(s3_file_path).stem
            pdf_data = read_s3_path(s3_file_path)
blue's avatar
blue committed
250
            do_parse(
赵小蒙's avatar
赵小蒙 committed
251
252
253
254
255
                pdf_file_name,
                pdf_data,
                jso["doc_layout_result"],
                method,
            )
赵小蒙's avatar
赵小蒙 committed
256
257


kernel.h@qq.com's avatar
kernel.h@qq.com committed
258
@cli.command()
许瑞's avatar
许瑞 committed
259
@click.option(
260
    "--pdf", type=click.Path(exists=True), required=True,
261
    help='pdf 文件路径, 支持单个文件或文件列表, 文件列表需要以".list"结尾, 一行一个pdf文件路径')
许瑞's avatar
许瑞 committed
262
@click.option("--model", type=click.Path(exists=True), help="模型的路径")
263
264
265
266
267
268
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
269
@click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试")
270
271
@click.option("--model_mode", type=click.STRING, default="full",
              help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
272
def pdf_command(pdf, model, method, inside_model, model_mode):
273
    model_config.__use_inside_model__ = inside_model
274
    model_config.__model_mode__ = model_mode
赵小蒙's avatar
赵小蒙 committed
275

许瑞's avatar
许瑞 committed
276
277
    def read_fn(path):
        disk_rw = DiskReaderWriter(os.path.dirname(path))
许瑞's avatar
许瑞 committed
278
        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
许瑞's avatar
许瑞 committed
279

280
    def get_model_json(model_path, doc_path):
281
282
        # 这里处理pdf和模型相关的逻辑
        if model_path is None:
283
            file_name_without_extension, extension = os.path.splitext(doc_path)
284
285
286
287
            if extension == ".pdf":
                model_path = file_name_without_extension + ".json"
            else:
                raise Exception("pdf_path input error")
288
            if not os.path.exists(model_path):
blue's avatar
blue committed
289
                logger.warning(
290
                    f"not found json {model_path} existed"
blue's avatar
blue committed
291
                )
292
293
                # 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
                model_json = "[]"
294
295
296
297
298
299
300
            else:
                model_json = read_fn(model_path).decode("utf-8")
        else:
            model_json = read_fn(model_path).decode("utf-8")

        return model_json

301
302
303
304
305
    def parse_doc(doc_path):
        try:
            file_name = str(Path(doc_path).stem)
            pdf_data = read_fn(doc_path)
            jso = json_parse.loads(get_model_json(model, doc_path))
blue's avatar
blue committed
306

307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
            do_parse(
                file_name,
                pdf_data,
                jso,
                method,
            )

        except Exception as e:
            logger.exception(e)

    if not pdf:
        logger.error(f"Error: Missing argument '--pdf'.")
        exit(f"Error: Missing argument '--pdf'.")
    else:
        '''适配多个文档的list文件输入'''
        if pdf.endswith(".list"):
            with open(pdf, "r") as f:
                for line in f.readlines():
                    line = line.strip()
                    parse_doc(line)
        else:
            '''适配单个文档的输入'''
            parse_doc(pdf)
许瑞's avatar
许瑞 committed
330

kernel.h@qq.com's avatar
kernel.h@qq.com committed
331

许瑞's avatar
许瑞 committed
332
333
if __name__ == "__main__":
    """
许瑞's avatar
许瑞 committed
334
    python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
许瑞's avatar
许瑞 committed
335
    """
kernel.h@qq.com's avatar
kernel.h@qq.com committed
336
    cli()