magicpdf.py 13.2 KB
Newer Older
kernel.h@qq.com's avatar
kernel.h@qq.com committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
"""
这里实现2个click命令:
第一个:
 接收一个完整的s3路径,例如:s3://llm-pdf-text/pdf_ebook_and_paper/pre-clean-mm-markdown/v014/part-660420b490be-000008.jsonl?bytes=0,81350
    1)根据~/magic-pdf.json里的ak,sk等,构造s3cliReader读取到这个jsonl的对应行,返回json对象。
    2)根据Json对象里的pdf的s3路径获取到他的ak,sk,endpoint,构造出s3cliReader用来读取pdf
    3)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalImageWriter,用来保存截图
    4)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
    
    最后把以上步骤准备好的对象传入真正的解析API
    
第二个:
  接收1)pdf的本地路径。2)模型json文件(可选)。然后:
    1)根据~/magic-pdf.json读取到本地保存图片、md等临时目录的位置,构造出LocalImageWriter,用来保存截图
    2)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
    3)根据约定,根据pdf本地路径,推导出pdf模型的json,并读入
    

效果:
20
21
python magicpdf.py json-command --json  s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
python magicpdf.py pdf-command --pdf  /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json  或者 python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf
kernel.h@qq.com's avatar
kernel.h@qq.com committed
22
23
"""

许瑞's avatar
许瑞 committed
24
25
import os
import json as json_parse
许瑞's avatar
许瑞 committed
26
import click
27
from loguru import logger
kernel.h@qq.com's avatar
kernel.h@qq.com committed
28
from pathlib import Path
赵小蒙's avatar
赵小蒙 committed
29
from magic_pdf.libs.version import __version__
30

31
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
赵小蒙's avatar
赵小蒙 committed
32
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
许瑞's avatar
许瑞 committed
33
from magic_pdf.pipe.UNIPipe import UNIPipe
许瑞's avatar
许瑞 committed
34
35
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
许瑞's avatar
许瑞 committed
36
37
38
39
40
from magic_pdf.libs.path_utils import (
    parse_s3path,
    parse_s3_range_params,
    remove_non_official_s3_args,
)
41
42
43
44
from magic_pdf.libs.config_reader import (
    get_local_dir,
    get_s3_config,
)
许瑞's avatar
许瑞 committed
45
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
kernel.h@qq.com's avatar
kernel.h@qq.com committed
46
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
许瑞's avatar
许瑞 committed
47
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
48
import csv
49
import copy
50
import magic_pdf.model as model_config
kernel.h@qq.com's avatar
kernel.h@qq.com committed
51

52
53
54
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])


赵小蒙's avatar
赵小蒙 committed
55
def prepare_env(pdf_file_name, method):
blue's avatar
blue committed
56
    local_parent_dir = os.path.join(get_local_dir(), "magic-pdf", pdf_file_name, method)
许瑞's avatar
许瑞 committed
57

赵小蒙's avatar
赵小蒙 committed
58
    local_image_dir = os.path.join(str(local_parent_dir), "images")
59
    local_md_dir = local_parent_dir
许瑞's avatar
许瑞 committed
60
61
62
    os.makedirs(local_image_dir, exist_ok=True)
    os.makedirs(local_md_dir, exist_ok=True)
    return local_image_dir, local_md_dir
kernel.h@qq.com's avatar
kernel.h@qq.com committed
63
64


65
def write_to_csv(csv_file_path, csv_data):
blue's avatar
blue committed
66
    with open(csv_file_path, mode="a", newline="", encoding="utf-8") as csvfile:
67
68
69
70
        # 创建csv writer对象
        csv_writer = csv.writer(csvfile)
        # 写入数据
        csv_writer.writerow(csv_data)
71
    logger.info(f"数据已成功追加到 '{csv_file_path}'")
72
73


blue's avatar
blue committed
74
def do_parse(
赵小蒙's avatar
赵小蒙 committed
75
76
77
78
79
80
81
82
83
84
85
        pdf_file_name,
        pdf_bytes,
        model_list,
        parse_method,
        f_draw_span_bbox=True,
        f_draw_layout_bbox=True,
        f_dump_md=True,
        f_dump_middle_json=True,
        f_dump_model_json=True,
        f_dump_orig_pdf=True,
        f_dump_content_list=True,
86
        f_make_md_mode=MakeMode.MM_MD,
blue's avatar
blue committed
87
):
88

89
    orig_model_list = copy.deepcopy(model_list)
blue's avatar
blue committed
90
91

    local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
赵小蒙's avatar
format  
赵小蒙 committed
92
    image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
赵小蒙's avatar
赵小蒙 committed
93
    image_dir = str(os.path.basename(local_image_dir))
blue's avatar
blue committed
94

许瑞's avatar
许瑞 committed
95
    if parse_method == "auto":
blue's avatar
blue committed
96
        jso_useful_key = {"_pdf_type": "", "model_list": model_list}
赵小蒙's avatar
赵小蒙 committed
97
        pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
许瑞's avatar
许瑞 committed
98
    elif parse_method == "txt":
99
        pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
许瑞's avatar
许瑞 committed
100
    elif parse_method == "ocr":
101
        pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
许瑞's avatar
许瑞 committed
102
    else:
赵小蒙's avatar
赵小蒙 committed
103
        logger.error("unknown parse method")
104
        exit(1)
许瑞's avatar
许瑞 committed
105
106

    pipe.pipe_classify()
107

赵小蒙's avatar
赵小蒙 committed
108
    """如果没有传入有效的模型数据,则使用内置model解析"""
109
    if len(model_list) == 0:
110
        if model_config.__use_inside_model__:
赵小蒙's avatar
赵小蒙 committed
111
            pipe.pipe_analyze()
112
            orig_model_list = copy.deepcopy(pipe.model_list)
赵小蒙's avatar
赵小蒙 committed
113
114
        else:
            logger.error("need model list input")
115
            exit(1)
116

许瑞's avatar
许瑞 committed
117
    pipe.pipe_parse()
blue's avatar
blue committed
118
119
120
121
122
    pdf_info = pipe.pdf_mid_data["pdf_info"]
    if f_draw_layout_bbox:
        draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
    if f_draw_span_bbox:
        draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
123

124
    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
blue's avatar
blue committed
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
    if f_dump_md:
        """写markdown"""
        md_writer.write(
            content=md_content,
            path=f"{pdf_file_name}.md",
            mode=AbsReaderWriter.MODE_TXT,
        )

    if f_dump_middle_json:
        """写middle_json"""
        md_writer.write(
            content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
            path=f"{pdf_file_name}_middle.json",
            mode=AbsReaderWriter.MODE_TXT,
        )

    if f_dump_model_json:
        """写model_json"""
        md_writer.write(
144
            content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
blue's avatar
blue committed
145
146
147
148
149
150
151
152
153
154
155
156
            path=f"{pdf_file_name}_model.json",
            mode=AbsReaderWriter.MODE_TXT,
        )

    if f_dump_orig_pdf:
        """写源pdf"""
        md_writer.write(
            content=pdf_bytes,
            path=f"{pdf_file_name}_origin.pdf",
            mode=AbsReaderWriter.MODE_BIN,
        )

157
    content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
blue's avatar
blue committed
158
159
160
161
162
163
164
    if f_dump_content_list:
        """写content_list"""
        md_writer.write(
            content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
            path=f"{pdf_file_name}_content_list.json",
            mode=AbsReaderWriter.MODE_TXT,
        )
165
    logger.info(f"local output dir is '{local_md_dir}', you can found the result in it.")
许瑞's avatar
许瑞 committed
166
167


kernel.h@qq.com's avatar
kernel.h@qq.com committed
168
@click.group()
赵小蒙's avatar
赵小蒙 committed
169
170
@click.version_option(__version__, "--version", "-v", help="显示版本信息")
@click.help_option("--help", "-h", help="显示帮助信息")
kernel.h@qq.com's avatar
kernel.h@qq.com committed
171
172
173
def cli():
    pass

许瑞's avatar
许瑞 committed
174

kernel.h@qq.com's avatar
kernel.h@qq.com committed
175
@cli.command()
许瑞's avatar
许瑞 committed
176
@click.option("--json", type=str, help="输入一个S3路径")
177
178
179
180
181
182
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
183
@click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试")
184
185
@click.option("--model_mode", type=click.STRING, default="full",
              help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
186
def json_command(json, method, inside_model, model_mode):
赵小蒙's avatar
赵小蒙 committed
187
    model_config.__use_inside_model__ = inside_model
188
    model_config.__model_mode__ = model_mode
赵小蒙's avatar
赵小蒙 committed
189

许瑞's avatar
许瑞 committed
190
    if not json.startswith("s3://"):
191
192
        logger.error("usage: magic-pdf json-command --json s3://some_bucket/some_path")
        exit(1)
许瑞's avatar
许瑞 committed
193
194
195
196
197
198
199
200

    def read_s3_path(s3path):
        bucket, key = parse_s3path(s3path)

        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
        s3_rw = S3ReaderWriter(
            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
        )
201
        may_range_params = parse_s3_range_params(s3path)
许瑞's avatar
许瑞 committed
202
203
204
205
        if may_range_params is None or 2 != len(may_range_params):
            byte_start, byte_end = 0, None
        else:
            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
许瑞's avatar
许瑞 committed
206
            byte_end += byte_start - 1
许瑞's avatar
许瑞 committed
207
        return s3_rw.read_jsonl(
许瑞's avatar
许瑞 committed
208
209
210
211
            remove_non_official_s3_args(s3path),
            byte_start,
            byte_end,
            AbsReaderWriter.MODE_BIN,
许瑞's avatar
许瑞 committed
212
213
214
        )

    jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
赵小蒙's avatar
赵小蒙 committed
215
216
217
    s3_file_path = jso.get("file_location")
    if s3_file_path is None:
        s3_file_path = jso.get("path")
kernel.h@qq.com's avatar
kernel.h@qq.com committed
218
219
    pdf_file_name = Path(s3_file_path).stem
    pdf_data = read_s3_path(s3_file_path)
blue's avatar
blue committed
220
221

    do_parse(
kernel.h@qq.com's avatar
kernel.h@qq.com committed
222
        pdf_file_name,
许瑞's avatar
许瑞 committed
223
        pdf_data,
许瑞's avatar
许瑞 committed
224
        jso["doc_layout_result"],
许瑞's avatar
许瑞 committed
225
226
        method,
    )
许瑞's avatar
许瑞 committed
227

kernel.h@qq.com's avatar
kernel.h@qq.com committed
228

赵小蒙's avatar
赵小蒙 committed
229
230
231
232
233
234
235
236
@cli.command()
@click.option("--local_json", type=str, help="输入一个本地jsonl路径")
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
237
@click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试")
238
239
@click.option("--model_mode", type=click.STRING, default="full",
              help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
240
def local_json_command(local_json, method, inside_model, model_mode):
赵小蒙's avatar
赵小蒙 committed
241
    model_config.__use_inside_model__ = inside_model
242
    model_config.__model_mode__ = model_mode
赵小蒙's avatar
赵小蒙 committed
243

赵小蒙's avatar
赵小蒙 committed
244
245
    def read_s3_path(s3path):
        bucket, key = parse_s3path(s3path)
赵小蒙's avatar
赵小蒙 committed
246

赵小蒙's avatar
赵小蒙 committed
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
        s3_rw = S3ReaderWriter(
            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
        )
        may_range_params = parse_s3_range_params(s3path)
        if may_range_params is None or 2 != len(may_range_params):
            byte_start, byte_end = 0, None
        else:
            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
            byte_end += byte_start - 1
        return s3_rw.read_jsonl(
            remove_non_official_s3_args(s3path),
            byte_start,
            byte_end,
            AbsReaderWriter.MODE_BIN,
        )

    with open(local_json, "r", encoding="utf-8") as f:
        for json_line in f:
            jso = json_parse.loads(json_line)

            s3_file_path = jso.get("file_location")
            if s3_file_path is None:
                s3_file_path = jso.get("path")
            pdf_file_name = Path(s3_file_path).stem
            pdf_data = read_s3_path(s3_file_path)
blue's avatar
blue committed
273
            do_parse(
赵小蒙's avatar
赵小蒙 committed
274
275
276
277
278
                pdf_file_name,
                pdf_data,
                jso["doc_layout_result"],
                method,
            )
赵小蒙's avatar
赵小蒙 committed
279
280


kernel.h@qq.com's avatar
kernel.h@qq.com committed
281
@cli.command()
许瑞's avatar
许瑞 committed
282
@click.option(
283
    "--pdf", type=click.Path(exists=True), required=True,
284
    help='pdf 文件路径, 支持单个文件或文件列表, 文件列表需要以".list"结尾, 一行一个pdf文件路径')
许瑞's avatar
许瑞 committed
285
@click.option("--model", type=click.Path(exists=True), help="模型的路径")
286
287
288
289
290
291
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
292
@click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试")
293
294
@click.option("--model_mode", type=click.STRING, default="full",
              help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
295
def pdf_command(pdf, model, method, inside_model, model_mode):
296
    model_config.__use_inside_model__ = inside_model
297
    model_config.__model_mode__ = model_mode
赵小蒙's avatar
赵小蒙 committed
298

许瑞's avatar
许瑞 committed
299
300
    def read_fn(path):
        disk_rw = DiskReaderWriter(os.path.dirname(path))
许瑞's avatar
许瑞 committed
301
        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
许瑞's avatar
许瑞 committed
302

303
    def get_model_json(model_path, doc_path):
304
305
        # 这里处理pdf和模型相关的逻辑
        if model_path is None:
306
            file_name_without_extension, extension = os.path.splitext(doc_path)
307
308
309
310
            if extension == ".pdf":
                model_path = file_name_without_extension + ".json"
            else:
                raise Exception("pdf_path input error")
311
            if not os.path.exists(model_path):
blue's avatar
blue committed
312
                logger.warning(
313
                    f"not found json {model_path} existed"
blue's avatar
blue committed
314
                )
315
316
                # 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
                model_json = "[]"
317
318
319
320
321
322
323
            else:
                model_json = read_fn(model_path).decode("utf-8")
        else:
            model_json = read_fn(model_path).decode("utf-8")

        return model_json

324
325
326
327
328
    def parse_doc(doc_path):
        try:
            file_name = str(Path(doc_path).stem)
            pdf_data = read_fn(doc_path)
            jso = json_parse.loads(get_model_json(model, doc_path))
blue's avatar
blue committed
329

330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
            do_parse(
                file_name,
                pdf_data,
                jso,
                method,
            )

        except Exception as e:
            logger.exception(e)

    if not pdf:
        logger.error(f"Error: Missing argument '--pdf'.")
        exit(f"Error: Missing argument '--pdf'.")
    else:
        '''适配多个文档的list文件输入'''
        if pdf.endswith(".list"):
            with open(pdf, "r") as f:
                for line in f.readlines():
                    line = line.strip()
                    parse_doc(line)
        else:
            '''适配单个文档的输入'''
            parse_doc(pdf)
许瑞's avatar
许瑞 committed
353

kernel.h@qq.com's avatar
kernel.h@qq.com committed
354

许瑞's avatar
许瑞 committed
355
356
if __name__ == "__main__":
    """
许瑞's avatar
许瑞 committed
357
    python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
许瑞's avatar
许瑞 committed
358
    """
kernel.h@qq.com's avatar
kernel.h@qq.com committed
359
    cli()