magicpdf.py 11.8 KB
Newer Older
kernel.h@qq.com's avatar
kernel.h@qq.com committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
"""
这里实现2个click命令:
第一个:
 接收一个完整的s3路径,例如:s3://llm-pdf-text/pdf_ebook_and_paper/pre-clean-mm-markdown/v014/part-660420b490be-000008.jsonl?bytes=0,81350
    1)根据~/magic-pdf.json里的ak,sk等,构造s3cliReader读取到这个jsonl的对应行,返回json对象。
    2)根据Json对象里的pdf的s3路径获取到他的ak,sk,endpoint,构造出s3cliReader用来读取pdf
    3)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalImageWriter,用来保存截图
    4)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
    
    最后把以上步骤准备好的对象传入真正的解析API
    
第二个:
  接收1)pdf的本地路径。2)模型json文件(可选)。然后:
    1)根据~/magic-pdf.json读取到本地保存图片、md等临时目录的位置,构造出LocalImageWriter,用来保存截图
    2)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
    3)根据约定,根据pdf本地路径,推导出pdf模型的json,并读入
    

效果:
python magicpdf.py --json  s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350 
python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json  或者 python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf
"""

许瑞's avatar
许瑞 committed
24
25
import os
import json as json_parse
26
import sys
许瑞's avatar
许瑞 committed
27
import click
28
from loguru import logger
kernel.h@qq.com's avatar
kernel.h@qq.com committed
29
from pathlib import Path
赵小蒙's avatar
赵小蒙 committed
30
from magic_pdf.libs.version import __version__
31

32
from magic_pdf.libs.MakeContentConfig import DropMode
赵小蒙's avatar
赵小蒙 committed
33
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
许瑞's avatar
许瑞 committed
34
from magic_pdf.pipe.UNIPipe import UNIPipe
许瑞's avatar
许瑞 committed
35
36
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
许瑞's avatar
许瑞 committed
37
38
39
40
41
42
43
from magic_pdf.libs.config_reader import get_s3_config
from magic_pdf.libs.path_utils import (
    parse_s3path,
    parse_s3_range_params,
    remove_non_official_s3_args,
)
from magic_pdf.libs.config_reader import get_local_dir
许瑞's avatar
许瑞 committed
44
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
kernel.h@qq.com's avatar
kernel.h@qq.com committed
45
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
许瑞's avatar
许瑞 committed
46
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
47
import csv
kernel.h@qq.com's avatar
kernel.h@qq.com committed
48

49
50
51
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])


赵小蒙's avatar
赵小蒙 committed
52
def prepare_env(pdf_file_name, method):
blue's avatar
blue committed
53
    local_parent_dir = os.path.join(get_local_dir(), "magic-pdf", pdf_file_name, method)
许瑞's avatar
许瑞 committed
54

赵小蒙's avatar
赵小蒙 committed
55
    local_image_dir = os.path.join(str(local_parent_dir), "images")
56
    local_md_dir = local_parent_dir
许瑞's avatar
许瑞 committed
57
58
59
    os.makedirs(local_image_dir, exist_ok=True)
    os.makedirs(local_md_dir, exist_ok=True)
    return local_image_dir, local_md_dir
kernel.h@qq.com's avatar
kernel.h@qq.com committed
60
61


62
def write_to_csv(csv_file_path, csv_data):
blue's avatar
blue committed
63
    with open(csv_file_path, mode="a", newline="", encoding="utf-8") as csvfile:
64
65
66
67
68
69
70
        # 创建csv writer对象
        csv_writer = csv.writer(csvfile)
        # 写入数据
        csv_writer.writerow(csv_data)
    print(f"数据已成功追加到 '{csv_file_path}'")


blue's avatar
blue committed
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def do_parse(
    pdf_file_name,
    pdf_bytes,
    model_list,
    parse_method,
    f_draw_span_bbox=True,
    f_draw_layout_bbox=True,
    f_dump_md=True,
    f_dump_middle_json=True,
    f_dump_model_json=True,
    f_dump_orig_pdf=True,
    f_dump_content_list=True,
):

    local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
    image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(
        local_md_dir
    )
    image_dir = (os.path.basename(local_image_dir),)

许瑞's avatar
许瑞 committed
91
    if parse_method == "auto":
blue's avatar
blue committed
92
        jso_useful_key = {"_pdf_type": "", "model_list": model_list}
赵小蒙's avatar
赵小蒙 committed
93
        pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
许瑞's avatar
许瑞 committed
94
    elif parse_method == "txt":
95
        pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
许瑞's avatar
许瑞 committed
96
    elif parse_method == "ocr":
97
        pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
许瑞's avatar
许瑞 committed
98
99
    else:
        print("unknow parse method")
100
        sys.exit(1)
许瑞's avatar
许瑞 committed
101
102

    pipe.pipe_classify()
103

blue's avatar
blue committed
104
    """如果没有传入有效的模型数据,则使用内置paddle解析"""
105
106
107
    if len(model_list) == 0:
        pipe.pipe_analyze()

许瑞's avatar
许瑞 committed
108
    pipe.pipe_parse()
blue's avatar
blue committed
109
110
111
112
113
    pdf_info = pipe.pdf_mid_data["pdf_info"]
    if f_draw_layout_bbox:
        draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
    if f_draw_span_bbox:
        draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
114

115
116
117
    # write_to_csv(r"D:\project\20231108code-clean\linshixuqiu\pdf_dev\新模型\新建文件夹\luanma.csv",
    #              [pdf_file_name, pipe.pdf_mid_data['not_common_character_rate'], pipe.pdf_mid_data['not_printable_rate']])

118
    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
blue's avatar
blue committed
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150

    if f_dump_md:
        """写markdown"""
        md_writer.write(
            content=md_content,
            path=f"{pdf_file_name}.md",
            mode=AbsReaderWriter.MODE_TXT,
        )

    if f_dump_middle_json:
        """写middle_json"""
        md_writer.write(
            content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
            path=f"{pdf_file_name}_middle.json",
            mode=AbsReaderWriter.MODE_TXT,
        )

    if f_dump_model_json:
        """写model_json"""
        md_writer.write(
            content=json_parse.dumps(pipe.model_list, ensure_ascii=False, indent=4),
            path=f"{pdf_file_name}_model.json",
            mode=AbsReaderWriter.MODE_TXT,
        )

    if f_dump_orig_pdf:
        """写源pdf"""
        md_writer.write(
            content=pdf_bytes,
            path=f"{pdf_file_name}_origin.pdf",
            mode=AbsReaderWriter.MODE_BIN,
        )
151
    content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
blue's avatar
blue committed
152
153
154
155
156
157
158
159

    if f_dump_content_list:
        """写content_list"""
        md_writer.write(
            content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
            path=f"{pdf_file_name}_content_list.json",
            mode=AbsReaderWriter.MODE_TXT,
        )
许瑞's avatar
许瑞 committed
160
161


kernel.h@qq.com's avatar
kernel.h@qq.com committed
162
@click.group()
赵小蒙's avatar
赵小蒙 committed
163
164
@click.version_option(__version__, "--version", "-v", help="显示版本信息")
@click.help_option("--help", "-h", help="显示帮助信息")
kernel.h@qq.com's avatar
kernel.h@qq.com committed
165
166
167
def cli():
    pass

许瑞's avatar
许瑞 committed
168

kernel.h@qq.com's avatar
kernel.h@qq.com committed
169
@cli.command()
许瑞's avatar
许瑞 committed
170
@click.option("--json", type=str, help="输入一个S3路径")
171
172
173
174
175
176
177
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
def json_command(json, method):
许瑞's avatar
许瑞 committed
178
179
    if not json.startswith("s3://"):
        print("usage: python magipdf.py --json s3://some_bucket/some_path")
180
        sys.exit(1)
许瑞's avatar
许瑞 committed
181
182
183
184
185
186
187
188

    def read_s3_path(s3path):
        bucket, key = parse_s3path(s3path)

        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
        s3_rw = S3ReaderWriter(
            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
        )
189
        may_range_params = parse_s3_range_params(s3path)
许瑞's avatar
许瑞 committed
190
191
192
193
        if may_range_params is None or 2 != len(may_range_params):
            byte_start, byte_end = 0, None
        else:
            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
许瑞's avatar
许瑞 committed
194
            byte_end += byte_start - 1
许瑞's avatar
许瑞 committed
195
        return s3_rw.read_jsonl(
许瑞's avatar
许瑞 committed
196
197
198
199
            remove_non_official_s3_args(s3path),
            byte_start,
            byte_end,
            AbsReaderWriter.MODE_BIN,
许瑞's avatar
许瑞 committed
200
201
202
        )

    jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
赵小蒙's avatar
赵小蒙 committed
203
204
205
    s3_file_path = jso.get("file_location")
    if s3_file_path is None:
        s3_file_path = jso.get("path")
kernel.h@qq.com's avatar
kernel.h@qq.com committed
206
207
    pdf_file_name = Path(s3_file_path).stem
    pdf_data = read_s3_path(s3_file_path)
blue's avatar
blue committed
208

赵小蒙's avatar
赵小蒙 committed
209
    local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
赵小蒙's avatar
赵小蒙 committed
210

许瑞's avatar
许瑞 committed
211
212
213
    local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
        local_md_dir
    )
许瑞's avatar
许瑞 committed
214

blue's avatar
blue committed
215
    do_parse(
kernel.h@qq.com's avatar
kernel.h@qq.com committed
216
        pdf_file_name,
许瑞's avatar
许瑞 committed
217
        pdf_data,
许瑞's avatar
许瑞 committed
218
        jso["doc_layout_result"],
许瑞's avatar
许瑞 committed
219
220
221
        method,
        local_image_rw,
        local_md_rw,
222
        os.path.basename(local_image_dir),
blue's avatar
blue committed
223
        local_md_dir,
许瑞's avatar
许瑞 committed
224
    )
许瑞's avatar
许瑞 committed
225

kernel.h@qq.com's avatar
kernel.h@qq.com committed
226

赵小蒙's avatar
赵小蒙 committed
227
228
229
230
231
232
233
234
235
236
237
@cli.command()
@click.option("--local_json", type=str, help="输入一个本地jsonl路径")
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
def local_json_command(local_json, method):
    def read_s3_path(s3path):
        bucket, key = parse_s3path(s3path)
赵小蒙's avatar
赵小蒙 committed
238

赵小蒙's avatar
赵小蒙 committed
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
        s3_rw = S3ReaderWriter(
            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
        )
        may_range_params = parse_s3_range_params(s3path)
        if may_range_params is None or 2 != len(may_range_params):
            byte_start, byte_end = 0, None
        else:
            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
            byte_end += byte_start - 1
        return s3_rw.read_jsonl(
            remove_non_official_s3_args(s3path),
            byte_start,
            byte_end,
            AbsReaderWriter.MODE_BIN,
        )

    with open(local_json, "r", encoding="utf-8") as f:
        for json_line in f:
            jso = json_parse.loads(json_line)

            s3_file_path = jso.get("file_location")
            if s3_file_path is None:
                s3_file_path = jso.get("path")
            pdf_file_name = Path(s3_file_path).stem
            pdf_data = read_s3_path(s3_file_path)
            local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)

blue's avatar
blue committed
267
268
269
            local_image_rw, local_md_rw = DiskReaderWriter(
                local_image_dir
            ), DiskReaderWriter(local_md_dir)
赵小蒙's avatar
赵小蒙 committed
270

blue's avatar
blue committed
271
            do_parse(
赵小蒙's avatar
赵小蒙 committed
272
273
274
275
276
277
278
                pdf_file_name,
                pdf_data,
                jso["doc_layout_result"],
                method,
                local_image_rw,
                local_md_rw,
                os.path.basename(local_image_dir),
blue's avatar
blue committed
279
                local_md_dir,
赵小蒙's avatar
赵小蒙 committed
280
            )
赵小蒙's avatar
赵小蒙 committed
281
282


kernel.h@qq.com's avatar
kernel.h@qq.com committed
283
@cli.command()
许瑞's avatar
许瑞 committed
284
285
286
287
@click.option(
    "--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
)
@click.option("--model", type=click.Path(exists=True), help="模型的路径")
288
289
290
291
292
293
294
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
def pdf_command(pdf, model, method):
许瑞's avatar
许瑞 committed
295
296
    def read_fn(path):
        disk_rw = DiskReaderWriter(os.path.dirname(path))
许瑞's avatar
许瑞 committed
297
        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
许瑞's avatar
许瑞 committed
298
299

    pdf_data = read_fn(pdf)
300
301
302
303
304
305

    def get_model_json(model_path):
        # 这里处理pdf和模型相关的逻辑
        if model_path is None:
            model_path = pdf.replace(".pdf", ".json")
            if not os.path.exists(model_path):
blue's avatar
blue committed
306
307
308
                logger.warning(
                    f"not found json {model_path} existed, use paddle analyze"
                )
309
310
                # 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
                model_json = "[]"
311
312
313
314
315
316
317
318
            else:
                model_json = read_fn(model_path).decode("utf-8")
        else:
            model_json = read_fn(model_path).decode("utf-8")

        return model_json

    jso = json_parse.loads(get_model_json(model))
kernel.h@qq.com's avatar
kernel.h@qq.com committed
319
    pdf_file_name = Path(pdf).stem
blue's avatar
blue committed
320

赵小蒙's avatar
赵小蒙 committed
321
    local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
许瑞's avatar
许瑞 committed
322
323
324
    local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
        local_md_dir
    )
blue's avatar
blue committed
325
    do_parse(
kernel.h@qq.com's avatar
kernel.h@qq.com committed
326
        pdf_file_name,
许瑞's avatar
许瑞 committed
327
        pdf_data,
kernel.h@qq.com's avatar
kernel.h@qq.com committed
328
        jso,
许瑞's avatar
许瑞 committed
329
330
331
        method,
        local_image_rw,
        local_md_rw,
332
        os.path.basename(local_image_dir),
blue's avatar
blue committed
333
        local_md_dir,
许瑞's avatar
许瑞 committed
334
    )
许瑞's avatar
许瑞 committed
335

kernel.h@qq.com's avatar
kernel.h@qq.com committed
336

许瑞's avatar
许瑞 committed
337
338
if __name__ == "__main__":
    """
许瑞's avatar
许瑞 committed
339
    python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
许瑞's avatar
许瑞 committed
340
    """
kernel.h@qq.com's avatar
kernel.h@qq.com committed
341
    cli()