magicpdf.py 5.3 KB
Newer Older
kernel.h@qq.com's avatar
kernel.h@qq.com committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
"""
这里实现2个click命令:
第一个:
 接收一个完整的s3路径,例如:s3://llm-pdf-text/pdf_ebook_and_paper/pre-clean-mm-markdown/v014/part-660420b490be-000008.jsonl?bytes=0,81350
    1)根据~/magic-pdf.json里的ak,sk等,构造s3cliReader读取到这个jsonl的对应行,返回json对象。
    2)根据Json对象里的pdf的s3路径获取到他的ak,sk,endpoint,构造出s3cliReader用来读取pdf
    3)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalImageWriter,用来保存截图
    4)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
    
    最后把以上步骤准备好的对象传入真正的解析API
    
第二个:
  接收1)pdf的本地路径。2)模型json文件(可选)。然后:
    1)根据~/magic-pdf.json读取到本地保存图片、md等临时目录的位置,构造出LocalImageWriter,用来保存截图
    2)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
    3)根据约定,根据pdf本地路径,推导出pdf模型的json,并读入
    

效果:
python magicpdf.py --json  s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350 
python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json  或者 python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf
"""

许瑞's avatar
许瑞 committed
24
25
26
27
28
29
30
31
32
33
import click
from magic_pdf.libs.config_reader import get_s3_config
from magic_pdf.libs.path_utils import (
    parse_s3path,
    parse_s3_range_params,
    remove_non_official_s3_args,
)
from magic_pdf.libs.config_reader import get_local_dir
from magic_pdf.io.S3ReaderWriter import S3ReaderWriter, MODE_BIN
from magic_pdf.io.DiskReaderWriter import DiskReaderWriter
34
from magic_pdf.spark.spark_api import parse_union_pdf, parse_txt_pdf, parse_ocr_pdf
许瑞's avatar
许瑞 committed
35
36
37
import os
import json as json_parse
from datetime import datetime
kernel.h@qq.com's avatar
kernel.h@qq.com committed
38
39


40
41
42
43
44
45
46
47
48
49
50
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])


def get_pdf_parse_method(method):
    if method == "ocr":
        return parse_ocr_pdf
    elif method == "txt":
        return parse_txt_pdf
    return parse_union_pdf


许瑞's avatar
许瑞 committed
51
52
def prepare_env():
    local_parent_dir = os.path.join(
53
        get_local_dir(), "magic-pdf", datetime.now().strftime("%Y-%m-%d")
许瑞's avatar
许瑞 committed
54
55
56
57
58
59
60
    )

    local_image_dir = os.path.join(local_parent_dir, "images")
    local_md_dir = os.path.join(local_parent_dir, "md")
    os.makedirs(local_image_dir, exist_ok=True)
    os.makedirs(local_md_dir, exist_ok=True)
    return local_image_dir, local_md_dir
kernel.h@qq.com's avatar
kernel.h@qq.com committed
61
62
63
64
65
66


@click.group()
def cli():
    pass

许瑞's avatar
许瑞 committed
67

kernel.h@qq.com's avatar
kernel.h@qq.com committed
68
@cli.command()
许瑞's avatar
许瑞 committed
69
@click.option("--json", type=str, help="输入一个S3路径")
70
71
72
73
74
75
76
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
def json_command(json, method):
许瑞's avatar
许瑞 committed
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
    if not json.startswith("s3://"):
        print("usage: python magipdf.py --json s3://some_bucket/some_path")
        os.exit(1)

    def read_s3_path(s3path):
        bucket, key = parse_s3path(s3path)

        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
        s3_rw = S3ReaderWriter(
            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
        )
        may_range_params = parse_s3_range_params(json)
        if may_range_params is None or 2 != len(may_range_params):
            byte_start, byte_end = 0, None
        else:
            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
        return s3_rw.read_jsonl(
            remove_non_official_s3_args(s3path), byte_start, byte_end, MODE_BIN
        )

    jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
    pdf_data = read_s3_path(jso["file_location"])
    local_image_dir, _ = prepare_env()

    local_image_rw = DiskReaderWriter(local_image_dir)
102
103
    parse = get_pdf_parse_method(method)
    parse(pdf_data, jso["doc_layout_result"], local_image_rw, is_debug=True)
许瑞's avatar
许瑞 committed
104

kernel.h@qq.com's avatar
kernel.h@qq.com committed
105
106

@cli.command()
许瑞's avatar
许瑞 committed
107
108
109
110
@click.option(
    "--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
)
@click.option("--model", type=click.Path(exists=True), help="模型的路径")
111
112
113
114
115
116
117
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
def pdf_command(pdf, model, method):
kernel.h@qq.com's avatar
kernel.h@qq.com committed
118
    # 这里处理pdf和模型相关的逻辑
许瑞's avatar
许瑞 committed
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
    if model is None:
        model = pdf.replace(".pdf", ".json")
        if not os.path.exists(model):
            print(f"make sure json file existed and place under {os.dirname(pdf)}")
            os.eixt(1)

    def read_fn(path):
        disk_rw = DiskReaderWriter(os.path.dirname(path))
        return disk_rw.read(os.path.basename(path), MODE_BIN)

    pdf_data = read_fn(pdf)
    jso = json_parse.loads(read_fn(model).decode("utf-8"))

    local_image_dir, _ = prepare_env()
    local_image_rw = DiskReaderWriter(local_image_dir)
134
    parse = get_pdf_parse_method(method)
kernel.h@qq.com's avatar
update  
kernel.h@qq.com committed
135
    parse(pdf_data, jso, local_image_rw, is_debug=True)
许瑞's avatar
许瑞 committed
136

kernel.h@qq.com's avatar
kernel.h@qq.com committed
137

许瑞's avatar
许瑞 committed
138
139
140
141
if __name__ == "__main__":
    """
    python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/format/v070/part-66028dd46437-000076.jsonl?bytes=0,308393
    """
kernel.h@qq.com's avatar
kernel.h@qq.com committed
142
    cli()