magicpdf.py 6.15 KB
Newer Older
kernel.h@qq.com's avatar
kernel.h@qq.com committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
"""
这里实现2个click命令:
第一个:
 接收一个完整的s3路径,例如:s3://llm-pdf-text/pdf_ebook_and_paper/pre-clean-mm-markdown/v014/part-660420b490be-000008.jsonl?bytes=0,81350
    1)根据~/magic-pdf.json里的ak,sk等,构造s3cliReader读取到这个jsonl的对应行,返回json对象。
    2)根据Json对象里的pdf的s3路径获取到他的ak,sk,endpoint,构造出s3cliReader用来读取pdf
    3)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalImageWriter,用来保存截图
    4)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
    
    最后把以上步骤准备好的对象传入真正的解析API
    
第二个:
  接收1)pdf的本地路径。2)模型json文件(可选)。然后:
    1)根据~/magic-pdf.json读取到本地保存图片、md等临时目录的位置,构造出LocalImageWriter,用来保存截图
    2)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
    3)根据约定,根据pdf本地路径,推导出pdf模型的json,并读入
    

效果:
python magicpdf.py --json  s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350 
python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json  或者 python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf
"""

许瑞's avatar
许瑞 committed
24
25
26
import os
import json as json_parse
from datetime import datetime
许瑞's avatar
许瑞 committed
27
import click
许瑞's avatar
许瑞 committed
28
from magic_pdf.pipe.UNIPipe import UNIPipe
许瑞's avatar
许瑞 committed
29
30
31
32
33
34
35
from magic_pdf.libs.config_reader import get_s3_config
from magic_pdf.libs.path_utils import (
    parse_s3path,
    parse_s3_range_params,
    remove_non_official_s3_args,
)
from magic_pdf.libs.config_reader import get_local_dir
许瑞's avatar
许瑞 committed
36
from magic_pdf.io.S3ReaderWriter import S3ReaderWriter, MODE_BIN, MODE_TXT
许瑞's avatar
许瑞 committed
37
from magic_pdf.io.DiskReaderWriter import DiskReaderWriter
许瑞's avatar
许瑞 committed
38
from magic_pdf.libs.json_compressor import JsonCompressor
kernel.h@qq.com's avatar
kernel.h@qq.com committed
39
40


41
42
43
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])


许瑞's avatar
许瑞 committed
44
45
def prepare_env():
    local_parent_dir = os.path.join(
46
        get_local_dir(), "magic-pdf", datetime.now().strftime("%Y-%m-%d")
许瑞's avatar
许瑞 committed
47
48
49
50
51
52
53
    )

    local_image_dir = os.path.join(local_parent_dir, "images")
    local_md_dir = os.path.join(local_parent_dir, "md")
    os.makedirs(local_image_dir, exist_ok=True)
    os.makedirs(local_md_dir, exist_ok=True)
    return local_image_dir, local_md_dir
kernel.h@qq.com's avatar
kernel.h@qq.com committed
54
55


许瑞's avatar
许瑞 committed
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def _do_parse(pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir):
    uni_pipe = UNIPipe()
    jso_useful_key = {
        "_pdf_type": "txt",
        "model_list": model_list,
    }
    if parse_method == "ocr":
        jso_useful_key["_pdf_type"] = "ocr"

    pdf_mid_data = uni_pipe.parse(pdf_bytes, image_writer, jso_useful_key)
    md_content = UNIPipe.mk_markdown(pdf_mid_data, image_dir)
    part_file_name = datetime.now().strftime("%H-%M-%S")
    md_writer.write(content=md_content, path=f"{part_file_name}.md", mode=MODE_TXT)
    md_writer.write(
        content=json_parse.dumps(
            JsonCompressor.decompress_json(pdf_mid_data), ensure_ascii=False, indent=4
        ),
        path=f"{part_file_name}.json",
        mode=MODE_TXT,
    )


kernel.h@qq.com's avatar
kernel.h@qq.com committed
78
79
80
81
@click.group()
def cli():
    pass

许瑞's avatar
许瑞 committed
82

kernel.h@qq.com's avatar
kernel.h@qq.com committed
83
@cli.command()
许瑞's avatar
许瑞 committed
84
@click.option("--json", type=str, help="输入一个S3路径")
85
86
87
88
89
90
91
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
def json_command(json, method):
许瑞's avatar
许瑞 committed
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
    if not json.startswith("s3://"):
        print("usage: python magipdf.py --json s3://some_bucket/some_path")
        os.exit(1)

    def read_s3_path(s3path):
        bucket, key = parse_s3path(s3path)

        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
        s3_rw = S3ReaderWriter(
            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
        )
        may_range_params = parse_s3_range_params(json)
        if may_range_params is None or 2 != len(may_range_params):
            byte_start, byte_end = 0, None
        else:
            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
许瑞's avatar
许瑞 committed
108
            byte_end += byte_start - 1
许瑞's avatar
许瑞 committed
109
110
111
112
113
114
        return s3_rw.read_jsonl(
            remove_non_official_s3_args(s3path), byte_start, byte_end, MODE_BIN
        )

    jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
    pdf_data = read_s3_path(jso["file_location"])
许瑞's avatar
许瑞 committed
115
116
117
118
119
    local_image_dir, local_md_dir = prepare_env()

    local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
        local_md_dir
    )
许瑞's avatar
许瑞 committed
120

许瑞's avatar
许瑞 committed
121
122
    _do_parse(
        pdf_data,
kernel.h@qq.com's avatar
kernel.h@qq.com committed
123
        jso,
许瑞's avatar
许瑞 committed
124
125
126
127
128
        method,
        local_image_rw,
        local_md_rw,
        local_image_dir,
    )
许瑞's avatar
许瑞 committed
129

kernel.h@qq.com's avatar
kernel.h@qq.com committed
130
131

@cli.command()
许瑞's avatar
许瑞 committed
132
133
134
135
@click.option(
    "--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
)
@click.option("--model", type=click.Path(exists=True), help="模型的路径")
136
137
138
139
140
141
142
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
def pdf_command(pdf, model, method):
kernel.h@qq.com's avatar
kernel.h@qq.com committed
143
    # 这里处理pdf和模型相关的逻辑
许瑞's avatar
许瑞 committed
144
145
146
147
148
149
150
151
152
153
154
155
    if model is None:
        model = pdf.replace(".pdf", ".json")
        if not os.path.exists(model):
            print(f"make sure json file existed and place under {os.dirname(pdf)}")
            os.eixt(1)

    def read_fn(path):
        disk_rw = DiskReaderWriter(os.path.dirname(path))
        return disk_rw.read(os.path.basename(path), MODE_BIN)

    pdf_data = read_fn(pdf)
    jso = json_parse.loads(read_fn(model).decode("utf-8"))
许瑞's avatar
许瑞 committed
156
157
158
159
160
161
    local_image_dir, local_md_dir = prepare_env()
    local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
        local_md_dir
    )
    _do_parse(
        pdf_data,
kernel.h@qq.com's avatar
kernel.h@qq.com committed
162
        jso,
许瑞's avatar
许瑞 committed
163
164
165
166
167
        method,
        local_image_rw,
        local_md_rw,
        local_image_dir,
    )
许瑞's avatar
许瑞 committed
168

kernel.h@qq.com's avatar
kernel.h@qq.com committed
169

许瑞's avatar
许瑞 committed
170
171
if __name__ == "__main__":
    """
许瑞's avatar
许瑞 committed
172
    python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
许瑞's avatar
许瑞 committed
173
    """
kernel.h@qq.com's avatar
kernel.h@qq.com committed
174
    cli()