magicpdf.py 6.93 KB
Newer Older
kernel.h@qq.com's avatar
kernel.h@qq.com committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
"""
这里实现2个click命令:
第一个:
 接收一个完整的s3路径,例如:s3://llm-pdf-text/pdf_ebook_and_paper/pre-clean-mm-markdown/v014/part-660420b490be-000008.jsonl?bytes=0,81350
    1)根据~/magic-pdf.json里的ak,sk等,构造s3cliReader读取到这个jsonl的对应行,返回json对象。
    2)根据Json对象里的pdf的s3路径获取到他的ak,sk,endpoint,构造出s3cliReader用来读取pdf
    3)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalImageWriter,用来保存截图
    4)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
    
    最后把以上步骤准备好的对象传入真正的解析API
    
第二个:
  接收1)pdf的本地路径。2)模型json文件(可选)。然后:
    1)根据~/magic-pdf.json读取到本地保存图片、md等临时目录的位置,构造出LocalImageWriter,用来保存截图
    2)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
    3)根据约定,根据pdf本地路径,推导出pdf模型的json,并读入
    

效果:
python magicpdf.py --json  s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350 
python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json  或者 python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf
"""

许瑞's avatar
许瑞 committed
24
25
import os
import json as json_parse
许瑞's avatar
许瑞 committed
26
import click
27
from loguru import logger
kernel.h@qq.com's avatar
kernel.h@qq.com committed
28
from pathlib import Path
29

许瑞's avatar
许瑞 committed
30
from magic_pdf.pipe.UNIPipe import UNIPipe
许瑞's avatar
许瑞 committed
31
32
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
许瑞's avatar
许瑞 committed
33
34
35
36
37
38
39
from magic_pdf.libs.config_reader import get_s3_config
from magic_pdf.libs.path_utils import (
    parse_s3path,
    parse_s3_range_params,
    remove_non_official_s3_args,
)
from magic_pdf.libs.config_reader import get_local_dir
许瑞's avatar
许瑞 committed
40
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
kernel.h@qq.com's avatar
kernel.h@qq.com committed
41
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
许瑞's avatar
许瑞 committed
42
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
kernel.h@qq.com's avatar
kernel.h@qq.com committed
43

44
45
46
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])


kernel.h@qq.com's avatar
kernel.h@qq.com committed
47
def prepare_env(pdf_file_name):
许瑞's avatar
许瑞 committed
48
    local_parent_dir = os.path.join(
49
        get_local_dir(), "magic-pdf", pdf_file_name
许瑞's avatar
许瑞 committed
50
51
52
    )

    local_image_dir = os.path.join(local_parent_dir, "images")
53
    local_md_dir = local_parent_dir
许瑞's avatar
许瑞 committed
54
55
56
    os.makedirs(local_image_dir, exist_ok=True)
    os.makedirs(local_md_dir, exist_ok=True)
    return local_image_dir, local_md_dir
kernel.h@qq.com's avatar
kernel.h@qq.com committed
57
58


kernel.h@qq.com's avatar
kernel.h@qq.com committed
59
def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir):
许瑞's avatar
许瑞 committed
60
61
62
63
64
65
66
67
68
69
70
71
72
    if parse_method == "auto":
        pipe = UNIPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
    elif parse_method == "txt":
        pipe = TXTPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
    elif parse_method == "ocr":
        pipe = OCRPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
    else:
        print("unknow parse method")
        os.exit(1)

    pipe.pipe_classify()
    pipe.pipe_parse()
    md_content = pipe.pipe_mk_markdown()
kernel.h@qq.com's avatar
kernel.h@qq.com committed
73
    #part_file_name = datetime.now().strftime("%H-%M-%S")
许瑞's avatar
许瑞 committed
74
    md_writer.write(
kernel.h@qq.com's avatar
kernel.h@qq.com committed
75
        content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
许瑞's avatar
许瑞 committed
76
77
78
    )
    md_writer.write(
        content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
kernel.h@qq.com's avatar
kernel.h@qq.com committed
79
        path=f"{pdf_file_name}.json",
许瑞's avatar
许瑞 committed
80
81
        mode=AbsReaderWriter.MODE_TXT,
    )
82
83
84
85
86
87
88
    # try:
    #     content_list = pipe.pipe_mk_uni_format()
    # except Exception as e:
    #     logger.exception(e)
    # md_writer.write(
    #     str(content_list), f"{part_file_name}.txt", AbsReaderWriter.MODE_TXT
    # )
许瑞's avatar
许瑞 committed
89
90


kernel.h@qq.com's avatar
kernel.h@qq.com committed
91
92
93
94
@click.group()
def cli():
    pass

许瑞's avatar
许瑞 committed
95

kernel.h@qq.com's avatar
kernel.h@qq.com committed
96
@cli.command()
许瑞's avatar
许瑞 committed
97
@click.option("--json", type=str, help="输入一个S3路径")
98
99
100
101
102
103
104
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
def json_command(json, method):
许瑞's avatar
许瑞 committed
105
106
107
108
109
110
111
112
113
114
115
    if not json.startswith("s3://"):
        print("usage: python magipdf.py --json s3://some_bucket/some_path")
        os.exit(1)

    def read_s3_path(s3path):
        bucket, key = parse_s3path(s3path)

        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
        s3_rw = S3ReaderWriter(
            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
        )
116
        may_range_params = parse_s3_range_params(s3path)
许瑞's avatar
许瑞 committed
117
118
119
120
        if may_range_params is None or 2 != len(may_range_params):
            byte_start, byte_end = 0, None
        else:
            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
许瑞's avatar
许瑞 committed
121
            byte_end += byte_start - 1
许瑞's avatar
许瑞 committed
122
        return s3_rw.read_jsonl(
许瑞's avatar
许瑞 committed
123
124
125
126
            remove_non_official_s3_args(s3path),
            byte_start,
            byte_end,
            AbsReaderWriter.MODE_BIN,
许瑞's avatar
许瑞 committed
127
128
129
        )

    jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
kernel.h@qq.com's avatar
kernel.h@qq.com committed
130
131
132
133
134
    s3_file_path = jso["file_location"]
    pdf_file_name = Path(s3_file_path).stem
    pdf_data = read_s3_path(s3_file_path)
    local_image_dir, local_md_dir = prepare_env(pdf_file_name)
    
许瑞's avatar
许瑞 committed
135
136
137
    local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
        local_md_dir
    )
许瑞's avatar
许瑞 committed
138

许瑞's avatar
许瑞 committed
139
    _do_parse(
kernel.h@qq.com's avatar
kernel.h@qq.com committed
140
        pdf_file_name,
许瑞's avatar
许瑞 committed
141
        pdf_data,
许瑞's avatar
许瑞 committed
142
        jso["doc_layout_result"],
许瑞's avatar
许瑞 committed
143
144
145
        method,
        local_image_rw,
        local_md_rw,
146
        os.path.basename(local_image_dir),
许瑞's avatar
许瑞 committed
147
    )
许瑞's avatar
许瑞 committed
148

kernel.h@qq.com's avatar
kernel.h@qq.com committed
149
150

@cli.command()
许瑞's avatar
许瑞 committed
151
152
153
154
@click.option(
    "--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
)
@click.option("--model", type=click.Path(exists=True), help="模型的路径")
155
156
157
158
159
160
161
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
def pdf_command(pdf, model, method):
kernel.h@qq.com's avatar
kernel.h@qq.com committed
162
    # 这里处理pdf和模型相关的逻辑
许瑞's avatar
许瑞 committed
163
164
165
166
    if model is None:
        model = pdf.replace(".pdf", ".json")
        if not os.path.exists(model):
            print(f"make sure json file existed and place under {os.dirname(pdf)}")
赵小蒙's avatar
赵小蒙 committed
167
            os.exit(1)
许瑞's avatar
许瑞 committed
168
169
170

    def read_fn(path):
        disk_rw = DiskReaderWriter(os.path.dirname(path))
许瑞's avatar
许瑞 committed
171
        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
许瑞's avatar
许瑞 committed
172
173
174

    pdf_data = read_fn(pdf)
    jso = json_parse.loads(read_fn(model).decode("utf-8"))
kernel.h@qq.com's avatar
kernel.h@qq.com committed
175
176
    pdf_file_name = Path(pdf).stem
    local_image_dir, local_md_dir = prepare_env(pdf_file_name)
许瑞's avatar
许瑞 committed
177
178
179
180
    local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
        local_md_dir
    )
    _do_parse(
kernel.h@qq.com's avatar
kernel.h@qq.com committed
181
        pdf_file_name,
许瑞's avatar
许瑞 committed
182
        pdf_data,
kernel.h@qq.com's avatar
kernel.h@qq.com committed
183
        jso,
许瑞's avatar
许瑞 committed
184
185
186
        method,
        local_image_rw,
        local_md_rw,
187
        os.path.basename(local_image_dir),
许瑞's avatar
许瑞 committed
188
    )
许瑞's avatar
许瑞 committed
189

kernel.h@qq.com's avatar
kernel.h@qq.com committed
190

许瑞's avatar
许瑞 committed
191
192
if __name__ == "__main__":
    """
许瑞's avatar
许瑞 committed
193
    python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
许瑞's avatar
许瑞 committed
194
    """
kernel.h@qq.com's avatar
kernel.h@qq.com committed
195
    cli()