magicpdf.py 9.52 KB
Newer Older
kernel.h@qq.com's avatar
kernel.h@qq.com committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
"""
这里实现2个click命令:
第一个:
 接收一个完整的s3路径,例如:s3://llm-pdf-text/pdf_ebook_and_paper/pre-clean-mm-markdown/v014/part-660420b490be-000008.jsonl?bytes=0,81350
    1)根据~/magic-pdf.json里的ak,sk等,构造s3cliReader读取到这个jsonl的对应行,返回json对象。
    2)根据Json对象里的pdf的s3路径获取到他的ak,sk,endpoint,构造出s3cliReader用来读取pdf
    3)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalImageWriter,用来保存截图
    4)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
    
    最后把以上步骤准备好的对象传入真正的解析API
    
第二个:
  接收1)pdf的本地路径。2)模型json文件(可选)。然后:
    1)根据~/magic-pdf.json读取到本地保存图片、md等临时目录的位置,构造出LocalImageWriter,用来保存截图
    2)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
    3)根据约定,根据pdf本地路径,推导出pdf模型的json,并读入
    

效果:
python magicpdf.py --json  s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350 
python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json  或者 python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf
"""

许瑞's avatar
许瑞 committed
24
25
import os
import json as json_parse
26
import sys
许瑞's avatar
许瑞 committed
27
import click
28
from loguru import logger
kernel.h@qq.com's avatar
kernel.h@qq.com committed
29
from pathlib import Path
30

31
from magic_pdf.libs.MakeContentConfig import DropMode
赵小蒙's avatar
赵小蒙 committed
32
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
许瑞's avatar
许瑞 committed
33
from magic_pdf.pipe.UNIPipe import UNIPipe
许瑞's avatar
许瑞 committed
34
35
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
许瑞's avatar
许瑞 committed
36
37
38
39
40
41
42
from magic_pdf.libs.config_reader import get_s3_config
from magic_pdf.libs.path_utils import (
    parse_s3path,
    parse_s3_range_params,
    remove_non_official_s3_args,
)
from magic_pdf.libs.config_reader import get_local_dir
许瑞's avatar
许瑞 committed
43
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
kernel.h@qq.com's avatar
kernel.h@qq.com committed
44
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
许瑞's avatar
许瑞 committed
45
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
kernel.h@qq.com's avatar
kernel.h@qq.com committed
46

47
48
49
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])


赵小蒙's avatar
赵小蒙 committed
50
def prepare_env(pdf_file_name, method):
许瑞's avatar
许瑞 committed
51
    local_parent_dir = os.path.join(
赵小蒙's avatar
赵小蒙 committed
52
        get_local_dir(), "magic-pdf", pdf_file_name, method
许瑞's avatar
许瑞 committed
53
54
55
    )

    local_image_dir = os.path.join(local_parent_dir, "images")
56
    local_md_dir = local_parent_dir
许瑞's avatar
许瑞 committed
57
58
59
    os.makedirs(local_image_dir, exist_ok=True)
    os.makedirs(local_md_dir, exist_ok=True)
    return local_image_dir, local_md_dir
kernel.h@qq.com's avatar
kernel.h@qq.com committed
60
61


赵小蒙's avatar
赵小蒙 committed
62
def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir, local_md_dir):
许瑞's avatar
许瑞 committed
63
    if parse_method == "auto":
赵小蒙's avatar
赵小蒙 committed
64
65
66
67
68
        jso_useful_key = {
            "_pdf_type": "",
            "model_list": model_list
        }
        pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
许瑞's avatar
许瑞 committed
69
    elif parse_method == "txt":
70
        pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
许瑞's avatar
许瑞 committed
71
    elif parse_method == "ocr":
72
        pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
许瑞's avatar
许瑞 committed
73
74
    else:
        print("unknow parse method")
75
        sys.exit(1)
许瑞's avatar
许瑞 committed
76
77
78

    pipe.pipe_classify()
    pipe.pipe_parse()
赵小蒙's avatar
赵小蒙 committed
79
80
    pdf_info = pipe.pdf_mid_data['pdf_info']
    draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
赵小蒙's avatar
赵小蒙 committed
81
    draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
82

83
    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
许瑞's avatar
许瑞 committed
84
    md_writer.write(
kernel.h@qq.com's avatar
kernel.h@qq.com committed
85
        content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
许瑞's avatar
许瑞 committed
86
87
88
    )
    md_writer.write(
        content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
kernel.h@qq.com's avatar
kernel.h@qq.com committed
89
        path=f"{pdf_file_name}.json",
许瑞's avatar
许瑞 committed
90
91
        mode=AbsReaderWriter.MODE_TXT,
    )
92
93

    content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
赵小蒙's avatar
赵小蒙 committed
94
95
96
    md_writer.write(
        str(content_list), f"{pdf_file_name}.txt", AbsReaderWriter.MODE_TXT
    )
许瑞's avatar
许瑞 committed
97
98


kernel.h@qq.com's avatar
kernel.h@qq.com committed
99
100
101
102
@click.group()
def cli():
    pass

许瑞's avatar
许瑞 committed
103

kernel.h@qq.com's avatar
kernel.h@qq.com committed
104
@cli.command()
许瑞's avatar
许瑞 committed
105
@click.option("--json", type=str, help="输入一个S3路径")
106
107
108
109
110
111
112
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
def json_command(json, method):
许瑞's avatar
许瑞 committed
113
114
    if not json.startswith("s3://"):
        print("usage: python magipdf.py --json s3://some_bucket/some_path")
115
        sys.exit(1)
许瑞's avatar
许瑞 committed
116
117
118
119
120
121
122
123

    def read_s3_path(s3path):
        bucket, key = parse_s3path(s3path)

        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
        s3_rw = S3ReaderWriter(
            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
        )
124
        may_range_params = parse_s3_range_params(s3path)
许瑞's avatar
许瑞 committed
125
126
127
128
        if may_range_params is None or 2 != len(may_range_params):
            byte_start, byte_end = 0, None
        else:
            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
许瑞's avatar
许瑞 committed
129
            byte_end += byte_start - 1
许瑞's avatar
许瑞 committed
130
        return s3_rw.read_jsonl(
许瑞's avatar
许瑞 committed
131
132
133
134
            remove_non_official_s3_args(s3path),
            byte_start,
            byte_end,
            AbsReaderWriter.MODE_BIN,
许瑞's avatar
许瑞 committed
135
136
137
        )

    jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
赵小蒙's avatar
赵小蒙 committed
138
139
140
    s3_file_path = jso.get("file_location")
    if s3_file_path is None:
        s3_file_path = jso.get("path")
kernel.h@qq.com's avatar
kernel.h@qq.com committed
141
142
    pdf_file_name = Path(s3_file_path).stem
    pdf_data = read_s3_path(s3_file_path)
赵小蒙's avatar
赵小蒙 committed
143
    local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
kernel.h@qq.com's avatar
kernel.h@qq.com committed
144
    
许瑞's avatar
许瑞 committed
145
146
147
    local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
        local_md_dir
    )
许瑞's avatar
许瑞 committed
148

许瑞's avatar
许瑞 committed
149
    _do_parse(
kernel.h@qq.com's avatar
kernel.h@qq.com committed
150
        pdf_file_name,
许瑞's avatar
许瑞 committed
151
        pdf_data,
许瑞's avatar
许瑞 committed
152
        jso["doc_layout_result"],
许瑞's avatar
许瑞 committed
153
154
155
        method,
        local_image_rw,
        local_md_rw,
156
        os.path.basename(local_image_dir),
赵小蒙's avatar
赵小蒙 committed
157
        local_md_dir
许瑞's avatar
许瑞 committed
158
    )
许瑞's avatar
许瑞 committed
159

kernel.h@qq.com's avatar
kernel.h@qq.com committed
160

赵小蒙's avatar
赵小蒙 committed
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
    @cli.command()
    @click.option("--local_json", type=str, help="输入一个本地jsonl路径")
    @click.option(
        "--method",
        type=parse_pdf_methods,
        help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
        default="auto",
    )
    def local_json_command(local_json, method):
        def read_s3_path(s3path):
            bucket, key = parse_s3path(s3path)

            s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
            s3_rw = S3ReaderWriter(
                s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
            )
            may_range_params = parse_s3_range_params(s3path)
            if may_range_params is None or 2 != len(may_range_params):
                byte_start, byte_end = 0, None
            else:
                byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
                byte_end += byte_start - 1
            return s3_rw.read_jsonl(
                remove_non_official_s3_args(s3path),
                byte_start,
                byte_end,
                AbsReaderWriter.MODE_BIN,
            )

        with open(local_json, "r", encoding="utf-8") as f:
            for json_line in f:
                jso = json_parse.loads(json_line)

                s3_file_path = jso.get("file_location")
                if s3_file_path is None:
                    s3_file_path = jso.get("path")
                pdf_file_name = Path(s3_file_path).stem
                pdf_data = read_s3_path(s3_file_path)
                local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)

                local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
                    local_md_dir
                )

                _do_parse(
                    pdf_file_name,
                    pdf_data,
                    jso["doc_layout_result"],
                    method,
                    local_image_rw,
                    local_md_rw,
                    os.path.basename(local_image_dir),
                    local_md_dir
                )


kernel.h@qq.com's avatar
kernel.h@qq.com committed
217
@cli.command()
许瑞's avatar
许瑞 committed
218
219
220
221
@click.option(
    "--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
)
@click.option("--model", type=click.Path(exists=True), help="模型的路径")
222
223
224
225
226
227
228
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
def pdf_command(pdf, model, method):
kernel.h@qq.com's avatar
kernel.h@qq.com committed
229
    # 这里处理pdf和模型相关的逻辑
许瑞's avatar
许瑞 committed
230
231
232
    if model is None:
        model = pdf.replace(".pdf", ".json")
        if not os.path.exists(model):
233
234
            print(f"make sure json {model} existed and place under {os.path.dirname(pdf)}", file=sys.stderr)
            exit(1)
许瑞's avatar
许瑞 committed
235
236
237

    def read_fn(path):
        disk_rw = DiskReaderWriter(os.path.dirname(path))
许瑞's avatar
许瑞 committed
238
        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
许瑞's avatar
许瑞 committed
239
240
241

    pdf_data = read_fn(pdf)
    jso = json_parse.loads(read_fn(model).decode("utf-8"))
kernel.h@qq.com's avatar
kernel.h@qq.com committed
242
    pdf_file_name = Path(pdf).stem
赵小蒙's avatar
赵小蒙 committed
243
    local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
许瑞's avatar
许瑞 committed
244
245
246
247
    local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
        local_md_dir
    )
    _do_parse(
kernel.h@qq.com's avatar
kernel.h@qq.com committed
248
        pdf_file_name,
许瑞's avatar
许瑞 committed
249
        pdf_data,
kernel.h@qq.com's avatar
kernel.h@qq.com committed
250
        jso,
许瑞's avatar
许瑞 committed
251
252
253
        method,
        local_image_rw,
        local_md_rw,
254
        os.path.basename(local_image_dir),
赵小蒙's avatar
赵小蒙 committed
255
        local_md_dir
许瑞's avatar
许瑞 committed
256
    )
许瑞's avatar
许瑞 committed
257

kernel.h@qq.com's avatar
kernel.h@qq.com committed
258

许瑞's avatar
许瑞 committed
259
260
if __name__ == "__main__":
    """
许瑞's avatar
许瑞 committed
261
    python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
许瑞's avatar
许瑞 committed
262
    """
kernel.h@qq.com's avatar
kernel.h@qq.com committed
263
    cli()