magicpdf.py 11.7 KB
Newer Older
kernel.h@qq.com's avatar
kernel.h@qq.com committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
"""
这里实现2个click命令:
第一个:
 接收一个完整的s3路径,例如:s3://llm-pdf-text/pdf_ebook_and_paper/pre-clean-mm-markdown/v014/part-660420b490be-000008.jsonl?bytes=0,81350
    1)根据~/magic-pdf.json里的ak,sk等,构造s3cliReader读取到这个jsonl的对应行,返回json对象。
    2)根据Json对象里的pdf的s3路径获取到他的ak,sk,endpoint,构造出s3cliReader用来读取pdf
    3)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalImageWriter,用来保存截图
    4)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
    
    最后把以上步骤准备好的对象传入真正的解析API
    
第二个:
  接收1)pdf的本地路径。2)模型json文件(可选)。然后:
    1)根据~/magic-pdf.json读取到本地保存图片、md等临时目录的位置,构造出LocalImageWriter,用来保存截图
    2)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
    3)根据约定,根据pdf本地路径,推导出pdf模型的json,并读入
    

效果:
20
21
python magicpdf.py json-command --json  s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
python magicpdf.py pdf-command --pdf  /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json  或者 python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf
kernel.h@qq.com's avatar
kernel.h@qq.com committed
22
23
"""

许瑞's avatar
许瑞 committed
24
25
import os
import json as json_parse
许瑞's avatar
许瑞 committed
26
import click
27
from loguru import logger
kernel.h@qq.com's avatar
kernel.h@qq.com committed
28
from pathlib import Path
赵小蒙's avatar
赵小蒙 committed
29
from magic_pdf.libs.version import __version__
30

31
from magic_pdf.libs.MakeContentConfig import DropMode
赵小蒙's avatar
赵小蒙 committed
32
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
许瑞's avatar
许瑞 committed
33
from magic_pdf.pipe.UNIPipe import UNIPipe
许瑞's avatar
许瑞 committed
34
35
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
许瑞's avatar
许瑞 committed
36
37
38
39
40
41
42
from magic_pdf.libs.config_reader import get_s3_config
from magic_pdf.libs.path_utils import (
    parse_s3path,
    parse_s3_range_params,
    remove_non_official_s3_args,
)
from magic_pdf.libs.config_reader import get_local_dir
许瑞's avatar
许瑞 committed
43
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
kernel.h@qq.com's avatar
kernel.h@qq.com committed
44
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
许瑞's avatar
许瑞 committed
45
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
46
import csv
47
import copy
48
import magic_pdf.model as model_config
kernel.h@qq.com's avatar
kernel.h@qq.com committed
49

50
51
52
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])


赵小蒙's avatar
赵小蒙 committed
53
def prepare_env(pdf_file_name, method):
blue's avatar
blue committed
54
    local_parent_dir = os.path.join(get_local_dir(), "magic-pdf", pdf_file_name, method)
许瑞's avatar
许瑞 committed
55

赵小蒙's avatar
赵小蒙 committed
56
    local_image_dir = os.path.join(str(local_parent_dir), "images")
57
    local_md_dir = local_parent_dir
许瑞's avatar
许瑞 committed
58
59
60
    os.makedirs(local_image_dir, exist_ok=True)
    os.makedirs(local_md_dir, exist_ok=True)
    return local_image_dir, local_md_dir
kernel.h@qq.com's avatar
kernel.h@qq.com committed
61
62


63
def write_to_csv(csv_file_path, csv_data):
blue's avatar
blue committed
64
    with open(csv_file_path, mode="a", newline="", encoding="utf-8") as csvfile:
65
66
67
68
        # 创建csv writer对象
        csv_writer = csv.writer(csvfile)
        # 写入数据
        csv_writer.writerow(csv_data)
69
    logger.info(f"数据已成功追加到 '{csv_file_path}'")
70
71


blue's avatar
blue committed
72
def do_parse(
赵小蒙's avatar
赵小蒙 committed
73
74
75
76
77
78
79
80
81
82
83
        pdf_file_name,
        pdf_bytes,
        model_list,
        parse_method,
        f_draw_span_bbox=True,
        f_draw_layout_bbox=True,
        f_dump_md=True,
        f_dump_middle_json=True,
        f_dump_model_json=True,
        f_dump_orig_pdf=True,
        f_dump_content_list=True,
blue's avatar
blue committed
84
):
85
    orig_model_list = copy.deepcopy(model_list)
blue's avatar
blue committed
86
87

    local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
88
    logger.info(f"local output dir is {local_md_dir}")
赵小蒙's avatar
format  
赵小蒙 committed
89
    image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
赵小蒙's avatar
赵小蒙 committed
90
    image_dir = str(os.path.basename(local_image_dir))
blue's avatar
blue committed
91

许瑞's avatar
许瑞 committed
92
    if parse_method == "auto":
blue's avatar
blue committed
93
        jso_useful_key = {"_pdf_type": "", "model_list": model_list}
赵小蒙's avatar
赵小蒙 committed
94
        pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
许瑞's avatar
许瑞 committed
95
    elif parse_method == "txt":
96
        pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
许瑞's avatar
许瑞 committed
97
    elif parse_method == "ocr":
98
        pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
许瑞's avatar
许瑞 committed
99
    else:
赵小蒙's avatar
赵小蒙 committed
100
        logger.error("unknown parse method")
101
        exit(1)
许瑞's avatar
许瑞 committed
102
103

    pipe.pipe_classify()
104

赵小蒙's avatar
赵小蒙 committed
105
    """如果没有传入有效的模型数据,则使用内置model解析"""
106
    if len(model_list) == 0:
107
        if model_config.__use_inside_model__:
赵小蒙's avatar
赵小蒙 committed
108
            pipe.pipe_analyze()
109
            orig_model_list = copy.deepcopy(pipe.model_list)
赵小蒙's avatar
赵小蒙 committed
110
111
        else:
            logger.error("need model list input")
112
            exit(1)
113

许瑞's avatar
许瑞 committed
114
    pipe.pipe_parse()
blue's avatar
blue committed
115
116
117
118
119
    pdf_info = pipe.pdf_mid_data["pdf_info"]
    if f_draw_layout_bbox:
        draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
    if f_draw_span_bbox:
        draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
120

赵小蒙's avatar
赵小蒙 committed
121
    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
blue's avatar
blue committed
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
    if f_dump_md:
        """写markdown"""
        md_writer.write(
            content=md_content,
            path=f"{pdf_file_name}.md",
            mode=AbsReaderWriter.MODE_TXT,
        )

    if f_dump_middle_json:
        """写middle_json"""
        md_writer.write(
            content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
            path=f"{pdf_file_name}_middle.json",
            mode=AbsReaderWriter.MODE_TXT,
        )

    if f_dump_model_json:
        """写model_json"""
        md_writer.write(
141
            content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
blue's avatar
blue committed
142
143
144
145
146
147
148
149
150
151
152
153
            path=f"{pdf_file_name}_model.json",
            mode=AbsReaderWriter.MODE_TXT,
        )

    if f_dump_orig_pdf:
        """写源pdf"""
        md_writer.write(
            content=pdf_bytes,
            path=f"{pdf_file_name}_origin.pdf",
            mode=AbsReaderWriter.MODE_BIN,
        )

154
    content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
blue's avatar
blue committed
155
156
157
158
159
160
161
    if f_dump_content_list:
        """写content_list"""
        md_writer.write(
            content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
            path=f"{pdf_file_name}_content_list.json",
            mode=AbsReaderWriter.MODE_TXT,
        )
许瑞's avatar
许瑞 committed
162
163


kernel.h@qq.com's avatar
kernel.h@qq.com committed
164
@click.group()
赵小蒙's avatar
赵小蒙 committed
165
166
@click.version_option(__version__, "--version", "-v", help="显示版本信息")
@click.help_option("--help", "-h", help="显示帮助信息")
kernel.h@qq.com's avatar
kernel.h@qq.com committed
167
168
169
def cli():
    pass

许瑞's avatar
许瑞 committed
170

kernel.h@qq.com's avatar
kernel.h@qq.com committed
171
@cli.command()
许瑞's avatar
许瑞 committed
172
@click.option("--json", type=str, help="输入一个S3路径")
173
174
175
176
177
178
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
赵小蒙's avatar
赵小蒙 committed
179
180
181
182
@click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
def json_command(json, method, inside_model):
    model_config.__use_inside_model__ = inside_model

许瑞's avatar
许瑞 committed
183
    if not json.startswith("s3://"):
184
185
        logger.error("usage: magic-pdf json-command --json s3://some_bucket/some_path")
        exit(1)
许瑞's avatar
许瑞 committed
186
187
188
189
190
191
192
193

    def read_s3_path(s3path):
        bucket, key = parse_s3path(s3path)

        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
        s3_rw = S3ReaderWriter(
            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
        )
194
        may_range_params = parse_s3_range_params(s3path)
许瑞's avatar
许瑞 committed
195
196
197
198
        if may_range_params is None or 2 != len(may_range_params):
            byte_start, byte_end = 0, None
        else:
            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
许瑞's avatar
许瑞 committed
199
            byte_end += byte_start - 1
许瑞's avatar
许瑞 committed
200
        return s3_rw.read_jsonl(
许瑞's avatar
许瑞 committed
201
202
203
204
            remove_non_official_s3_args(s3path),
            byte_start,
            byte_end,
            AbsReaderWriter.MODE_BIN,
许瑞's avatar
许瑞 committed
205
206
207
        )

    jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
赵小蒙's avatar
赵小蒙 committed
208
209
210
    s3_file_path = jso.get("file_location")
    if s3_file_path is None:
        s3_file_path = jso.get("path")
kernel.h@qq.com's avatar
kernel.h@qq.com committed
211
212
    pdf_file_name = Path(s3_file_path).stem
    pdf_data = read_s3_path(s3_file_path)
blue's avatar
blue committed
213
214

    do_parse(
kernel.h@qq.com's avatar
kernel.h@qq.com committed
215
        pdf_file_name,
许瑞's avatar
许瑞 committed
216
        pdf_data,
许瑞's avatar
许瑞 committed
217
        jso["doc_layout_result"],
许瑞's avatar
许瑞 committed
218
219
        method,
    )
许瑞's avatar
许瑞 committed
220

kernel.h@qq.com's avatar
kernel.h@qq.com committed
221

赵小蒙's avatar
赵小蒙 committed
222
223
224
225
226
227
228
229
@cli.command()
@click.option("--local_json", type=str, help="输入一个本地jsonl路径")
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
赵小蒙's avatar
赵小蒙 committed
230
231
232
233
@click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
def local_json_command(local_json, method, inside_model):
    model_config.__use_inside_model__ = inside_model

赵小蒙's avatar
赵小蒙 committed
234
235
    def read_s3_path(s3path):
        bucket, key = parse_s3path(s3path)
赵小蒙's avatar
赵小蒙 committed
236

赵小蒙's avatar
赵小蒙 committed
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
        s3_rw = S3ReaderWriter(
            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
        )
        may_range_params = parse_s3_range_params(s3path)
        if may_range_params is None or 2 != len(may_range_params):
            byte_start, byte_end = 0, None
        else:
            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
            byte_end += byte_start - 1
        return s3_rw.read_jsonl(
            remove_non_official_s3_args(s3path),
            byte_start,
            byte_end,
            AbsReaderWriter.MODE_BIN,
        )

    with open(local_json, "r", encoding="utf-8") as f:
        for json_line in f:
            jso = json_parse.loads(json_line)

            s3_file_path = jso.get("file_location")
            if s3_file_path is None:
                s3_file_path = jso.get("path")
            pdf_file_name = Path(s3_file_path).stem
            pdf_data = read_s3_path(s3_file_path)
blue's avatar
blue committed
263
            do_parse(
赵小蒙's avatar
赵小蒙 committed
264
265
266
267
268
                pdf_file_name,
                pdf_data,
                jso["doc_layout_result"],
                method,
            )
赵小蒙's avatar
赵小蒙 committed
269
270


kernel.h@qq.com's avatar
kernel.h@qq.com committed
271
@cli.command()
许瑞's avatar
许瑞 committed
272
273
274
275
@click.option(
    "--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
)
@click.option("--model", type=click.Path(exists=True), help="模型的路径")
276
277
278
279
280
281
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
赵小蒙's avatar
赵小蒙 committed
282
283
@click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
def pdf_command(pdf, model, method, inside_model):
284
    model_config.__use_inside_model__ = inside_model
赵小蒙's avatar
赵小蒙 committed
285

许瑞's avatar
许瑞 committed
286
287
    def read_fn(path):
        disk_rw = DiskReaderWriter(os.path.dirname(path))
许瑞's avatar
许瑞 committed
288
        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
许瑞's avatar
许瑞 committed
289
290

    pdf_data = read_fn(pdf)
291
292
293
294

    def get_model_json(model_path):
        # 这里处理pdf和模型相关的逻辑
        if model_path is None:
295
296
297
298
299
            file_name_without_extension, extension = os.path.splitext(pdf)
            if extension == ".pdf":
                model_path = file_name_without_extension + ".json"
            else:
                raise Exception("pdf_path input error")
300
            if not os.path.exists(model_path):
blue's avatar
blue committed
301
                logger.warning(
302
                    f"not found json {model_path} existed"
blue's avatar
blue committed
303
                )
304
305
                # 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
                model_json = "[]"
306
307
308
309
310
311
312
313
            else:
                model_json = read_fn(model_path).decode("utf-8")
        else:
            model_json = read_fn(model_path).decode("utf-8")

        return model_json

    jso = json_parse.loads(get_model_json(model))
kernel.h@qq.com's avatar
kernel.h@qq.com committed
314
    pdf_file_name = Path(pdf).stem
blue's avatar
blue committed
315
316

    do_parse(
kernel.h@qq.com's avatar
kernel.h@qq.com committed
317
        pdf_file_name,
许瑞's avatar
许瑞 committed
318
        pdf_data,
kernel.h@qq.com's avatar
kernel.h@qq.com committed
319
        jso,
许瑞's avatar
许瑞 committed
320
321
        method,
    )
许瑞's avatar
许瑞 committed
322

kernel.h@qq.com's avatar
kernel.h@qq.com committed
323

许瑞's avatar
许瑞 committed
324
325
if __name__ == "__main__":
    """
许瑞's avatar
许瑞 committed
326
    python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
许瑞's avatar
许瑞 committed
327
    """
kernel.h@qq.com's avatar
kernel.h@qq.com committed
328
    cli()