magicpdf.py 4.58 KB
Newer Older
kernel.h@qq.com's avatar
kernel.h@qq.com committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
"""
这里实现2个click命令:
第一个:
 接收一个完整的s3路径,例如:s3://llm-pdf-text/pdf_ebook_and_paper/pre-clean-mm-markdown/v014/part-660420b490be-000008.jsonl?bytes=0,81350
    1)根据~/magic-pdf.json里的ak,sk等,构造s3cliReader读取到这个jsonl的对应行,返回json对象。
    2)根据Json对象里的pdf的s3路径获取到他的ak,sk,endpoint,构造出s3cliReader用来读取pdf
    3)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalImageWriter,用来保存截图
    4)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
    
    最后把以上步骤准备好的对象传入真正的解析API
    
第二个:
  接收1)pdf的本地路径。2)模型json文件(可选)。然后:
    1)根据~/magic-pdf.json读取到本地保存图片、md等临时目录的位置,构造出LocalImageWriter,用来保存截图
    2)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
    3)根据约定,根据pdf本地路径,推导出pdf模型的json,并读入
    

效果:
python magicpdf.py --json  s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350 
python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json  或者 python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf
"""

许瑞's avatar
许瑞 committed
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import click
from magic_pdf.libs.config_reader import get_s3_config
from magic_pdf.libs.path_utils import (
    parse_s3path,
    parse_s3_range_params,
    remove_non_official_s3_args,
)
from magic_pdf.libs.config_reader import get_local_dir
from magic_pdf.io.S3ReaderWriter import S3ReaderWriter, MODE_BIN
from magic_pdf.io.DiskReaderWriter import DiskReaderWriter
from magic_pdf.spark.spark_api import parse_union_pdf
import os
import json as json_parse
from datetime import datetime
kernel.h@qq.com's avatar
kernel.h@qq.com committed
38
39


许瑞's avatar
许瑞 committed
40
41
42
43
44
45
46
47
48
49
def prepare_env():
    local_parent_dir = os.path.join(
        get_local_dir(), "magic-pdf", datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    )

    local_image_dir = os.path.join(local_parent_dir, "images")
    local_md_dir = os.path.join(local_parent_dir, "md")
    os.makedirs(local_image_dir, exist_ok=True)
    os.makedirs(local_md_dir, exist_ok=True)
    return local_image_dir, local_md_dir
kernel.h@qq.com's avatar
kernel.h@qq.com committed
50
51
52
53
54
55


@click.group()
def cli():
    pass

许瑞's avatar
许瑞 committed
56

kernel.h@qq.com's avatar
kernel.h@qq.com committed
57
@cli.command()
许瑞's avatar
许瑞 committed
58
@click.option("--json", type=str, help="输入一个S3路径")
kernel.h@qq.com's avatar
kernel.h@qq.com committed
59
def json_command(json):
许瑞's avatar
许瑞 committed
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
    if not json.startswith("s3://"):
        print("usage: python magipdf.py --json s3://some_bucket/some_path")
        os.exit(1)

    def read_s3_path(s3path):
        bucket, key = parse_s3path(s3path)

        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
        s3_rw = S3ReaderWriter(
            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
        )
        may_range_params = parse_s3_range_params(json)
        if may_range_params is None or 2 != len(may_range_params):
            byte_start, byte_end = 0, None
        else:
            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
        return s3_rw.read_jsonl(
            remove_non_official_s3_args(s3path), byte_start, byte_end, MODE_BIN
        )

    jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
    pdf_data = read_s3_path(jso["file_location"])
    local_image_dir, _ = prepare_env()

    local_image_rw = DiskReaderWriter(local_image_dir)
    parse_union_pdf(pdf_data, jso["doc_layout_result"], local_image_rw, is_debug=True)

kernel.h@qq.com's avatar
kernel.h@qq.com committed
87
88

@cli.command()
许瑞's avatar
许瑞 committed
89
90
91
92
@click.option(
    "--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
)
@click.option("--model", type=click.Path(exists=True), help="模型的路径")
kernel.h@qq.com's avatar
kernel.h@qq.com committed
93
94
def pdf_command(pdf, model):
    # 这里处理pdf和模型相关的逻辑
许瑞's avatar
许瑞 committed
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
    if model is None:
        model = pdf.replace(".pdf", ".json")
        if not os.path.exists(model):
            print(f"make sure json file existed and place under {os.dirname(pdf)}")
            os.eixt(1)

    def read_fn(path):
        disk_rw = DiskReaderWriter(os.path.dirname(path))
        return disk_rw.read(os.path.basename(path), MODE_BIN)

    pdf_data = read_fn(pdf)
    jso = json_parse.loads(read_fn(model).decode("utf-8"))

    local_image_dir, _ = prepare_env()
    local_image_rw = DiskReaderWriter(local_image_dir)
    parse_union_pdf(pdf_data, jso["doc_layout_result"], local_image_rw, is_debug=True)

kernel.h@qq.com's avatar
kernel.h@qq.com committed
112

许瑞's avatar
许瑞 committed
113
114
115
116
if __name__ == "__main__":
    """
    python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/format/v070/part-66028dd46437-000076.jsonl?bytes=0,308393
    """
kernel.h@qq.com's avatar
kernel.h@qq.com committed
117
    cli()