cli_dev.py 4.03 KB
Newer Older
icecraft's avatar
icecraft committed
1
import json as json_parse
2
import os
icecraft's avatar
icecraft committed
3
from pathlib import Path
4
5
6

import click

icecraft's avatar
icecraft committed
7
import magic_pdf.model as model_config
8
9
10
from magic_pdf.libs.config_reader import get_s3_config
from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
                                       remove_non_official_s3_args)
11
from magic_pdf.libs.version import __version__
12
13
14
15
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
from magic_pdf.tools.common import do_parse, parse_pdf_methods
icecraft's avatar
icecraft committed
16
17
18
19
20
21


def read_s3_path(s3path):
    bucket, key = parse_s3path(s3path)

    s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
22
23
    s3_rw = S3ReaderWriter(s3_ak, s3_sk, s3_endpoint, 'auto',
                           remove_non_official_s3_args(s3path))
icecraft's avatar
icecraft committed
24
25
26
27
    may_range_params = parse_s3_range_params(s3path)
    if may_range_params is None or 2 != len(may_range_params):
        byte_start, byte_end = 0, None
    else:
28
29
        byte_start, byte_end = int(may_range_params[0]), int(
            may_range_params[1])
30
    return s3_rw.read_offset(
icecraft's avatar
icecraft committed
31
32
33
34
35
36
37
        remove_non_official_s3_args(s3path),
        byte_start,
        byte_end,
    )


@click.group()
38
@click.version_option(__version__, '--version', '-v', help='显示版本信息')
icecraft's avatar
icecraft committed
39
40
41
42
43
44
def cli():
    pass


@cli.command()
@click.option(
45
46
47
    '-j',
    '--jsonl',
    'jsonl',
icecraft's avatar
icecraft committed
48
    type=str,
49
    help='输入 jsonl 路径,本地或者 s3 上的文件',
icecraft's avatar
icecraft committed
50
51
52
    required=True,
)
@click.option(
53
54
55
    '-m',
    '--method',
    'method',
icecraft's avatar
icecraft committed
56
    type=parse_pdf_methods,
57
58
    help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
    default='auto',
icecraft's avatar
icecraft committed
59
60
)
@click.option(
61
62
63
64
65
66
    '-o',
    '--output-dir',
    'output_dir',
    type=click.Path(),
    required=True,
    help='输出到本地目录',
icecraft's avatar
icecraft committed
67
68
69
)
def jsonl(jsonl, method, output_dir):
    model_config.__use_inside_model__ = False
70
71
    if jsonl.startswith('s3://'):
        jso = json_parse.loads(read_s3_path(jsonl).decode('utf-8'))
icecraft's avatar
icecraft committed
72
73
74
    else:
        with open(jsonl) as f:
            jso = json_parse.loads(f.readline())
75
76
    os.makedirs(output_dir, exist_ok=True)
    s3_file_path = jso.get('file_location')
icecraft's avatar
icecraft committed
77
    if s3_file_path is None:
78
        s3_file_path = jso.get('path')
icecraft's avatar
icecraft committed
79
80
81
82
83
84
85
86
    pdf_file_name = Path(s3_file_path).stem
    pdf_data = read_s3_path(s3_file_path)

    print(pdf_file_name, jso, method)
    do_parse(
        output_dir,
        pdf_file_name,
        pdf_data,
87
        jso['doc_layout_result'],
icecraft's avatar
icecraft committed
88
        method,
icecraft's avatar
icecraft committed
89
        False,
icecraft's avatar
icecraft committed
90
        f_dump_content_list=True,
91
        f_draw_model_bbox=True,
icecraft's avatar
icecraft committed
92
93
94
95
96
    )


@cli.command()
@click.option(
97
98
99
    '-p',
    '--pdf',
    'pdf',
icecraft's avatar
icecraft committed
100
101
    type=click.Path(exists=True),
    required=True,
102
    help='本地 PDF 文件',
icecraft's avatar
icecraft committed
103
104
)
@click.option(
105
106
107
    '-j',
    '--json',
    'json_data',
icecraft's avatar
icecraft committed
108
109
    type=click.Path(exists=True),
    required=True,
110
    help='本地模型推理出的 json 数据',
icecraft's avatar
icecraft committed
111
)
112
113
114
115
116
@click.option('-o',
              '--output-dir',
              'output_dir',
              type=click.Path(),
              required=True,
117
              help='本地输出目录')
icecraft's avatar
icecraft committed
118
@click.option(
119
120
121
    '-m',
    '--method',
    'method',
icecraft's avatar
icecraft committed
122
    type=parse_pdf_methods,
123
124
    help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
    default='auto',
icecraft's avatar
icecraft committed
125
126
127
128
)
def pdf(pdf, json_data, output_dir, method):
    model_config.__use_inside_model__ = False
    full_pdf_path = os.path.realpath(pdf)
129
    os.makedirs(output_dir, exist_ok=True)
icecraft's avatar
icecraft committed
130
131
132
133
134

    def read_fn(path):
        disk_rw = DiskReaderWriter(os.path.dirname(path))
        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)

135
    model_json_list = json_parse.loads(read_fn(json_data).decode('utf-8'))
icecraft's avatar
icecraft committed
136
137
138
139
140
141
142
143
144

    file_name = str(Path(full_pdf_path).stem)
    pdf_data = read_fn(full_pdf_path)
    do_parse(
        output_dir,
        file_name,
        pdf_data,
        model_json_list,
        method,
icecraft's avatar
icecraft committed
145
        False,
icecraft's avatar
icecraft committed
146
        f_dump_content_list=True,
147
        f_draw_model_bbox=True,
icecraft's avatar
icecraft committed
148
149
150
    )


151
if __name__ == '__main__':
icecraft's avatar
icecraft committed
152
    cli()