cli_dev.py 3.86 KB
Newer Older
icecraft's avatar
icecraft committed
1
import json as json_parse
2
import os
icecraft's avatar
icecraft committed
3
from pathlib import Path
4
5
6

import click

icecraft's avatar
icecraft committed
7
import magic_pdf.model as model_config
8
from magic_pdf.data.data_reader_writer import FileBasedDataReader, S3DataReader
9
10
11
from magic_pdf.libs.config_reader import get_s3_config
from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
                                       remove_non_official_s3_args)
12
from magic_pdf.libs.version import __version__
13
from magic_pdf.tools.common import do_parse, parse_pdf_methods
icecraft's avatar
icecraft committed
14
15
16
17
18
19


def read_s3_path(s3path):
    bucket, key = parse_s3path(s3path)

    s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
20
    s3_rw = S3DataReader('', bucket, s3_ak, s3_sk, s3_endpoint, 'auto')
icecraft's avatar
icecraft committed
21
22
    may_range_params = parse_s3_range_params(s3path)
    if may_range_params is None or 2 != len(may_range_params):
23
        byte_start, byte_end = 0, -1
icecraft's avatar
icecraft committed
24
    else:
25
26
        byte_start, byte_end = int(may_range_params[0]), int(
            may_range_params[1])
27
    return s3_rw.read_at(
icecraft's avatar
icecraft committed
28
29
30
31
32
33
34
        remove_non_official_s3_args(s3path),
        byte_start,
        byte_end,
    )


@click.group()
35
@click.version_option(__version__, '--version', '-v', help='显示版本信息')
icecraft's avatar
icecraft committed
36
37
38
39
40
41
def cli():
    pass


@cli.command()
@click.option(
42
43
44
    '-j',
    '--jsonl',
    'jsonl',
icecraft's avatar
icecraft committed
45
    type=str,
46
    help='输入 jsonl 路径,本地或者 s3 上的文件',
icecraft's avatar
icecraft committed
47
48
49
    required=True,
)
@click.option(
50
51
52
    '-m',
    '--method',
    'method',
icecraft's avatar
icecraft committed
53
    type=parse_pdf_methods,
54
55
    help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
    default='auto',
icecraft's avatar
icecraft committed
56
57
)
@click.option(
58
59
60
61
62
63
    '-o',
    '--output-dir',
    'output_dir',
    type=click.Path(),
    required=True,
    help='输出到本地目录',
icecraft's avatar
icecraft committed
64
65
66
)
def jsonl(jsonl, method, output_dir):
    model_config.__use_inside_model__ = False
67
68
    if jsonl.startswith('s3://'):
        jso = json_parse.loads(read_s3_path(jsonl).decode('utf-8'))
icecraft's avatar
icecraft committed
69
70
71
    else:
        with open(jsonl) as f:
            jso = json_parse.loads(f.readline())
72
73
    os.makedirs(output_dir, exist_ok=True)
    s3_file_path = jso.get('file_location')
icecraft's avatar
icecraft committed
74
    if s3_file_path is None:
75
        s3_file_path = jso.get('path')
icecraft's avatar
icecraft committed
76
77
78
79
80
81
82
83
    pdf_file_name = Path(s3_file_path).stem
    pdf_data = read_s3_path(s3_file_path)

    print(pdf_file_name, jso, method)
    do_parse(
        output_dir,
        pdf_file_name,
        pdf_data,
84
        jso['doc_layout_result'],
icecraft's avatar
icecraft committed
85
        method,
icecraft's avatar
icecraft committed
86
        False,
icecraft's avatar
icecraft committed
87
        f_dump_content_list=True,
88
        f_draw_model_bbox=True,
icecraft's avatar
icecraft committed
89
90
91
92
93
    )


@cli.command()
@click.option(
94
95
96
    '-p',
    '--pdf',
    'pdf',
icecraft's avatar
icecraft committed
97
98
    type=click.Path(exists=True),
    required=True,
99
    help='本地 PDF 文件',
icecraft's avatar
icecraft committed
100
101
)
@click.option(
102
103
104
    '-j',
    '--json',
    'json_data',
icecraft's avatar
icecraft committed
105
106
    type=click.Path(exists=True),
    required=True,
107
    help='本地模型推理出的 json 数据',
icecraft's avatar
icecraft committed
108
)
109
110
111
112
113
@click.option('-o',
              '--output-dir',
              'output_dir',
              type=click.Path(),
              required=True,
114
              help='本地输出目录')
icecraft's avatar
icecraft committed
115
@click.option(
116
117
118
    '-m',
    '--method',
    'method',
icecraft's avatar
icecraft committed
119
    type=parse_pdf_methods,
120
121
    help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
    default='auto',
icecraft's avatar
icecraft committed
122
123
124
125
)
def pdf(pdf, json_data, output_dir, method):
    model_config.__use_inside_model__ = False
    full_pdf_path = os.path.realpath(pdf)
126
    os.makedirs(output_dir, exist_ok=True)
icecraft's avatar
icecraft committed
127
128

    def read_fn(path):
129
130
        disk_rw = FileBasedDataReader(os.path.dirname(path))
        return disk_rw.read(os.path.basename(path))
icecraft's avatar
icecraft committed
131

132
    model_json_list = json_parse.loads(read_fn(json_data).decode('utf-8'))
icecraft's avatar
icecraft committed
133
134
135
136
137
138
139
140
141

    file_name = str(Path(full_pdf_path).stem)
    pdf_data = read_fn(full_pdf_path)
    do_parse(
        output_dir,
        file_name,
        pdf_data,
        model_json_list,
        method,
icecraft's avatar
icecraft committed
142
        False,
icecraft's avatar
icecraft committed
143
        f_dump_content_list=True,
144
        f_draw_model_bbox=True,
icecraft's avatar
icecraft committed
145
146
147
    )


148
if __name__ == '__main__':
icecraft's avatar
icecraft committed
149
    cli()