magicpdf.py 10.6 KB
Newer Older
kernel.h@qq.com's avatar
kernel.h@qq.com committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
"""
这里实现2个click命令:
第一个:
 接收一个完整的s3路径,例如:s3://llm-pdf-text/pdf_ebook_and_paper/pre-clean-mm-markdown/v014/part-660420b490be-000008.jsonl?bytes=0,81350
    1)根据~/magic-pdf.json里的ak,sk等,构造s3cliReader读取到这个jsonl的对应行,返回json对象。
    2)根据Json对象里的pdf的s3路径获取到他的ak,sk,endpoint,构造出s3cliReader用来读取pdf
    3)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalImageWriter,用来保存截图
    4)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
    
    最后把以上步骤准备好的对象传入真正的解析API
    
第二个:
  接收1)pdf的本地路径。2)模型json文件(可选)。然后:
    1)根据~/magic-pdf.json读取到本地保存图片、md等临时目录的位置,构造出LocalImageWriter,用来保存截图
    2)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
    3)根据约定,根据pdf本地路径,推导出pdf模型的json,并读入
    

效果:
python magicpdf.py --json  s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350 
python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json  或者 python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf
"""

许瑞's avatar
许瑞 committed
24
25
import os
import json as json_parse
26
import sys
许瑞's avatar
许瑞 committed
27
import click
28
from loguru import logger
kernel.h@qq.com's avatar
kernel.h@qq.com committed
29
from pathlib import Path
赵小蒙's avatar
赵小蒙 committed
30
from magic_pdf.libs.version import __version__
31

32
from magic_pdf.libs.MakeContentConfig import DropMode
赵小蒙's avatar
赵小蒙 committed
33
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
34
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
许瑞's avatar
许瑞 committed
35
from magic_pdf.pipe.UNIPipe import UNIPipe
许瑞's avatar
许瑞 committed
36
37
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
许瑞's avatar
许瑞 committed
38
39
40
41
42
43
44
from magic_pdf.libs.config_reader import get_s3_config
from magic_pdf.libs.path_utils import (
    parse_s3path,
    parse_s3_range_params,
    remove_non_official_s3_args,
)
from magic_pdf.libs.config_reader import get_local_dir
许瑞's avatar
许瑞 committed
45
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
kernel.h@qq.com's avatar
kernel.h@qq.com committed
46
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
许瑞's avatar
许瑞 committed
47
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
48
import csv
kernel.h@qq.com's avatar
kernel.h@qq.com committed
49

50
51
52
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])


赵小蒙's avatar
赵小蒙 committed
53
def prepare_env(pdf_file_name, method):
许瑞's avatar
许瑞 committed
54
    local_parent_dir = os.path.join(
赵小蒙's avatar
赵小蒙 committed
55
        get_local_dir(), "magic-pdf", pdf_file_name, method
许瑞's avatar
许瑞 committed
56
57
    )

赵小蒙's avatar
赵小蒙 committed
58
    local_image_dir = os.path.join(str(local_parent_dir), "images")
59
    local_md_dir = local_parent_dir
许瑞's avatar
许瑞 committed
60
61
62
    os.makedirs(local_image_dir, exist_ok=True)
    os.makedirs(local_md_dir, exist_ok=True)
    return local_image_dir, local_md_dir
kernel.h@qq.com's avatar
kernel.h@qq.com committed
63
64


65
66
67
68
69
70
71
72
73
def write_to_csv(csv_file_path, csv_data):
    with open(csv_file_path, mode='a', newline='', encoding='utf-8') as csvfile:
        # 创建csv writer对象
        csv_writer = csv.writer(csvfile)
        # 写入数据
        csv_writer.writerow(csv_data)
    print(f"数据已成功追加到 '{csv_file_path}'")


赵小蒙's avatar
赵小蒙 committed
74
def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir, local_md_dir):
许瑞's avatar
许瑞 committed
75
    if parse_method == "auto":
赵小蒙's avatar
赵小蒙 committed
76
77
78
79
80
        jso_useful_key = {
            "_pdf_type": "",
            "model_list": model_list
        }
        pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
许瑞's avatar
许瑞 committed
81
    elif parse_method == "txt":
82
        pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
许瑞's avatar
许瑞 committed
83
    elif parse_method == "ocr":
84
        pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
许瑞's avatar
许瑞 committed
85
86
    else:
        print("unknow parse method")
87
        sys.exit(1)
许瑞's avatar
许瑞 committed
88
89

    pipe.pipe_classify()
90
91
92
93
94

    '''如果没有传入有效的模型数据,则使用内置paddle解析'''
    if len(model_list) == 0:
        pipe.pipe_analyze()

许瑞's avatar
许瑞 committed
95
    pipe.pipe_parse()
赵小蒙's avatar
赵小蒙 committed
96
97
    pdf_info = pipe.pdf_mid_data['pdf_info']
    draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
赵小蒙's avatar
赵小蒙 committed
98
    draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
99

100
101
102
    # write_to_csv(r"D:\project\20231108code-clean\linshixuqiu\pdf_dev\新模型\新建文件夹\luanma.csv",
    #              [pdf_file_name, pipe.pdf_mid_data['not_common_character_rate'], pipe.pdf_mid_data['not_printable_rate']])

103
    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
许瑞's avatar
许瑞 committed
104
    md_writer.write(
kernel.h@qq.com's avatar
kernel.h@qq.com committed
105
        content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
许瑞's avatar
许瑞 committed
106
107
108
    )
    md_writer.write(
        content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
kernel.h@qq.com's avatar
kernel.h@qq.com committed
109
        path=f"{pdf_file_name}.json",
许瑞's avatar
许瑞 committed
110
111
        mode=AbsReaderWriter.MODE_TXT,
    )
112
113

    content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
赵小蒙's avatar
赵小蒙 committed
114
115
116
    md_writer.write(
        str(content_list), f"{pdf_file_name}.txt", AbsReaderWriter.MODE_TXT
    )
许瑞's avatar
许瑞 committed
117
118


kernel.h@qq.com's avatar
kernel.h@qq.com committed
119
@click.group()
赵小蒙's avatar
赵小蒙 committed
120
121
@click.version_option(__version__, "--version", "-v", help="显示版本信息")
@click.help_option("--help", "-h", help="显示帮助信息")
kernel.h@qq.com's avatar
kernel.h@qq.com committed
122
123
124
def cli():
    pass

许瑞's avatar
许瑞 committed
125

kernel.h@qq.com's avatar
kernel.h@qq.com committed
126
@cli.command()
许瑞's avatar
许瑞 committed
127
@click.option("--json", type=str, help="输入一个S3路径")
128
129
130
131
132
133
134
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
def json_command(json, method):
许瑞's avatar
许瑞 committed
135
136
    if not json.startswith("s3://"):
        print("usage: python magipdf.py --json s3://some_bucket/some_path")
137
        sys.exit(1)
许瑞's avatar
许瑞 committed
138
139
140
141
142
143
144
145

    def read_s3_path(s3path):
        bucket, key = parse_s3path(s3path)

        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
        s3_rw = S3ReaderWriter(
            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
        )
146
        may_range_params = parse_s3_range_params(s3path)
许瑞's avatar
许瑞 committed
147
148
149
150
        if may_range_params is None or 2 != len(may_range_params):
            byte_start, byte_end = 0, None
        else:
            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
许瑞's avatar
许瑞 committed
151
            byte_end += byte_start - 1
许瑞's avatar
许瑞 committed
152
        return s3_rw.read_jsonl(
许瑞's avatar
许瑞 committed
153
154
155
156
            remove_non_official_s3_args(s3path),
            byte_start,
            byte_end,
            AbsReaderWriter.MODE_BIN,
许瑞's avatar
许瑞 committed
157
158
159
        )

    jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
赵小蒙's avatar
赵小蒙 committed
160
161
162
    s3_file_path = jso.get("file_location")
    if s3_file_path is None:
        s3_file_path = jso.get("path")
kernel.h@qq.com's avatar
kernel.h@qq.com committed
163
164
    pdf_file_name = Path(s3_file_path).stem
    pdf_data = read_s3_path(s3_file_path)
赵小蒙's avatar
赵小蒙 committed
165
    local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
赵小蒙's avatar
赵小蒙 committed
166

许瑞's avatar
许瑞 committed
167
168
169
    local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
        local_md_dir
    )
许瑞's avatar
许瑞 committed
170

许瑞's avatar
许瑞 committed
171
    _do_parse(
kernel.h@qq.com's avatar
kernel.h@qq.com committed
172
        pdf_file_name,
许瑞's avatar
许瑞 committed
173
        pdf_data,
许瑞's avatar
许瑞 committed
174
        jso["doc_layout_result"],
许瑞's avatar
许瑞 committed
175
176
177
        method,
        local_image_rw,
        local_md_rw,
178
        os.path.basename(local_image_dir),
赵小蒙's avatar
赵小蒙 committed
179
        local_md_dir
许瑞's avatar
许瑞 committed
180
    )
许瑞's avatar
许瑞 committed
181

kernel.h@qq.com's avatar
kernel.h@qq.com committed
182

赵小蒙's avatar
赵小蒙 committed
183
184
185
186
187
188
189
190
191
192
193
@cli.command()
@click.option("--local_json", type=str, help="输入一个本地jsonl路径")
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
def local_json_command(local_json, method):
    def read_s3_path(s3path):
        bucket, key = parse_s3path(s3path)
赵小蒙's avatar
赵小蒙 committed
194

赵小蒙's avatar
赵小蒙 committed
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
        s3_rw = S3ReaderWriter(
            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
        )
        may_range_params = parse_s3_range_params(s3path)
        if may_range_params is None or 2 != len(may_range_params):
            byte_start, byte_end = 0, None
        else:
            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
            byte_end += byte_start - 1
        return s3_rw.read_jsonl(
            remove_non_official_s3_args(s3path),
            byte_start,
            byte_end,
            AbsReaderWriter.MODE_BIN,
        )

    with open(local_json, "r", encoding="utf-8") as f:
        for json_line in f:
            jso = json_parse.loads(json_line)

            s3_file_path = jso.get("file_location")
            if s3_file_path is None:
                s3_file_path = jso.get("path")
            pdf_file_name = Path(s3_file_path).stem
            pdf_data = read_s3_path(s3_file_path)
            local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)

            local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
                local_md_dir
赵小蒙's avatar
赵小蒙 committed
225
226
            )

赵小蒙's avatar
赵小蒙 committed
227
228
229
230
231
232
233
234
235
236
            _do_parse(
                pdf_file_name,
                pdf_data,
                jso["doc_layout_result"],
                method,
                local_image_rw,
                local_md_rw,
                os.path.basename(local_image_dir),
                local_md_dir
            )
赵小蒙's avatar
赵小蒙 committed
237
238


kernel.h@qq.com's avatar
kernel.h@qq.com committed
239
@cli.command()
许瑞's avatar
许瑞 committed
240
241
242
243
@click.option(
    "--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
)
@click.option("--model", type=click.Path(exists=True), help="模型的路径")
244
245
246
247
248
249
250
@click.option(
    "--method",
    type=parse_pdf_methods,
    help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
    default="auto",
)
def pdf_command(pdf, model, method):
许瑞's avatar
许瑞 committed
251
252
    def read_fn(path):
        disk_rw = DiskReaderWriter(os.path.dirname(path))
许瑞's avatar
许瑞 committed
253
        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
许瑞's avatar
许瑞 committed
254
255

    pdf_data = read_fn(pdf)
256
257
258
259
260
261
262

    def get_model_json(model_path):
        # 这里处理pdf和模型相关的逻辑
        if model_path is None:
            model_path = pdf.replace(".pdf", ".json")
            if not os.path.exists(model_path):
                logger.warning(f"not found json {model_path} existed, use paddle analyze")
263
264
                # 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
                model_json = "[]"
265
266
267
268
269
270
271
272
            else:
                model_json = read_fn(model_path).decode("utf-8")
        else:
            model_json = read_fn(model_path).decode("utf-8")

        return model_json

    jso = json_parse.loads(get_model_json(model))
kernel.h@qq.com's avatar
kernel.h@qq.com committed
273
    pdf_file_name = Path(pdf).stem
赵小蒙's avatar
赵小蒙 committed
274
    local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
许瑞's avatar
许瑞 committed
275
276
277
278
    local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
        local_md_dir
    )
    _do_parse(
kernel.h@qq.com's avatar
kernel.h@qq.com committed
279
        pdf_file_name,
许瑞's avatar
许瑞 committed
280
        pdf_data,
kernel.h@qq.com's avatar
kernel.h@qq.com committed
281
        jso,
许瑞's avatar
许瑞 committed
282
283
284
        method,
        local_image_rw,
        local_md_rw,
285
        os.path.basename(local_image_dir),
赵小蒙's avatar
赵小蒙 committed
286
        local_md_dir
许瑞's avatar
许瑞 committed
287
    )
许瑞's avatar
许瑞 committed
288

kernel.h@qq.com's avatar
kernel.h@qq.com committed
289

许瑞's avatar
许瑞 committed
290
291
if __name__ == "__main__":
    """
许瑞's avatar
许瑞 committed
292
    python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
许瑞's avatar
许瑞 committed
293
    """
kernel.h@qq.com's avatar
kernel.h@qq.com committed
294
    cli()