pipeline.py 24.1 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
3
4
5
# coding=utf8
import sys
import time
from urllib.parse import quote

许瑞's avatar
许瑞 committed
6
7
8
9
10
11
12
13
14
15
16
17
18
from magic_pdf.dict2md.ocr_mkcontent import (
    ocr_mk_nlp_markdown,
    ocr_mk_mm_markdown,
    ocr_mk_mm_standard_format,
    ocr_mk_mm_markdown_with_para,
)
from magic_pdf.libs.commons import (
    read_file,
    join_path,
    parse_bucket_key,
    formatted_time,
    s3_image_save_path,
)
赵小蒙's avatar
赵小蒙 committed
19
20
from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.json_compressor import JsonCompressor
21
from magic_pdf.dict2md.mkcontent import mk_nlp_markdown, mk_universal_format
赵小蒙's avatar
赵小蒙 committed
22
23
24
25
from magic_pdf.pdf_parse_by_model import parse_pdf_by_model
from magic_pdf.filter.pdf_classify_by_type import classify
from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
from loguru import logger
许瑞's avatar
许瑞 committed
26
27
from magic_pdf.pdf_parse_for_train import parse_pdf_for_train
rom magic_pdf.train_utils.convert_to_train_format import convert_to_train_format
赵小蒙's avatar
赵小蒙 committed
28
from app.common.s3 import get_s3_config, get_s3_client
赵小蒙's avatar
赵小蒙 committed
29
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
赵小蒙's avatar
赵小蒙 committed
30
31
32
33


def exception_handler(jso: dict, e):
    logger.exception(e)
许瑞's avatar
许瑞 committed
34
35
36
    jso["need_drop"] = True
    jso["drop_reason"] = DropReason.Exception
    jso["exception"] = f"ERROR: {e}"
赵小蒙's avatar
赵小蒙 committed
37
38
39
    return jso


40
def get_data_type(jso: dict):
许瑞's avatar
许瑞 committed
41
    data_type = jso.get("data_type")
42
    if data_type is None:
许瑞's avatar
许瑞 committed
43
        data_type = jso.get("file_type")
44
45
46
47
    return data_type


def get_bookid(jso: dict):
许瑞's avatar
许瑞 committed
48
    book_id = jso.get("bookid")
49
    if book_id is None:
许瑞's avatar
许瑞 committed
50
        book_id = jso.get("original_file_id")
51
52
53
54
    return book_id


def get_data_source(jso: dict):
许瑞's avatar
许瑞 committed
55
    data_source = jso.get("data_source")
56
    if data_source is None:
许瑞's avatar
许瑞 committed
57
        data_source = jso.get("file_source")
58
59
60
    return data_source


赵小蒙's avatar
赵小蒙 committed
61
def meta_scan(jso: dict, doc_layout_check=True) -> dict:
许瑞's avatar
许瑞 committed
62
    s3_pdf_path = jso.get("file_location")
赵小蒙's avatar
赵小蒙 committed
63
64
    s3_config = get_s3_config(s3_pdf_path)
    if doc_layout_check:
许瑞's avatar
许瑞 committed
65
66
67
68
69
        if (
            "doc_layout_result" not in jso
        ):  # 检测json中是存在模型数据,如果没有则需要跳过该pdf
            jso["need_drop"] = True
            jso["drop_reason"] = DropReason.MISS_DOC_LAYOUT_RESULT
赵小蒙's avatar
赵小蒙 committed
70
71
            return jso
    try:
72
        data_source = get_data_source(jso)
许瑞's avatar
许瑞 committed
73
        file_id = jso.get("file_id")
赵小蒙's avatar
赵小蒙 committed
74
        book_name = f"{data_source}/{file_id}"
赵小蒙's avatar
赵小蒙 committed
75
76
77
78
79
80
81
82
83

        # 首页存在超量drawing问题
        # special_pdf_list = ['zlib/zlib_21822650']
        # if book_name in special_pdf_list:
        #     jso['need_drop'] = True
        #     jso['drop_reason'] = DropReason.SPECIAL_PDF
        #     return jso

        start_time = time.time()  # 记录开始时间
许瑞's avatar
许瑞 committed
84
85
86
87
        logger.info(
            f"book_name is:{book_name},start_time is:{formatted_time(start_time)}",
            file=sys.stderr,
        )
赵小蒙's avatar
赵小蒙 committed
88
89
90
91
92
        file_content = read_file(s3_pdf_path, s3_config)
        read_file_time = int(time.time() - start_time)  # 计算执行时间

        start_time = time.time()  # 记录开始时间
        res = pdf_meta_scan(s3_pdf_path, file_content)
许瑞's avatar
许瑞 committed
93
94
95
96
97
        if res.get(
            "need_drop", False
        ):  # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析
            jso["need_drop"] = True
            jso["drop_reason"] = res["drop_reason"]
赵小蒙's avatar
赵小蒙 committed
98
        else:  # 正常返回
许瑞's avatar
许瑞 committed
99
100
101
102
            jso["pdf_meta"] = res
            jso["content"] = ""
            jso["remark"] = ""
            jso["data_url"] = ""
赵小蒙's avatar
赵小蒙 committed
103
104
        end_time = time.time()  # 记录结束时间
        meta_scan_time = int(end_time - start_time)  # 计算执行时间
许瑞's avatar
许瑞 committed
105
106
107
108
109
110
        logger.info(
            f"book_name is:{book_name},end_time is:{formatted_time(end_time)},read_file_time is:{read_file_time},meta_scan_time is:{meta_scan_time}",
            file=sys.stderr,
        )
        jso["read_file_time"] = read_file_time
        jso["meta_scan_time"] = meta_scan_time
赵小蒙's avatar
赵小蒙 committed
111
112
113
114
115
116
    except Exception as e:
        jso = exception_handler(jso, e)
    return jso


def classify_by_type(jso: dict, debug_mode=False) -> dict:
许瑞's avatar
许瑞 committed
117
    # 检测debug开关
赵小蒙's avatar
赵小蒙 committed
118
119
    if debug_mode:
        pass
许瑞's avatar
许瑞 committed
120
121
    else:  # 如果debug没开,则检测是否有needdrop字段
        if jso.get("need_drop", False):
赵小蒙's avatar
赵小蒙 committed
122
123
124
            return jso
    # 开始正式逻辑
    try:
许瑞's avatar
许瑞 committed
125
        pdf_meta = jso.get("pdf_meta")
126
        data_source = get_data_source(jso)
许瑞's avatar
许瑞 committed
127
        file_id = jso.get("file_id")
赵小蒙's avatar
赵小蒙 committed
128
        book_name = f"{data_source}/{file_id}"
赵小蒙's avatar
赵小蒙 committed
129
130
131
132
        total_page = pdf_meta["total_page"]
        page_width = pdf_meta["page_width_pts"]
        page_height = pdf_meta["page_height_pts"]
        img_sz_list = pdf_meta["image_info_per_page"]
许瑞's avatar
许瑞 committed
133
134
135
136
        img_num_list = pdf_meta["imgs_per_page"]
        text_len_list = pdf_meta["text_len_per_page"]
        text_layout_list = pdf_meta["text_layout_per_page"]
        text_language = pdf_meta["text_language"]
赵小蒙's avatar
赵小蒙 committed
137
138
139
140
141
142
        # allow_language = ['zh', 'en']  # 允许的语言,目前只允许简中和英文的

        # if text_language not in allow_language:  # 如果语言不在允许的语言中,则drop
        #     jso['need_drop'] = True
        #     jso['drop_reason'] = DropReason.NOT_ALLOW_LANGUAGE
        #     return jso
许瑞's avatar
许瑞 committed
143
144
145
146
147
148
149
150
        pdf_path = pdf_meta["pdf_path"]
        is_encrypted = pdf_meta["is_encrypted"]
        is_needs_password = pdf_meta["is_needs_password"]
        if (
            is_encrypted or is_needs_password
        ):  # 加密的,需要密码的,没有页面的,都不处理
            jso["need_drop"] = True
            jso["drop_reason"] = DropReason.ENCRYPTED
赵小蒙's avatar
赵小蒙 committed
151
152
        else:
            start_time = time.time()  # 记录开始时间
许瑞's avatar
许瑞 committed
153
154
155
156
157
158
159
160
161
162
            is_text_pdf, results = classify(
                pdf_path,
                total_page,
                page_width,
                page_height,
                img_sz_list,
                text_len_list,
                img_num_list,
                text_layout_list,
            )
赵小蒙's avatar
赵小蒙 committed
163
164
            classify_time = int(time.time() - start_time)  # 计算执行时间
            if is_text_pdf:
许瑞's avatar
许瑞 committed
165
166
167
                pdf_meta["is_text_pdf"] = is_text_pdf
                jso["pdf_meta"] = pdf_meta
                jso["classify_time"] = classify_time
赵小蒙's avatar
赵小蒙 committed
168
169
                # print(json.dumps(pdf_meta, ensure_ascii=False))

许瑞's avatar
许瑞 committed
170
171
172
173
174
175
                allow_language = ["zh", "en"]  # 允许的语言,目前只允许简中和英文的
                if (
                    text_language not in allow_language
                ):  # 如果语言不在允许的语言中,则drop
                    jso["need_drop"] = True
                    jso["drop_reason"] = DropReason.NOT_ALLOW_LANGUAGE
赵小蒙's avatar
赵小蒙 committed
176
177
178
                    return jso
            else:
                # 先不drop
许瑞's avatar
许瑞 committed
179
180
181
182
183
                pdf_meta["is_text_pdf"] = is_text_pdf
                jso["pdf_meta"] = pdf_meta
                jso["classify_time"] = classify_time
                jso["need_drop"] = True
                jso["drop_reason"] = DropReason.NOT_IS_TEXT_PDF
赵小蒙's avatar
赵小蒙 committed
184
185
186
187
                extra_info = {"classify_rules": []}
                for condition, result in results.items():
                    if not result:
                        extra_info["classify_rules"].append(condition)
许瑞's avatar
许瑞 committed
188
                jso["extra_info"] = extra_info
赵小蒙's avatar
赵小蒙 committed
189
190
191
192
193
194
195
196
197
198

    except Exception as e:
        jso = exception_handler(jso, e)
    return jso


def save_tables_to_s3(jso: dict, debug_mode=False) -> dict:

    if debug_mode:
        pass
许瑞's avatar
许瑞 committed
199
200
201
202
203
204
    else:  # 如果debug没开,则检测是否有needdrop字段
        if jso.get("need_drop", False):
            logger.info(
                f"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop",
                file=sys.stderr,
            )
赵小蒙's avatar
赵小蒙 committed
205
206
207
            jso["dropped"] = True
            return jso
    try:
208
        data_source = get_data_source(jso)
许瑞's avatar
许瑞 committed
209
        file_id = jso.get("file_id")
赵小蒙's avatar
赵小蒙 committed
210
        book_name = f"{data_source}/{file_id}"
许瑞's avatar
许瑞 committed
211
212
213
        title = jso.get("title")
        url_encode_title = quote(title, safe="")
        if data_source != "scihub":
赵小蒙's avatar
赵小蒙 committed
214
            return jso
许瑞's avatar
许瑞 committed
215
        pdf_intermediate_dict = jso["pdf_intermediate_dict"]
赵小蒙's avatar
赵小蒙 committed
216
217
218
219
        # 将 pdf_intermediate_dict 解压
        pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
        i = 0
        for page in pdf_intermediate_dict.values():
许瑞's avatar
许瑞 committed
220
221
            if page.get("tables"):
                if len(page["tables"]) > 0:
赵小蒙's avatar
赵小蒙 committed
222
                    j = 0
许瑞's avatar
许瑞 committed
223
                    for table in page["tables"]:
赵小蒙's avatar
赵小蒙 committed
224
                        if debug_mode:
许瑞's avatar
许瑞 committed
225
226
227
228
229
                            image_path = join_path(
                                "s3://mllm-raw-media/pdf2md_img/",
                                book_name,
                                table["image_path"],
                            )
赵小蒙's avatar
赵小蒙 committed
230
                        else:
许瑞's avatar
许瑞 committed
231
232
233
                            image_path = join_path(
                                "s3://mllm-raw-media/pdf2md_img/", table["image_path"]
                            )
赵小蒙's avatar
赵小蒙 committed
234

许瑞's avatar
许瑞 committed
235
                        if image_path.endswith(".jpg"):
赵小蒙's avatar
赵小蒙 committed
236
237
238
239
                            j += 1
                            s3_client = get_s3_client(image_path)
                            bucket_name, bucket_key = parse_bucket_key(image_path)
                            # 通过s3_client获取图片到内存
许瑞's avatar
许瑞 committed
240
241
242
                            image_bytes = s3_client.get_object(
                                Bucket=bucket_name, Key=bucket_key
                            )["Body"].read()
赵小蒙's avatar
赵小蒙 committed
243
244
                            # 保存图片到新的位置
                            if debug_mode:
许瑞's avatar
许瑞 committed
245
246
247
248
249
250
                                new_image_path = join_path(
                                    "s3://mllm-raw-media/pdf2md_img/table_new/",
                                    url_encode_title
                                    + "_"
                                    + table["image_path"].lstrip("tables/"),
                                )
赵小蒙's avatar
赵小蒙 committed
251
                            else:
许瑞's avatar
许瑞 committed
252
253
254
255
                                new_image_path = join_path(
                                    "s3://mllm-raw-media/pdf2md_img/table_new/",
                                    url_encode_title + f"_page{i}_{j}.jpg",
                                )
赵小蒙's avatar
赵小蒙 committed
256
257
258

                            logger.info(new_image_path, file=sys.stderr)
                            bucket_name, bucket_key = parse_bucket_key(new_image_path)
许瑞's avatar
许瑞 committed
259
260
261
                            s3_client.put_object(
                                Bucket=bucket_name, Key=bucket_key, Body=image_bytes
                            )
赵小蒙's avatar
赵小蒙 committed
262
263
264
265
266
267
268
269
270
271
272
273
274
275
                        else:
                            continue
            i += 1

        # 把无用的信息清空
        jso["doc_layout_result"] = ""
        jso["pdf_intermediate_dict"] = ""
        jso["pdf_meta"] = ""
    except Exception as e:
        jso = exception_handler(jso, e)
    return jso


def drop_needdrop_pdf(jso: dict) -> dict:
许瑞's avatar
许瑞 committed
276
277
278
279
280
    if jso.get("need_drop", False):
        logger.info(
            f"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop",
            file=sys.stderr,
        )
赵小蒙's avatar
赵小蒙 committed
281
282
283
284
285
286
287
288
        jso["dropped"] = True
    return jso


def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:

    if debug_mode:
        pass
许瑞's avatar
许瑞 committed
289
290
291
    else:  # 如果debug没开,则检测是否有needdrop字段
        if jso.get("need_drop", False):
            book_name = join_path(get_data_source(jso), jso["file_id"])
赵小蒙's avatar
赵小蒙 committed
292
293
294
295
            logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
            jso["dropped"] = True
            return jso
    try:
许瑞's avatar
许瑞 committed
296
        pdf_intermediate_dict = jso["pdf_intermediate_dict"]
赵小蒙's avatar
赵小蒙 committed
297
298
        # 将 pdf_intermediate_dict 解压
        pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
许瑞's avatar
许瑞 committed
299
300
301
        # markdown_content = mk_nlp_markdown(pdf_intermediate_dict)
        jso["content_list"] = mk_universal_format(pdf_intermediate_dict)
        # jso["content"] = markdown_content
302
        logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']}")
赵小蒙's avatar
赵小蒙 committed
303
304
305
306
307
308
309
310
311
312
        # 把无用的信息清空
        jso["doc_layout_result"] = ""
        jso["pdf_intermediate_dict"] = ""
        jso["pdf_meta"] = ""
    except Exception as e:
        jso = exception_handler(jso, e)
    return jso


def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
许瑞's avatar
许瑞 committed
313
    # 检测debug开关
赵小蒙's avatar
赵小蒙 committed
314
315
    if debug_mode:
        pass
许瑞's avatar
许瑞 committed
316
317
    else:  # 如果debug没开,则检测是否有needdrop字段
        if jso.get("need_drop", False):
赵小蒙's avatar
赵小蒙 committed
318
319
            return jso
    # 开始正式逻辑
许瑞's avatar
许瑞 committed
320
    s3_pdf_path = jso.get("file_location")
赵小蒙's avatar
赵小蒙 committed
321
    s3_config = get_s3_config(s3_pdf_path)
许瑞's avatar
许瑞 committed
322
    model_output_json_list = jso.get("doc_layout_result")
323
    data_source = get_data_source(jso)
许瑞's avatar
许瑞 committed
324
    file_id = jso.get("file_id")
赵小蒙's avatar
赵小蒙 committed
325
    book_name = f"{data_source}/{file_id}"
赵小蒙's avatar
赵小蒙 committed
326
327
328
329
330
331
332
333
334
335

    # 1.23.22已修复
    # if debug_mode:
    #     pass
    # else:
    #     if book_name == "zlib/zlib_21929367":
    #         jso['need_drop'] = True
    #         jso['drop_reason'] = DropReason.SPECIAL_PDF
    #         return jso

许瑞's avatar
许瑞 committed
336
    junk_img_bojids = jso["pdf_meta"]["junk_img_bojids"]
赵小蒙's avatar
赵小蒙 committed
337
338
339
    # total_page = jso['pdf_meta']['total_page']

    # 增加检测 max_svgs 数量的检测逻辑,如果 max_svgs 超过3000则drop
许瑞's avatar
许瑞 committed
340
    svgs_per_page_list = jso["pdf_meta"]["svgs_per_page"]
赵小蒙's avatar
赵小蒙 committed
341
342
    max_svgs = max(svgs_per_page_list)
    if max_svgs > 3000:
许瑞's avatar
许瑞 committed
343
344
        jso["need_drop"] = True
        jso["drop_reason"] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_SVGS
赵小蒙's avatar
赵小蒙 committed
345
346
347
348
349
    # elif total_page > 1000:
    #     jso['need_drop'] = True
    #     jso['drop_reason'] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES
    else:
        try:
赵小蒙's avatar
赵小蒙 committed
350
            save_path = s3_image_save_path
赵小蒙's avatar
赵小蒙 committed
351
352
353
            image_s3_config = get_s3_config(save_path)
            start_time = time.time()  # 记录开始时间
            # 先打印一下book_name和解析开始的时间
许瑞's avatar
许瑞 committed
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
            logger.info(
                f"book_name is:{book_name},start_time is:{formatted_time(start_time)}",
                file=sys.stderr,
            )
            pdf_info_dict = parse_pdf_by_model(
                s3_pdf_path,
                s3_config,
                model_output_json_list,
                save_path,
                book_name,
                pdf_model_profile=None,
                image_s3_config=image_s3_config,
                start_page_id=start_page_id,
                junk_img_bojids=junk_img_bojids,
                debug_mode=debug_mode,
            )
            if pdf_info_dict.get(
                "need_drop", False
            ):  # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析
                jso["need_drop"] = True
                jso["drop_reason"] = pdf_info_dict["drop_reason"]
赵小蒙's avatar
赵小蒙 committed
375
376
            else:  # 正常返回,将 pdf_info_dict 压缩并存储
                pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
许瑞's avatar
许瑞 committed
377
                jso["pdf_intermediate_dict"] = pdf_info_dict
赵小蒙's avatar
赵小蒙 committed
378
379
380
            end_time = time.time()  # 记录完成时间
            parse_time = int(end_time - start_time)  # 计算执行时间
            # 解析完成后打印一下book_name和耗时
许瑞's avatar
许瑞 committed
381
382
383
384
385
            logger.info(
                f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}",
                file=sys.stderr,
            )
            jso["parse_time"] = parse_time
赵小蒙's avatar
赵小蒙 committed
386
387
388
389
        except Exception as e:
            jso = exception_handler(jso, e)
    return jso

许瑞's avatar
许瑞 committed
390
391

"""
392
393
394
统一处理逻辑
1.先调用parse_pdf对文本类pdf进行处理
2.再调用ocr_dropped_parse_pdf,对之前drop的pdf进行处理
许瑞's avatar
许瑞 committed
395
396
397
"""


398
399
400
401
402
def uni_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
    jso = parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode)
    jso = ocr_dropped_parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode)
    return jso

许瑞's avatar
许瑞 committed
403

404
405
# 专门用来跑被drop的pdf,跑完之后需要把need_drop字段置为false
def ocr_dropped_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
许瑞's avatar
许瑞 committed
406
    if not jso.get("need_drop", False):
407
408
        return jso
    else:
许瑞's avatar
许瑞 committed
409
410
411
412
        jso = ocr_parse_pdf_core(
            jso, start_page_id=start_page_id, debug_mode=debug_mode
        )
        jso["need_drop"] = False
413
414
        return jso

赵小蒙's avatar
赵小蒙 committed
415

416
def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
赵小蒙's avatar
赵小蒙 committed
417
418
419
420
    # 检测debug开关
    if debug_mode:
        pass
    else:  # 如果debug没开,则检测是否有needdrop字段
许瑞's avatar
许瑞 committed
421
        if jso.get("need_drop", False):
赵小蒙's avatar
赵小蒙 committed
422
423
            return jso

424
425
426
427
428
    jso = ocr_parse_pdf_core(jso, start_page_id=start_page_id, debug_mode=debug_mode)
    return jso


def ocr_parse_pdf_core(jso: dict, start_page_id=0, debug_mode=False) -> dict:
许瑞's avatar
许瑞 committed
429
    s3_pdf_path = jso.get("file_location")
赵小蒙's avatar
赵小蒙 committed
430
    s3_config = get_s3_config(s3_pdf_path)
许瑞's avatar
许瑞 committed
431
    model_output_json_list = jso.get("doc_layout_result")
赵小蒙's avatar
赵小蒙 committed
432
    data_source = get_data_source(jso)
许瑞's avatar
许瑞 committed
433
    file_id = jso.get("file_id")
赵小蒙's avatar
赵小蒙 committed
434
    book_name = f"{data_source}/{file_id}"
赵小蒙's avatar
赵小蒙 committed
435
    try:
赵小蒙's avatar
赵小蒙 committed
436
        save_path = s3_image_save_path
赵小蒙's avatar
赵小蒙 committed
437
438
439
        image_s3_config = get_s3_config(save_path)
        start_time = time.time()  # 记录开始时间
        # 先打印一下book_name和解析开始的时间
许瑞's avatar
许瑞 committed
440
441
442
443
        logger.info(
            f"book_name is:{book_name},start_time is:{formatted_time(start_time)}",
            file=sys.stderr,
        )
赵小蒙's avatar
赵小蒙 committed
444
445
446
447
448
449
450
451
452
        pdf_info_dict = parse_pdf_by_ocr(
            s3_pdf_path,
            s3_config,
            model_output_json_list,
            save_path,
            book_name,
            pdf_model_profile=None,
            image_s3_config=image_s3_config,
            start_page_id=start_page_id,
许瑞's avatar
许瑞 committed
453
            debug_mode=debug_mode,
赵小蒙's avatar
赵小蒙 committed
454
        )
455
        pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
许瑞's avatar
许瑞 committed
456
        jso["pdf_intermediate_dict"] = pdf_info_dict
赵小蒙's avatar
赵小蒙 committed
457
458
459
        end_time = time.time()  # 记录完成时间
        parse_time = int(end_time - start_time)  # 计算执行时间
        # 解析完成后打印一下book_name和耗时
许瑞's avatar
许瑞 committed
460
461
462
463
464
        logger.info(
            f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}",
            file=sys.stderr,
        )
        jso["parse_time"] = parse_time
赵小蒙's avatar
赵小蒙 committed
465
466
467
468
469
470
471
472
473
474
    except Exception as e:
        jso = exception_handler(jso, e)
    return jso


def ocr_pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:

    if debug_mode:
        pass
    else:  # 如果debug没开,则检测是否有needdrop字段
许瑞's avatar
许瑞 committed
475
476
        if jso.get("need_drop", False):
            book_name = join_path(get_data_source(jso), jso["file_id"])
赵小蒙's avatar
赵小蒙 committed
477
478
479
480
            logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
            jso["dropped"] = True
            return jso
    try:
许瑞's avatar
许瑞 committed
481
        pdf_intermediate_dict = jso["pdf_intermediate_dict"]
赵小蒙's avatar
赵小蒙 committed
482
483
484
485
        # 将 pdf_intermediate_dict 解压
        pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
        markdown_content = ocr_mk_mm_markdown(pdf_intermediate_dict)
        jso["content"] = markdown_content
许瑞's avatar
许瑞 committed
486
487
488
489
        logger.info(
            f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
            file=sys.stderr,
        )
赵小蒙's avatar
赵小蒙 committed
490
491
492
493
494
495
496
        # 把无用的信息清空
        jso["doc_layout_result"] = ""
        jso["pdf_intermediate_dict"] = ""
        jso["pdf_meta"] = ""
    except Exception as e:
        jso = exception_handler(jso, e)
    return jso
497
498


许瑞's avatar
许瑞 committed
499
500
501
def ocr_pdf_intermediate_dict_to_markdown_with_para_for_qa(
    jso: dict, debug_mode=False
) -> dict:
赵小蒙's avatar
赵小蒙 committed
502
503
504
505

    if debug_mode:
        pass
    else:  # 如果debug没开,则检测是否有needdrop字段
许瑞's avatar
许瑞 committed
506
507
        if jso.get("need_drop", False):
            book_name = join_path(get_data_source(jso), jso["file_id"])
赵小蒙's avatar
赵小蒙 committed
508
509
510
511
            logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
            jso["dropped"] = True
            return jso
    try:
许瑞's avatar
许瑞 committed
512
        pdf_intermediate_dict = jso["pdf_intermediate_dict"]
赵小蒙's avatar
赵小蒙 committed
513
514
515
516
        # 将 pdf_intermediate_dict 解压
        pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
        markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
        jso["content_ocr"] = markdown_content
许瑞's avatar
许瑞 committed
517
518
519
520
        logger.info(
            f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",
            file=sys.stderr,
        )
赵小蒙's avatar
赵小蒙 committed
521
522
523
524
525
526
527
528
529
        # 把无用的信息清空
        jso["doc_layout_result"] = ""
        jso["pdf_intermediate_dict"] = ""
        jso["pdf_meta"] = ""
    except Exception as e:
        jso = exception_handler(jso, e)
    return jso


赵小蒙's avatar
赵小蒙 committed
530
531
532
533
534
def ocr_pdf_intermediate_dict_to_standard_format(jso: dict, debug_mode=False) -> dict:

    if debug_mode:
        pass
    else:  # 如果debug没开,则检测是否有needdrop字段
许瑞's avatar
许瑞 committed
535
536
        if jso.get("need_drop", False):
            book_name = join_path(get_data_source(jso), jso["file_id"])
赵小蒙's avatar
赵小蒙 committed
537
538
539
540
            logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
            jso["dropped"] = True
            return jso
    try:
许瑞's avatar
许瑞 committed
541
        pdf_intermediate_dict = jso["pdf_intermediate_dict"]
赵小蒙's avatar
赵小蒙 committed
542
543
544
545
        # 将 pdf_intermediate_dict 解压
        pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
        standard_format = ocr_mk_mm_standard_format(pdf_intermediate_dict)
        jso["content_list"] = standard_format
许瑞's avatar
许瑞 committed
546
547
548
549
        logger.info(
            f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",
            file=sys.stderr,
        )
赵小蒙's avatar
赵小蒙 committed
550
551
552
553
554
555
556
557
558
        # 把无用的信息清空
        jso["doc_layout_result"] = ""
        jso["pdf_intermediate_dict"] = ""
        jso["pdf_meta"] = ""
    except Exception as e:
        jso = exception_handler(jso, e)
    return jso


许瑞's avatar
许瑞 committed
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
def parse_pdf_for_model_train(jso: dict, start_page_id=0, debug_mode=False) -> dict:
    # 检测debug开关
    if debug_mode:
        pass
    else:  # 如果debug没开,则检测是否有needdrop字段
        if jso.get("need_drop", False):
            return jso
    # 开始正式逻辑
    s3_pdf_path = jso.get("file_location")
    s3_config = get_s3_config(s3_pdf_path)
    model_output_json_list = jso.get("doc_layout_result")
    data_source = get_data_source(jso)
    file_id = jso.get("file_id")
    book_name = f"{data_source}/{file_id}"

    # 1.23.22已修复
    # if debug_mode:
    #     pass
    # else:
    #     if book_name == "zlib/zlib_21929367":
    #         jso['need_drop'] = True
    #         jso['drop_reason'] = DropReason.SPECIAL_PDF
    #         return jso

    junk_img_bojids = jso["pdf_meta"]["junk_img_bojids"]
    # total_page = jso['pdf_meta']['total_page']

    # 增加检测 max_svgs 数量的检测逻辑,如果 max_svgs 超过3000则drop
    svgs_per_page_list = jso["pdf_meta"]["svgs_per_page"]
    max_svgs = max(svgs_per_page_list)
    if max_svgs > 3000:
        jso["need_drop"] = True
        jso["drop_reason"] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_SVGS
    # elif total_page > 1000:
    #     jso['need_drop'] = True
    #     jso['drop_reason'] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES
    else:
        try:
            save_path = s3_image_save_path
            image_s3_config = get_s3_config(save_path)
            start_time = time.time()  # 记录开始时间
            # 先打印一下book_name和解析开始的时间
            logger.info(
                f"book_name is:{book_name},start_time is:{formatted_time(start_time)}",
                file=sys.stderr,
            )
            pdf_info_dict = parse_pdf_for_train(
                s3_pdf_path,
                s3_config,
                model_output_json_list,
                save_path,
                book_name,
                pdf_model_profile=None,
                image_s3_config=image_s3_config,
                start_page_id=start_page_id,
                junk_img_bojids=junk_img_bojids,
                debug_mode=debug_mode,
            )
            if pdf_info_dict.get(
                "need_drop", False
            ):  # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析
                jso["need_drop"] = True
                jso["drop_reason"] = pdf_info_dict["drop_reason"]
            else:  # 正常返回,将 pdf_info_dict 压缩并存储
                pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
                jso["pdf_intermediate_dict"] = pdf_info_dict
                jso["parsed_results"] = convert_to_train_format(pdf_info_dict)
            end_time = time.time()  # 记录完成时间
            parse_time = int(end_time - start_time)  # 计算执行时间
            # 解析完成后打印一下book_name和耗时
            logger.info(
                f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}",
                file=sys.stderr,
            )
            jso["parse_time"] = parse_time
        except Exception as e:
            jso = exception_handler(jso, e)
    return jso


赵小蒙's avatar
赵小蒙 committed
639
640
if __name__ == "__main__":
    pass