pipeline_txt.bak 2.65 KB
Newer Older
1
2
3
4
"""
文本型pdf转化为统一清洗格式
"""

5
# TODO 移动到spark/目录下
6
7

from loguru import logger
8
from magic_pdf.dict2md.mkcontent import mk_mm_markdown, mk_universal_format
9
10
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.json_compressor import JsonCompressor
赵小蒙's avatar
赵小蒙 committed
11
from magic_pdf.spark.base import exception_handler, get_data_source
12
13
14


def txt_pdf_to_standard_format(jso: dict, debug_mode=False) -> dict:
15
16
17
    """
    变成统一的标准格式
    """
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
    if debug_mode:
        pass
    else:  # 如果debug没开,则检测是否有needdrop字段
        if jso.get("need_drop", False):
            book_name = join_path(get_data_source(jso), jso["file_id"])
            logger.info(f"book_name is:{book_name} need drop")
            jso["dropped"] = True
            return jso
    try:
        pdf_intermediate_dict = jso["pdf_intermediate_dict"]
        # 将 pdf_intermediate_dict 解压
        pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
        standard_format = mk_universal_format(pdf_intermediate_dict)
        jso["content_list"] = standard_format
        logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",)
        # 把无用的信息清空
        jso["doc_layout_result"] = ""
        jso["pdf_intermediate_dict"] = ""
        jso["pdf_meta"] = ""
    except Exception as e:
        jso = exception_handler(jso, e)
    return jso
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59


def txt_pdf_to_mm_markdown_format(jso: dict, debug_mode=False) -> dict:
    """
    变成多模态的markdown格式
    """
    if debug_mode:
        pass
    else:  # 如果debug没开,则检测是否有needdrop字段
        if jso.get("need_drop", False):
            book_name = join_path(get_data_source(jso), jso["file_id"])
            logger.info(f"book_name is:{book_name} need drop")
            jso["dropped"] = True
            return jso
    try:
        pdf_intermediate_dict = jso["pdf_intermediate_dict"]
        # 将 pdf_intermediate_dict 解压
        pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
        standard_format = mk_universal_format(pdf_intermediate_dict)
        mm_content = mk_mm_markdown(standard_format)
kernel.h@qq.com's avatar
kernel.h@qq.com committed
60
        jso["content"] = mm_content
61
62
        logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",)
        # 把无用的信息清空
kernel.h@qq.com's avatar
kernel.h@qq.com committed
63
64
65
66
        to_del_keys = ["doc_layout_result", "pdf_intermediate_dict", "pdf_meta", "parsed_result"]
        for key in to_del_keys:
            if jso.get(key):
                del jso[key]
67
68
69
    except Exception as e:
        jso = exception_handler(jso, e)
    return jso