pipeline_txt.py 2.57 KB
Newer Older
1
2
3
4
"""
文本型pdf转化为统一清洗格式
"""

5
# TODO 移动到spark/目录下
6
7

from loguru import logger
8
from magic_pdf.dict2md.mkcontent import mk_mm_markdown, mk_universal_format
9
10
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.json_compressor import JsonCompressor
赵小蒙's avatar
赵小蒙 committed
11
from magic_pdf.spark.base import exception_handler, get_data_source
12
13
14


def txt_pdf_to_standard_format(jso: dict, debug_mode=False) -> dict:
15
16
17
    """
    变成统一的标准格式
    """
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
    if debug_mode:
        pass
    else:  # 如果debug没开,则检测是否有needdrop字段
        if jso.get("need_drop", False):
            book_name = join_path(get_data_source(jso), jso["file_id"])
            logger.info(f"book_name is:{book_name} need drop")
            jso["dropped"] = True
            return jso
    try:
        pdf_intermediate_dict = jso["pdf_intermediate_dict"]
        # 将 pdf_intermediate_dict 解压
        pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
        standard_format = mk_universal_format(pdf_intermediate_dict)
        jso["content_list"] = standard_format
        logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",)
        # 把无用的信息清空
        jso["doc_layout_result"] = ""
        jso["pdf_intermediate_dict"] = ""
        jso["pdf_meta"] = ""
    except Exception as e:
        jso = exception_handler(jso, e)
    return jso
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68


def txt_pdf_to_mm_markdown_format(jso: dict, debug_mode=False) -> dict:
    """
    变成多模态的markdown格式
    """
    if debug_mode:
        pass
    else:  # 如果debug没开,则检测是否有needdrop字段
        if jso.get("need_drop", False):
            book_name = join_path(get_data_source(jso), jso["file_id"])
            logger.info(f"book_name is:{book_name} need drop")
            jso["dropped"] = True
            return jso
    try:
        pdf_intermediate_dict = jso["pdf_intermediate_dict"]
        # 将 pdf_intermediate_dict 解压
        pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
        standard_format = mk_universal_format(pdf_intermediate_dict)
        mm_content = mk_mm_markdown(standard_format)
        jso["content_list"] = mm_content
        logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",)
        # 把无用的信息清空
        jso["doc_layout_result"] = ""
        jso["pdf_intermediate_dict"] = ""
        jso["pdf_meta"] = ""
    except Exception as e:
        jso = exception_handler(jso, e)
    return jso