"macapp/package.json" did not exist on "7ed5a39bc7d41d809d0bcbc010344286cad7a6bc"
AbsPipe.py 3.84 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
3
from abc import ABC, abstractmethod

from magic_pdf.dict2md.mkcontent import mk_universal_format, mk_mm_markdown
4
from magic_pdf.dict2md.ocr_mkcontent import make_standard_format_with_para, ocr_mk_mm_markdown_with_para, union_make
赵小蒙's avatar
赵小蒙 committed
5
6
from magic_pdf.filter.pdf_classify_by_type import classify
from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
7
from magic_pdf.libs.MakeContentConfig import MakeMode, DropMode
kernel.h@qq.com's avatar
kernel.h@qq.com committed
8
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
赵小蒙's avatar
赵小蒙 committed
9
10
11
12
13
14
15
16
from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.json_compressor import JsonCompressor


class AbsPipe(ABC):
    """
    txt和ocr处理的抽象类
    """
kernel.h@qq.com's avatar
kernel.h@qq.com committed
17
18
    PIP_OCR = "ocr"
    PIP_TXT = "txt"
赵小蒙's avatar
赵小蒙 committed
19

20
    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
赵小蒙's avatar
赵小蒙 committed
21
22
23
        self.pdf_bytes = pdf_bytes
        self.model_list = model_list
        self.image_writer = image_writer
24
        self.pdf_mid_data = None  # 未压缩
kernel.h@qq.com's avatar
kernel.h@qq.com committed
25
        self.is_debug = is_debug
kernel.h@qq.com's avatar
kernel.h@qq.com committed
26
27
28
    
    def get_compress_pdf_mid_data(self):
        return JsonCompressor.compress_json(self.pdf_mid_data)
赵小蒙's avatar
赵小蒙 committed
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44

    @abstractmethod
    def pipe_classify(self):
        """
        有状态的分类
        """
        raise NotImplementedError

    @abstractmethod
    def pipe_parse(self):
        """
        有状态的解析
        """
        raise NotImplementedError

    @abstractmethod
45
    def pipe_mk_uni_format(self, img_parent_path, drop_mode):
赵小蒙's avatar
赵小蒙 committed
46
47
48
49
50
51
        """
        有状态的组装统一格式
        """
        raise NotImplementedError

    @abstractmethod
52
    def pipe_mk_markdown(self, img_parent_path, drop_mode):
赵小蒙's avatar
赵小蒙 committed
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
        """
        有状态的组装markdown
        """
        raise NotImplementedError

    @staticmethod
    def classify(pdf_bytes: bytes) -> str:
        """
        根据pdf的元数据,判断是否是文本pdf,还是ocr pdf
        """
        pdf_meta = pdf_meta_scan(pdf_bytes)
        if pdf_meta.get("_need_drop", False):  # 如果返回了需要丢弃的标志,则抛出异常
            raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
        else:
            is_encrypted = pdf_meta["is_encrypted"]
            is_needs_password = pdf_meta["is_needs_password"]
            if is_encrypted or is_needs_password:  # 加密的,需要密码的,没有页面的,都不处理
                raise Exception(f"pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}")
            else:
                is_text_pdf, results = classify(
                    pdf_meta["total_page"],
                    pdf_meta["page_width_pts"],
                    pdf_meta["page_height_pts"],
                    pdf_meta["image_info_per_page"],
                    pdf_meta["text_len_per_page"],
                    pdf_meta["imgs_per_page"],
                    pdf_meta["text_layout_per_page"],
                )
                if is_text_pdf:
kernel.h@qq.com's avatar
kernel.h@qq.com committed
82
                    return AbsPipe.PIP_TXT
赵小蒙's avatar
赵小蒙 committed
83
                else:
kernel.h@qq.com's avatar
kernel.h@qq.com committed
84
                    return AbsPipe.PIP_OCR
赵小蒙's avatar
赵小蒙 committed
85
86

    @staticmethod
87
    def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
赵小蒙's avatar
赵小蒙 committed
88
89
90
91
92
        """
        根据pdf类型,生成统一格式content_list
        """
        pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
        pdf_info_list = pdf_mid_data["pdf_info"]
93
        content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path)
赵小蒙's avatar
赵小蒙 committed
94
95
96
        return content_list

    @staticmethod
97
    def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
赵小蒙's avatar
赵小蒙 committed
98
99
100
101
102
        """
        根据pdf类型,markdown
        """
        pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
        pdf_info_list = pdf_mid_data["pdf_info"]
103
        md_content = union_make(pdf_info_list, MakeMode.MM_MD, drop_mode, img_buket_path)
赵小蒙's avatar
赵小蒙 committed
104
105
106
        return md_content