TXTPipe.py 1.12 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from magic_pdf.io.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.json_compressor import JsonCompressor
from magic_pdf.pipe.AbsPipe import AbsPipe
from magic_pdf.user_api import parse_txt_pdf


class TXTPipe(AbsPipe):

    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_bucket_path: str):
        self.compressed_pdf_mid_data = None
        self.pdf_mid_data = None
        self.pdf_bytes = pdf_bytes
        self.model_list = model_list
        self.image_writer = image_writer
        self.img_bucket_path = img_bucket_path

    def pipe_classify(self):
        pass

    def pipe_parse(self):
        self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer)
        self.compressed_pdf_mid_data = JsonCompressor.compress_json(self.pdf_mid_data)

    def pipe_mk_uni_format(self):
        content_list = AbsPipe.mk_uni_format(self.compressed_pdf_mid_data, self.img_bucket_path)
        return content_list

    def pipe_mk_markdown(self):
        md_content = AbsPipe.mk_markdown(self.compressed_pdf_mid_data, self.img_bucket_path)
        return md_content