pdf_parse_by_txt.py 762 Bytes
Newer Older
1
2
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.dataset import PymuDocDataset
3
from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
赵小蒙's avatar
赵小蒙 committed
4

赵小蒙's avatar
赵小蒙 committed
5
6
7
8
9
10
11
12

def parse_pdf_by_txt(
    pdf_bytes,
    model_list,
    imageWriter,
    start_page_id=0,
    end_page_id=None,
    debug_mode=False,
13
    lang=None,
赵小蒙's avatar
赵小蒙 committed
14
):
15
16
    dataset = PymuDocDataset(pdf_bytes)
    return pdf_parse_union(dataset,
赵小蒙's avatar
赵小蒙 committed
17
18
                           model_list,
                           imageWriter,
19
                           SupportedPdfParseMethod.TXT,
赵小蒙's avatar
赵小蒙 committed
20
21
22
                           start_page_id=start_page_id,
                           end_page_id=end_page_id,
                           debug_mode=debug_mode,
23
                           lang=lang,
赵小蒙's avatar
赵小蒙 committed
24
                           )