pdf_parse_by_ocr.py 840 Bytes
Newer Older
1
from magic_pdf.config.enums import SupportedPdfParseMethod
2
from magic_pdf.data.dataset import Dataset
3
from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
赵小蒙's avatar
赵小蒙 committed
4

赵小蒙's avatar
赵小蒙 committed
5

6
def parse_pdf_by_ocr(dataset: Dataset,
赵小蒙's avatar
赵小蒙 committed
7
8
9
10
11
                     model_list,
                     imageWriter,
                     start_page_id=0,
                     end_page_id=None,
                     debug_mode=False,
12
                     lang=None,
赵小蒙's avatar
赵小蒙 committed
13
                     ):
14
15
    return pdf_parse_union(model_list,
                           dataset,
赵小蒙's avatar
赵小蒙 committed
16
                           imageWriter,
17
                           SupportedPdfParseMethod.OCR,
赵小蒙's avatar
赵小蒙 committed
18
19
20
                           start_page_id=start_page_id,
                           end_page_id=end_page_id,
                           debug_mode=debug_mode,
21
                           lang=lang,
赵小蒙's avatar
赵小蒙 committed
22
                           )