cli.py 3.34 KB
Newer Older
icecraft's avatar
icecraft committed
1
import os
2
3
from pathlib import Path

icecraft's avatar
icecraft committed
4
5
import click
from loguru import logger
6

icecraft's avatar
icecraft committed
7
import magic_pdf.model as model_config
8
from magic_pdf.libs.version import __version__
9
10
11
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.tools.common import do_parse, parse_pdf_methods
icecraft's avatar
icecraft committed
12
13
14


@click.command()
15
16
17
18
@click.version_option(__version__,
                      '--version',
                      '-v',
                      help='display the version and exit')
icecraft's avatar
icecraft committed
19
@click.option(
20
21
22
    '-p',
    '--path',
    'path',
icecraft's avatar
icecraft committed
23
24
    type=click.Path(exists=True),
    required=True,
25
    help='local pdf filepath or directory',
icecraft's avatar
icecraft committed
26
27
)
@click.option(
28
29
30
31
32
33
    '-o',
    '--output-dir',
    'output_dir',
    type=click.Path(),
    required=True,
    help='output local directory',
icecraft's avatar
icecraft committed
34
35
)
@click.option(
36
37
38
    '-m',
    '--method',
    'method',
icecraft's avatar
icecraft committed
39
    type=parse_pdf_methods,
40
    help="""the method for parsing pdf.
icecraft's avatar
icecraft committed
41
ocr: using ocr technique to extract information from pdf.
42
43
44
txt: suitable for the text-based pdf only and outperform ocr.
auto: automatically choose the best method for parsing pdf from ocr and txt.
without method specified, auto will be used by default.""",
45
    default='auto',
icecraft's avatar
icecraft committed
46
)
47
@click.option(
quyuan's avatar
quyuan committed
48
<<<<<<< HEAD
49
50
51
52
53
54
55
56
57
58
59
    '-l',
    '--lang',
    'lang',
    type=str,
    help="""
    Input the languages in the pdf (if known) to improve OCR accuracy.  Optional.
    You should input "Abbreviation" with language form url:
    https://paddlepaddle.github.io/PaddleOCR/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
    """,
    default=None,
)
60
@click.option(
quyuan's avatar
quyuan committed
61
62
=======
>>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
63
64
65
    '-d',
    '--debug',
    'debug_able',
66
    type=bool,
67
    help='Enables detailed debugging information during the execution of the CLI commands.',
68
69
    default=False,
)
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
@click.option(
    '-s',
    '--start',
    'start_page_id',
    type=int,
    help='The starting page for PDF parsing, beginning from 0.',
    default=0,
)
@click.option(
    '-e',
    '--end',
    'end_page_id',
    type=int,
    help='The ending page for PDF parsing, beginning from 0.',
    default=None,
)
quyuan's avatar
quyuan committed
86
<<<<<<< HEAD
87
def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
quyuan's avatar
quyuan committed
88
=======
drunkpig's avatar
drunkpig committed
89
def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
quyuan's avatar
quyuan committed
90
>>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
icecraft's avatar
icecraft committed
91
    model_config.__use_inside_model__ = True
92
93
    model_config.__model_mode__ = 'full'
    os.makedirs(output_dir, exist_ok=True)
icecraft's avatar
icecraft committed
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108

    def read_fn(path):
        disk_rw = DiskReaderWriter(os.path.dirname(path))
        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)

    def parse_doc(doc_path: str):
        try:
            file_name = str(Path(doc_path).stem)
            pdf_data = read_fn(doc_path)
            do_parse(
                output_dir,
                file_name,
                pdf_data,
                [],
                method,
109
                debug_able,
110
111
                start_page_id=start_page_id,
                end_page_id=end_page_id,
quyuan's avatar
quyuan committed
112
<<<<<<< HEAD
113
                lang=lang
quyuan's avatar
quyuan committed
114
115
=======
>>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
icecraft's avatar
icecraft committed
116
117
118
119
120
121
            )

        except Exception as e:
            logger.exception(e)

    if os.path.isdir(path):
122
        for doc_path in Path(path).glob('*.pdf'):
icecraft's avatar
icecraft committed
123
124
125
126
127
            parse_doc(doc_path)
    else:
        parse_doc(path)


128
if __name__ == '__main__':
icecraft's avatar
icecraft committed
129
    cli()