cli.py 2.67 KB
Newer Older
icecraft's avatar
icecraft committed
1
import os
drunkpig's avatar
drunkpig committed
2
3
from pathlib import Path

icecraft's avatar
icecraft committed
4
5
import click
from loguru import logger
6

icecraft's avatar
icecraft committed
7
import magic_pdf.model as model_config
8
from magic_pdf.libs.version import __version__
drunkpig's avatar
drunkpig committed
9
10
11
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.tools.common import do_parse, parse_pdf_methods
icecraft's avatar
icecraft committed
12
13
14


@click.command()
drunkpig's avatar
drunkpig committed
15
16
17
18
@click.version_option(__version__,
                      '--version',
                      '-v',
                      help='display the version and exit')
icecraft's avatar
icecraft committed
19
@click.option(
drunkpig's avatar
drunkpig committed
20
21
22
    '-p',
    '--path',
    'path',
icecraft's avatar
icecraft committed
23
24
    type=click.Path(exists=True),
    required=True,
drunkpig's avatar
drunkpig committed
25
    help='local pdf filepath or directory',
icecraft's avatar
icecraft committed
26
27
)
@click.option(
drunkpig's avatar
drunkpig committed
28
29
30
31
32
33
    '-o',
    '--output-dir',
    'output_dir',
    type=click.Path(),
    required=True,
    help='output local directory',
icecraft's avatar
icecraft committed
34
35
)
@click.option(
drunkpig's avatar
drunkpig committed
36
37
38
    '-m',
    '--method',
    'method',
icecraft's avatar
icecraft committed
39
    type=parse_pdf_methods,
drunkpig's avatar
drunkpig committed
40
    help="""the method for parsing pdf.
icecraft's avatar
icecraft committed
41
ocr: using ocr technique to extract information from pdf.
42
43
44
txt: suitable for the text-based pdf only and outperform ocr.
auto: automatically choose the best method for parsing pdf from ocr and txt.
without method specified, auto will be used by default.""",
drunkpig's avatar
drunkpig committed
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
    default='auto',
)
@click.option(
    '-d',
    '--debug',
    'debug_able',
    type=bool,
    help='Enables detailed debugging information during the execution of the CLI commands.',
    default=False,
)
@click.option(
    '-s',
    '--start',
    'start_page_id',
    type=int,
    help='The starting page for PDF parsing, beginning from 0.',
    default=0,
)
@click.option(
    '-e',
    '--end',
    'end_page_id',
    type=int,
    help='The ending page for PDF parsing, beginning from 0.',
    default=None,
icecraft's avatar
icecraft committed
70
)
drunkpig's avatar
drunkpig committed
71
def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
icecraft's avatar
icecraft committed
72
    model_config.__use_inside_model__ = True
drunkpig's avatar
drunkpig committed
73
74
    model_config.__model_mode__ = 'full'
    os.makedirs(output_dir, exist_ok=True)
icecraft's avatar
icecraft committed
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89

    def read_fn(path):
        disk_rw = DiskReaderWriter(os.path.dirname(path))
        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)

    def parse_doc(doc_path: str):
        try:
            file_name = str(Path(doc_path).stem)
            pdf_data = read_fn(doc_path)
            do_parse(
                output_dir,
                file_name,
                pdf_data,
                [],
                method,
drunkpig's avatar
drunkpig committed
90
91
92
                debug_able,
                start_page_id=start_page_id,
                end_page_id=end_page_id,
icecraft's avatar
icecraft committed
93
94
95
96
97
98
            )

        except Exception as e:
            logger.exception(e)

    if os.path.isdir(path):
drunkpig's avatar
drunkpig committed
99
        for doc_path in Path(path).glob('*.pdf'):
icecraft's avatar
icecraft committed
100
101
102
103
104
            parse_doc(doc_path)
    else:
        parse_doc(path)


drunkpig's avatar
drunkpig committed
105
if __name__ == '__main__':
icecraft's avatar
icecraft committed
106
    cli()