cli.py 1.92 KB
Newer Older
icecraft's avatar
icecraft committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os
import click
from loguru import logger
from pathlib import Path
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
import magic_pdf.model as model_config
from magic_pdf.tools.common import parse_pdf_methods, do_parse


@click.command()
@click.option(
    "-p",
    "--path",
    "path",
    type=click.Path(exists=True),
    required=True,
    help="local pdf filepath or directory",
)
@click.option(
    "-o",
    "--output-dir",
    "output_dir",
    type=str,
    help="output local directory",
    default="",
)
@click.option(
    "-m",
    "--method",
    "method",
    type=parse_pdf_methods,
    help="""the method for parsing pdf. 
ocr: using ocr technique to extract information from pdf.
txt: suitable for the text-based pdf only and outperform ocr. 
auto: automatically choose the best method for parsing pdf from ocr and txt""",
    default="auto",
)
def cli(path, output_dir, method):
    model_config.__use_inside_model__ = True
    model_config.__model_mode__ = "full"
    if output_dir == "":
        if os.path.isdir(path):
            output_dir = os.path.join(path, "output")
        else:
            output_dir = os.path.join(os.path.dirname(path), "output")

    def read_fn(path):
        disk_rw = DiskReaderWriter(os.path.dirname(path))
        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)

    def parse_doc(doc_path: str):
        try:
            file_name = str(Path(doc_path).stem)
            pdf_data = read_fn(doc_path)
            do_parse(
                output_dir,
                file_name,
                pdf_data,
                [],
                method,
            )

        except Exception as e:
            logger.exception(e)

    if os.path.isdir(path):
        for doc_path in Path(path).glob("*.pdf"):
            parse_doc(doc_path)
    else:
        parse_doc(path)


if __name__ == "__main__":
    cli()