cli.py 4.43 KB
Newer Older
icecraft's avatar
icecraft committed
1
import os
2
3
import shutil
import tempfile
icecraft's avatar
icecraft committed
4
5
from pathlib import Path

icecraft's avatar
icecraft committed
6
import click
7
import fitz
icecraft's avatar
icecraft committed
8
from loguru import logger
9

icecraft's avatar
icecraft committed
10
import magic_pdf.model as model_config
icecraft's avatar
icecraft committed
11
from magic_pdf.data.batch_build_dataset import batch_build_dataset
icecraft's avatar
icecraft committed
12
from magic_pdf.data.data_reader_writer import FileBasedDataReader
icecraft's avatar
icecraft committed
13
from magic_pdf.data.dataset import Dataset
14
from magic_pdf.libs.version import __version__
icecraft's avatar
icecraft committed
15
from magic_pdf.tools.common import batch_do_parse, do_parse, parse_pdf_methods
16
17
18
19
from magic_pdf.utils.office_to_pdf import convert_file_to_pdf

pdf_suffixes = ['.pdf']
ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
20
image_suffixes = ['.png', '.jpeg', '.jpg']
icecraft's avatar
icecraft committed
21
22
23


@click.command()
24
25
26
27
@click.version_option(__version__,
                      '--version',
                      '-v',
                      help='display the version and exit')
icecraft's avatar
icecraft committed
28
@click.option(
29
30
31
    '-p',
    '--path',
    'path',
icecraft's avatar
icecraft committed
32
33
    type=click.Path(exists=True),
    required=True,
34
    help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
icecraft's avatar
icecraft committed
35
36
)
@click.option(
37
38
39
40
41
42
    '-o',
    '--output-dir',
    'output_dir',
    type=click.Path(),
    required=True,
    help='output local directory',
icecraft's avatar
icecraft committed
43
44
)
@click.option(
45
46
47
    '-m',
    '--method',
    'method',
icecraft's avatar
icecraft committed
48
    type=parse_pdf_methods,
49
    help="""the method for parsing pdf.
icecraft's avatar
icecraft committed
50
ocr: using ocr technique to extract information from pdf.
51
52
53
txt: suitable for the text-based pdf only and outperform ocr.
auto: automatically choose the best method for parsing pdf from ocr and txt.
without method specified, auto will be used by default.""",
54
    default='auto',
icecraft's avatar
icecraft committed
55
)
56
57
58
59
60
61
62
63
@click.option(
    '-l',
    '--lang',
    'lang',
    type=str,
    help="""
    Input the languages in the pdf (if known) to improve OCR accuracy.  Optional.
    You should input "Abbreviation" with language form url:
64
    https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
65
66
67
    """,
    default=None,
)
68
@click.option(
69
70
71
    '-d',
    '--debug',
    'debug_able',
72
    type=bool,
73
    help='Enables detailed debugging information during the execution of the CLI commands.',
74
75
    default=False,
)
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
@click.option(
    '-s',
    '--start',
    'start_page_id',
    type=int,
    help='The starting page for PDF parsing, beginning from 0.',
    default=0,
)
@click.option(
    '-e',
    '--end',
    'end_page_id',
    type=int,
    help='The ending page for PDF parsing, beginning from 0.',
    default=None,
)
92
def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
93
    os.makedirs(output_dir, exist_ok=True)
94
95
96
97
    temp_dir = tempfile.mkdtemp()
    def read_fn(path: Path):
        if path.suffix in ms_office_suffixes:
            convert_file_to_pdf(str(path), temp_dir)
icecraft's avatar
icecraft committed
98
            fn = os.path.join(temp_dir, f'{path.stem}.pdf')
99
100
        elif path.suffix in image_suffixes:
            with open(str(path), 'rb') as f:
xu rui's avatar
xu rui committed
101
                bits = f.read()
102
            pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
icecraft's avatar
icecraft committed
103
            fn = os.path.join(temp_dir, f'{path.stem}.pdf')
104
105
106
107
108
            with open(fn, 'wb') as f:
                f.write(pdf_bytes)
        elif path.suffix in pdf_suffixes:
            fn = str(path)
        else:
icecraft's avatar
icecraft committed
109
110
            raise Exception(f'Unknown file suffix: {path.suffix}')

111
112
        disk_rw = FileBasedDataReader(os.path.dirname(fn))
        return disk_rw.read(os.path.basename(fn))
icecraft's avatar
icecraft committed
113

icecraft's avatar
icecraft committed
114
    def parse_doc(doc_path: Path, dataset: Dataset | None = None):
icecraft's avatar
icecraft committed
115
116
        try:
            file_name = str(Path(doc_path).stem)
icecraft's avatar
icecraft committed
117
118
119
120
            if dataset is None:
                pdf_data_or_dataset = read_fn(doc_path)
            else:
                pdf_data_or_dataset = dataset
icecraft's avatar
icecraft committed
121
122
123
            do_parse(
                output_dir,
                file_name,
icecraft's avatar
icecraft committed
124
                pdf_data_or_dataset,
icecraft's avatar
icecraft committed
125
126
                [],
                method,
127
                debug_able,
128
129
                start_page_id=start_page_id,
                end_page_id=end_page_id,
130
                lang=lang
icecraft's avatar
icecraft committed
131
132
133
134
135
136
            )

        except Exception as e:
            logger.exception(e)

    if os.path.isdir(path):
icecraft's avatar
icecraft committed
137
        doc_paths = []
138
139
        for doc_path in Path(path).glob('*'):
            if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
icecraft's avatar
icecraft committed
140
141
142
                doc_paths.append(doc_path)
        datasets = batch_build_dataset(doc_paths, 4, lang)
        batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, debug_able, lang=lang)
icecraft's avatar
icecraft committed
143
    else:
xu rui's avatar
xu rui committed
144
        parse_doc(Path(path))
icecraft's avatar
icecraft committed
145

146
147
    shutil.rmtree(temp_dir)

icecraft's avatar
icecraft committed
148

149
if __name__ == '__main__':
icecraft's avatar
icecraft committed
150
    cli()