batch_demo.py 750 Bytes
Newer Older
icecraft's avatar
icecraft committed
1
2
3
import os
from pathlib import Path
from magic_pdf.data.batch_build_dataset import batch_build_dataset
4
from magic_pdf.tools.common import batch_do_parse
icecraft's avatar
icecraft committed
5
6
7
8
9
10
11
12
13
14


def batch(pdf_dir, output_dir, method, lang):
    os.makedirs(output_dir, exist_ok=True)
    doc_paths = []
    for doc_path in Path(pdf_dir).glob('*'):
        if doc_path.suffix == '.pdf':
            doc_paths.append(doc_path)

    # build dataset with 2 workers
15
    datasets = batch_build_dataset(doc_paths, 4, lang)
icecraft's avatar
icecraft committed
16

17
18
    # os.environ["MINERU_MIN_BATCH_INFERENCE_SIZE"] = "200"  # every 200 pages will be parsed in one batch
    batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method)
icecraft's avatar
icecraft committed
19
20
21


if __name__ == '__main__':
22
    batch("pdfs", "output", "auto", "")
icecraft's avatar
icecraft committed
23