test_common.py 1.46 KB
Newer Older
icecraft's avatar
icecraft committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import tempfile
import os
import shutil

import pytest

from magic_pdf.tools.common import do_parse


@pytest.mark.parametrize("method", ["auto", "txt", "ocr"])
def test_common_do_parse(method):
    # setup
    unitest_dir = "/tmp/magic_pdf/unittest/tools"
    filename = "fake"
    os.makedirs(unitest_dir, exist_ok=True)

    temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools")

    # run
    with open("tests/test_tools/assets/common/cli_test_01.pdf", "rb") as f:
        bits = f.read()
    do_parse(temp_output_dir, filename, bits, [], method, f_dump_content_list=True)

    # check
    base_output_dir = os.path.join(temp_output_dir, f"fake/{method}")

    r = os.stat(os.path.join(base_output_dir, "content_list.json"))
    assert r.st_size > 5000

    r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
    assert r.st_size > 7000

    r = os.stat(os.path.join(base_output_dir, "middle.json"))
    assert r.st_size > 200000

    r = os.stat(os.path.join(base_output_dir, "model.json"))
    assert r.st_size > 15000

    r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
    assert r.st_size > 500000

    r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
    assert r.st_size > 500000

    r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
    assert r.st_size > 500000

    os.path.exists(os.path.join(base_output_dir, "images"))
    os.path.isdir(os.path.join(base_output_dir, "images"))

    # teardown
    shutil.rmtree(temp_output_dir)