test_cli_dev.py 3.21 KB
Newer Older
yyy's avatar
yyy committed
1
import tempfile
icecraft's avatar
icecraft committed
2
3
4
5
6
7
8
9
10
import os
import shutil
from click.testing import CliRunner

from magic_pdf.tools import cli_dev


def test_cli_pdf():
    # setup
yyy's avatar
yyy committed
11
12
    unitest_dir = "/tmp/magic_pdf/unittest/tools"
    filename = "cli_test_01"
icecraft's avatar
icecraft committed
13
    os.makedirs(unitest_dir, exist_ok=True)
yyy's avatar
yyy committed
14
    temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools")
icecraft's avatar
icecraft committed
15
16
17
18
19
20

    # run
    runner = CliRunner()
    result = runner.invoke(
        cli_dev.cli,
        [
yyy's avatar
yyy committed
21
22
23
24
25
26
            "pdf",
            "-p",
            "tests/test_tools/assets/cli/pdf/cli_test_01.pdf",
            "-j",
            "tests/test_tools/assets/cli_dev/cli_test_01.model.json",
            "-o",
icecraft's avatar
icecraft committed
27
28
29
30
31
32
33
            temp_output_dir,
        ],
    )

    # check
    assert result.exit_code == 0

yyy's avatar
yyy committed
34
    base_output_dir = os.path.join(temp_output_dir, "cli_test_01/auto")
icecraft's avatar
icecraft committed
35

yyy's avatar
yyy committed
36
    r = os.stat(os.path.join(base_output_dir, "content_list.json"))
icecraft's avatar
icecraft committed
37
38
    assert r.st_size > 5000

yyy's avatar
yyy committed
39
    r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
icecraft's avatar
icecraft committed
40
41
    assert r.st_size > 7000

yyy's avatar
yyy committed
42
    r = os.stat(os.path.join(base_output_dir, "middle.json"))
icecraft's avatar
icecraft committed
43
44
    assert r.st_size > 200000

yyy's avatar
yyy committed
45
    r = os.stat(os.path.join(base_output_dir, "model.json"))
icecraft's avatar
icecraft committed
46
47
    assert r.st_size > 15000

yyy's avatar
yyy committed
48
    r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
icecraft's avatar
icecraft committed
49
50
    assert r.st_size > 500000

yyy's avatar
yyy committed
51
    r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
icecraft's avatar
icecraft committed
52
53
    assert r.st_size > 500000

yyy's avatar
yyy committed
54
    r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
icecraft's avatar
icecraft committed
55
56
    assert r.st_size > 500000

yyy's avatar
yyy committed
57
58
    assert os.path.exists(os.path.join(base_output_dir, "images")) is True
    assert os.path.isdir(os.path.join(base_output_dir, "images")) is True
icecraft's avatar
icecraft committed
59
60
61
62
63
64
65

    # teardown
    shutil.rmtree(temp_output_dir)


def test_cli_jsonl():
    # setup
yyy's avatar
yyy committed
66
67
    unitest_dir = "/tmp/magic_pdf/unittest/tools"
    filename = "cli_test_01"
icecraft's avatar
icecraft committed
68
    os.makedirs(unitest_dir, exist_ok=True)
yyy's avatar
yyy committed
69
    temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools")
icecraft's avatar
icecraft committed
70
71

    def mock_read_s3_path(s3path):
yyy's avatar
yyy committed
72
        with open(s3path, "rb") as f:
icecraft's avatar
icecraft committed
73
74
            return f.read()

yyy's avatar
yyy committed
75
    cli_dev.read_s3_path = mock_read_s3_path # mock
icecraft's avatar
icecraft committed
76
77
78
79
80
81

    # run
    runner = CliRunner()
    result = runner.invoke(
        cli_dev.cli,
        [
yyy's avatar
yyy committed
82
83
84
85
            "jsonl",
            "-j",
            "tests/test_tools/assets/cli_dev/cli_test_01.jsonl",
            "-o",
icecraft's avatar
icecraft committed
86
87
88
89
90
91
92
            temp_output_dir,
        ],
    )

    # check
    assert result.exit_code == 0

yyy's avatar
yyy committed
93
    base_output_dir = os.path.join(temp_output_dir, "cli_test_01/auto")
icecraft's avatar
icecraft committed
94

yyy's avatar
yyy committed
95
    r = os.stat(os.path.join(base_output_dir, "content_list.json"))
icecraft's avatar
icecraft committed
96
97
    assert r.st_size > 5000

yyy's avatar
yyy committed
98
    r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
icecraft's avatar
icecraft committed
99
100
    assert r.st_size > 7000

yyy's avatar
yyy committed
101
    r = os.stat(os.path.join(base_output_dir, "middle.json"))
icecraft's avatar
icecraft committed
102
103
    assert r.st_size > 200000

yyy's avatar
yyy committed
104
    r = os.stat(os.path.join(base_output_dir, "model.json"))
icecraft's avatar
icecraft committed
105
106
    assert r.st_size > 15000

yyy's avatar
yyy committed
107
    r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
icecraft's avatar
icecraft committed
108
109
    assert r.st_size > 500000

yyy's avatar
yyy committed
110
    r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
icecraft's avatar
icecraft committed
111
112
    assert r.st_size > 500000

yyy's avatar
yyy committed
113
    r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
icecraft's avatar
icecraft committed
114
115
    assert r.st_size > 500000

yyy's avatar
yyy committed
116
117
    assert os.path.exists(os.path.join(base_output_dir, "images")) is True
    assert os.path.isdir(os.path.join(base_output_dir, "images")) is True
icecraft's avatar
icecraft committed
118
119
120

    # teardown
    shutil.rmtree(temp_output_dir)