test_cli_dev.py 3.38 KB
Newer Older
1
2
import os
import shutil
3
4
import tempfile

5
6
7
8
9
10
11
from click.testing import CliRunner

from magic_pdf.tools import cli_dev


def test_cli_pdf():
    # setup
12
13
    unitest_dir = '/tmp/magic_pdf/unittest/tools'
    filename = 'cli_test_01'
14
    os.makedirs(unitest_dir, exist_ok=True)
15
    temp_output_dir = tempfile.mkdtemp(dir='/tmp/magic_pdf/unittest/tools')
16
17
18
19
20
21

    # run
    runner = CliRunner()
    result = runner.invoke(
        cli_dev.cli,
        [
22
23
24
25
26
27
            'pdf',
            '-p',
            'tests/unittest/test_tools/assets/cli/pdf/cli_test_01.pdf',
            '-j',
            'tests/unittest/test_tools/assets/cli_dev/cli_test_01.model.json',
            '-o',
28
29
30
31
32
33
34
            temp_output_dir,
        ],
    )

    # check
    assert result.exit_code == 0

35
    base_output_dir = os.path.join(temp_output_dir, 'cli_test_01/auto')
36

37
    r = os.stat(os.path.join(base_output_dir, f'{filename}_content_list.json'))
38
    assert r.st_size > 5000
39
    r = os.stat(os.path.join(base_output_dir, f'{filename}.md'))
40
41
    assert r.st_size > 7000

42
    r = os.stat(os.path.join(base_output_dir, f'{filename}_middle.json'))
43
44
    assert r.st_size > 200000

45
    r = os.stat(os.path.join(base_output_dir, f'{filename}_model.json'))
46
47
    assert r.st_size > 15000

48
49
    r = os.stat(os.path.join(base_output_dir, f'{filename}_origin.pdf'))
    assert r.st_size > 400000
50

51
52
    r = os.stat(os.path.join(base_output_dir, f'{filename}_layout.pdf'))
    assert r.st_size > 400000
53

54
55
    r = os.stat(os.path.join(base_output_dir, f'{filename}_spans.pdf'))
    assert r.st_size > 400000
56

57
58
    assert os.path.exists(os.path.join(base_output_dir, 'images')) is True
    assert os.path.isdir(os.path.join(base_output_dir, 'images')) is True
59
60
61
62
63
64
65

    # teardown
    shutil.rmtree(temp_output_dir)


def test_cli_jsonl():
    # setup
66
67
    unitest_dir = '/tmp/magic_pdf/unittest/tools'
    filename = 'cli_test_01'
68
    os.makedirs(unitest_dir, exist_ok=True)
69
    temp_output_dir = tempfile.mkdtemp(dir='/tmp/magic_pdf/unittest/tools')
70
71

    def mock_read_s3_path(s3path):
72
        with open(s3path, 'rb') as f:
73
74
            return f.read()

75
    cli_dev.read_s3_path = mock_read_s3_path  # mock
76
77
78
79
80
81

    # run
    runner = CliRunner()
    result = runner.invoke(
        cli_dev.cli,
        [
82
83
84
85
            'jsonl',
            '-j',
            'tests/unittest/test_tools/assets/cli_dev/cli_test_01.jsonl',
            '-o',
86
87
88
89
90
91
92
            temp_output_dir,
        ],
    )

    # check
    assert result.exit_code == 0

93
    base_output_dir = os.path.join(temp_output_dir, 'cli_test_01/auto')
94

95
    r = os.stat(os.path.join(base_output_dir, f'{filename}_content_list.json'))
96
97
    assert r.st_size > 5000

98
    r = os.stat(os.path.join(base_output_dir, f'{filename}.md'))
99
100
    assert r.st_size > 7000

101
    r = os.stat(os.path.join(base_output_dir, f'{filename}_middle.json'))
102
103
    assert r.st_size > 200000

104
    r = os.stat(os.path.join(base_output_dir, f'{filename}_model.json'))
105
106
    assert r.st_size > 15000

107
108
    r = os.stat(os.path.join(base_output_dir, f'{filename}_origin.pdf'))
    assert r.st_size > 400000
109

110
111
    r = os.stat(os.path.join(base_output_dir, f'{filename}_layout.pdf'))
    assert r.st_size > 400000
112

113
114
    r = os.stat(os.path.join(base_output_dir, f'{filename}_spans.pdf'))
    assert r.st_size > 400000
115

116
117
    assert os.path.exists(os.path.join(base_output_dir, 'images')) is True
    assert os.path.isdir(os.path.join(base_output_dir, 'images')) is True
118
119
120

    # teardown
    shutil.rmtree(temp_output_dir)