test_common.py 1.71 KB
Newer Older
icecraft's avatar
icecraft committed
1
2
import os
import shutil
3
import tempfile
icecraft's avatar
icecraft committed
4
5
6

import pytest

7
import magic_pdf.model as model_config
icecraft's avatar
icecraft committed
8
9
10
from magic_pdf.tools.common import do_parse


11
@pytest.mark.parametrize('method', ['auto', 'txt', 'ocr'])
icecraft's avatar
icecraft committed
12
13
def test_common_do_parse(method):
    # setup
14
15
16
    model_config.__use_inside_model__ = True
    unitest_dir = '/tmp/magic_pdf/unittest/tools'
    filename = 'fake'
icecraft's avatar
icecraft committed
17
18
    os.makedirs(unitest_dir, exist_ok=True)

19
20
    temp_output_dir = tempfile.mkdtemp(dir='/tmp/magic_pdf/unittest/tools')
    os.makedirs(temp_output_dir, exist_ok=True)
icecraft's avatar
icecraft committed
21
    # run
22
    with open('tests/test_tools/assets/common/cli_test_01.pdf', 'rb') as f:
icecraft's avatar
icecraft committed
23
        bits = f.read()
24
25
26
27
28
    do_parse(temp_output_dir,
             filename,
             bits, [],
             method,
             f_dump_content_list=True)
icecraft's avatar
icecraft committed
29
30

    # check
31
    base_output_dir = os.path.join(temp_output_dir, f'fake/{method}')
icecraft's avatar
icecraft committed
32

33
    r = os.stat(os.path.join(base_output_dir, f'{filename}_content_list.json'))
icecraft's avatar
icecraft committed
34
35
    assert r.st_size > 5000

36
    r = os.stat(os.path.join(base_output_dir, f'{filename}.md'))
icecraft's avatar
icecraft committed
37
38
    assert r.st_size > 7000

39
    r = os.stat(os.path.join(base_output_dir, f'{filename}_middle.json'))
icecraft's avatar
icecraft committed
40
41
    assert r.st_size > 200000

42
    r = os.stat(os.path.join(base_output_dir, f'{filename}_model.json'))
icecraft's avatar
icecraft committed
43
44
    assert r.st_size > 15000

45
    r = os.stat(os.path.join(base_output_dir, f'{filename}_origin.pdf'))
icecraft's avatar
icecraft committed
46
47
    assert r.st_size > 500000

48
    r = os.stat(os.path.join(base_output_dir, f'{filename}_layout.pdf'))
icecraft's avatar
icecraft committed
49
50
    assert r.st_size > 500000

51
    r = os.stat(os.path.join(base_output_dir, f'{filename}_spans.pdf'))
icecraft's avatar
icecraft committed
52
53
    assert r.st_size > 500000

54
55
    os.path.exists(os.path.join(base_output_dir, 'images'))
    os.path.isdir(os.path.join(base_output_dir, 'images'))
icecraft's avatar
icecraft committed
56
57
58

    # teardown
    shutil.rmtree(temp_output_dir)