test_common.py 1.53 KB
Newer Older
yyy's avatar
yyy committed
1
import tempfile
icecraft's avatar
icecraft committed
2
3
4
5
6
7
8
9
import os
import shutil

import pytest

from magic_pdf.tools.common import do_parse


yyy's avatar
yyy committed
10
@pytest.mark.parametrize("method", ["auto", "txt", "ocr"])
icecraft's avatar
icecraft committed
11
12
def test_common_do_parse(method):
    # setup
yyy's avatar
yyy committed
13
14
    unitest_dir = "/tmp/magic_pdf/unittest/tools"
    filename = "fake"
icecraft's avatar
icecraft committed
15
16
    os.makedirs(unitest_dir, exist_ok=True)

yyy's avatar
yyy committed
17
18
    temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools")

icecraft's avatar
icecraft committed
19
    # run
yyy's avatar
yyy committed
20
    with open("tests/test_tools/assets/common/cli_test_01.pdf", "rb") as f:
icecraft's avatar
icecraft committed
21
        bits = f.read()
icecraft's avatar
icecraft committed
22
23
24
25
26
27
    do_parse(temp_output_dir,
             filename,
             bits, [],
             method,
             False,
             f_dump_content_list=True)
icecraft's avatar
icecraft committed
28
29

    # check
yyy's avatar
yyy committed
30
    base_output_dir = os.path.join(temp_output_dir, f"fake/{method}")
icecraft's avatar
icecraft committed
31

yyy's avatar
yyy committed
32
    r = os.stat(os.path.join(base_output_dir, "content_list.json"))
icecraft's avatar
icecraft committed
33
34
    assert r.st_size > 5000

yyy's avatar
yyy committed
35
    r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
icecraft's avatar
icecraft committed
36
37
    assert r.st_size > 7000

yyy's avatar
yyy committed
38
    r = os.stat(os.path.join(base_output_dir, "middle.json"))
icecraft's avatar
icecraft committed
39
40
    assert r.st_size > 200000

yyy's avatar
yyy committed
41
    r = os.stat(os.path.join(base_output_dir, "model.json"))
icecraft's avatar
icecraft committed
42
43
    assert r.st_size > 15000

yyy's avatar
yyy committed
44
    r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
icecraft's avatar
icecraft committed
45
46
    assert r.st_size > 500000

yyy's avatar
yyy committed
47
    r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
icecraft's avatar
icecraft committed
48
49
    assert r.st_size > 500000

yyy's avatar
yyy committed
50
    r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
icecraft's avatar
icecraft committed
51
52
    assert r.st_size > 500000

yyy's avatar
yyy committed
53
54
    os.path.exists(os.path.join(base_output_dir, "images"))
    os.path.isdir(os.path.join(base_output_dir, "images"))
icecraft's avatar
icecraft committed
55
56
57

    # teardown
    shutil.rmtree(temp_output_dir)