test_api.py 1.64 KB
Newer Older
drunkpig's avatar
drunkpig committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import json
import os
import shutil
import tempfile

from magic_pdf.integrations.rag.api import DataReader, RagDocumentReader
from magic_pdf.integrations.rag.type import CategoryType
from magic_pdf.integrations.rag.utils import \
    convert_middle_json_to_layout_elements


def test_rag_document_reader():
    # setup
    unitest_dir = '/tmp/magic_pdf/unittest/integrations/rag'
    os.makedirs(unitest_dir, exist_ok=True)
    temp_output_dir = tempfile.mkdtemp(dir=unitest_dir)
    os.makedirs(temp_output_dir, exist_ok=True)

    # test
    with open('tests/test_integrations/test_rag/assets/middle.json') as f:
        json_data = json.load(f)
    res = convert_middle_json_to_layout_elements(json_data, temp_output_dir)

    doc = RagDocumentReader(res)
    assert len(list(iter(doc))) == 1

    page = list(iter(doc))[0]
    assert len(list(iter(page))) == 10
    assert len(page.get_rel_map()) == 3

    item = list(iter(page))[0]
    assert item.category_type == CategoryType.text

    # teardown
    shutil.rmtree(temp_output_dir)


def test_data_reader():
    # setup
    unitest_dir = '/tmp/magic_pdf/unittest/integrations/rag'
    os.makedirs(unitest_dir, exist_ok=True)
    temp_output_dir = tempfile.mkdtemp(dir=unitest_dir)
    os.makedirs(temp_output_dir, exist_ok=True)

    # test
    data_reader = DataReader('tests/test_integrations/test_rag/assets', 'ocr',
                             temp_output_dir)

    assert data_reader.get_documents_count() == 2
    for idx in range(data_reader.get_documents_count()):
        document = data_reader.get_document_result(idx)
        assert document is not None

    # teardown
    shutil.rmtree(temp_output_dir)