test_pdf2text_recogPara_TitleProcessor.py 2.02 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
3
4
import json
import unittest

from utils_for_test_para import UtilsForTestPara
赵小蒙's avatar
赵小蒙 committed
5
from magic_pdf.post_proc.detect_para import TitleProcessor
赵小蒙's avatar
赵小蒙 committed
6
7
8
9
10
11

# from ... pdf2text_recogPara import * # another way to import

"""
Execute the following command to run the test under directory code-clean:

赵小蒙's avatar
赵小蒙 committed
12
    python -m tests.test_para.test_pdf2text_recogPara_ClassName
赵小蒙's avatar
赵小蒙 committed
13
14
15
    
    or 
    
赵小蒙's avatar
赵小蒙 committed
16
    pytest -v -s app/pdf_toolbox/tests/test_para/test_pdf2text_recogPara_TitleProcessor.py
赵小蒙's avatar
赵小蒙 committed
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
    
"""


class TestTitleProcessor(unittest.TestCase):
    def setUp(self):
        self.title_processor = TitleProcessor()
        self.utils = UtilsForTestPara()
        self.preproc_out_jsons = self.utils.read_preproc_out_jfiles()

    def test_batch_process_blocks_detect_titles(self):
        """
        Test the function detect_titles with preprocessed output JSON
        """
        for preproc_out_json in self.preproc_out_jsons:
            with open(preproc_out_json, "r", encoding="utf-8") as f:
                preproc_dict = json.load(f)
                preproc_dict["statistics"] = {}
                result = self.title_processor.batch_detect_titles(preproc_dict)
                for page_id, blocks in preproc_dict.items():
                    if page_id.startswith("page_"):
                        pass
                    else:
                        continue

    def test_batch_process_blocks_recog_title_level(self):
        """
        Test the function batch_process_blocks_recog_title_level with preprocessed output JSON
        """
        for preproc_out_json in self.preproc_out_jsons:
            with open(preproc_out_json, "r", encoding="utf-8") as f:
                preproc_dict = json.load(f)
                preproc_dict["statistics"] = {}
                result = self.title_processor.batch_recog_title_level(preproc_dict)
                for page_id, blocks in preproc_dict.items():
                    if page_id.startswith("page_"):
                        pass
                    else:
                        continue


if __name__ == "__main__":
    unittest.main()