test_cli.py 3.22 KB
Newer Older
quyuan's avatar
quyuan committed
1
2
3
import pytest
import os
from conf import conf
quyuan's avatar
add ci  
quyuan committed
4
5
6
7
import os
import json
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
quyuan's avatar
add ci  
quyuan committed
8
9
from lib import common

quyuan's avatar
quyuan committed
10
11
12
13
pdf_res_path = conf.conf["pdf_res_path"]
code_path = conf.conf["code_path"]
pdf_dev_path = conf.conf["pdf_dev_path"]
class TestCli:
quyuan's avatar
add ci  
quyuan committed
14
15
16
    """
    test cli
    """
quyuan's avatar
add ci  
quyuan committed
17
18
19
20
21
    def test_pdf_sdk(self):
        """
        pdf sdk 方式解析
        """
        demo_names = list()
quyuan's avatar
add ci  
quyuan committed
22
23
        pdf_path = os.path.join(pdf_dev_path, "pdf")
        for pdf_file in os.listdir(pdf_path):
quyuan's avatar
add ci  
quyuan committed
24
25
26
            if pdf_file.endswith('.pdf'):
                demo_names.append(pdf_file.split('.')[0])
        for demo_name in demo_names:
quyuan's avatar
add ci  
quyuan committed
27
            model_path = os.path.join(pdf_dev_path, f"{demo_name}_model.json")
quyuan's avatar
add ci  
quyuan committed
28
            pdf_path = os.path.join(pdf_dev_path, "pdf", f"{demo_name}.pdf")
quyuan's avatar
add ci  
quyuan committed
29
30
31
32
33
34
35
36
37
            pdf_bytes = open(pdf_path, "rb").read()
            model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
            image_writer = DiskReaderWriter(pdf_dev_path)
            image_dir = str(os.path.basename(pdf_dev_path))
            jso_useful_key = {"_pdf_type": "", "model_list": model_json}
            pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
            pipe.pipe_classify()
            pipe.pipe_parse()
            md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
quyuan's avatar
add ci  
quyuan committed
38
39
40
41
            dir_path = os.path.join(pdf_dev_path, "mineru")
            if not os.path.exists(dir_path):
                os.makedirs(dir_path, exist_ok=True)
            res_path = os.path.join(dir_path, f"{demo_name}.md")
quyuan's avatar
add ci  
quyuan committed
42
            with open(res_path, "w+", encoding="utf-8") as f:
quyuan's avatar
add ci  
quyuan committed
43
                f.write(md_content)
quyuan's avatar
add ci  
quyuan committed
44
            common.count_folders_and_check_contents(res_path)
quyuan's avatar
add ci  
quyuan committed
45
        
myhloli's avatar
myhloli committed
46
47
48
49
50
51
52
53
    # def test_pdf_specify_jsonl(self):
    #     """
    #     输入jsonl, 默认方式解析
    #     """
    #     cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py  json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972'" % (code_path)
    #     logging.info(cmd)
    #     common.check_shell(cmd)
    #     #common.count_folders_and_check_contents(pdf_res_path)
quyuan's avatar
quyuan committed
54

myhloli's avatar
myhloli committed
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
    # def test_pdf_specify_jsonl_txt(self):
    #     """
    #     输入jsonl, txt方式解析
    #     """
    #     cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py  json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972' --method txt" % (code_path)
    #     logging.info(cmd)
    #     common.check_shell(cmd)
    #     #common.count_folders_and_check_contents(pdf_res_path)
    #
    # def test_pdf_specify_jsonl_ocr(self):
    #     """
    #     输入jsonl, ocr方式解析
    #     """
    #     cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py  json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972' --method ocr" % (code_path)
    #     logging.info(cmd)
    #     common.check_shell(cmd)
    #     #common.count_folders_and_check_contents(pdf_res_path)
quyuan's avatar
quyuan committed
72
 
quyuan's avatar
quyuan committed
73
74
75
 
if __name__ == "__main__":
    pytest.main()