test_cli_sdk.py 14 KB
Newer Older
yyy's avatar
yyy committed
1
2
3
4
5
6
"""test cli and sdk."""
import logging
import os
import pytest
from conf import conf
from lib import common
yyy's avatar
yyy committed
7
import time
yyy's avatar
yyy committed
8
9
10
import magic_pdf.model as model_config
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
quyuan's avatar
quyuan committed
11
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
yyy's avatar
yyy committed
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
model_config.__use_inside_model__ = True
pdf_res_path = conf.conf['pdf_res_path']
code_path = conf.conf['code_path']
pdf_dev_path = conf.conf['pdf_dev_path']


class TestCli:
    """test cli."""

    @pytest.mark.P0
    def test_pdf_auto_sdk(self):
        """pdf sdk auto test."""
        demo_names = list()
        pdf_path = os.path.join(pdf_dev_path, 'pdf')
        for pdf_file in os.listdir(pdf_path):
            if pdf_file.endswith('.pdf'):
                demo_names.append(pdf_file.split('.')[0])
        for demo_name in demo_names:
            pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
            print(pdf_path)
            pdf_bytes = open(pdf_path, 'rb').read()
            local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
            image_dir = str(os.path.basename(local_image_dir))
            image_writer = DiskReaderWriter(local_image_dir)
            model_json = list()
            jso_useful_key = {'_pdf_type': '', 'model_list': model_json}
            pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
            pipe.pipe_classify()
            if len(model_json) == 0:
                if model_config.__use_inside_model__:
                    pipe.pipe_analyze()
                else:
                    exit(1)
            pipe.pipe_parse()
            md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
            dir_path = os.path.join(pdf_dev_path, 'mineru')
            if not os.path.exists(dir_path):
                os.makedirs(dir_path, exist_ok=True)
            res_path = os.path.join(dir_path, f'{demo_name}.md')
            common.delete_file(res_path)
            with open(res_path, 'w+', encoding='utf-8') as f:
                f.write(md_content)
            common.sdk_count_folders_and_check_contents(res_path)

    @pytest.mark.P0
    def test_pdf_ocr_sdk(self):
        """pdf sdk ocr test."""
yyy's avatar
yyy committed
59
        time.sleep(2)
yyy's avatar
yyy committed
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
        demo_names = list()
        pdf_path = os.path.join(pdf_dev_path, 'pdf')
        for pdf_file in os.listdir(pdf_path):
            if pdf_file.endswith('.pdf'):
                demo_names.append(pdf_file.split('.')[0])
        for demo_name in demo_names:
            pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
            print(pdf_path)
            pdf_bytes = open(pdf_path, 'rb').read()
            local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
            image_dir = str(os.path.basename(local_image_dir))
            image_writer = DiskReaderWriter(local_image_dir)
            model_json = list()
            jso_useful_key = {'_pdf_type': 'ocr', 'model_list': model_json}
            pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
            pipe.pipe_classify()
            if len(model_json) == 0:
                if model_config.__use_inside_model__:
                    pipe.pipe_analyze()
                else:
                    exit(1)
            pipe.pipe_parse()
            md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
            dir_path = os.path.join(pdf_dev_path, 'mineru')
            if not os.path.exists(dir_path):
                os.makedirs(dir_path, exist_ok=True)
            res_path = os.path.join(dir_path, f'{demo_name}.md')
            common.delete_file(res_path)
            with open(res_path, 'w+', encoding='utf-8') as f:
                f.write(md_content)
            common.sdk_count_folders_and_check_contents(res_path)
yyy's avatar
yyy committed
91
    
yyy's avatar
yyy committed
92
93
94
    @pytest.mark.P0
    def test_pdf_txt_sdk(self):
        """pdf sdk txt test."""
yyy's avatar
yyy committed
95
        time.sleep(2)
yyy's avatar
yyy committed
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
        demo_names = list()
        pdf_path = os.path.join(pdf_dev_path, 'pdf')
        for pdf_file in os.listdir(pdf_path):
            if pdf_file.endswith('.pdf'):
                demo_names.append(pdf_file.split('.')[0])
        for demo_name in demo_names:
            pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
            pdf_bytes = open(pdf_path, 'rb').read()
            local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
            image_dir = str(os.path.basename(local_image_dir))
            image_writer = DiskReaderWriter(local_image_dir)
            model_json = list()
            jso_useful_key = {'_pdf_type': 'txt', 'model_list': model_json}
            pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
            pipe.pipe_classify()
            if len(model_json) == 0:
                if model_config.__use_inside_model__:
                    pipe.pipe_analyze()
                else:
                    exit(1)
            pipe.pipe_parse()
            md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
            dir_path = os.path.join(pdf_dev_path, 'mineru')
            if not os.path.exists(dir_path):
                os.makedirs(dir_path, exist_ok=True)
            res_path = os.path.join(dir_path, f'{demo_name}.md')
            common.delete_file(res_path)
            with open(res_path, 'w+', encoding='utf-8') as f:
                f.write(md_content)
            common.sdk_count_folders_and_check_contents(res_path)
yyy's avatar
yyy committed
126
    
yyy's avatar
yyy committed
127
128
129
    @pytest.mark.P0
    def test_pdf_cli_auto(self):
        """magic_pdf cli test auto."""
yyy's avatar
yyy committed
130
        time.sleep(2)
yyy's avatar
yyy committed
131
132
133
134
135
136
137
138
139
140
141
142
143
144
        demo_names = []
        pdf_path = os.path.join(pdf_dev_path, 'pdf')
        for pdf_file in os.listdir(pdf_path):
            if pdf_file.endswith('.pdf'):
                demo_names.append(pdf_file.split('.')[0])
        for demo_name in demo_names:
            res_path = os.path.join(pdf_dev_path, 'mineru')
            common.delete_file(res_path)
            cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
                pdf_path, f'{demo_name}.pdf'), res_path, 'auto')
            logging.info(cmd)
            os.system(cmd)
            common.cli_count_folders_and_check_contents(
                os.path.join(res_path, demo_name, 'auto'))
yyy's avatar
yyy committed
145
   
yyy's avatar
yyy committed
146
    @pytest.mark.P0
yyy's avatar
yyy committed
147
    def test_pdf_cli_txt(self):
yyy's avatar
yyy committed
148
        """magic_pdf cli test txt."""
yyy's avatar
yyy committed
149
        time.sleep(2)
yyy's avatar
yyy committed
150
151
152
153
154
155
156
157
158
159
160
161
162
163
        demo_names = []
        pdf_path = os.path.join(pdf_dev_path, 'pdf')
        for pdf_file in os.listdir(pdf_path):
            if pdf_file.endswith('.pdf'):
                demo_names.append(pdf_file.split('.')[0])
        for demo_name in demo_names:
            res_path = os.path.join(pdf_dev_path, 'mineru')
            common.delete_file(res_path)
            cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
                pdf_path, f'{demo_name}.pdf'), res_path, 'txt')
            logging.info(cmd)
            os.system(cmd)
            common.cli_count_folders_and_check_contents(
                os.path.join(res_path, demo_name, 'txt'))
yyy's avatar
yyy committed
164
   
yyy's avatar
yyy committed
165
    @pytest.mark.P0
yyy's avatar
yyy committed
166
    def test_pdf_cli_ocr(self):
yyy's avatar
yyy committed
167
        """magic_pdf cli test ocr."""
yyy's avatar
yyy committed
168
        time.sleep(2)
yyy's avatar
yyy committed
169
170
171
172
173
174
175
176
177
178
179
180
181
182
        demo_names = []
        pdf_path = os.path.join(pdf_dev_path, 'pdf')
        for pdf_file in os.listdir(pdf_path):
            if pdf_file.endswith('.pdf'):
                demo_names.append(pdf_file.split('.')[0])
        for demo_name in demo_names:
            res_path = os.path.join(pdf_dev_path, 'mineru')
            common.delete_file(res_path)
            cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
                pdf_path, f'{demo_name}.pdf'), res_path, 'ocr')
            logging.info(cmd)
            os.system(cmd)
            common.cli_count_folders_and_check_contents(
                os.path.join(res_path, demo_name, 'ocr'))
yyy's avatar
yyy committed
183
184
    
    @pytest.mark.skip(reason='out-of-date api')
quyuan's avatar
quyuan committed
185
186
187
    @pytest.mark.P1
    def test_pdf_dev_cli_local_jsonl_txt(self):
        """magic_pdf_dev cli local txt."""
yyy's avatar
yyy committed
188
        time.sleep(2)
quyuan's avatar
quyuan committed
189
        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
quyuan's avatar
quyuan committed
190
        cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
quyuan's avatar
quyuan committed
191
192
193
        logging.info(cmd)
        os.system(cmd)

yyy's avatar
yyy committed
194
    @pytest.mark.skip(reason='out-of-date api')
quyuan's avatar
quyuan committed
195
196
197
    @pytest.mark.P1
    def test_pdf_dev_cli_local_jsonl_ocr(self):
        """magic_pdf_dev cli local ocr."""
yyy's avatar
yyy committed
198
        time.sleep(2)
quyuan's avatar
quyuan committed
199
        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
quyuan's avatar
quyuan committed
200
        cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
quyuan's avatar
quyuan committed
201
202
203
        logging.info(cmd)
        os.system(cmd)

yyy's avatar
yyy committed
204
    @pytest.mark.skip(reason='out-of-date api')
quyuan's avatar
quyuan committed
205
206
207
    @pytest.mark.P1
    def test_pdf_dev_cli_local_jsonl_auto(self):
        """magic_pdf_dev cli local auto."""
yyy's avatar
yyy committed
208
        time.sleep(2)
quyuan's avatar
quyuan committed
209
        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
quyuan's avatar
quyuan committed
210
        cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
quyuan's avatar
quyuan committed
211
212
        logging.info(cmd)
        os.system(cmd)
yyy's avatar
yyy committed
213
214
    
    @pytest.mark.skip(reason='out-of-date api')
quyuan's avatar
quyuan committed
215
216
217
    @pytest.mark.P1
    def test_pdf_dev_cli_s3_jsonl_txt(self):
        """magic_pdf_dev cli s3 txt."""
yyy's avatar
yyy committed
218
        time.sleep(2)
quyuan's avatar
quyuan committed
219
        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
quyuan's avatar
quyuan committed
220
        cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
quyuan's avatar
quyuan committed
221
222
223
        logging.info(cmd)
        os.system(cmd)

yyy's avatar
yyy committed
224
    @pytest.mark.skip(reason='out-of-date api')
quyuan's avatar
quyuan committed
225
226
227
    @pytest.mark.P1
    def test_pdf_dev_cli_s3_jsonl_ocr(self):
        """magic_pdf_dev cli s3 ocr."""
yyy's avatar
yyy committed
228
        time.sleep(2)
quyuan's avatar
quyuan committed
229
        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
quyuan's avatar
quyuan committed
230
        cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
quyuan's avatar
quyuan committed
231
232
233
        logging.info(cmd)
        os.system(cmd)

yyy's avatar
yyy committed
234
    @pytest.mark.skip(reason='out-of-date api')
quyuan's avatar
quyuan committed
235
236
237
    @pytest.mark.P1
    def test_pdf_dev_cli_s3_jsonl_auto(self):
        """magic_pdf_dev cli s3 auto."""
yyy's avatar
yyy committed
238
        time.sleep(2)
quyuan's avatar
quyuan committed
239
240
241
242
243
        jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
        cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
        logging.info(cmd)
        os.system(cmd)

quyuan's avatar
quyuan committed
244
245
246
    @pytest.mark.P1
    def test_pdf_dev_cli_pdf_json_auto(self):
        """magic_pdf_dev cli pdf+json auto."""
yyy's avatar
yyy committed
247
        time.sleep(2)
quyuan's avatar
quyuan committed
248
        json_path = os.path.join(pdf_dev_path, 'test_model.json')
yyy's avatar
yyy committed
249
        pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf')
quyuan's avatar
quyuan committed
250
251
252
        cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
        logging.info(cmd)
        os.system(cmd)
yyy's avatar
yyy committed
253
254
   
    @pytest.mark.skip(reason='out-of-date api')
quyuan's avatar
quyuan committed
255
256
257
    @pytest.mark.P1
    def test_pdf_dev_cli_pdf_json_ocr(self):
        """magic_pdf_dev cli pdf+json ocr."""
yyy's avatar
yyy committed
258
        time.sleep(2)
quyuan's avatar
quyuan committed
259
        json_path = os.path.join(pdf_dev_path, 'test_model.json')
yyy's avatar
yyy committed
260
        pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf')
quyuan's avatar
quyuan committed
261
262
263
        cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
        logging.info(cmd)
        os.system(cmd)
yyy's avatar
yyy committed
264
    
quyuan's avatar
quyuan committed
265
266
    @pytest.mark.P1
    def test_s3_sdk_suto(self):
yyy's avatar
yyy committed
267
268
269
270
271
272
        """
        test s3 sdk auto.
        """
        time.sleep(2)
        pdf_ak = os.getenv('pdf_ak')
        print (pdf_ak)
quyuan's avatar
quyuan committed
273
274
275
276
        pdf_sk = os.environ.get('pdf_sk', "")
        pdf_bucket = os.environ.get('bucket', "")
        pdf_endpoint = os.environ.get('pdf_endpoint', "")
        s3_pdf_path = conf.conf["s3_pdf_path"]
yyy's avatar
yyy committed
277
278
        image_dir = "s3://" + pdf_bucket + "/mineru/test/output"
        print (image_dir)
quyuan's avatar
quyuan committed
279
280
281
282
283
284
285
286
287
288
        s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
        s3image_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint, parent_path=image_dir)
        pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
        jso_useful_key = {"_pdf_type": "", "model_list": []}
        pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
        pipe.pipe_classify()
        pipe.pipe_analyze()
        pipe.pipe_parse()
        md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
        assert len(md_content) > 0
quyuan's avatar
quyuan committed
289

yyy's avatar
yyy committed
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
    @pytest.mark.P1
    def test_local_magic_pdf_open_st_table(self):
        """magic pdf cli open st table."""
        time.sleep(2)
        pre_cmd = "cp ~/magic_pdf_st.json ~/magic-pdf.json"
        print (pre_cmd)
        os.system(pre_cmd)
        pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
        common.delete_file(pdf_res_path)
        cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
        os.system(cli_cmd)
        res = common.check_latex_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
        assert res is True
  
    @pytest.mark.P1
    def test_local_magic_pdf_open_html_table(self):
        """magic pdf cli open html table."""
        time.sleep(2)
        pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
        os.system(pre_cmd)
        pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
        common.delete_file(pdf_res_path)
        cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
        os.system(cli_cmd)
        res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
        assert res is True
    
    @pytest.mark.P1
    def test_magic_pdf_close_html_table_cpu(self):
        """magic pdf cli close html table cpu mode."""
        time.sleep(2)
        pre_cmd = "cp ~/magic_pdf_html_table_cpu.json ~/magic-pdf.json"
        os.system(pre_cmd)
        pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
        common.delete_file(pdf_res_path)
        cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
        os.system(cli_cmd)
        res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
        assert res is  True

    @pytest.mark.P1
    def test_local_magic_pdf_close_html_table(self):
        """magic pdf cli close table."""
        time.sleep(2)
        pre_cmd = "cp ~/magic_pdf_close_table.json ~/magic-pdf.json"
        os.system(pre_cmd)
        pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
        common.delete_file(pdf_res_path)
        cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
        os.system(cli_cmd)
        res = common.check_close_tables(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
        assert res is True
    
quyuan's avatar
quyuan committed
343

yyy's avatar
yyy committed
344
 
yyy's avatar
yyy committed
345
346
if __name__ == '__main__':
    pytest.main()