benchmark.py 3.29 KB
Newer Older
quyuan's avatar
quyuan committed
1
2
3
import zipfile
import os
import shutil
quyuan's avatar
quyuan committed
4
5
import json
import markdown_calculate
quyuan's avatar
quyuan committed
6
code_path = os.environ.get('GITHUB_WORKSPACE')
quyuan's avatar
quyuan committed
7
8
#code_path = "/home/quyuan/actions-runner/_work/Magic-PDF/Magic-PDF.bk"
#评测集存放路径
quyuan's avatar
quyuan committed
9
pdf_dev_path = "/home/quyuan/data"
quyuan's avatar
quyuan committed
10
#magicpdf跑测结果
quyuan's avatar
quyuan committed
11
pdf_res_path = "/home/quyuan/code/Magic-PDF/Magic-PDF/Magic-PDF/ci/magic-pdf"
quyuan's avatar
quyuan committed
12
13
14
file_types = ["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"]
#file_types = ["academic_literature"]

quyuan's avatar
quyuan committed
15
def test_cli():
quyuan's avatar
quyuan committed
16
    magicpdf_path = os.path.join(pdf_dev_path, "output")
quyuan's avatar
quyuan committed
17
18
19
    rm_cmd = "rm -rf %s" % (pdf_res_path)
    os.system(rm_cmd)
    os.makedirs(pdf_res_path)
quyuan's avatar
quyuan committed
20
21
    cmd = 'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py  pdf-command  --pdf {}' % (code_path, magicpdf_path)
    os.system(cmd)
quyuan's avatar
quyuan committed
22
23
24
25
26
27
28
29
30
31
    for root, dirs, files in os.walk(pdf_res_path):
         for magic_file in files:
            for file_type in file_types:
                target_dir = os.path.join(pdf_dev_path, "ci", file_type, "magicpdf")
                if magic_file.endswith(".md") and magic_file.startswith(file_type):
                    source_file = os.path.join(root, magic_file)
                    target_file = os.path.join(pdf_dev_path, "ci", file_type, "magicpdf", magic_file)
                    if not os.path.exists(target_dir):
                         os.makedirs(target_dir) 
                    shutil.copy(source_file, target_file)   
quyuan's avatar
quyuan committed
32
33

def calculate_score():
quyuan's avatar
quyuan committed
34
35
    data_path = os.path.join(pdf_dev_path, "ci")
    cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name annotations --download_dir %s" % (code_path, data_path)
quyuan's avatar
quyuan committed
36
    os.system(cmd)
quyuan's avatar
quyuan committed
37
    cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name magicpdf --download_dir %s" % (code_path, data_path)
quyuan's avatar
quyuan committed
38
    os.system(cmd)
quyuan's avatar
quyuan committed
39
40
    score = markdown_calculate.Scoring(os.path.join(data_path, "result.json"))
    score.calculate_similarity_total("magicpdf", file_types, data_path)
quyuan's avatar
quyuan committed
41
42
    res = score.summary_scores()
    return res
quyuan's avatar
quyuan committed
43
44
45
46
47
48
49
50
51
52
53


def extrat_zip(zip_file_path, extract_to_path):
    if zipfile.is_zipfile(zip_file_path):
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to_path)
        print(f'Files extracted to {extract_to_path}')
    else:
        print(f'{zip_file_path} is not a zip file')


quyuan's avatar
quyuan committed
54
def ci_ben():
quyuan's avatar
quyuan committed
55
    fr = open(os.path.join(pdf_dev_path, "ci", "result.json"), "r")
quyuan's avatar
quyuan committed
56
57
58
59
60
61
62
    lines = fr.readlines()
    last_line = lines[-1].strip()
    last_score = json.loads(last_line)
    print ("last_score:", last_score)
    last_simscore = last_score["average_sim_score"]
    last_editdistance = last_score["average_edit_distance"]
    last_bleu = last_score["average_bleu_score"]
quyuan's avatar
quyuan committed
63
    extrat_zip(os.path.join(pdf_dev_path, 'output.zip'), os.path.join(pdf_dev_path))
quyuan's avatar
quyuan committed
64
    test_cli()
quyuan's avatar
quyuan committed
65
66
67
68
69
70
71
72
73
74
75
76
    now_score = calculate_score()
    print ("now_score:", now_score)
    now_simscore = now_score["average_sim_score"]
    now_editdistance = now_score["average_edit_distance"]
    now_bleu = now_score["average_bleu_score"]
    assert last_simscore <= now_simscore
    assert last_editdistance <= now_editdistance
    assert last_bleu <= now_bleu


if __name__ == "__main__":
    ci_ben()