"official/vision/modeling/layers/deeplab.py" did not exist on "44f942b4b0f6bc0e72b7b93448ab5b2b5066aab5"
markdown_calculate.py 6.85 KB
Newer Older
quyuan's avatar
quyuan committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import os  
from Levenshtein import distance  
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
from nltk.tokenize import word_tokenize  
import json 
import re
import scoring
import argparse

parser = argparse.ArgumentParser(description="get directory")
parser.add_argument('--document_types', 
    nargs='+',
    choices=["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"], 
    help='Choose one or more document_types',
    default=["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"]
)

parser.add_argument(
    "--tool_name",
    type=str,
    required=True,
    help="tool name",
)
parser.add_argument(
    "--download_dir",
    type=str,
    required=True,
    help="input download dir",
)
parser.add_argument(
    "--results",
    type=str,
    required=True,
    help="results path(end with .json)",
)
args = parser.parse_args()
fw = open(args.results, 'w+', encoding='utf-8')
# 初始化列表来存储编辑距离和BLEU分数  
class Scoring:
    def __init__(self):
        self.edit_distances = []
        self.bleu_scores = []
        self.sim_scores = []
        self.filenames = []
        self.score_dict = {}
        self.anntion_cnt = 0

    def simple_bleu_score(self, candidate, reference):  
        candidate_tokens = word_tokenize(candidate)  
        reference_tokens = word_tokenize(reference) 
        return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method1) 


    def preprocess_string(self, s):  
        sub_enter = re.sub(r'\n+', '\n', s)
        return re.sub(r'  ', ' ', sub_enter)
    
    def calculate_similarity(self, annotion, actual, tool_type):
        class_dict = {}
        edit_distances = []
        bleu_scores = []
        sim_scores = list()
        total_file = 0
        for filename in os.listdir(annotion):  
            if filename.endswith('.md') and not filename.startswith('.'):  # 忽略隐藏文件  
                total_file = total_file + 1
                # 读取A目录中的文件  
                with open(os.path.join(annotion, filename), 'r', encoding='utf-8') as file_a:  
                    content_a = file_a.read()
                self.anntion_cnt = self.anntion_cnt + 1
                filepath_b = os.path.join(actual, filename)  
                if os.path.exists(filepath_b):  
                    with open(filepath_b, 'r', encoding='utf-8') as file_b:  
                        content_b = file_b.read()
                        self.filenames.append(filename)
                        # 计算编辑距离
                        edit_dist = distance(self.preprocess_string(content_b),self.preprocess_string(content_a)) / max(len(content_a), len(content_b))
                        self.edit_distances.append(edit_dist)  
                        edit_distances.append(edit_dist)
                        #计算BLUE分数
                        bleu_score = self.simple_bleu_score(content_b, content_a)  
                        bleu_scores.append(bleu_score)
                        self.bleu_scores.append(bleu_score)  
                        #计算marker分数
                        score = scoring.score_text(content_b, content_a)
                        sim_scores.append(score)
                        self.sim_scores.append(score)
                        class_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
                        self.score_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
                else:  
                    print(f"File {filename} not found in actual directory.")  
        # 计算每类平均值
        class_average_edit_distance = sum(edit_distances) / len(edit_distances) if edit_distances else 0  
        class_average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0  
        class_average_sim_score = sum(sim_scores) / len(sim_scores) if sim_scores else 0
        fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
        ratio = len(class_dict)/total_file
        fw.write(f"{tool_type} extract ratio:  {ratio}" + "\n")
        fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
        fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
        fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")

        print (f"{tool_type} extract ratio: {ratio}")
        print (f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}")
        print (f"{tool_type} Average BLEU Score: {class_average_bleu_score}")
        print (f"{tool_type} Average Sim Score: {class_average_sim_score}")
        return self.score_dict
    def summary_scores(self):
         # 计算整体平均值
        average_edit_distance = sum(self.edit_distances) / len(self.edit_distances) if self.edit_distances else 0  
        average_bleu_score = sum(self.bleu_scores) / len(self.bleu_scores) if self.bleu_scores else 0  
        average_sim_score = sum(self.sim_scores) / len(self.sim_scores) if self.sim_scores else 0
        #self.fw.write(json.dumps(self.score_dict, ensure_ascii=False) + "\n")
        fw.write(f"Overall extract cnt: {len(self.score_dict)/self.anntion_cnt}" + "\n")
        fw.write(f"Overall Average Levenshtein Distance: {average_edit_distance}" + "\n")
        fw.write(f"Overall Average BLEU Score: {average_bleu_score}" + "\n")
        fw.write(f"Overall Average Marker Score: {average_sim_score}" + "\n") 
        print ("Overall extract ratio: ", len(self.score_dict)/self.anntion_cnt)
        print (f"Overall Average Levenshtein Distance: {average_edit_distance}")
        print (f"Overall Average BLEU Score: {average_bleu_score}")
        print (f"Overall Average Marker Score: {average_sim_score}")
        fw.close()

    def calculate_similarity_total(self, tool_type, file_types, download_dir):
        for file_type in file_types:
            annotion = os.path.join(download_dir, file_type, "annotations", "cleaned")
            actual = os.path.join(download_dir, file_type, tool_type, "cleaned")
            self.calculate_similarity(annotion, actual, file_type)

if __name__ == "__main__":  
  file_types = list()
  tool_type =args.tool_name
  download_dir = args.download_dir
  if args.document_types:
    print("Selected types:", args.document_types)
    for type_ in args.document_types:
        file_types.append(type_)
  else:
      print("No types selected")
  print(f"Type {file_types} is selected. Executing related operations...")
  score = Scoring()
  score.calculate_similarity_total(tool_type, file_types, download_dir)
  score.summary_scores()