import argparse import os from sklearn.metrics import (accuracy_score, confusion_matrix, precision_score, recall_score) parser = argparse.ArgumentParser() parser.add_argument('--results_dir', default='./LaVIN', type=str) eval_type_dict = { 'Perception': ['existence', 'count', 'position', 'color', 'posters', 'celebrity', 'scene', 'landmark', 'artwork', 'OCR'], 'Cognition': ['commonsense_reasoning', 'numerical_calculation', 'text_translation', 'code_reasoning'] } class calculate_metrics: def divide_chunks(self, l, n=2): # looping till length l for i in range(0, len(l), n): yield l[i:i + n] return def parse_pred_ans(self, pred_ans): pred_label = None if pred_ans in ['yes', 'no']: pred_label = pred_ans else: prefix_pred_ans = pred_ans[:4] if 'yes' in prefix_pred_ans: pred_label = 'yes' elif 'no' in prefix_pred_ans: pred_label = 'no' else: pred_label = 'other' return pred_label def compute_metric(self, gts, preds): assert len(gts) == len(preds) label_map = { 'yes': 1, 'no': 0, 'other': -1, } gts = [label_map[x] for x in gts] preds = [label_map[x] for x in preds] acc = accuracy_score(gts, preds) clean_gts = [] clean_preds = [] other_num = 0 for gt, pred in zip(gts, preds): if pred == -1: other_num += 1 continue clean_gts.append(gt) clean_preds.append(pred) conf_mat = confusion_matrix(clean_gts, clean_preds, labels=[1,0]) precision = precision_score(clean_gts, clean_preds, average='binary') recall = recall_score(clean_gts, clean_preds, average='binary') tp, fn = conf_mat[0] fp, tn = conf_mat[1] metric_dict = dict() metric_dict = { 'TP': tp, 'FN': fn, 'TN': tn, 'FP': fp, 'precision': precision, 'recall': recall, 'other_num': other_num, 'acc': acc, } return metric_dict def process_result(self, results_dir): model_score_dict = dict() for eval_type, task_name_list in eval_type_dict.items(): print('===========', eval_type, '===========') scores = 0 task_score_dict = dict() for task_name in task_name_list: task_txt = os.path.join(results_dir, task_name + '.txt') lines = open(task_txt, 'r').readlines() chunk_lines = list(self.divide_chunks(lines)) # one image corresponds to two questions img_num = len(chunk_lines) task_other_ans_num = 0 task_score = 0 acc_plus_correct_num = 0 gts = [] preds = [] for img_items in chunk_lines: assert len(img_items) == 2 img_correct_num = 0 for img_item in img_items: try: img_name, question, gt_ans, pred_ans = img_item.split('\t') except: print(img_item) continue gt_ans = gt_ans.lower() pred_ans = pred_ans.lower() assert gt_ans in ['yes', 'no'] # gt can only be yes or no. pred_ans = self.parse_pred_ans(pred_ans) assert pred_ans in ['yes', 'no', 'other'] gts.append(gt_ans) preds.append(pred_ans) if gt_ans == pred_ans: img_correct_num += 1 if pred_ans not in ['yes', 'no']: task_other_ans_num += 1 if img_correct_num == 2: acc_plus_correct_num += 1 # cal TP precision acc, etc. metric_dict = self.compute_metric(gts, preds) acc_plus = acc_plus_correct_num / img_num metric_dict['acc_plus'] = acc_plus for k, v in metric_dict.items(): if k in ['acc', 'acc_plus']: task_score += v*100 task_score_dict[task_name] = task_score scores += task_score print('total score:', scores, '\n') for task_name, score in task_score_dict.items(): print('\t', task_name, ' score:', score) print('\n') return if __name__ == '__main__': cal = calculate_metrics() args = parser.parse_args() results_dir = args.results_dir cal.process_result(results_dir)