evaluate_video_mvbench.py

import argparse
import json

from .evaluate_mmmu import get_input_output_paths


def merge_input_files(input_path):
    """Merge input files to a format compatible with the evaluator."""
    input_file_paths, output_file_path = get_input_output_paths(input_path, task="MVBench")

    results = []
    collected = set()


    for input_file_path in input_file_paths:
        with open(input_file_path, "r") as input_file:
            for line in input_file:
                res = json.loads(line)
                res["question_id"] = "{}-{}".format(res['task_type'], res['sample_id'])
                if res['sample_id'] in collected:
                    continue
                collected.add(res['sample_id'])

                results.append(res)

    with open(output_file_path, "w") as output_file:
        json.dump(results, output_file, indent=4, sort_keys=True)

    return output_file_path


# The following code is adapted from
# https://github.com/OpenGVLab/Ask-Anything/blob/main/video_chat2/mvbench.ipynb
# which is licensed under the MIT license. More details on the license can be
# found at https://github.com/OpenGVLab/Ask-Anything/tree/main?tab=MIT-1-ov-file#readme
def check_ans(pred, gt):
    flag = False
        
    pred_list = pred.lower().split(' ')
    pred_option, pred_content = pred_list[0], ' '.join(pred_list[1:])
    gt_list = gt.lower().split(' ')
    gt_option, gt_content = gt_list[0], ' '.join(gt_list[1:])
    if gt_content[-1] == '.':
        gt_content = gt_content[:-1]
    
    if pred_option.replace('.', '') in gt_option:
        flag = True
    elif gt_option in pred_option:
        flag = True
    
    return flag

def create_result_dict(result_list):

    correct = 0
    total = 0
    res_list = []
    acc_dict = {}

    for idx, result_obj in enumerate(result_list):
        task_type = result_obj['task_type']
        if task_type not in acc_dict:
            acc_dict[task_type] = [0, 0]  # correct, total
        acc_dict[task_type][1] += 1
        total += 1
        pred = result_obj['answer']
        gt = result_obj['gt_answer'][0]
        
        res_list.append({
            'pred': pred,
            'gt': gt
        })
        if check_ans(pred=pred, gt=gt):
            acc_dict[task_type][0] += 1
            correct += 1

    print(f"Total Acc: {correct / total * 100 :.2f}%")
    print('-' * 30, task_type, '-' * 30)

    return acc_dict
        

def combine_all_res(acc_dict):
    final_res = dict()
    correct = 0
    total = 0
    for k, v in acc_dict.items():
        final_res[k] = v[0] / v[1] * 100
        correct += v[0]
        total += v[1]    
    final_res['total-acc'] = correct / total * 100

    print(final_res)

    return final_res


def mvbench_eval(input_path):
    result_file_path = merge_input_files(input_path)
    
    merged_results = json.load(open(result_file_path))
    acc_dict = create_result_dict(merged_results)
    
    return combine_all_res(acc_dict)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
    args = parser.parse_args()

    avg_acc_dict = mvbench_eval(args.input_path)

    print(f"MVBench {avg_acc_dict}")