open_bench.py

import torch
from torchvision import transforms
import json
from tqdm import tqdm
import os
import numpy as np
from torch.utils.data import Dataset


def get_prompt2(conv):
    ret = conv.system + conv.sep
    count = 0
    for role, message in conv.messages:
        count += 1
        if count == len(conv.messages):
            ret += role + ": " + message
        else:
            if message:
                ret += role + ": " + message + conv.sep
            else:
                ret += role + ":"
    return ret

class MLVU(Dataset):
    def __init__(self, data_dir, data_list):
        self.data_list = []
        for k, v in data_list.items():
            with open(os.path.join(data_dir, v[0]), 'r') as f:
                json_data = json.load(f)
            for data in json_data:
                self.data_list.append({
                    'task_type': k,
                    'prefix': v[1],
                    'data_type': v[2],
                    'data': data
                })
        
    
    def __str__(self):
        len_list = {}
        option_list = {}
        for data in self.data_list:
            if data['task_type'] not in len_list:
                len_list[data['task_type']] = 0
            len_list[data['task_type']] += 1
            if data['task_type'] not in option_list:
                option_list[data['task_type']] = 0
            option_list[data['task_type']] += len(data['data']['candidates'])
        
        correct = 0
        total = 0
        res = f"There are {len(self.data_list)} videos as follow:\n"
        for k, v in len_list.items():
            correct += len_list[k]
            total += option_list[k]
            res += f"{v} for {k} ({option_list[k]} options => {len_list[k]/option_list[k]*100:.2f}%)\n"
            correct = correct + 1 / option_list[k]
        res += f"Total random accuracy: {correct/total*100:.2f}%"
        return res.rstrip()
        
    def __len__(self):
        return len(self.data_list)
    
    def get_index(self, bound, fps, max_frame, first_idx=0):
        if bound:
            start, end = bound[0], bound[1]
        else:
            start, end = -100000, 100000
        start_idx = max(first_idx, round(start * fps))
        end_idx = min(round(end * fps), max_frame)
        seg_size = float(end_idx - start_idx) / self.num_segments
        frame_indices = np.array([
            int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
            for idx in range(self.num_segments)
        ])
        return frame_indices
    

    def qa_template(self, data):
        question = f"{data['question']}"
        answer = data['answer']
        return question, answer


    def __getitem__(self, idx):
        video_path = os.path.join(self.data_list[idx]['prefix'], self.data_list[idx]['data']['video'])
        question, answer = self.qa_template(self.data_list[idx]['data'])
            
        return {
            'video': video_path, 
            'question': question, 
            'answer': answer,
            'task_type': self.data_list[idx]['task_type']
        }


def main():

    data_list = {
    "subPlot": ("8_sub_scene.json", f"MLVU_all/video/subPlot", "video", False),
    "summary": ("9_summary.json", f"MLVU_all/video/summary", "video", False)
    }
   

    data_dir = f"MLVU_all/json"
   

    dataset = MLVU(data_dir, data_list)

    '''
    load your model
    '''

    res_list_subplot = []
    res_list_summary = []
    for example in tqdm(dataset):
        video_path=example["video"]
        quesiotn=example["question"]

        '''
        modify the conv_templates like the following tempates
        conv_mode = "llava_v1"
        conv = conv_templates[conv_mode].copy()
        roles = conv.roles
        inp = [DEFAULT_VIDEO_TOKEN] '\n' + question  # noted different models take different concatenation ways
        conv.system="Carefully watch this video and pay attention to every detail. Based on your observations, answer the given questions."
        conv.append_message(conv.roles[0], inp)
        conv.append_message(conv.roles[1], None)
        prompt=get_prompt(conv)
        '''

        '''
        run the inference code of MLLMs
        pred=run(video_path,conv_mode,prompt,...)
        '''
    
        gt = example['answer']

        if task_type=="subPlot":
            result={}
            result["video_name"]=example['video_path'].split("/")[-1]
            result['Q']=example['question']
            result['A']=gt
            result['pred']=pred
            res_list_subplot.append(result)
    
        if task_type=="summary":
            result={}
            result["video_name"]=example['video_path'].split("/")[-1]
            result['Q']=example['question']
            result['A']=gt
            result['pred']=pred
            res_list_summary.append(result)


    with open(f"subplot_all.json", "w") as f:
        json.dump(res_list_subplot, f)

    with open(f"summary_all.json", "w") as f:
        json.dump(res_list_summary, f)

      
if __name__ == '__main__':
    main()