MiniGemini_pytorch

07dbc76b · dongchy920 · 07dbc76b · 07dbc76b · 07dbc76b · 07dbc76b
Commit 07dbc76b authored Jul 15, 2024 by dongchy920
20 changed files
--- a/mgm/eval/MMMU/eval/utils/data_utils.py
+++ b/mgm/eval/MMMU/eval/utils/data_utils.py
+"""Utils for data load, save, and process (e.g., prompt construction)"""
+
+import os
+import json
+import yaml
+import re
+
+
+DOMAIN_CAT2SUB_CAT = {
+  'Art and Design': ['Art', 'Art_Theory', 'Design', 'Music'],
+  'Business': ['Accounting', 'Economics', 'Finance', 'Manage','Marketing'],
+  'Science': ['Biology', 'Chemistry', 'Geography', 'Math', 'Physics',],
+  'Health and Medicine': ['Basic_Medical_Science', 'Clinical_Medicine', 'Diagnostics_and_Laboratory_Medicine', 'Pharmacy', 'Public_Health'],
+  'Humanities and Social Science': ['History', 'Literature', 'Sociology', 'Psychology'],
+  'Tech and Engineering': ['Agriculture', 'Architecture_and_Engineering', 'Computer_Science', 'Electronics', 'Energy_and_Power', 'Materials', 'Mechanical_Engineering'],
+}
+
+
+CAT_SHORT2LONG = {
+    'acc': 'Accounting',
+    'agri': 'Agriculture',
+    'arch': 'Architecture_and_Engineering',
+    'art': 'Art',
+    'art_theory': 'Art_Theory',
+    'bas_med': 'Basic_Medical_Science',
+    'bio': 'Biology',
+    'chem': 'Chemistry',
+    'cli_med': 'Clinical_Medicine',
+    'cs': 'Computer_Science',
+    'design': 'Design',
+    'diag_med': 'Diagnostics_and_Laboratory_Medicine',
+    'econ': 'Economics',
+    'elec': 'Electronics',
+    'ep': 'Energy_and_Power',
+    'fin': 'Finance',
+    'geo': 'Geography',
+    'his': 'History',
+    'liter': 'Literature',
+    'manage': 'Manage',
+    'mark': 'Marketing',
+    'mate': 'Materials',
+    'math': 'Math',
+    'mech': 'Mechanical_Engineering',
+    'music': 'Music',
+    'phar': 'Pharmacy',
+    'phys': 'Physics',
+    'psy': 'Psychology',
+    'pub_health': 'Public_Health',
+    'socio': 'Sociology'
+}
+
+# DATA SAVING
+def save_json(filename, ds):
+    with open(filename, 'w') as f:
+        json.dump(ds, f, indent=4)
+
+
+def get_multi_choice_info(options):
+    """
+    Given the list of options for multiple choice question
+    Return the index2ans and all_choices
+    """
+    
+    start_chr = 'A'
+    all_choices = []
+    index2ans = {}
+    for i, option in enumerate(options):
+        index2ans[chr(ord(start_chr) + i)] = option
+        all_choices.append(chr(ord(start_chr) + i))
+
+    return index2ans, all_choices
+
+def load_yaml(file_path):
+    with open(file_path, 'r') as stream:
+        try:
+            yaml_dict = yaml.safe_load(stream)
+        except yaml.YAMLError as exc:
+            print(exc)
+
+    return yaml_dict
+
+
+def parse_img_path(text):
+    matches = re.findall("<img='(.*?)'>", text)
+    return matches
+
+def process_single_sample(data):
+    question = data['question']
+    o_imgs_paths = []
+    for option in data['options']:
+        current_o_imgs_paths = parse_img_path(option)
+        for img_path in current_o_imgs_paths:
+            o_imgs_paths.append(img_path)
+
+    if len(o_imgs_paths) > 1:  # multiple images in options, used for random selection
+        return {'id': data['id'], 'question': question, 'options': data['options'], 'answer': data['answer'],
+             'image': None, 'question_type': data['question_type']}
+    else:
+        return {'id': data['id'], 'question': question, 'options': data['options'], 'answer': data['answer'],
+             'image': data['image_1'], 'question_type': data['question_type']}
+
+
+# DATA SAVING
+def save_json(filename, ds):
+    with open(filename, 'w') as f:
+        json.dump(ds, f, indent=4)
+
+def save_jsonl(filename, data):
+    """
+    Save a dictionary of data to a JSON Lines file with the filename as key and caption as value.
+
+    Args:
+        filename (str): The path to the file where the data should be saved.
+        data (dict): The dictionary containing the data to save where key is the image path and value is the caption.
+    """
+    with open(filename, 'w', encoding='utf-8') as f:
+        for img_path, caption in data.items():
+            # Extract the base filename without the extension
+            base_filename = os.path.basename(img_path)
+            # Create a JSON object with the filename as the key and caption as the value
+            json_record = json.dumps({base_filename: caption}, ensure_ascii=False)
+            # Write the JSON object to the file, one per line
+            f.write(json_record + '\n')
+
+def save_args(args, path_dir):
+    argsDict = args.__dict__
+    with open(path_dir + 'setting.txt', 'w') as f:
+        f.writelines('------------------ start ------------------' + '\n')
+        for eachArg, value in argsDict.items():
+            f.writelines(eachArg + ' : ' + str(value) + '\n')
+        f.writelines('------------------- end -------------------')
+
+
+
+# DATA PROCESSING
+def construct_prompt(sample, config):
+    question = sample['question']
+    options = eval(sample['options'])
+    example = ""
+    if sample['question_type'] == 'multiple-choice':
+        start_chr = 'A'
+        prediction_range = []
+        index2ans = {}
+        for option in options:
+            prediction_range.append(start_chr)
+            example += f"({start_chr}) {option}\n"
+            index2ans[start_chr] = option
+            start_chr = chr(ord(start_chr) + 1)
+        empty_prompt_sample_structure = config['multi_choice_example_format']
+        empty_prompt = empty_prompt_sample_structure.format(question, example)
+        res_dict = {}
+        res_dict['index2ans'] = index2ans
+        res_dict['correct_choice'] = sample['answer']
+        res_dict['all_choices'] = prediction_range
+        res_dict['empty_prompt'] = empty_prompt
+        if config['task_instructions']:
+            res_dict['final_input_prompt'] = config['task_instructions'].strip() + '\n\n' + empty_prompt
+        else:
+            res_dict['final_input_prompt'] = empty_prompt
+
+        res_dict['gt_content'] = options[ord(sample['answer'].upper()) - ord('A')]
+    else:
+        empty_prompt_sample_structure = config['short_ans_example_format']
+        empty_prompt = empty_prompt_sample_structure.format(question)
+        res_dict = {}
+        res_dict['empty_prompt'] = empty_prompt
+        if config['task_instructions']:
+            res_dict['final_input_prompt'] = config['task_instructions'].strip() + '\n\n' + empty_prompt
+        else:
+            res_dict['final_input_prompt'] = empty_prompt
+        res_dict['gt_content'] = sample['answer']
+
+    res_dict.update(sample)
+    return res_dict
\ No newline at end of file
--- a/mgm/eval/MMMU/eval/utils/eval_utils.py
+++ b/mgm/eval/MMMU/eval/utils/eval_utils.py
+"""Response Parsing and Evaluation for various models"""
+from typing import Dict
+
+import re
+import random
+random.seed(42)
+import numpy as np
+
+# ----------- Process Multi-choice -------------
+def parse_multi_choice_response(response, all_choices, index2ans):
+    """
+    Parse the prediction from the generated response.
+    Return the predicted index e.g., A, B, C, D.
+    """
+    for char in [',', '.', '!', '?', ';', ':', "'"]:
+        response = response.strip(char)
+    response = " " + response + " " # add space to avoid partial match
+
+    index_ans = True
+    ans_with_brack = False
+    candidates = []
+    for choice in all_choices:  # e.g., (A) (B) (C) (D)
+        if f'({choice})' in response:
+            candidates.append(choice)
+            ans_with_brack = True
+
+    if len(candidates) == 0:
+        for choice in all_choices: # e.g., A B C D
+            if f' {choice} ' in response:
+                candidates.append(choice)
+
+    # if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
+    if len(candidates) == 0 and len(response.split()) > 5:
+        for index, ans in index2ans.items():
+            if ans.lower() in response.lower():
+                candidates.append(index)
+                index_ans = False # it's content ans.
+
+    if len(candidates) == 0:  # still not get answer, randomly choose one.
+        pred_index = random.choice(all_choices)
+    elif len(candidates) > 1:
+        start_indexes = []
+        if index_ans:
+            if ans_with_brack: 
+                for can in candidates:
+                    index = response.rfind(f'({can})')
+                    start_indexes.append(index) # -1 will be ignored anyway
+                # start_indexes = [generated_response.index(f'({can})') for can in candidates]
+            else:
+                for can in candidates:
+                    index = response.rfind(f" {can} ")
+                    start_indexes.append(index)
+        else:
+            for can in candidates:
+                index = response.lower().rfind(index2ans[can].lower())
+                start_indexes.append(index)
+        # get the last one
+        pred_index = candidates[np.argmax(start_indexes)]
+    else: # if only one candidate, use it.
+        pred_index = candidates[0]
+
+    return pred_index
+
+# ----------- Process Open -------------
+def check_is_number(string):
+    """
+    Check if the given string a number.
+    """
+    try:
+        float(string.replace(',', ''))
+        return True
+    except ValueError:
+        # check if there's comma inside
+        return False
+
+def normalize_str(string):
+    """
+    Normalize the str to lower case and make them float numbers if possible.
+    """
+    # check if characters in the string
+
+    # if number, numerize it.
+    string = string.strip()
+
+    is_number = check_is_number(string)
+
+    if is_number:
+        string = string.replace(',', '')
+        string = float(string)
+        # leave 2 decimal
+        string = round(string, 2)
+        return [string]
+    else: # it's likely to be a string
+        # lower it 
+        string = string.lower()
+        if len(string) == 1:
+            return [" " + string, string + " "] # avoid trivial matches
+        return [string]
+
+def extract_numbers(string):
+    """
+    Exact all forms of numbers from a string with regex.
+    """
+    # Pattern for numbers with commas
+    pattern_commas = r'-?\b\d{1,3}(?:,\d{3})+\b'
+    # Pattern for scientific notation
+    pattern_scientific = r'-?\d+(?:\.\d+)?[eE][+-]?\d+'
+    # Pattern for simple numbers without commas
+    pattern_simple = r'-?(?:\d+\.\d+|\.\d+|\d+\b)(?![eE][+-]?\d+)(?![,\d])'
+
+    # Extract numbers with commas
+    numbers_with_commas = re.findall(pattern_commas, string)
+    # Extract numbers in scientific notation
+    numbers_scientific = re.findall(pattern_scientific, string)
+    # Extract simple numbers without commas
+    numbers_simple = re.findall(pattern_simple, string)
+
+    # Combine all extracted numbers
+    all_numbers = numbers_with_commas + numbers_scientific + numbers_simple
+    return all_numbers
+
+def parse_open_response(response):
+    """
+    Parse the prediction from the generated response.
+    Return a list of predicted strings or numbers.
+    """
+    # content = content.strip("\n").strip(".").strip(" ")
+    def get_key_subresponses(response):
+        key_responses = []
+        response = response.strip().strip(".").lower()
+        sub_responses = re.split(r'\.\s(?=[A-Z])|\n', response)
+        indicators_of_keys = ['could be ', 'so ', 'is ',
+                            'thus ', 'therefore ', 'final ', 'answer ', 'result ']
+        key_responses = []
+        for index, resp in enumerate(sub_responses):
+            # if last one, accept it's an equation (the entire response can be just one sentence with equation)
+            if index == len(sub_responses) - 1:
+                indicators_of_keys.extend(['='])
+            shortest_key_response = None # the shortest response that may contain the answer (tail part of the response)
+            for indicator in indicators_of_keys:
+                if indicator in resp:
+                    if not shortest_key_response:
+                        shortest_key_response = resp.split(indicator)[-1].strip()
+                    else:
+                        if len(resp.split(indicator)[-1].strip()) < len(shortest_key_response):
+                            shortest_key_response = resp.split(indicator)[-1].strip()
+                    # key_responses.append(resp.split(indicator)[1].strip())
+            
+            if shortest_key_response:
+                # and it's not trivial
+                if shortest_key_response.strip() not in [":", ",", ".", "!", "?", ";", ":", "'"]:
+                    key_responses.append(shortest_key_response)
+        if len(key_responses) == 0: # did not found any
+            return [response]
+        return key_responses
+    # pdb.set_trace()
+    key_responses = get_key_subresponses(response)
+
+    pred_list = key_responses.copy() # keep the original string response
+    for resp in key_responses:
+        pred_list.extend(extract_numbers(resp))
+
+    tmp_pred_list = []
+    for i in range(len(pred_list)):
+        tmp_pred_list.extend(normalize_str(pred_list[i]))
+    pred_list = tmp_pred_list
+
+    # remove duplicates
+    pred_list = list(set(pred_list))
+
+    return pred_list
+
+# ----------- Evaluation -------------
+
+def eval_multi_choice(gold_i, pred_i):
+    """
+    Evaluate a multiple choice instance.
+    """
+    correct = False
+    # only they are exactly the same, we consider it as correct
+    if isinstance(gold_i, list):
+        for answer in gold_i:
+            if answer == pred_i:
+                correct = True
+                break
+    else: # gold_i is a string
+        if gold_i == pred_i:
+            correct = True
+    return correct
+
+def eval_open(gold_i, pred_i):
+    """
+    Evaluate an open question instance
+    """
+    correct = False
+    if isinstance(gold_i, list):
+        # use float to avoid trivial matches
+        norm_answers = []
+        for answer in gold_i:
+            norm_answers.extend(normalize_str(answer))
+    else:
+        norm_answers = normalize_str(gold_i)
+    for pred in pred_i: # pred is already normalized in parse response phase
+        if isinstance(pred, str): # if it's a string, then find if ans in the pred_i
+            for norm_ans in norm_answers:
+                # only see if the string answer in the string pred
+                if isinstance(norm_ans, str) and norm_ans in pred:
+                    if not correct:
+                        correct = True
+                    break
+        else: # it's a float number
+            if pred in norm_answers:
+                if not correct:
+                    correct = True
+                break
+    return correct
+
+# ----------- Batch Evaluation -------------
+def evaluate(samples):
+    """
+    Batch evaluation for multiple choice and open questions.
+    """
+    pred_correct = 0
+    judge_dict = dict()
+    for sample in samples:
+        gold_i = sample['answer']
+        pred_i = sample['parsed_pred']
+        if sample['question_type'] == 'multiple-choice':
+            correct = eval_multi_choice(gold_i, pred_i)
+        else: # open question
+            correct = eval_open(gold_i, pred_i)
+
+        if correct:
+            judge_dict[sample['id']] = 'Correct'
+            pred_correct += 1
+        else:
+            judge_dict[sample['id']] = 'Wrong'
+
+    if len(samples) == 0:
+        return {'acc': 0}
+    return judge_dict, {'acc': pred_correct / len(samples)}
+
+
+
+# ----------- Calculate Accuracy -------------
+def calculate_ins_level_acc(results: Dict):
+    """Calculate the instruction level accuracy for given Subject results"""
+    acc = 0
+    ins_num = 0
+    for cat_results in results.values():
+        acc += cat_results['acc'] * cat_results['num_example']
+        ins_num += cat_results['num_example']
+    if ins_num == 0:
+        return 0
+    return acc / ins_num
+
--- a/mgm/eval/MMMU/eval/utils/model_utils.py
+++ b/mgm/eval/MMMU/eval/utils/model_utils.py
+from random import random
+import torch
+
+def call_llava_engine_df(args, sample, model, tokenizer=None, processor=None):
+    from mgm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+    from mgm.conversation import conv_templates, SeparatorStyle
+
+    def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
+        prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
+
+        def insert_separator(X, sep):
+            return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
+
+        input_ids = []
+        offset = 0
+        if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+            offset = 1
+            input_ids.append(prompt_chunks[0][0])
+
+        for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+            input_ids.extend(x[offset:])
+
+        if return_tensors is not None:
+            if return_tensors == 'pt':
+                return torch.tensor(input_ids, dtype=torch.long)
+            raise ValueError(f'Unsupported tensor type: {return_tensors}')
+        return input_ids
+
+    def deal_with_prompt(input_text, mm_use_im_start_end, ocr_tokens):
+        if ocr_tokens is not None:
+            qs = input_text + '\n' + ocr_tokens
+        else:
+            qs = input_text
+        if mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+        return qs
+
+    prompt = sample['final_input_prompt']
+    ocr_tokens = sample.get('ocr', None)
+    prompt = deal_with_prompt(prompt, model.config.mm_use_im_start_end, ocr_tokens)
+    conv = conv_templates[args.conv_mode].copy()
+    conv.append_message(conv.roles[0], prompt)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+    image = sample['image']
+    image_aux = sample['image_aux']
+    if image_aux is not None:
+        image_aux = image_aux.unsqueeze(0).half().cuda()
+    
+    terminators = tokenizer.eos_token_id
+    if "llama_3" in args.conv_mode:
+        terminators = [terminators, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
+    
+    if image is not None:
+        output_ids = model.generate(
+            input_ids,
+            images=image.unsqueeze(0).half().cuda(),
+            images_aux=image_aux,
+            do_sample=True,
+            temperature=1,
+            num_beams=5,
+            top_p=None,
+            max_new_tokens=128,
+            bos_token_id=tokenizer.bos_token_id,  # Begin of sequence token
+            eos_token_id=terminators,  # End of sequence token
+            pad_token_id=tokenizer.pad_token_id,  # Pad token
+            use_cache=True)
+
+        response = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip('\n')
+    else:  # multiple images actually
+        if sample['question_type'] == 'multiple-choice':
+            all_choices = sample['all_choices']
+            response = random.choice(all_choices)
+        else:
+            response = 'INVALID GENERATION FOR MULTIPLE IMAGE INPUTS'
+
+    return response
--- a/mgm/eval/MMMU/image.png
+++ b/mgm/eval/MMMU/image.png
--- a/mgm/eval/MathVista/__pycache__/utilities.cpython-310.pyc
+++ b/mgm/eval/MathVista/__pycache__/utilities.cpython-310.pyc
--- a/mgm/eval/MathVista/calculate_score.py
+++ b/mgm/eval/MathVista/calculate_score.py
+import os
+import re
+import argparse
+import pandas as pd
+
+# !pip install python-Levenshtein
+from Levenshtein import distance
+
+import sys
+sys.path.append('../')
+from utilities import *
+
+
+def get_most_similar(prediction, choices):
+    """
+    Use the Levenshtein distance (or edit distance) to determine which of the choices is most similar to the given prediction
+    """
+    distances = [distance(prediction, choice) for choice in choices]
+    ind = distances.index(min(distances))
+    return choices[ind]
+    # return min(choices, key=lambda choice: distance(prediction, choice))
+
+
+def normalize_extracted_answer(extraction, choices, question_type, answer_type, precision):
+    """
+    Normalize the extracted answer to match the answer type
+    """
+    if question_type == 'multi_choice':
+        # make sure the extraction is a string
+        if isinstance(extraction, str):
+            extraction = extraction.strip()
+        else:
+            try:
+                extraction = str(extraction)
+            except:
+                extraction = ""
+    
+        # extract "A" from "(A) text"
+        letter = re.findall(r'\(([a-zA-Z])\)', extraction)
+        if len(letter) > 0:
+            extraction = letter[0].upper()
+        
+        options = [chr(ord('A') + i) for i in range(len(choices))]
+            
+        if extraction in options:
+            # convert option letter to text, e.g. "A" -> "text"
+            ind = options.index(extraction)
+            extraction = choices[ind]
+        else:
+            # select the most similar option
+            extraction = get_most_similar(extraction, choices)
+        assert extraction in choices
+
+    elif answer_type == 'integer':
+        try:
+            extraction = str(int(float(extraction)))
+        except:
+            extraction = None
+
+    elif answer_type == 'float':
+        try:
+            extraction = str(round(float(extraction), precision))
+        except:
+            extraction = None
+        
+    elif answer_type == 'list':
+        try:
+            extraction = str(extraction)
+        except:
+            extraction = None
+
+    return extraction
+    
+
+def safe_equal(prediction, answer):
+    """
+    Check if the prediction is equal to the answer, even if they are of different types
+    """
+    try:
+        if prediction == answer:
+            return True
+        return False
+    except Exception as e:
+        print(e)
+        return False
+
+
+def get_acc_with_contion(res_pd, key, value):
+    if key == 'skills':
+        # if value in res_pd[key]:
+        total_pd = res_pd[res_pd[key].apply(lambda x: value in x)]
+    else:
+        total_pd = res_pd[res_pd[key] == value]
+
+    correct_pd = total_pd[total_pd['true_false'] == True]
+    acc = "{:.2f}".format(len(correct_pd) / len(total_pd) * 100)
+    return len(correct_pd), len(total_pd), acc
+        
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output_file', type=str, default='output.json')
+    parser.add_argument('--score_file', type=str, default='scores.json')
+    parser.add_argument('--gt_file', type=str, default='../data/testmini.json', help='ground truth file')
+    parser.add_argument('--number', type=int, default=-1, help='number of problems to run')
+    parser.add_argument('--rerun', action='store_true', help='rerun the evaluation')
+    parser.add_argument('--caculate_gain', action='store_true', help='caculate the socre gains over random guess')
+    parser.add_argument('--random_file', type=str, default='score_random_guess.json')  
+    args = parser.parse_args()
+
+    # args
+    output_file = args.output_file
+
+    # # quick test
+    # output_file = '../results/llava-llama-2-13b/output_llava_llama_2_13b.json'
+
+    # read json
+    print(f"Reading {output_file}...")
+    results = read_json(output_file)
+
+    # read ground truth
+    print(f"Reading {args.gt_file}...")
+    gts = read_json(args.gt_file)
+
+    # full pids
+    full_pids = list(results.keys())
+    if args.number > 0:
+        full_pids = full_pids[:min(args.number, len(full_pids))]
+    print("Number of testing problems:", len(full_pids))
+    
+    ## [1] Evaluate if the prediction is true or false
+    print("\nEvaluating the predictions...")
+    update_json_flag = False
+    for pid in full_pids:
+        problem = results[pid]
+        # print(problem)
+
+        if args.rerun:
+            if 'prediction' in problem:
+                del problem['prediction']
+            if 'true_false' in problem:
+                del problem['true_false']
+
+        choices = problem['choices']
+        question_type = problem['question_type']
+        answer_type = problem['answer_type']
+        precision = problem['precision']
+        extraction = problem['extraction']
+
+        if 'answer' in problem:
+            answer = problem['answer']
+        else:
+            answer = gts[pid]['answer']
+            problem['answer'] = answer
+
+        # normalize the extracted answer to match the answer type
+        prediction = normalize_extracted_answer(extraction, choices, question_type, answer_type, precision)
+
+        # verify the prediction is true or false
+        true_false = safe_equal(prediction, answer)
+        
+        # update the problem
+        if "true_false" not in problem:
+            update_json_flag = True
+
+        elif true_false != problem['true_false']:
+            update_json_flag = True
+
+        if "prediction" not in problem:
+            update_json_flag = True
+
+        elif prediction !=  problem['prediction']:
+            update_json_flag = True
+            
+        problem['prediction'] = prediction
+        problem['true_false'] = true_false
+
+    # save the updated json
+    if update_json_flag:
+        print("\n!!!Some problems are updated.!!!")
+        print(f"\nSaving {output_file}...")
+        save_json(results, output_file)
+
+    ## [2] Calculate the average accuracy
+    total = len(full_pids)
+    correct = 0
+    for pid in full_pids:
+        if results[pid]['true_false']:
+            correct += 1
+    accuracy = str(round(correct / total * 100, 2))
+    print(f"\nCorrect: {correct}, Total: {total}, Accuracy: {accuracy}%")
+
+    scores = {"average": {"accuracy": accuracy, "correct": correct, "total": total}}
+    
+    ## [3] Calculate the fine-grained accuracy scores
+    
+    # merge the 'metadata' attribute into the data
+    for pid in results:
+        results[pid].update(results[pid].pop('metadata'))
+
+    # convert the data to a pandas DataFrame
+    df = pd.DataFrame(results).T
+
+    print(len(df))
+    print("Number of test problems:", len(df))
+    # assert len(df) == 1000 # Important!!!
+
+    # asign the target keys for evaluation
+    target_keys = ['question_type', 'answer_type', 'language', 'source', 'category', 'task', 'context', 'grade', 'skills']
+     
+    for key in target_keys:
+        print(f"\nType: [{key}]")
+        # get the unique values of the key
+        if key == 'skills':
+            # the value is a list
+            values = []
+            for i in range(len(df)):
+                values += df[key][i]
+            values = list(set(values))
+        else:
+            values = df[key].unique()
+        #print(values)
+
+        # calculate the accuracy for each value
+        scores[key] = {}
+        for value in values:
+            correct, total, acc = get_acc_with_contion(df, key, value)
+            if total > 0:
+                print(f"[{value}]: {acc}% ({correct}/{total})")
+                scores[key][value] = {"accuracy": acc, "correct": correct, "total": total}
+        
+        # sort the scores by accuracy
+        scores[key] = dict(sorted(scores[key].items(), key=lambda item: float(item[1]['accuracy']), reverse=True))
+
+    # save the scores
+    scores_file = args.score_file
+    print(f"\nSaving {scores_file}...")
+    save_json(scores, scores_file)
+    print("\nDone!")
+
+    # [4] Calculate the score gains over random guess
+    if args.caculate_gain:
+        random_file = args.random_file
+        random_scores = json.load(open(random_file))
+
+        print("\nCalculating the score gains...")
+        for key in scores:
+            if key == 'average':
+                gain = round(float(scores[key]['accuracy']) - float(random_scores[key]['accuracy']), 2)
+                scores[key]['acc_gain'] = gain
+            else:
+                for sub_key in scores[key]:
+                    gain = round(float(scores[key][sub_key]['accuracy']) - float(random_scores[key][sub_key]['accuracy']), 2)
+                    scores[key][sub_key]['acc_gain'] = str(gain)
+
+        # save the score gains
+        print(f"\nSaving {scores_file}...")    
+        save_json(scores, scores_file)
+        print("\nDone!")
--- a/mgm/eval/MathVista/extract_answer.py
+++ b/mgm/eval/MathVista/extract_answer.py
+import os
+import re
+import time
+import argparse
+
+from tqdm import tqdm
+
+import sys
+sys.path.append('../')
+from utilities import *
+
+# OpenAI
+import openai
+
+# load demo prompt
+from prompts.ext_ans import demo_prompt
+
+
+def verify_extraction(extraction):
+    extraction = extraction.strip()
+    if extraction == "" or extraction == None:
+        return False
+    return True
+
+
+def create_test_prompt(demo_prompt, query, response):
+    demo_prompt = demo_prompt.strip()
+    test_prompt = f"{query}\n\n{response}"
+    full_prompt = f"{demo_prompt}\n\n{test_prompt}\n\nExtracted answer: "
+    return full_prompt
+
+
+def extract_answer(response, problem, quick_extract=False):
+    question_type = problem['question_type']
+    answer_type = problem['answer_type']
+    choices = problem['choices']
+    query = problem['query']
+    pid = problem['pid']
+
+    if response == "":
+        return ""
+    
+    if question_type == 'multi_choice' and response in choices:
+        return response
+    
+    if answer_type == "integer":
+        try:
+            extraction = int(response)
+            return str(extraction)
+        except:
+            pass
+
+    if answer_type == "float":
+        try:
+            extraction = str(float(response))
+            return extraction
+        except:
+            pass
+
+    # quick extraction
+    if quick_extract:
+        print("Quickly extracting answer...")
+        # The answer is "text". -> "text"
+        try:
+            result = re.search(r'The answer is "(.*)"\.', response)
+            if result:
+                extraction = result.group(1)
+                return extraction
+        except:
+            pass
+
+    # general extraction
+    try:
+        full_prompt = create_test_prompt(demo_prompt, query, response)
+        extraction = get_chat_response(full_prompt, openai.api_key, openai.api_base, model=args.llm_engine)
+        return extraction
+    except Exception as e:
+        print(e)
+        print(f"Error in extracting answer for {pid}")
+
+    return ""
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    # input
+    parser.add_argument('--output_file', type=str, default='answer.json')
+    parser.add_argument('--response_label', type=str, default='response', help='response label for the input file')
+    # model
+    parser.add_argument('--llm_engine', type=str, default='gpt-4-0613', help='llm engine',
+                        choices = ['gpt-3.5-turbo', 'gpt-3.5', 'gpt-4', 'gpt-4-0314', 'gpt-4-0613'])
+    parser.add_argument('--number', type=int, default=-1, help='number of problems to run')
+    parser.add_argument('--quick_extract', action='store_true', help='use rules to extract answer for some problems')
+    parser.add_argument('--rerun', action='store_true', help='rerun the answer extraction')
+    # openai
+    parser.add_argument("--api_key", required=True, type=str, help="OpenAI API key")
+    parser.add_argument("--api_base", default=None, type=str, help="OpenAI API base")
+    # output
+    parser.add_argument('--save_every', type=int, default=10, help='save every n problems')
+    parser.add_argument('--output_label', type=str, default='', help='label for the output file')
+    args = parser.parse_args()
+
+    # args
+    label = args.response_label
+    result_file = args.output_file
+    if args.output_label != '':
+        output_file = result_file.replace('.json', f'_{args.output_label}.json')
+    else:
+        output_file = result_file
+
+    # read results
+    print(f"Reading {result_file}...")
+    try:
+        results = read_json(output_file)
+    except:
+        samples = [json.loads(line) for line in open(result_file)]
+        results = {}
+        for sample in samples:
+            results[sample['pid']] = sample
+
+    # full pids
+    full_pids = list(results.keys())
+    if args.number > 0:
+        full_pids = full_pids[:min(args.number, len(full_pids))]
+    print("Number of testing problems:", len(full_pids))
+
+    # test pids
+    if args.rerun:
+        test_pids = full_pids
+    else:
+        test_pids = []
+        for pid in full_pids:
+            # print(pid)
+            if 'extraction' not in results[pid] or not verify_extraction(results[pid]['extraction']):
+                test_pids.append(pid)
+    
+    test_num = len(test_pids)
+    print("Number of problems to run:", test_num)
+    # print(test_pids)
+
+    # openai api
+    openai.api_key = args.api_key # Your API key here
+    if args.api_base:
+        openai.api_base = args.api_base # Your API base here
+
+    # tqdm, enumerate results
+    for i, pid in enumerate(tqdm(test_pids)):
+        problem = results[pid]
+
+        assert label in problem
+        response = problem[label]       
+
+        
+        extraction  = extract_answer(response, problem, args.quick_extract)
+        results[pid]['extraction'] = extraction
+
+        if i % args.save_every == 0 or i == test_num - 1:
+            print(f"Saving results to {output_file}...")
+            save_json(results, output_file)
+            print(f"Results saved.")
--- a/mgm/eval/MathVista/prompts/__pycache__/ext_ans.cpython-310.pyc
+++ b/mgm/eval/MathVista/prompts/__pycache__/ext_ans.cpython-310.pyc
--- a/mgm/eval/MathVista/prompts/ext_ans.py
+++ b/mgm/eval/MathVista/prompts/ext_ans.py
+
+
+# pids = 852,  104,  824,  506,  540
+
+demo_prompt = """
+Please read the following example. Then extract the answer from the model response and type it at the end of the prompt.
+
+Hint: Please answer the question requiring an integer answer and provide the final value, e.g., 1, 2, 3, at the end.
+Question: Which number is missing?
+
+Model response: The number missing in the sequence is 14.
+
+Extracted answer: 14
+
+Hint: Please answer the question requiring a floating-point number with one decimal place and provide the final value, e.g., 1.2, 1.3, 1.4, at the end.
+Question: What is the fraction of females facing the camera?
+
+Model response: The fraction of females facing the camera is 0.6, which means that six out of ten females in the group are facing the camera.
+
+Extracted answer: 0.6
+
+Hint: Please answer the question requiring a floating-point number with two decimal places and provide the final value, e.g., 1.23, 1.34, 1.45, at the end.
+Question: How much money does Luca need to buy a sour apple candy and a butterscotch candy? (Unit: $)
+
+Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.
+
+Extracted answer: 1.45
+
+Hint: Please answer the question requiring a Python list as an answer and provide the final list, e.g., [1, 2, 3], [1.2, 1.3, 1.4], at the end.
+Question: Between which two years does the line  graph saw its maximum peak?
+
+Model response: The line graph saw its maximum peak between 2007 and 2008.
+
+Extracted answer: [2007, 2008]
+
+Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.
+Question: What fraction of the shape is blue?\nChoices:\n(A) 3/11\n(B) 8/11\n(C) 6/11\n(D) 3/5
+
+Model response: The correct answer is (B) 8/11.
+
+Extracted answer: B
+"""
\ No newline at end of file
--- a/mgm/eval/MathVista/utilities.py
+++ b/mgm/eval/MathVista/utilities.py
+import os
+import cv2
+import json
+import time
+import pickle
+import openai
+import re
+from word2number import w2n
+
+
+def create_dir(output_dir):
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    
+
+def read_csv(file):
+    data = []
+    with open(file, 'r') as f:
+        for line in f:
+            data.append(line.strip())
+    return data
+
+
+def read_pandas_csv(csv_path):
+    # read a pandas csv sheet 
+    import pandas as pd
+    df = pd.read_csv(csv_path)
+    return df
+
+
+def read_json(path):
+    with open(path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+
+def read_jsonl(file):
+    with open(file, 'r') as f:
+        data = [json.loads(line) for line in f]
+    return data
+
+
+def read_pickle(path):
+    with open(path, 'rb') as f:
+        return pickle.load(f)
+
+
+def save_json(data, path):
+    with open(path, 'w') as f:
+        json.dump(data, f, indent=4)
+
+
+def save_array_img(path, image):
+    cv2.imwrite(path, image)
+
+
+def contains_digit(text):
+    # check if text contains a digit
+    if any(char.isdigit() for char in text):
+        return True
+    return False  
+    
+def contains_number_word(text):
+    # check if text contains a number word
+    ignore_words = ["a", "an", "point"]
+    words = re.findall(r'\b\w+\b', text)  # This regex pattern matches any word in the text
+    for word in words:
+        if word in ignore_words:
+            continue
+        try:
+            w2n.word_to_num(word)
+            return True  # If the word can be converted to a number, return True
+        except ValueError:
+            continue  # If the word can't be converted to a number, continue with the next word
+    
+    # check if text contains a digit
+    if any(char.isdigit() for char in text):
+        return True
+
+    return False  # If none of the words could be converted to a number, return False
+
+
+def contains_quantity_word(text, special_keep_words=[]):
+    # check if text contains a quantity word
+    quantity_words = ["most", "least", "fewest"
+                      "more", "less", "fewer", 
+                      "largest", "smallest", "greatest", 
+                      "larger", "smaller", "greater", 
+                      "highest", "lowest", "higher", "lower",
+                      "increase", "decrease",
+                      "minimum", "maximum", "max", "min",
+                      "mean", "average", "median",
+                      "total", "sum", "add", "subtract",
+                      "difference", "quotient", "gap",
+                      "half", "double", "twice", "triple",
+                      "square", "cube", "root",
+                      "approximate", "approximation",
+                      "triangle", "rectangle", "circle", "square", "cube", "sphere", "cylinder", "cone", "pyramid",
+                      "multiply", "divide",
+                      "percentage", "percent", "ratio", "proportion", "fraction", "rate", 
+                    ]
+    
+    quantity_words += special_keep_words # dataset specific words
+    
+    words = re.findall(r'\b\w+\b', text)  # This regex pattern matches any word in the text
+    if any(word in quantity_words for word in words):
+        return True
+
+    return False  # If none of the words could be converted to a number, return False
+
+
+def is_bool_word(text):
+    if text in ["Yes", "No", "True", "False", 
+                "yes", "no", "true", "false", 
+                "YES", "NO", "TRUE", "FALSE"]:
+        return True
+    return False
+
+
+def is_digit_string(text):
+    # remove ".0000"
+    text = text.strip()
+    text = re.sub(r'\.0+$', '', text)
+    try:
+        int(text)
+        return True
+    except ValueError:
+        return False
+   
+    
+def is_float_string(text):
+    # text is a float string if it contains a "." and can be converted to a float
+    if "." in text:
+        try:
+            float(text)
+            return True
+        except ValueError:
+            return False
+    return False
+
+
+def copy_image(image_path, output_image_path):
+    from shutil import copyfile
+    copyfile(image_path, output_image_path)
+
+
+def copy_dir(src_dir, dst_dir):
+    from shutil import copytree
+    # copy the source directory to the target directory
+    copytree(src_dir, dst_dir)
+
+
+import PIL.Image as Image
+def get_image_size(img_path):
+    img = Image.open(img_path)
+    width, height = img.size
+    return width, height
+
+
+def get_chat_response(promot, api_key, api_base, model="gpt-3.5-turbo", temperature=0, max_tokens=256, n=1, patience=10000000,
+ sleep_time=0):
+    messages = [
+        {"role": "user", "content": promot},
+    ]
+    # print("I am here")
+    while patience > 0:
+        patience -= 1
+        try:
+            response = openai.ChatCompletion.create(model=model,
+                                                messages=messages,
+                                                api_key=api_key,
+                                                api_base=api_base,
+                                                temperature=temperature,
+                                                max_tokens=max_tokens,
+                                                n=n)
+            if n == 1:
+                prediction = response['choices'][0]['message']['content'].strip()
+                if prediction != "" and prediction != None:
+                    return prediction
+            else:
+                prediction = [choice['message']['content'].strip() for choice in response['choices']]
+                if prediction[0] != "" and prediction[0] != None:
+                    return prediction
+
+        except Exception as e:
+            if "Rate limit" not in str(e):
+                print(e)
+
+            if "Please reduce the length of the messages" in str(e):
+                print("!!Reduce promot size")
+                # reduce input prompt and keep the tail
+                new_size = int(len(promot) * 0.9)
+                new_start = len(promot) - new_size
+                promot = promot[new_start:]
+                messages = [
+                    {"role": "user", "content": promot},
+                ]
+                
+            if sleep_time > 0:
+                time.sleep(sleep_time)
+    return ""
\ No newline at end of file
--- a/mgm/eval/__pycache__/model_math_vista.cpython-310.pyc
+++ b/mgm/eval/__pycache__/model_math_vista.cpython-310.pyc
--- a/mgm/eval/__pycache__/model_vqa_mmbench.cpython-310.pyc
+++ b/mgm/eval/__pycache__/model_vqa_mmbench.cpython-310.pyc
--- a/mgm/eval/eval_gpt_review.py
+++ b/mgm/eval/eval_gpt_review.py
+import argparse
+import json
+import os
+
+import openai
+import tqdm
+import ray
+import time
+
+NUM_SECONDS_TO_SLEEP = 3
+
+@ray.remote(num_cpus=4)
+def get_eval(content: str, max_tokens: int):
+    while True:
+        try:
+            response = openai.ChatCompletion.create(
+                model='gpt-4',
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
+                }, {
+                    'role': 'user',
+                    'content': content,
+                }],
+                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
+                max_tokens=max_tokens,
+            )
+            break
+        except openai.error.RateLimitError:
+            pass
+        except Exception as e:
+            print(e)
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+
+    print('success!')
+    return response['choices'][0]['message']['content']
+
+
+def parse_score(review):
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            print('error', review)
+            return [-1, -1]
+    except Exception as e:
+        print(e)
+        print('error', review)
+        return [-1, -1]
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-q', '--question')
+    # parser.add_argument('-a', '--answer')
+    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
+    parser.add_argument('-r', '--rule')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+
+    ray.init()
+
+    f_q = open(os.path.expanduser(args.question))
+    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
+    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
+    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
+
+    review_file = open(f'{args.output}', 'w')
+
+    js_list = []
+    handles = []
+    idx = 0
+    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
+        # if idx == 1:
+        #     break
+
+        ques = json.loads(ques_js)
+        ans1 = json.loads(ans1_js)
+        ans2 = json.loads(ans2_js)
+
+        category = json.loads(ques_js)['category']
+        if category in rule_dict:
+            rule = rule_dict[category]
+        else:
+            rule = rule_dict['default']
+        prompt = rule['prompt']
+        role = rule['role']
+        content = (f'[Question]\n{ques["text"]}\n\n'
+                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
+                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
+                   f'[System]\n{prompt}\n\n')
+        js_list.append({
+            'id': idx+1,
+            'question_id': ques['question_id'],
+            'answer1_id': ans1['answer_id'],
+            'answer2_id': ans2['answer_id'],
+            'category': category})
+        idx += 1
+        handles.append(get_eval.remote(content, args.max_tokens))
+        # To avoid the rate limit set by OpenAI
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+
+    reviews = ray.get(handles)
+    for idx, review in enumerate(reviews):
+        scores = parse_score(review)
+        js_list[idx]['content'] = review
+        js_list[idx]['tuple'] = scores
+        review_file.write(json.dumps(js_list[idx]) + '\n')
+    review_file.close()
--- a/mgm/eval/eval_gpt_review_bench.py
+++ b/mgm/eval/eval_gpt_review_bench.py
+import argparse
+import json
+import os
+
+import openai
+import time
+
+NUM_SECONDS_TO_SLEEP = 0.5
+
+
+def get_eval(content: str, max_tokens: int):
+    while True:
+        try:
+            response = openai.ChatCompletion.create(
+                model='gpt-4-0314',
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
+                }, {
+                    'role': 'user',
+                    'content': content,
+                }],
+                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
+                max_tokens=max_tokens,
+            )
+            break
+        except openai.error.RateLimitError:
+            pass
+        except Exception as e:
+            print(e)
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+
+    return response['choices'][0]['message']['content']
+
+
+def parse_score(review):
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            print('error', review)
+            return [-1, -1]
+    except Exception as e:
+        print(e)
+        print('error', review)
+        return [-1, -1]
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-q', '--question')
+    parser.add_argument('-c', '--context')
+    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
+    parser.add_argument('-r', '--rule')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+
+    f_q = open(os.path.expanduser(args.question))
+    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
+    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
+    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
+
+    if os.path.isfile(os.path.expanduser(args.output)):
+        cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
+    else:
+        cur_reviews = []
+
+    review_file = open(f'{args.output}', 'a')
+
+    context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
+    image_to_context = {context['image']: context for context in context_list}
+
+    handles = []
+    idx = 0
+    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
+        ques = json.loads(ques_js)
+        ans1 = json.loads(ans1_js)
+        ans2 = json.loads(ans2_js)
+
+        inst = image_to_context[ques['image']]
+
+        if isinstance(inst['caption'], list):
+            cap_str = '\n'.join(inst['caption'])
+        else:
+            cap_str = inst['caption']
+
+        category = 'llava_bench_' + json.loads(ques_js)['category']
+        if category in rule_dict:
+            rule = rule_dict[category]
+        else:
+            assert False, f"Visual QA category not found in rule file: {category}."
+        prompt = rule['prompt']
+        role = rule['role']
+        content = (f'[Context]\n{cap_str}\n\n'
+                   f'[Question]\n{ques["text"]}\n\n'
+                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
+                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
+                   f'[System]\n{prompt}\n\n')
+        cur_js = {
+            'id': idx+1,
+            'question_id': ques['question_id'],
+            'answer1_id': ans1.get('answer_id', ans1['question_id']),
+            'answer2_id': ans2.get('answer_id', ans2['answer_id']),
+            'category': category
+        }
+        if idx >= len(cur_reviews):
+            review = get_eval(content, args.max_tokens)
+            scores = parse_score(review)
+            cur_js['content'] = review
+            cur_js['tuple'] = scores
+            review_file.write(json.dumps(cur_js) + '\n')
+            review_file.flush()
+        else:
+            print(f'Skipping {idx} as we already have it.')
+        idx += 1
+        print(idx)
+    review_file.close()
--- a/mgm/eval/eval_gpt_review_visual.py
+++ b/mgm/eval/eval_gpt_review_visual.py
+import argparse
+import json
+import os
+
+import openai
+import time
+
+NUM_SECONDS_TO_SLEEP = 0.5
+
+
+def get_eval(content: str, max_tokens: int):
+    while True:
+        try:
+            response = openai.ChatCompletion.create(
+                model='gpt-4-0314',
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
+                }, {
+                    'role': 'user',
+                    'content': content,
+                }],
+                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
+                max_tokens=max_tokens,
+            )
+            break
+        except openai.error.RateLimitError:
+            pass
+        except Exception as e:
+            print(e)
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+
+    return response['choices'][0]['message']['content']
+
+
+def parse_score(review):
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            print('error', review)
+            return [-1, -1]
+    except Exception as e:
+        print(e)
+        print('error', review)
+        return [-1, -1]
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-q', '--question')
+    parser.add_argument('-c', '--context')
+    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
+    parser.add_argument('-r', '--rule')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+
+    f_q = open(os.path.expanduser(args.question))
+    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
+    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
+    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
+
+    if os.path.isfile(os.path.expanduser(args.output)):
+        cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
+    else:
+        cur_reviews = []
+
+    review_file = open(f'{args.output}', 'a')
+
+    context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
+    image_to_context = {context['image']: context for context in context_list}
+
+    handles = []
+    idx = 0
+    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
+        ques = json.loads(ques_js)
+        ans1 = json.loads(ans1_js)
+        ans2 = json.loads(ans2_js)
+
+        inst = image_to_context[ques['image']]
+        cap_str = '\n'.join(inst['captions'])
+        box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])
+
+        category = json.loads(ques_js)['category']
+        if category in rule_dict:
+            rule = rule_dict[category]
+        else:
+            assert False, f"Visual QA category not found in rule file: {category}."
+        prompt = rule['prompt']
+        role = rule['role']
+        content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
+                   f'[Question]\n{ques["text"]}\n\n'
+                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
+                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
+                   f'[System]\n{prompt}\n\n')
+        cur_js = {
+            'id': idx+1,
+            'question_id': ques['question_id'],
+            'answer1_id': ans1.get('answer_id', ans1['question_id']),
+            'answer2_id': ans2.get('answer_id', ans2['answer_id']),
+            'category': category
+        }
+        if idx >= len(cur_reviews):
+            review = get_eval(content, args.max_tokens)
+            scores = parse_score(review)
+            cur_js['content'] = review
+            cur_js['tuple'] = scores
+            review_file.write(json.dumps(cur_js) + '\n')
+            review_file.flush()
+        else:
+            print(f'Skipping {idx} as we already have it.')
+        idx += 1
+        print(idx)
+    review_file.close()
--- a/mgm/eval/eval_pope.py
+++ b/mgm/eval/eval_pope.py
+import os
+import json
+import argparse
+
+def eval_pope(answers, label_file):
+    label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
+
+    for answer in answers:
+        text = answer['text']
+
+        # Only keep the first sentence
+        if text.find('.') != -1:
+            text = text.split('.')[0]
+
+        text = text.replace(',', '')
+        words = text.split(' ')
+        if 'No' in words or 'not' in words or 'no' in words:
+            answer['text'] = 'no'
+        else:
+            answer['text'] = 'yes'
+
+    for i in range(len(label_list)):
+        if label_list[i] == 'no':
+            label_list[i] = 0
+        else:
+            label_list[i] = 1
+
+    pred_list = []
+    for answer in answers:
+        if answer['text'] == 'no':
+            pred_list.append(0)
+        else:
+            pred_list.append(1)
+
+    pos = 1
+    neg = 0
+    yes_ratio = pred_list.count(1) / len(pred_list)
+
+    TP, TN, FP, FN = 0, 0, 0, 0
+    for pred, label in zip(pred_list, label_list):
+        if pred == pos and label == pos:
+            TP += 1
+        elif pred == pos and label == neg:
+            FP += 1
+        elif pred == neg and label == neg:
+            TN += 1
+        elif pred == neg and label == pos:
+            FN += 1
+
+    print('TP\tFP\tTN\tFN\t')
+    print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
+
+    precision = float(TP) / float(TP + FP)
+    recall = float(TP) / float(TP + FN)
+    f1 = 2*precision*recall / (precision + recall)
+    acc = (TP + TN) / (TP + TN + FP + FN)
+    print('Accuracy: {}'.format(acc))
+    print('Precision: {}'.format(precision))
+    print('Recall: {}'.format(recall))
+    print('F1 score: {}'.format(f1))
+    print('Yes ratio: {}'.format(yes_ratio))
+    print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) )
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--annotation-dir", type=str)
+    parser.add_argument("--question-file", type=str)
+    parser.add_argument("--result-file", type=str)
+    args = parser.parse_args()
+
+    questions = [json.loads(line) for line in open(args.question_file)]
+    questions = {question['question_id']: question for question in questions}
+    answers = [json.loads(q) for q in open(args.result_file)]
+    for file in os.listdir(args.annotation_dir):
+        assert file.startswith('coco_pope_')
+        assert file.endswith('.json')
+        category = file[10:-5]
+        cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
+        print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
+        eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
+        print("====================================")
--- a/mgm/eval/eval_science_qa.py
+++ b/mgm/eval/eval_science_qa.py
+import argparse
+import json
+import os
+import re
+import random
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--base-dir', type=str)
+    parser.add_argument('--result-file', type=str)
+    parser.add_argument('--output-file', type=str)
+    parser.add_argument('--output-result', type=str)
+    parser.add_argument('--split', type=str, default='test')
+    parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
+    return parser.parse_args()
+
+
+def convert_caps(results):
+    fakecaps = []
+    for result in results:
+        image_id = result['question_id']
+        caption = result['text']
+        fakecaps.append({"image_id": int(image_id), "caption": caption})
+    return fakecaps
+
+
+def get_pred_idx(prediction, choices, options):
+    """
+    Get the index (e.g. 2) from the prediction (e.g. 'C')
+    """
+    if prediction in options[:len(choices)]:
+        return options.index(prediction)
+    else:
+        return -1
+        return random.choice(range(len(choices)))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    base_dir = args.base_dir
+    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
+    problems = json.load(open(os.path.join(base_dir, "problems.json")))
+    predictions = [json.loads(line) for line in open(args.result_file)]
+    predictions = {pred['question_id']: pred for pred in predictions}
+    split_problems = {idx: problems[idx] for idx in split_indices}
+
+    results = {'correct': [], 'incorrect': []}
+    sqa_results = {}
+    sqa_results['acc'] = None
+    sqa_results['correct'] = None
+    sqa_results['count'] = None
+    sqa_results['results'] = {}
+    sqa_results['outputs'] = {}
+
+    for prob_id, prob in split_problems.items():
+        if prob_id not in predictions:
+            pred = {'text': 'FAILED', 'prompt': 'Unknown'}
+            pred_text = 'FAILED'
+        else:
+            pred = predictions[prob_id]
+            pred_text = pred['text']
+
+        if pred_text in args.options:
+            answer = pred_text
+        elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ":
+            answer = pred_text[0]
+        else:
+            pattern = re.compile(r'The answer is ([A-Z]).')
+            res = pattern.findall(pred_text)
+            if len(res) == 1:
+                answer = res[0]  # 'A', 'B', ...
+            else:
+                answer = "FAILED"
+
+        pred_idx = get_pred_idx(answer, prob['choices'], args.options)
+
+        analysis = {
+            'question_id': prob_id,
+            'parsed_ans': answer,
+            'ground_truth': args.options[prob['answer']],
+            'question': pred['prompt'],
+            'pred': pred_text,
+            'is_multimodal': '<image>' in pred['prompt'],
+        }
+
+        sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
+        sqa_results['outputs'][prob_id] = pred_text
+
+        if pred_idx == prob['answer']:
+            results['correct'].append(analysis)
+        else:
+            results['incorrect'].append(analysis)
+
+    correct = len(results['correct'])
+    total = len(results['correct']) + len(results['incorrect'])
+
+    ###### IMG ######
+    multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']])
+    multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']])
+    multimodal_total = multimodal_correct + multimodal_incorrect
+    ###### IMG ######
+
+    print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%')
+
+    sqa_results['acc'] = correct / total * 100
+    sqa_results['correct'] = correct
+    sqa_results['count'] = total
+
+    with open(args.output_file, 'w') as f:
+        json.dump(results, f, indent=2)
+    with open(args.output_result, 'w') as f:
+        json.dump(sqa_results, f, indent=2)
--- a/mgm/eval/eval_science_qa_gpt4.py
+++ b/mgm/eval/eval_science_qa_gpt4.py
+import argparse
+import json
+import os
+import re
+import random
+from collections import defaultdict
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--base-dir', type=str)
+    parser.add_argument('--gpt4-result', type=str)
+    parser.add_argument('--our-result', type=str)
+    parser.add_argument('--split', type=str, default='test')
+    parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
+    return parser.parse_args()
+
+
+def convert_caps(results):
+    fakecaps = []
+    for result in results:
+        image_id = result['question_id']
+        caption = result['text']
+        fakecaps.append({"image_id": int(image_id), "caption": caption})
+    return fakecaps
+
+
+def get_pred_idx(prediction, choices, options):
+    """
+    Get the index (e.g. 2) from the prediction (e.g. 'C')
+    """
+    if prediction in options[:len(choices)]:
+        return options.index(prediction)
+    else:
+        return random.choice(range(len(choices)))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    base_dir = args.base_dir
+    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
+    problems = json.load(open(os.path.join(base_dir, "problems.json")))
+    our_predictions = [json.loads(line) for line in open(args.our_result)]
+    our_predictions = {pred['question_id']: pred for pred in our_predictions}
+    split_problems = {idx: problems[idx] for idx in split_indices}
+
+    gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
+
+    results = defaultdict(lambda: 0)
+
+    for prob_id, prob in split_problems.items():
+        if prob_id not in our_predictions:
+            continue
+        if prob_id not in gpt4_predictions:
+            continue
+        our_pred = our_predictions[prob_id]['text']
+        gpt4_pred = gpt4_predictions[prob_id]
+
+        pattern = re.compile(r'The answer is ([A-Z]).')
+        our_res = pattern.findall(our_pred)
+        if len(our_res) == 1:
+            our_answer = our_res[0]  # 'A', 'B', ...
+        else:
+            our_answer = "FAILED"
+        gpt4_res = pattern.findall(gpt4_pred)
+        if len(gpt4_res) == 1:
+            gpt4_answer = gpt4_res[0]  # 'A', 'B', ...
+        else:
+            gpt4_answer = "FAILED"
+
+        our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
+        gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
+
+        if gpt4_answer == 'FAILED':
+            results['gpt4_failed'] += 1
+            # continue
+            gpt4_pred_idx = our_pred_idx
+            # if our_pred_idx != prob['answer']:
+            #     print(our_predictions[prob_id]['prompt'])
+            #     print('-----------------')
+            #     print(f'LECTURE: {prob["lecture"]}')
+            #     print(f'SOLUTION: {prob["solution"]}')
+            #     print('=====================')
+        else:
+            # continue
+            pass
+        # gpt4_pred_idx = our_pred_idx
+
+        if gpt4_pred_idx == prob['answer']:
+            results['correct'] += 1
+        else:
+            results['incorrect'] += 1
+
+
+        if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
+            results['correct_upperbound'] += 1
+
+    correct = results['correct']
+    total = results['correct'] + results['incorrect']
+    print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%')
+    print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
+    print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
+
--- a/mgm/eval/eval_science_qa_gpt4_requery.py
+++ b/mgm/eval/eval_science_qa_gpt4_requery.py
+import argparse
+import json
+import os
+import re
+import random
+from collections import defaultdict
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--base-dir', type=str)
+    parser.add_argument('--gpt4-result', type=str)
+    parser.add_argument('--requery-result', type=str)
+    parser.add_argument('--our-result', type=str)
+    parser.add_argument('--output-result', type=str)
+    parser.add_argument('--split', type=str, default='test')
+    parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
+    return parser.parse_args()
+
+
+def convert_caps(results):
+    fakecaps = []
+    for result in results:
+        image_id = result['question_id']
+        caption = result['text']
+        fakecaps.append({"image_id": int(image_id), "caption": caption})
+    return fakecaps
+
+
+def get_pred_idx(prediction, choices, options):
+    """
+    Get the index (e.g. 2) from the prediction (e.g. 'C')
+    """
+    if prediction in options[:len(choices)]:
+        return options.index(prediction)
+    else:
+        return random.choice(range(len(choices)))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    base_dir = args.base_dir
+    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
+    problems = json.load(open(os.path.join(base_dir, "problems.json")))
+    our_predictions = [json.loads(line) for line in open(args.our_result)]
+    our_predictions = {pred['question_id']: pred for pred in our_predictions}
+    split_problems = {idx: problems[idx] for idx in split_indices}
+
+    requery_predictions = [json.loads(line) for line in open(args.requery_result)]
+    requery_predictions = {pred['question_id']: pred for pred in requery_predictions}
+
+    gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
+
+    results = defaultdict(lambda: 0)
+
+    sqa_results = {}
+    sqa_results['acc'] = None
+    sqa_results['correct'] = None
+    sqa_results['count'] = None
+    sqa_results['results'] = {}
+    sqa_results['outputs'] = {}
+
+    for prob_id, prob in split_problems.items():
+        if prob_id not in our_predictions:
+            assert False
+        if prob_id not in gpt4_predictions:
+            assert False
+        our_pred = our_predictions[prob_id]['text']
+        gpt4_pred = gpt4_predictions[prob_id]
+        if prob_id not in requery_predictions:
+            results['missing_requery'] += 1
+            requery_pred = "MISSING"
+        else:
+            requery_pred = requery_predictions[prob_id]['text']
+
+        pattern = re.compile(r'The answer is ([A-Z]).')
+        our_res = pattern.findall(our_pred)
+        if len(our_res) == 1:
+            our_answer = our_res[0]  # 'A', 'B', ...
+        else:
+            our_answer = "FAILED"
+
+        requery_res = pattern.findall(requery_pred)
+        if len(requery_res) == 1:
+            requery_answer = requery_res[0]  # 'A', 'B', ...
+        else:
+            requery_answer = "FAILED"
+
+        gpt4_res = pattern.findall(gpt4_pred)
+        if len(gpt4_res) == 1:
+            gpt4_answer = gpt4_res[0]  # 'A', 'B', ...
+        else:
+            gpt4_answer = "FAILED"
+
+        our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
+        gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
+        requery_pred_idx = get_pred_idx(requery_answer, prob['choices'], args.options)
+
+        results['total'] += 1
+
+        if gpt4_answer == 'FAILED':
+            results['gpt4_failed'] += 1
+            if gpt4_pred_idx == prob['answer']:
+                results['gpt4_correct'] += 1
+            if our_pred_idx == prob['answer']:
+                results['gpt4_ourvisual_correct'] += 1
+        elif gpt4_pred_idx == prob['answer']:
+            results['gpt4_correct'] += 1
+            results['gpt4_ourvisual_correct'] += 1
+
+        if our_pred_idx == prob['answer']:
+            results['our_correct'] += 1
+
+        if requery_answer == 'FAILED':
+            sqa_results['results'][prob_id] = our_pred_idx
+            if our_pred_idx == prob['answer']:
+                results['requery_correct'] += 1
+        else:
+            sqa_results['results'][prob_id] = requery_pred_idx
+            if requery_pred_idx == prob['answer']:
+                results['requery_correct'] += 1
+            else:
+                print(f"""
+Question ({args.options[prob['answer']]}): {our_predictions[prob_id]['prompt']}
+Our ({our_answer}): {our_pred}
+GPT-4 ({gpt4_answer}): {gpt4_pred}
+Requery ({requery_answer}): {requery_pred}
+print("=====================================")
+""")
+
+        if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
+            results['correct_upperbound'] += 1
+
+    total = results['total']
+    print(f'Total: {total}, Our-Correct: {results["our_correct"]}, Accuracy: {results["our_correct"] / total * 100:.2f}%')
+    print(f'Total: {total}, GPT-4-Correct: {results["gpt4_correct"]}, Accuracy: {results["gpt4_correct"] / total * 100:.2f}%')
+    print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
+    print(f'Total: {total}, GPT-4-OursVisual-Correct: {results["gpt4_ourvisual_correct"]}, Accuracy: {results["gpt4_ourvisual_correct"] / total * 100:.2f}%')
+    print(f'Total: {total}, Requery-Correct: {results["requery_correct"]}, Accuracy: {results["requery_correct"] / total * 100:.2f}%')
+    print(f'Total: {total}, Correct upper: {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
+
+    sqa_results['acc'] = results["requery_correct"] / total * 100
+    sqa_results['correct'] = results["requery_correct"]
+    sqa_results['count'] = total
+
+    with open(args.output_result, 'w') as f:
+        json.dump(sqa_results, f, indent=2)
+
--- a/mgm/eval/eval_textvqa.py
+++ b/mgm/eval/eval_textvqa.py
+import os
+import argparse
+import json
+import re
+
+from mgm.eval.m4c_evaluator import TextVQAAccuracyEvaluator
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--annotation-file', type=str)
+    parser.add_argument('--result-file', type=str)
+    parser.add_argument('--result-dir', type=str)
+    return parser.parse_args()
+
+
+def prompt_processor(prompt):
+    if prompt.startswith('OCR tokens: '):
+        pattern = r"Question: (.*?) Short answer:"
+        match = re.search(pattern, prompt, re.DOTALL)
+        question = match.group(1)
+    elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
+        if prompt.startswith('Reference OCR token:'):
+            question = prompt.split('\n')[1]
+        else:
+            question = prompt.split('\n')[0]
+    elif len(prompt.split('\n')) == 2:
+        question = prompt.split('\n')[0]
+    else:
+        assert False
+
+    return question.lower()
+
+
+def eval_single(annotation_file, result_file):
+    experiment_name = os.path.splitext(os.path.basename(result_file))[0]
+    print(experiment_name)
+    annotations = json.load(open(annotation_file))['data']
+    annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
+    results = [json.loads(line) for line in open(result_file)]
+
+    pred_list = []
+    for result in results:
+        annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
+        pred_list.append({
+            "pred_answer": result['text'],
+            "gt_answers": annotation['answers'],
+        })
+
+    evaluator = TextVQAAccuracyEvaluator()
+    print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    if args.result_file is not None:
+        eval_single(args.annotation_file, args.result_file)
+
+    if args.result_dir is not None:
+        for result_file in sorted(os.listdir(args.result_dir)):
+            if not result_file.endswith('.jsonl'):
+                print(f'Skipping {result_file}')
+                continue
+            eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))