MiniGemini_pytorch

07dbc76b · dongchy920 · 07dbc76b · 07dbc76b · 07dbc76b · 07dbc76b
Commit 07dbc76b authored Jul 15, 2024 by dongchy920
20 changed files
--- a/mgm/eval/generate_webpage_data_from_table.py
+++ b/mgm/eval/generate_webpage_data_from_table.py
+"""Generate json file for webpage."""
+import json
+import os
+import re
+
+# models = ['llama', 'alpaca', 'gpt35', 'bard']
+models = ['vicuna']
+
+
+def read_jsonl(path: str, key: str=None):
+    data = []
+    with open(os.path.expanduser(path)) as f:
+        for line in f:
+            if not line:
+                continue
+            data.append(json.loads(line))
+    if key is not None:
+        data.sort(key=lambda x: x[key])
+        data = {item[key]: item for item in data}
+    return data
+
+
+def trim_hanging_lines(s: str, n: int) -> str:
+    s = s.strip()
+    for _ in range(n):
+        s = s.split('\n', 1)[1].strip()
+    return s
+
+
+if __name__ == '__main__':
+    questions = read_jsonl('table/question.jsonl', key='question_id')
+
+    # alpaca_answers = read_jsonl('table/answer/answer_alpaca-13b.jsonl', key='question_id')
+    # bard_answers = read_jsonl('table/answer/answer_bard.jsonl', key='question_id')
+    # gpt35_answers = read_jsonl('table/answer/answer_gpt35.jsonl', key='question_id')
+    # llama_answers = read_jsonl('table/answer/answer_llama-13b.jsonl', key='question_id')
+    vicuna_answers = read_jsonl('table/answer/answer_vicuna-13b.jsonl', key='question_id')
+    ours_answers = read_jsonl('table/results/llama-13b-hf-alpaca.jsonl', key='question_id')
+
+    review_vicuna = read_jsonl('table/review/review_vicuna-13b_llama-13b-hf-alpaca.jsonl', key='question_id')
+    # review_alpaca = read_jsonl('table/review/review_alpaca-13b_vicuna-13b.jsonl', key='question_id')
+    # review_bard = read_jsonl('table/review/review_bard_vicuna-13b.jsonl', key='question_id')
+    # review_gpt35 = read_jsonl('table/review/review_gpt35_vicuna-13b.jsonl', key='question_id')
+    # review_llama = read_jsonl('table/review/review_llama-13b_vicuna-13b.jsonl', key='question_id')
+
+    records = []
+    for qid in questions.keys():
+        r = {
+            'id': qid,
+            'category': questions[qid]['category'],
+            'question': questions[qid]['text'],
+            'answers': {
+                # 'alpaca': alpaca_answers[qid]['text'],
+                # 'llama': llama_answers[qid]['text'],
+                # 'bard': bard_answers[qid]['text'],
+                # 'gpt35': gpt35_answers[qid]['text'],
+                'vicuna': vicuna_answers[qid]['text'],
+                'ours': ours_answers[qid]['text'],
+            },
+            'evaluations': {
+                # 'alpaca': review_alpaca[qid]['text'],
+                # 'llama': review_llama[qid]['text'],
+                # 'bard': review_bard[qid]['text'],
+                'vicuna': review_vicuna[qid]['content'],
+                # 'gpt35': review_gpt35[qid]['text'],
+            },
+            'scores': {
+                'vicuna': review_vicuna[qid]['tuple'],
+                # 'alpaca': review_alpaca[qid]['score'],
+                # 'llama': review_llama[qid]['score'],
+                # 'bard': review_bard[qid]['score'],
+                # 'gpt35': review_gpt35[qid]['score'],
+            },
+        }
+
+        # cleanup data
+        cleaned_evals = {}
+        for k, v in r['evaluations'].items():
+            v = v.strip()
+            lines = v.split('\n')
+            # trim the first line if it's a pair of numbers
+            if re.match(r'\d+[, ]+\d+', lines[0]):
+                lines = lines[1:]
+            v = '\n'.join(lines)
+            cleaned_evals[k] = v.replace('Assistant 1', "**Assistant 1**").replace('Assistant 2', '**Assistant 2**')
+
+        r['evaluations'] = cleaned_evals
+        records.append(r)
+
+    # Reorder the records, this is optional
+    for r in records:
+        if r['id'] <= 20:
+            r['id'] += 60
+        else:
+            r['id'] -= 20
+    for r in records:
+        if r['id'] <= 50:
+            r['id'] += 10
+        elif 50 < r['id'] <= 60:
+            r['id'] -= 50
+    for r in records:
+        if r['id'] == 7:
+            r['id'] = 1
+        elif r['id'] < 7:
+            r['id'] += 1 
+
+    records.sort(key=lambda x: x['id'])
+
+    # Write to file
+    with open('webpage/data.json', 'w') as f:
+        json.dump({'questions': records, 'models': models}, f, indent=2)
--- a/mgm/eval/m4c_evaluator.py
+++ b/mgm/eval/m4c_evaluator.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import re
+
+from tqdm import tqdm
+
+
+class EvalAIAnswerProcessor:
+    """
+    Processes an answer similar to Eval AI
+        copied from
+        https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897
+    """
+
+    CONTRACTIONS = {
+        "aint": "ain't",
+        "arent": "aren't",
+        "cant": "can't",
+        "couldve": "could've",
+        "couldnt": "couldn't",
+        "couldn'tve": "couldn't've",
+        "couldnt've": "couldn't've",
+        "didnt": "didn't",
+        "doesnt": "doesn't",
+        "dont": "don't",
+        "hadnt": "hadn't",
+        "hadnt've": "hadn't've",
+        "hadn'tve": "hadn't've",
+        "hasnt": "hasn't",
+        "havent": "haven't",
+        "hed": "he'd",
+        "hed've": "he'd've",
+        "he'dve": "he'd've",
+        "hes": "he's",
+        "howd": "how'd",
+        "howll": "how'll",
+        "hows": "how's",
+        "Id've": "I'd've",
+        "I'dve": "I'd've",
+        "Im": "I'm",
+        "Ive": "I've",
+        "isnt": "isn't",
+        "itd": "it'd",
+        "itd've": "it'd've",
+        "it'dve": "it'd've",
+        "itll": "it'll",
+        "let's": "let's",
+        "maam": "ma'am",
+        "mightnt": "mightn't",
+        "mightnt've": "mightn't've",
+        "mightn'tve": "mightn't've",
+        "mightve": "might've",
+        "mustnt": "mustn't",
+        "mustve": "must've",
+        "neednt": "needn't",
+        "notve": "not've",
+        "oclock": "o'clock",
+        "oughtnt": "oughtn't",
+        "ow's'at": "'ow's'at",
+        "'ows'at": "'ow's'at",
+        "'ow'sat": "'ow's'at",
+        "shant": "shan't",
+        "shed've": "she'd've",
+        "she'dve": "she'd've",
+        "she's": "she's",
+        "shouldve": "should've",
+        "shouldnt": "shouldn't",
+        "shouldnt've": "shouldn't've",
+        "shouldn'tve": "shouldn't've",
+        "somebody'd": "somebodyd",
+        "somebodyd've": "somebody'd've",
+        "somebody'dve": "somebody'd've",
+        "somebodyll": "somebody'll",
+        "somebodys": "somebody's",
+        "someoned": "someone'd",
+        "someoned've": "someone'd've",
+        "someone'dve": "someone'd've",
+        "someonell": "someone'll",
+        "someones": "someone's",
+        "somethingd": "something'd",
+        "somethingd've": "something'd've",
+        "something'dve": "something'd've",
+        "somethingll": "something'll",
+        "thats": "that's",
+        "thered": "there'd",
+        "thered've": "there'd've",
+        "there'dve": "there'd've",
+        "therere": "there're",
+        "theres": "there's",
+        "theyd": "they'd",
+        "theyd've": "they'd've",
+        "they'dve": "they'd've",
+        "theyll": "they'll",
+        "theyre": "they're",
+        "theyve": "they've",
+        "twas": "'twas",
+        "wasnt": "wasn't",
+        "wed've": "we'd've",
+        "we'dve": "we'd've",
+        "weve": "we've",
+        "werent": "weren't",
+        "whatll": "what'll",
+        "whatre": "what're",
+        "whats": "what's",
+        "whatve": "what've",
+        "whens": "when's",
+        "whered": "where'd",
+        "wheres": "where's",
+        "whereve": "where've",
+        "whod": "who'd",
+        "whod've": "who'd've",
+        "who'dve": "who'd've",
+        "wholl": "who'll",
+        "whos": "who's",
+        "whove": "who've",
+        "whyll": "why'll",
+        "whyre": "why're",
+        "whys": "why's",
+        "wont": "won't",
+        "wouldve": "would've",
+        "wouldnt": "wouldn't",
+        "wouldnt've": "wouldn't've",
+        "wouldn'tve": "wouldn't've",
+        "yall": "y'all",
+        "yall'll": "y'all'll",
+        "y'allll": "y'all'll",
+        "yall'd've": "y'all'd've",
+        "y'alld've": "y'all'd've",
+        "y'all'dve": "y'all'd've",
+        "youd": "you'd",
+        "youd've": "you'd've",
+        "you'dve": "you'd've",
+        "youll": "you'll",
+        "youre": "you're",
+        "youve": "you've",
+    }
+
+    NUMBER_MAP = {
+        "none": "0",
+        "zero": "0",
+        "one": "1",
+        "two": "2",
+        "three": "3",
+        "four": "4",
+        "five": "5",
+        "six": "6",
+        "seven": "7",
+        "eight": "8",
+        "nine": "9",
+        "ten": "10",
+    }
+    ARTICLES = ["a", "an", "the"]
+    PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)")
+    COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)")
+    PUNCTUATIONS = [
+        ";",
+        r"/",
+        "[",
+        "]",
+        '"',
+        "{",
+        "}",
+        "(",
+        ")",
+        "=",
+        "+",
+        "\\",
+        "_",
+        "-",
+        ">",
+        "<",
+        "@",
+        "`",
+        ",",
+        "?",
+        "!",
+    ]
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def word_tokenize(self, word):
+        word = word.lower()
+        word = word.replace(",", "").replace("?", "").replace("'s", " 's")
+        return word.strip()
+
+    def process_punctuation(self, in_text):
+        out_text = in_text
+        for p in self.PUNCTUATIONS:
+            if (p + " " in in_text or " " + p in in_text) or (
+                re.search(self.COMMA_STRIP, in_text) is not None
+            ):
+                out_text = out_text.replace(p, "")
+            else:
+                out_text = out_text.replace(p, " ")
+        out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE)
+        return out_text
+
+    def process_digit_article(self, in_text):
+        out_text = []
+        temp_text = in_text.lower().split()
+        for word in temp_text:
+            word = self.NUMBER_MAP.setdefault(word, word)
+            if word not in self.ARTICLES:
+                out_text.append(word)
+            else:
+                pass
+        for word_id, word in enumerate(out_text):
+            if word in self.CONTRACTIONS:
+                out_text[word_id] = self.CONTRACTIONS[word]
+        out_text = " ".join(out_text)
+        return out_text
+
+    def __call__(self, item):
+        item = self.word_tokenize(item)
+        item = item.replace("\n", " ").replace("\t", " ").strip()
+        item = self.process_punctuation(item)
+        item = self.process_digit_article(item)
+        return item
+
+
+class TextVQAAccuracyEvaluator:
+    def __init__(self):
+        self.answer_processor = EvalAIAnswerProcessor()
+
+    def _compute_answer_scores(self, raw_answers):
+        """
+        compute the accuracy (soft score) of human answers
+        """
+        answers = [self.answer_processor(a) for a in raw_answers]
+        assert len(answers) == 10
+        gt_answers = list(enumerate(answers))
+        unique_answers = set(answers)
+        unique_answer_scores = {}
+
+        for unique_answer in unique_answers:
+            accs = []
+            for gt_answer in gt_answers:
+                other_answers = [item for item in gt_answers if item != gt_answer]
+                matching_answers = [
+                    item for item in other_answers if item[1] == unique_answer
+                ]
+                acc = min(1, float(len(matching_answers)) / 3)
+                accs.append(acc)
+            unique_answer_scores[unique_answer] = sum(accs) / len(accs)
+
+        return unique_answer_scores
+
+    def eval_pred_list(self, pred_list):
+        pred_scores = []
+        for entry in tqdm(pred_list):
+            pred_answer = self.answer_processor(entry["pred_answer"])
+            unique_answer_scores = self._compute_answer_scores(entry["gt_answers"])
+            score = unique_answer_scores.get(pred_answer, 0.0)
+            pred_scores.append(score)
+
+        accuracy = sum(pred_scores) / len(pred_scores)
+        return accuracy
+
+
+class STVQAAccuracyEvaluator:
+    def __init__(self):
+        self.answer_processor = EvalAIAnswerProcessor()
+
+    def eval_pred_list(self, pred_list):
+        pred_scores = []
+        for entry in pred_list:
+            pred_answer = self.answer_processor(entry["pred_answer"])
+            gts = [self.answer_processor(a) for a in entry["gt_answers"]]
+            score = 1.0 if pred_answer in gts else 0.0
+            pred_scores.append(score)
+
+        accuracy = sum(pred_scores) / len(pred_scores)
+        return accuracy
+
+
+class STVQAANLSEvaluator:
+    def __init__(self):
+        import editdistance  # install with `pip install editdistance`
+
+        self.get_edit_distance = editdistance.eval
+
+    def get_anls(self, s1, s2):
+        s1 = s1.lower().strip()
+        s2 = s2.lower().strip()
+        iou = 1 - self.get_edit_distance(s1, s2) / max(len(s1), len(s2))
+        anls = iou if iou >= 0.5 else 0.0
+        return anls
+
+    def eval_pred_list(self, pred_list):
+        pred_scores = []
+        for entry in pred_list:
+            anls = max(
+                self.get_anls(entry["pred_answer"], gt) for gt in entry["gt_answers"]
+            )
+            pred_scores.append(anls)
+
+        accuracy = sum(pred_scores) / len(pred_scores)
+        return accuracy
+
+
+class TextCapsBleu4Evaluator:
+    def __init__(self):
+        # The following script requires Java 1.8.0 and pycocotools installed.
+        # The pycocoevalcap can be installed with pip as
+        # pip install git+https://github.com/ronghanghu/coco-caption.git@python23
+        # Original pycocoevalcap code is at https://github.com/tylin/coco-caption
+        # but has no python3 support yet.
+        try:
+            from pycocoevalcap.bleu.bleu import Bleu
+            from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+        except ModuleNotFoundError:
+            print(
+                "Please install pycocoevalcap module using "
+                "pip install git+https://github.com/ronghanghu/coco-caption.git@python23"  # noqa
+            )
+            raise
+
+        self.tokenizer = PTBTokenizer()
+        self.scorer = Bleu(4)
+
+    def eval_pred_list(self, pred_list):
+        # Create reference and hypotheses captions.
+        gts = {}
+        res = {}
+        for idx, entry in enumerate(pred_list):
+            gts[idx] = [{"caption": a} for a in entry["gt_answers"]]
+            res[idx] = [{"caption": entry["pred_answer"]}]
+
+        gts = self.tokenizer.tokenize(gts)
+        res = self.tokenizer.tokenize(res)
+        score, _ = self.scorer.compute_score(gts, res)
+
+        bleu4 = score[3]  # score is (Bleu-1, Bleu-2, Bleu-3, Bleu-4)
+        return bleu4
--- a/mgm/eval/model_math_vista.py
+++ b/mgm/eval/model_math_vista.py
+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+
+from mgm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from mgm.conversation import conv_templates, SeparatorStyle
+from mgm.model.builder import load_pretrained_model
+from mgm.utils import disable_torch_init
+from mgm.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+
+from PIL import Image
+import math
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+def create_one_query(problem, shot_num, shot_type, use_caption):
+
+
+    ### [1] Demo prompt
+    demo_prompt = ""
+
+    ### [2] Test query
+    # problem info
+    question = problem['question']
+    unit = problem['unit']
+    choices = problem['choices']
+    # caption = problem['caption']
+    precision = problem['precision']
+    question_type = problem['question_type']
+    answer_type = problem['answer_type']
+
+    # hint
+    if shot_type == 'solution':
+        if question_type == "multi_choice":
+            assert answer_type == "text"
+            hint_text = f"Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end."
+        else:
+            assert answer_type in ["integer", "float", "list"]
+            if answer_type == "integer":
+                hint_text = f"Hint: Please answer the question requiring an integer answer and provide the final value, e.g., 1, 2, 3, at the end."
+            
+            elif answer_type == "float" and precision == 1:
+                hint_text = f"Hint: Please answer the question requiring a floating-point number with one decimal place and provide the final value, e.g., 1.2, 1.3, 1.4, at the end."
+            
+            elif answer_type == "float" and precision == 2:
+                hint_text = f"Hint: Please answer the question requiring a floating-point number with two decimal places and provide the final value, e.g., 1.23, 1.34, 1.45, at the end."
+            
+            elif answer_type == "list":
+                hint_text = f"Hint: Please answer the question requiring a Python list as an answer and provide the final list, e.g., [1, 2, 3], [1.2, 1.3, 1.4], at the end."
+    else:
+        assert shot_type == 'code'
+        hint_text = "Hint: Please generate a python code to solve the problem"
+
+    # question
+    question_text = f"Question: {question}"
+    if unit:
+        question_text += f" (Unit: {unit})"
+
+    # choices
+    if choices:
+        # choices: (A) 1.2 (B) 1.3 (C) 1.4 (D) 1.5
+        texts = ["Choices:"]
+        for i, choice in enumerate(choices):
+            texts.append(f"({chr(ord('A')+i)}) {choice}")
+        choices_text = "\n".join(texts)
+    else:
+        choices_text = ""
+
+    # prompt
+    if shot_type == 'solution':
+        prompt = "Solution: "
+    else:
+        assert shot_type == 'code'
+        prompt = "Python code: "
+    
+    elements = [hint_text, question_text, choices_text]
+    test_query = "\n".join([e for e in elements if e != ""])
+
+    ### [3] Final query
+    query = demo_prompt + "\n\n" + test_query
+    query = query.strip()
+    return query
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name,
+                                                                           load_8bit=args.load_8bit)
+
+    questions = json.load(open(os.path.expanduser(args.question_file), "r"))
+    questions = [dict(pid=pid, info=qs) for pid, qs in questions.items()]
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+
+    if os.path.exists(answers_file):
+        file = open(answers_file, "r")
+        pred_contents = [json.loads(line) for line in file]
+        done_pid = [sample['pid'] for sample in pred_contents]
+    else:
+        done_pid = []
+    ans_file = open(answers_file, "a")
+
+    for i, line in enumerate(tqdm(questions)):
+        idx = line['pid']
+        info = line['info']
+        if idx in done_pid:
+            continue
+
+        qs = create_one_query(
+            problem = info, 
+            shot_num = 0,
+            shot_type = 'solution',
+            use_caption = False,
+        )
+        query = qs
+
+        if 'image' in info:
+            image_file = info["image"]
+            image = Image.open(os.path.join(args.image_folder, image_file))
+            
+            if hasattr(model.config, 'image_size_aux'):
+                if not hasattr(image_processor, 'image_size_raw'):
+                    image_processor.image_size_raw = image_processor.crop_size.copy()
+                image_processor.crop_size['height'] = model.config.image_size_aux
+                image_processor.crop_size['width'] = model.config.image_size_aux
+                image_processor.size['shortest_edge'] = model.config.image_size_aux
+            
+            image_tensor = process_images([image], image_processor, model.config)[0]
+            
+            image_grid = getattr(model.config, 'image_grid', 1)
+            if hasattr(model.config, 'image_size_aux'):
+                raw_shape = [image_processor.image_size_raw['height'] * image_grid, 
+                            image_processor.image_size_raw['width'] * image_grid]
+                image_tensor_aux = image_tensor
+                image_tensor = torch.nn.functional.interpolate(image_tensor[None], 
+                                                            size=raw_shape, 
+                                                            mode='bilinear', 
+                                                            align_corners=False)[0]
+            else:
+                image_tensor_aux = []
+
+            if image_grid >= 2:            
+                raw_image = image_tensor.reshape(3, 
+                                                image_grid,
+                                                image_processor.image_size_raw['height'],
+                                                image_grid,
+                                                image_processor.image_size_raw['width'])
+                raw_image = raw_image.permute(1, 3, 0, 2, 4)
+                raw_image = raw_image.reshape(-1, 3,
+                                            image_processor.image_size_raw['height'],
+                                            image_processor.image_size_raw['width'])
+                
+                if getattr(model.config, 'image_global', False):
+                    global_image = image_tensor
+                    if len(global_image.shape) == 3:
+                        global_image = global_image[None]
+                    global_image = torch.nn.functional.interpolate(global_image, 
+                                                            size=[image_processor.image_size_raw['height'],
+                                                                image_processor.image_size_raw['width']], 
+                                                            mode='bilinear', 
+                                                            align_corners=False)
+                    # [image_crops, image_global]
+                    raw_image = torch.cat([raw_image, global_image], dim=0)
+                image_tensor = raw_image.contiguous()
+            
+            images = image_tensor[None].to(dtype=model.dtype, device='cuda', non_blocking=True)
+            images_aux = image_tensor_aux[None].to(dtype=model.dtype, device='cuda', non_blocking=True) if len(image_tensor_aux)>0 else None
+            if getattr(model.config, 'mm_use_im_start_end', False):
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+        else:
+            images = None
+            images_aux = None
+
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+        
+        terminators = tokenizer.eos_token_id
+        if "llama_3" in args.conv_mode:
+            terminators = [terminators, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
+        
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=images,
+                images_aux=images_aux,
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+                max_new_tokens=1024,
+                bos_token_id=tokenizer.bos_token_id,  # Begin of sequence token
+                eos_token_id=terminators,  # End of sequence token
+                pad_token_id=tokenizer.pad_token_id,  # Pad token
+                use_cache=True,
+            )
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+        info['query'] = query
+        info['response'] = outputs
+        ans_file.write(json.dumps(info) + "\n")
+        ans_file.flush()
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.json")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v0")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--answer-prompter", action="store_true")
+    parser.add_argument('--load_8bit', type=bool, default=False)
+    parser.add_argument("--single-pred-prompt", action="store_true")
+    args = parser.parse_args()
+
+    eval_model(args)
\ No newline at end of file
--- a/mgm/eval/model_qa.py
+++ b/mgm/eval/model_qa.py
+import argparse
+from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+
+from mgm.conversation import default_conversation
+from mgm.utils import disable_torch_init
+
+
+@torch.inference_mode()
+def eval_model(model_name, questions_file, answers_file):
+    # Model
+    disable_torch_init()
+    model_name = os.path.expanduser(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
+    model = AutoModelForCausalLM.from_pretrained(model_name,
+        torch_dtype=torch.float16).cuda()
+
+
+    ques_file = open(os.path.expanduser(questions_file), "r")
+    ans_file = open(os.path.expanduser(answers_file), "w")
+    for i, line in enumerate(tqdm(ques_file)):
+        idx = json.loads(line)["question_id"]
+        qs = json.loads(line)["text"]
+        cat = json.loads(line)["category"]
+        conv = default_conversation.copy()
+        conv.append_message(conv.roles[0], qs)
+        prompt = conv.get_prompt()
+        inputs = tokenizer([prompt])
+        input_ids = torch.as_tensor(inputs.input_ids).cuda()
+        output_ids = model.generate(
+            input_ids,
+            do_sample=True,
+            use_cache=True,
+            temperature=0.7,
+            max_new_tokens=1024,)
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
+        try:
+            index = outputs.index(conv.sep, len(prompt))
+        except ValueError:
+            outputs += conv.sep
+            index = outputs.index(conv.sep, len(prompt))
+
+        outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"question_id": idx,
+                                   "text": outputs,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "metadata": {}}) + "\n")
+        ans_file.flush()
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    args = parser.parse_args()
+
+    eval_model(args.model_name, args.question_file, args.answers_file)
\ No newline at end of file
--- a/mgm/eval/model_vqa.py
+++ b/mgm/eval/model_vqa.py
+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+
+from mgm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from mgm.conversation import conv_templates, SeparatorStyle
+from mgm.model.builder import load_pretrained_model
+from mgm.utils import disable_torch_init
+from mgm.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+
+from PIL import Image
+import math
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+
+    questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    for line in tqdm(questions):
+        idx = line["question_id"]
+        image_file = line["image"]
+        qs = line["text"]
+        cur_prompt = qs
+        
+        if hasattr(model, "update_prompt"):
+            model.update_prompt([[cur_prompt]])
+
+        if model.config.mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+
+        image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB')
+        
+        if hasattr(model.config, 'image_size_aux'):
+            if not hasattr(image_processor, 'image_size_raw'):
+                image_processor.image_size_raw = image_processor.crop_size.copy()
+            image_processor.crop_size['height'] = model.config.image_size_aux
+            image_processor.crop_size['width'] = model.config.image_size_aux
+            image_processor.size['shortest_edge'] = model.config.image_size_aux
+        
+        image_tensor = process_images([image], image_processor, model.config)[0]
+
+        image_grid = getattr(model.config, 'image_grid', 1)
+        if hasattr(model.config, 'image_size_aux'):
+            raw_shape = [image_processor.image_size_raw['height'] * image_grid, 
+                        image_processor.image_size_raw['width'] * image_grid]
+            image_tensor_aux = image_tensor
+            image_tensor = torch.nn.functional.interpolate(image_tensor[None], 
+                                                        size=raw_shape, 
+                                                        mode='bilinear', 
+                                                        align_corners=False)[0]
+        else:
+            image_tensor_aux = []
+
+        if image_grid >= 2:            
+            raw_image = image_tensor.reshape(3, 
+                                            image_grid,
+                                            image_processor.image_size_raw['height'],
+                                            image_grid,
+                                            image_processor.image_size_raw['width'])
+            raw_image = raw_image.permute(1, 3, 0, 2, 4)
+            raw_image = raw_image.reshape(-1, 3,
+                                        image_processor.image_size_raw['height'],
+                                        image_processor.image_size_raw['width'])
+            
+            if getattr(model.config, 'image_global', False):
+                global_image = image_tensor
+                if len(global_image.shape) == 3:
+                    global_image = global_image[None]
+                global_image = torch.nn.functional.interpolate(global_image, 
+                                                        size=[image_processor.image_size_raw['height'],
+                                                            image_processor.image_size_raw['width']], 
+                                                        mode='bilinear', 
+                                                        align_corners=False)
+                # [image_crops, image_global]
+                raw_image = torch.cat([raw_image, global_image], dim=0)
+            image_tensor = raw_image.contiguous()
+
+        images = image_tensor[None].to(dtype=model.dtype, device='cuda', non_blocking=True)
+        images_aux = image_tensor_aux[None].to(dtype=model.dtype, device='cuda', non_blocking=True) if len(image_tensor_aux)>0 else None
+
+        terminators = tokenizer.eos_token_id
+        if "llama_3" in args.conv_mode:
+            terminators = [terminators, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
+
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=images,
+                images_aux=images_aux,
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                num_beams=args.num_beams,
+                max_new_tokens=1024,
+                bos_token_id=tokenizer.bos_token_id,  # Begin of sequence token
+                eos_token_id=terminators,  # End of sequence token
+                pad_token_id=tokenizer.pad_token_id,  # Pad token
+                use_cache=True)
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"question_id": idx,
+                                   "prompt": cur_prompt,
+                                   "text": outputs,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "metadata": {}}) + "\n")
+        ans_file.flush()
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    args = parser.parse_args()
+
+    eval_model(args)
\ No newline at end of file
--- a/mgm/eval/model_vqa_loader.py
+++ b/mgm/eval/model_vqa_loader.py
+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+
+from mgm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from mgm.conversation import conv_templates, SeparatorStyle
+from mgm.model.builder import load_pretrained_model
+from mgm.utils import disable_torch_init
+from mgm.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+from torch.utils.data import Dataset, DataLoader
+
+from PIL import Image
+import math
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+# Custom dataset class
+class CustomDataset(Dataset):
+    def __init__(self, questions, image_folder, tokenizer, image_processor, model_config):
+        self.questions = questions
+        self.image_folder = image_folder
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        self.model_config = model_config
+
+    def __getitem__(self, index):
+        line = self.questions[index]
+        image_file = line["image"]
+        qs = line["text"]
+        
+        if self.model_config.mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
+                
+        if hasattr(self.model_config, 'image_size_aux'):
+            if not hasattr(self.image_processor, 'image_size_raw'):
+                self.image_processor.image_size_raw = self.image_processor.crop_size.copy()
+            self.image_processor.crop_size['height'] = self.model_config.image_size_aux
+            self.image_processor.crop_size['width'] = self.model_config.image_size_aux
+            self.image_processor.size['shortest_edge'] = self.model_config.image_size_aux
+        
+        image_tensor = process_images([image], self.image_processor, self.model_config)[0]
+
+        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
+        
+        image_grid = getattr(self.model_config, 'image_grid', 1)
+        if hasattr(self.model_config, 'image_size_aux'):
+            raw_shape = [self.image_processor.image_size_raw['height'] * image_grid, 
+                         self.image_processor.image_size_raw['width'] * image_grid]
+            image_tensor_aux = image_tensor
+            image_tensor = torch.nn.functional.interpolate(image_tensor[None], 
+                                                           size=raw_shape, 
+                                                           mode='bilinear', 
+                                                           align_corners=False)[0]
+        else:
+            image_tensor_aux = []
+
+        if image_grid >= 2:            
+            raw_image = image_tensor.reshape(3, 
+                                             image_grid,
+                                             self.image_processor.image_size_raw['height'],
+                                             image_grid,
+                                             self.image_processor.image_size_raw['width'])
+            raw_image = raw_image.permute(1, 3, 0, 2, 4)
+            raw_image = raw_image.reshape(-1, 3,
+                                          self.image_processor.image_size_raw['height'],
+                                          self.image_processor.image_size_raw['width'])
+            
+            if getattr(self.model_config, 'image_global', False):
+                global_image = image_tensor
+                if len(global_image.shape) == 3:
+                    global_image = global_image[None]
+                global_image = torch.nn.functional.interpolate(global_image, 
+                                                        size=[self.image_processor.image_size_raw['height'],
+                                                              self.image_processor.image_size_raw['width']], 
+                                                        mode='bilinear', 
+                                                        align_corners=False)
+                # [image_crops, image_global]
+                raw_image = torch.cat([raw_image, global_image], dim=0)
+            image_tensor = raw_image.contiguous()
+
+        return input_ids, image_tensor, image_tensor_aux
+    
+    def __len__(self):
+        return len(self.questions)
+
+
+# DataLoader
+def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4):
+    assert batch_size == 1, "batch_size must be 1"
+    dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config)
+    data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
+    return data_loader
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name, load_8bit=args.load_8bit)
+
+    questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+
+    if 'plain' in args.conv_mode and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
+        args.conv_mode = args.conv_mode + '_mmtag'
+        print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
+
+    data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config)
+
+    for (input_ids, image_tensor, image_tensor_aux), line in tqdm(zip(data_loader, questions), total=len(questions)):
+        idx = line["question_id"]
+        cur_prompt = line["text"]
+        
+        input_ids = input_ids.to(device=model.device, non_blocking=True)
+        if hasattr(model, "update_prompt"):
+            model.update_prompt([[cur_prompt]])
+
+        terminators = tokenizer.eos_token_id
+        if "llama_3" in args.conv_mode:
+            terminators = [terminators, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
+        
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=image_tensor.to(dtype=model.dtype, device=model.device, non_blocking=True),
+                images_aux=image_tensor_aux.to(dtype=model.dtype, device=model.device, non_blocking=True) if len(image_tensor_aux)>0 else None,
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                num_beams=args.num_beams,
+                max_new_tokens=args.max_new_tokens,
+                bos_token_id=tokenizer.bos_token_id,  # Begin of sequence token
+                eos_token_id=terminators,  # End of sequence token
+                pad_token_id=tokenizer.pad_token_id,  # Pad token
+                use_cache=True)
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"question_id": idx,
+                                   "prompt": cur_prompt,
+                                   "text": outputs,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "metadata": {}}) + "\n")
+        # ans_file.flush()
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument('--load_8bit', type=bool, default=False)
+    parser.add_argument("--max_new_tokens", type=int, default=128)
+    args = parser.parse_args()
+
+    eval_model(args)
--- a/mgm/eval/model_vqa_mmbench.py
+++ b/mgm/eval/model_vqa_mmbench.py
+import argparse
+import torch
+import os
+import json
+import pandas as pd
+from tqdm import tqdm
+import shortuuid
+
+from mgm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from mgm.conversation import conv_templates, SeparatorStyle
+from mgm.model.builder import load_pretrained_model
+from mgm.utils import disable_torch_init
+from mgm.mm_utils import tokenizer_image_token, process_images, load_image_from_base64, get_model_name_from_path
+
+from PIL import Image
+import math
+
+
+all_options = ['A', 'B', 'C', 'D']
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+def is_none(value):
+    if value is None:
+        return True
+    if type(value) is float and math.isnan(value):
+        return True
+    if type(value) is str and value.lower() == 'nan':
+        return True
+    if type(value) is str and value.lower() == 'none':
+        return True
+    return False
+
+def get_options(row, options):
+    parsed_options = []
+    for option in options:
+        option_value = row[option]
+        if is_none(option_value):
+            break
+        parsed_options.append(option_value)
+    return parsed_options
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+
+    questions = pd.read_table(os.path.expanduser(args.question_file))
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+
+    if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
+        args.conv_mode = args.conv_mode + '_mmtag'
+        print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
+
+    for index, row in tqdm(questions.iterrows(), total=len(questions)):
+        options = get_options(row, all_options)
+        cur_option_char = all_options[:len(options)]
+
+        if args.all_rounds:
+            num_rounds = len(options)
+        else:
+            num_rounds = 1
+
+        for round_idx in range(num_rounds):
+            idx = row['index']
+            question = row['question']
+            hint = row['hint']
+            image = load_image_from_base64(row['image'])
+            if not is_none(hint):
+                question = hint + '\n' + question
+            for option_char, option in zip(all_options[:len(options)], options):
+                question = question + '\n' + option_char + '. ' + option
+            qs = cur_prompt = question
+            
+            if hasattr(model, "update_prompt"):
+                model.update_prompt([[cur_prompt]])
+            
+            if model.config.mm_use_im_start_end:
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+
+            if args.single_pred_prompt:
+                if args.lang == 'cn':
+                    qs = qs + '\n' + "请直接回答选项字母。"
+                else:
+                    qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
+
+            conv = conv_templates[args.conv_mode].copy()
+            conv.append_message(conv.roles[0], qs)
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+
+            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+
+            if hasattr(model.config, 'image_size_aux'):
+                if not hasattr(image_processor, 'image_size_raw'):
+                    image_processor.image_size_raw = image_processor.crop_size.copy()
+                image_processor.crop_size['height'] = model.config.image_size_aux
+                image_processor.crop_size['width'] = model.config.image_size_aux
+                image_processor.size['shortest_edge'] = model.config.image_size_aux
+
+            image_tensor = process_images([image], image_processor, model.config)[0]
+            image_grid = getattr(model.config, 'image_grid', 1)
+            if hasattr(model.config, 'image_size_aux'):
+                raw_shape = [image_processor.image_size_raw['height'] * image_grid, 
+                            image_processor.image_size_raw['width'] * image_grid]
+                image_tensor_aux = image_tensor
+                image_tensor = torch.nn.functional.interpolate(image_tensor[None], 
+                                                            size=raw_shape, 
+                                                            mode='bilinear', 
+                                                            align_corners=False)[0]
+            else:
+                image_tensor_aux = []
+
+            if image_grid >= 2:            
+                raw_image = image_tensor.reshape(3, 
+                                                image_grid,
+                                                image_processor.image_size_raw['height'],
+                                                image_grid,
+                                                image_processor.image_size_raw['width'])
+                raw_image = raw_image.permute(1, 3, 0, 2, 4)
+                raw_image = raw_image.reshape(-1, 3,
+                                            image_processor.image_size_raw['height'],
+                                            image_processor.image_size_raw['width'])
+                
+                if getattr(model.config, 'image_global', False):
+                    global_image = image_tensor
+                    if len(global_image.shape) == 3:
+                        global_image = global_image[None]
+                    global_image = torch.nn.functional.interpolate(global_image, 
+                                                            size=[image_processor.image_size_raw['height'],
+                                                                image_processor.image_size_raw['width']], 
+                                                            mode='bilinear', 
+                                                            align_corners=False)
+                    # [image_crops, image_global]
+                    raw_image = torch.cat([raw_image, global_image], dim=0)
+                image_tensor = raw_image.contiguous()
+            
+            images = image_tensor[None].to(dtype=model.dtype, device='cuda', non_blocking=True)
+            images_aux = image_tensor_aux[None].to(dtype=model.dtype, device='cuda', non_blocking=True) if len(image_tensor_aux)>0 else None
+
+            terminators = tokenizer.eos_token_id
+            if "llama_3" in args.conv_mode:
+                terminators = [terminators, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
+
+            with torch.inference_mode():
+                output_ids = model.generate(
+                    input_ids,
+                    images=images,
+                    images_aux=images_aux,
+                    do_sample=True if args.temperature > 0 else False,
+                    temperature=args.temperature,
+                    top_p=args.top_p,
+                    num_beams=args.num_beams,
+                    # no_repeat_ngram_size=3,
+                    max_new_tokens=1024,
+                    bos_token_id=tokenizer.bos_token_id,  # Begin of sequence token
+                    eos_token_id=terminators,  # End of sequence token
+                    pad_token_id=tokenizer.pad_token_id,  # Pad token
+                    use_cache=True)
+
+            outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+            ans_id = shortuuid.uuid()
+            ans_file.write(json.dumps({"question_id": idx,
+                                    "round_id": round_idx,
+                                    "prompt": cur_prompt,
+                                    "text": outputs,
+                                    "options": options,
+                                    "option_char": cur_option_char,
+                                    "answer_id": ans_id,
+                                    "model_id": model_name,
+                                    "metadata": {}}) + "\n")
+            ans_file.flush()
+
+            # rotate options
+            options = options[1:] + options[:1]
+            cur_option_char = cur_option_char[1:] + cur_option_char[:1]
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--all-rounds", action="store_true")
+    parser.add_argument("--single-pred-prompt", action="store_true")
+    parser.add_argument("--lang", type=str, default="en")
+    args = parser.parse_args()
+
+    eval_model(args)
--- a/mgm/eval/model_vqa_qbench.py
+++ b/mgm/eval/model_vqa_qbench.py
+import argparse
+import torch
+from tqdm import tqdm
+import json
+
+from mgm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from mgm.conversation import conv_templates, SeparatorStyle
+from mgm.model.builder import load_pretrained_model
+from mgm.utils import disable_torch_init
+from mgm.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+
+from PIL import Image
+
+import requests
+from PIL import Image
+from io import BytesIO
+
+
+def load_image(image_file):
+    if image_file.startswith('http') or image_file.startswith('https'):
+        response = requests.get(image_file)
+        image = Image.open(BytesIO(response.content)).convert('RGB')
+    else:
+        image = Image.open(image_file).convert('RGB')
+    return image
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+
+    model_name = get_model_name_from_path(args.model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, True)
+
+
+
+    
+    with open(args.questions_file) as f:
+        llvqa_data = json.load(f)  
+        
+    for i, llddata in enumerate(tqdm(llvqa_data)):
+        filename = llddata["img_path"]
+        if args.lang == "en":
+            message = llddata["question"] + "\nChoose between one of the options as follows:\n"
+        elif args.lang == "zh":
+            message = llddata["question"] + "\在下列选项中选择一个:\n"
+        else:
+            raise NotImplementedError("Q-Bench does not support languages other than English (en) and Chinese (zh) yet. Contact us (https://github.com/VQAssessment/Q-Bench/) to convert  Q-Bench into more languages.")
+        for choice, ans in zip(["A.", "B.", "C.", "D."], llddata["candidates"]):
+            message += f"{choice} {ans}\n"
+        qs = message
+        
+        if model.config.mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+
+        if 'llama-2' in model_name.lower():
+            conv_mode = "llava_llama_2"
+        elif "v1" in model_name.lower():
+            conv_mode = "llava_v1"
+        elif "mpt" in model_name.lower():
+            conv_mode = "mpt"
+        else:
+            conv_mode = "llava_v0"
+
+        if args.conv_mode is not None and conv_mode != args.conv_mode:
+            print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode))
+        else:
+            args.conv_mode = conv_mode
+
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        image = load_image(args.image_folder + filename)
+        image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].half().cuda()
+
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        
+
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=image_tensor,
+                num_beams=1,
+                do_sample=False,
+                temperature=0,
+                max_new_tokens=1024,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria])
+        
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+        outputs = outputs.strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[:-len(stop_str)]
+        outputs = outputs.strip()
+        llddata["response"] = outputs
+        with open(args.answers_file, "a") as wf:
+            json.dump(llddata, wf)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="llava-v1.5")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="./playground/data/qbench/images_llvisionqa")
+    parser.add_argument("--questions-file", type=str, default="./playground/data/qbench/llvisionqa_dev.json")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v1")
+    parser.add_argument("--lang", type=str, default="en")
+    args = parser.parse_args()
+
+    eval_model(args)
--- a/mgm/eval/model_vqa_science.py
+++ b/mgm/eval/model_vqa_science.py
+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+
+from mgm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from mgm.conversation import conv_templates, SeparatorStyle
+from mgm.model.builder import load_pretrained_model
+from mgm.utils import disable_torch_init
+from mgm.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+
+from PIL import Image
+import math
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+
+    questions = json.load(open(os.path.expanduser(args.question_file), "r"))
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+
+    for i, line in enumerate(tqdm(questions)):
+        idx = line["id"]
+        question = line['conversations'][0]
+        qs = question['value'].replace('<image>', '').strip()
+        cur_prompt = qs
+
+        if 'image' in line:
+            image_file = line["image"]
+            image = Image.open(os.path.join(args.image_folder, image_file))
+            
+            if hasattr(model.config, 'image_size_aux'):
+                if not hasattr(image_processor, 'image_size_raw'):
+                    image_processor.image_size_raw = image_processor.crop_size.copy()
+                image_processor.crop_size['height'] = model.config.image_size_aux
+                image_processor.crop_size['width'] = model.config.image_size_aux
+                image_processor.size['shortest_edge'] = model.config.image_size_aux
+            
+            image_tensor = process_images([image], image_processor, model.config)[0]
+            
+            image_grid = getattr(model.config, 'image_grid', 1)
+            if hasattr(model.config, 'image_size_aux'):
+                raw_shape = [image_processor.image_size_raw['height'] * image_grid, 
+                            image_processor.image_size_raw['width'] * image_grid]
+                image_tensor_aux = image_tensor
+                image_tensor = torch.nn.functional.interpolate(image_tensor[None], 
+                                                            size=raw_shape, 
+                                                            mode='bilinear', 
+                                                            align_corners=False)[0]
+            else:
+                image_tensor_aux = []
+
+            if image_grid >= 2:            
+                raw_image = image_tensor.reshape(3, 
+                                                image_grid,
+                                                image_processor.image_size_raw['height'],
+                                                image_grid,
+                                                image_processor.image_size_raw['width'])
+                raw_image = raw_image.permute(1, 3, 0, 2, 4)
+                raw_image = raw_image.reshape(-1, 3,
+                                            image_processor.image_size_raw['height'],
+                                            image_processor.image_size_raw['width'])
+                
+                if getattr(model.config, 'image_global', False):
+                    global_image = image_tensor
+                    if len(global_image.shape) == 3:
+                        global_image = global_image[None]
+                    global_image = torch.nn.functional.interpolate(global_image, 
+                                                            size=[image_processor.image_size_raw['height'],
+                                                                image_processor.image_size_raw['width']], 
+                                                            mode='bilinear', 
+                                                            align_corners=False)
+                    # [image_crops, image_global]
+                    raw_image = torch.cat([raw_image, global_image], dim=0)
+                image_tensor = raw_image.contiguous()
+            
+            images = image_tensor[None].to(dtype=model.dtype, device='cuda', non_blocking=True)
+            images_aux = image_tensor_aux[None].to(dtype=model.dtype, device='cuda', non_blocking=True) if len(image_tensor_aux)>0 else None
+            if getattr(model.config, 'mm_use_im_start_end', False):
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+            cur_prompt = '<image>' + '\n' + cur_prompt
+        else:
+            images = None
+            images_aux = None
+
+        if args.single_pred_prompt:
+            qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
+            cur_prompt = cur_prompt + '\n' + "Answer with the option's letter from the given choices directly."
+
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+        
+        if hasattr(model, "update_prompt"):
+            model.update_prompt([[cur_prompt]])
+        
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=images,
+                images_aux=images_aux,
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+                max_new_tokens=1024,
+                bos_token_id=tokenizer.bos_token_id,  # Begin of sequence token
+                eos_token_id=tokenizer.eos_token_id,  # End of sequence token
+                pad_token_id=tokenizer.pad_token_id,  # Pad token
+                use_cache=True,
+            )
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"question_id": idx,
+                                   "prompt": cur_prompt,
+                                   "text": outputs,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "metadata": {}}) + "\n")
+        ans_file.flush()
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.json")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v0")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--answer-prompter", action="store_true")
+    parser.add_argument("--single-pred-prompt", action="store_true")
+    args = parser.parse_args()
+
+    eval_model(args)
\ No newline at end of file
--- a/mgm/eval/qa_baseline_gpt35.py
+++ b/mgm/eval/qa_baseline_gpt35.py
+"""Generate answers with GPT-3.5"""
+# Note: you need to be using OpenAI Python v0.27.0 for the code below to work
+import argparse
+import json
+import os
+import time
+import concurrent.futures
+
+import openai
+import tqdm
+import shortuuid
+
+MODEL = 'gpt-3.5-turbo'
+MODEL_ID = 'gpt-3.5-turbo:20230327'
+
+def get_answer(question_id: int, question: str, max_tokens: int):
+    ans = {
+        'answer_id': shortuuid.uuid(),
+        'question_id': question_id,
+        'model_id': MODEL_ID,
+    }
+    for _ in range(3):
+        try:
+            response = openai.ChatCompletion.create(
+                model=MODEL,
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful assistant.'
+                }, {
+                    'role': 'user',
+                    'content': question,
+                }],
+                max_tokens=max_tokens,
+            )
+            ans['text'] = response['choices'][0]['message']['content']
+            return ans
+        except Exception as e:
+            print('[ERROR]', e)
+            ans['text'] = '#ERROR#'
+            time.sleep(1)
+    return ans
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT answer generation.')
+    parser.add_argument('-q', '--question')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+
+    questions_dict = {}
+    with open(os.path.expanduser(args.question)) as f:
+        for line in f:
+            if not line:
+                continue
+            q = json.loads(line)
+            questions_dict[q['question_id']] = q['text']
+
+    answers = []
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
+        futures = []
+        for qid, question in questions_dict.items():
+            future = executor.submit(get_answer, qid, question, args.max_tokens)
+            futures.append(future)
+
+        for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
+            answers.append(future.result())
+
+    answers.sort(key=lambda x: x['question_id'])
+
+    with open(os.path.expanduser(args.output), 'w') as f:
+        table = [json.dumps(ans) for ans in answers]
+        f.write('\n'.join(table))
--- a/mgm/eval/run_llava.py
+++ b/mgm/eval/run_llava.py
+import argparse
+import torch
+
+from mgm.constants import (
+    IMAGE_TOKEN_INDEX,
+    DEFAULT_IMAGE_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IM_END_TOKEN,
+    IMAGE_PLACEHOLDER,
+)
+from mgm.conversation import conv_templates, SeparatorStyle
+from mgm.model.builder import load_pretrained_model
+from mgm.utils import disable_torch_init
+from mgm.mm_utils import (
+    process_images,
+    tokenizer_image_token,
+    get_model_name_from_path,
+)
+
+from PIL import Image
+
+import requests
+from PIL import Image
+from io import BytesIO
+import re
+
+
+def image_parser(args):
+    out = args.image_file.split(args.sep)
+    return out
+
+
+def load_image(image_file):
+    if image_file.startswith("http") or image_file.startswith("https"):
+        response = requests.get(image_file)
+        image = Image.open(BytesIO(response.content)).convert("RGB")
+    else:
+        image = Image.open(image_file).convert("RGB")
+    return image
+
+
+def load_images(image_files):
+    out = []
+    for image_file in image_files:
+        image = load_image(image_file)
+        out.append(image)
+    return out
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+
+    model_name = get_model_name_from_path(args.model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(
+        args.model_path, args.model_base, model_name
+    )
+
+    qs = args.query
+    image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
+    if IMAGE_PLACEHOLDER in qs:
+        if model.config.mm_use_im_start_end:
+            qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
+        else:
+            qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
+    else:
+        if model.config.mm_use_im_start_end:
+            qs = image_token_se + "\n" + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
+
+    if "llama-2" in model_name.lower():
+        conv_mode = "llava_llama_2"
+    elif "mistral" in model_name.lower():
+        conv_mode = "mistral_instruct"
+    elif "v1.6-34b" in model_name.lower():
+        conv_mode = "chatml_direct"
+    elif "v1" in model_name.lower():
+        conv_mode = "llava_v1"
+    elif "mpt" in model_name.lower():
+        conv_mode = "mpt"
+    else:
+        conv_mode = "llava_v0"
+
+    if args.conv_mode is not None and conv_mode != args.conv_mode:
+        print(
+            "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(
+                conv_mode, args.conv_mode, args.conv_mode
+            )
+        )
+    else:
+        args.conv_mode = conv_mode
+
+    conv = conv_templates[args.conv_mode].copy()
+    conv.append_message(conv.roles[0], qs)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+
+    image_files = image_parser(args)
+    images = load_images(image_files)
+    images_tensor = process_images(
+        images,
+        image_processor,
+        model.config
+    ).to(model.device, dtype=torch.float16)
+
+    input_ids = (
+        tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
+        .unsqueeze(0)
+        .cuda()
+    )
+
+    with torch.inference_mode():
+        output_ids = model.generate(
+            input_ids,
+            images=images_tensor,
+            do_sample=True if args.temperature > 0 else False,
+            temperature=args.temperature,
+            top_p=args.top_p,
+            num_beams=args.num_beams,
+            max_new_tokens=args.max_new_tokens,
+            use_cache=True,
+        )
+
+    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+    print(outputs)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-file", type=str, required=True)
+    parser.add_argument("--query", type=str, required=True)
+    parser.add_argument("--conv-mode", type=str, default=None)
+    parser.add_argument("--sep", type=str, default=",")
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--max_new_tokens", type=int, default=512)
+    args = parser.parse_args()
+
+    eval_model(args)
\ No newline at end of file
--- a/mgm/eval/summarize_gpt_review.py
+++ b/mgm/eval/summarize_gpt_review.py
+import json
+import os
+from collections import defaultdict
+
+import numpy as np
+
+import argparse
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-d', '--dir', default=None)
+    parser.add_argument('-v', '--version', default=None)
+    parser.add_argument('-s', '--select', nargs='*', default=None)
+    parser.add_argument('-f', '--files', nargs='*', default=[])
+    parser.add_argument('-i', '--ignore', nargs='*', default=[])
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    if args.ignore is not None:
+        args.ignore = [int(x) for x in args.ignore]
+
+    if len(args.files) > 0:
+        review_files = args.files
+    else:
+        review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)]
+
+    for review_file in sorted(review_files):
+        config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
+        if args.select is not None and any(x not in config for x in args.select):
+            continue
+        if '0613' in config:
+            version = '0613'
+        else:
+            version = '0314'
+        if args.version is not None and args.version != version:
+            continue
+        scores = defaultdict(list)
+        print(config)
+        with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
+            for review_str in f:
+                review = json.loads(review_str)
+                if review['question_id'] in args.ignore:
+                    continue
+                if 'category' in review:
+                    scores[review['category']].append(review['tuple'])
+                    scores['all'].append(review['tuple'])
+                else:
+                    if 'tuple' in review:
+                        scores['all'].append(review['tuple'])
+                    else:
+                        scores['all'].append(review['score'])
+        for k, v in sorted(scores.items()):
+            stats = np.asarray(v).mean(0).tolist()
+            stats = [round(x, 3) for x in stats]
+            # print(k, stats, round(stats[1]/stats[0]*100, 1))
+            print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
+        print('=================================')
--- a/mgm/eval/webpage/figures/alpaca.png
+++ b/mgm/eval/webpage/figures/alpaca.png
--- a/mgm/eval/webpage/figures/bard.jpg
+++ b/mgm/eval/webpage/figures/bard.jpg
--- a/mgm/eval/webpage/figures/chatgpt.svg
+++ b/mgm/eval/webpage/figures/chatgpt.svg
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 2406 2406"><path d="M1 578.4C1 259.5 259.5 1 578.4 1h1249.1c319 0 577.5 258.5 577.5 577.4V2406H578.4C259.5 2406 1 2147.5 1 1828.6V578.4z" fill="#74aa9c"/><path d="M1107.3 299.1c-198 0-373.9 127.3-435.2 315.3C544.8 640.6 434.9 720.2 370.5 833c-99.3 171.4-76.6 386.9 56.4 533.8-41.1 123.1-27 257.7 38.6 369.2 98.7 172 297.3 260.2 491.6 219.2 86.1 97 209.8 152.3 339.6 151.8 198 0 373.9-127.3 435.3-315.3 127.5-26.3 237.2-105.9 301-218.5 99.9-171.4 77.2-386.9-55.8-533.9v-.6c41.1-123.1 27-257.8-38.6-369.8-98.7-171.4-297.3-259.6-491-218.6-86.6-96.8-210.5-151.8-340.3-151.2zm0 117.5-.6.6c79.7 0 156.3 27.5 217.6 78.4-2.5 1.2-7.4 4.3-11 6.1L952.8 709.3c-18.4 10.4-29.4 30-29.4 51.4V1248l-155.1-89.4V755.8c-.1-187.1 151.6-338.9 339-339.2zm434.2 141.9c121.6-.2 234 64.5 294.7 169.8 39.2 68.6 53.9 148.8 40.4 226.5-2.5-1.8-7.3-4.3-10.4-6.1l-360.4-208.2c-18.4-10.4-41-10.4-59.4 0L1024 984.2V805.4L1372.7 604c51.3-29.7 109.5-45.4 168.8-45.5zM650 743.5v427.9c0 21.4 11 40.4 29.4 51.4l421.7 243-155.7 90L597.2 1355c-162-93.8-217.4-300.9-123.8-462.8C513.1 823.6 575.5 771 650 743.5zm807.9 106 348.8 200.8c162.5 93.7 217.6 300.6 123.8 462.8l.6.6c-39.8 68.6-102.4 121.2-176.5 148.2v-428c0-21.4-11-41-29.4-51.4l-422.3-243.7 155-89.3zM1201.7 997l177.8 102.8v205.1l-177.8 102.8-177.8-102.8v-205.1L1201.7 997zm279.5 161.6 155.1 89.4v402.2c0 187.3-152 339.2-339 339.2v-.6c-79.1 0-156.3-27.6-217-78.4 2.5-1.2 8-4.3 11-6.1l360.4-207.5c18.4-10.4 30-30 29.4-51.4l.1-486.8zM1380 1421.9v178.8l-348.8 200.8c-162.5 93.1-369.6 38-463.4-123.7h.6c-39.8-68-54-148.8-40.5-226.5 2.5 1.8 7.4 4.3 10.4 6.1l360.4 208.2c18.4 10.4 41 10.4 59.4 0l421.9-243.7z" fill="white"/></svg>
\ No newline at end of file
--- a/mgm/eval/webpage/figures/llama.jpg
+++ b/mgm/eval/webpage/figures/llama.jpg
--- a/mgm/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg
+++ b/mgm/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg
+<svg xmlns="http://www.w3.org/2000/svg" height="48" viewBox="0 96 960 960" width="48"><path d="m762.846 947.614-124.77-124.769-88 88-30.306-30.692q-16.616-16.231-16.616-40.077 0-23.846 16.616-40.461L708 611.385q16.23-16.231 40.076-16.231t40.462 16.231l30.307 30.691-88 88 124.154 124.77q8.615 8.615 8.615 20.23 0 11.616-8.615 20.231l-51.692 52.307q-8.615 9-20.231 9-11.615 0-20.23-9Zm97.153-624.076L412.768 771.153l27.847 28.077q16.231 16.616 16.231 40.462 0 23.846-16.231 40.077l-30.691 30.691-88-88-124.77 124.769q-8.615 9-20.23 9-11.616 0-20.231-9l-52.307-52.307q-9-8.615-9-20.23 0-11.616 9-20.231l124.769-124.769-88-88L171.847 611q16.231-16.23 40.077-16.23 23.846 0 40.461 16.23l28.462 28.232 447.615-447.231h131.537v131.537ZM323.846 483.769l33.769-34.154 34.154-34.153-34.154 34.153-33.769 34.154Zm-31.999 31.999-191.846-192.23V192.001h131.537l191.461 191.846-31.23 31.615-179.077-178.077h-67.307v67.307l178.461 179.077-31.999 31.999Zm87.691 222.77 435.077-433.846v-67.307h-67.307L312.231 670.846l67.307 67.692Zm0 0L346.385 704l-34.154-33.154L346.385 704l33.153 34.538Z"/></svg>
\ No newline at end of file
--- a/mgm/eval/webpage/figures/vicuna.jpeg
+++ b/mgm/eval/webpage/figures/vicuna.jpeg
--- a/mgm/eval/webpage/index.html
+++ b/mgm/eval/webpage/index.html
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Who's GPT-4's favorite? Battles between State-of-the-Art Chatbots</title>
+    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css">
+    <link rel="stylesheet" href="https://fonts.googleapis.com/icon?family=Material+Icons">
+    <link rel="stylesheet" href="styles.css">
+</head>
+
+<body>
+    <nav class="navbar navbar-expand-lg navbar-dark bg-dark">
+        <a class="navbar-brand" href="#">🏔️ Vicuna Evaluation Examples</a>
+        <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
+          <span class="navbar-toggler-icon"></span>
+        </button>
+        <div class="collapse navbar-collapse" id="navbarNav">
+          <ul class="navbar-nav mr-auto">
+            <li class="nav-item">
+                <a class="nav-link" href="https://chat.lmsys.org/">Demo</a>
+              </li>
+              <li class="nav-item">
+                <a class="nav-link" href="https://vicuna.lmsys.org">Blog</a>
+              </li>
+              <li class="nav-item">
+                <a class="nav-link" href="https://github.com/lm-sys/FastChat">Github</a>
+              </li>
+          </ul>
+        </div>
+    </nav>
+
+    <div class="container mt-5">
+        <h2 class="text-center mb-5">Who's GPT-4's favorite? Battles between State-of-the-Art Chatbots</h2>
+
+        <!-- Selection -->
+        <div class="form-row">
+            <div class="form-group col-md-2">
+                <label for="category-select">Category</label>
+                <select class="form-control" id="category-select"></select>
+            </div>
+            <div class="form-group col-md-8">
+                <label for="question-select">Question</label>
+                <select class="form-control" id="question-select"></select>
+            </div>
+            <div class="form-group col-md-2">
+                <div class="col-md-2"><label>&nbsp;</label></div>
+                <div class="btn-group" role="group" aria-label="Left and Right Controller">
+                    <button type="button" class="form-control btn btn-primary" id="prev-question"><i class="material-icons">keyboard_arrow_left</i></button>
+                    <button type="button" class="form-control btn btn-primary" id="next-question"><i class="material-icons">keyboard_arrow_right</i></button>
+                </div>
+            </div>
+        </div>
+
+        <!-- "Battle" -->
+        <div class="row mb-4" style="justify-content: center;">
+            <div class="col" style="display: flex; justify-content: center; align-items: center;">
+                <label class="adjustable-font-size" id="other-score-label">*/10</label>
+            </div>
+            <div class="col">
+                <div class="vertical-flex-layout">
+                    <img class="shadow figure-img img-fluid" src="" alt="other logo" width="150" id="other-model-figure">
+                </div>
+            </div>
+            <div class="col">
+                <div class="vertical-flex-layout">
+                    <!-- from: https://fonts.google.com/icons?icon.query=battle&selected=Material+Symbols+Outlined:swords:FILL@0;wght@300;GRAD@0;opsz@48&icon.style=Outlined -->
+                    <img class="figure-img img-fluid" src="figures/swords_FILL0_wght300_GRAD0_opsz48.svg" width="60" height="60">
+                </div>
+            </div>
+            <div class="col">
+                <div class="vertical-flex-layout">
+                    <img class="shadow figure-img img-fluid" src="figures/vicuna.jpeg" alt="vicuna logo" width="150" id="our-model-figure">
+                </div>
+            </div>
+            <div class="col" style="display: flex; justify-content: center; align-items: center;">
+                <label class="adjustable-font-size" id="our-score-label">*/10</label>
+            </div>
+        </div>
+
+        <!-- Question Card -->
+        <div class="card mb-4">
+            <div class="card-body" id="selected-question"></div>
+        </div>
+
+        <!-- Answer Cards -->
+        <div class="row">
+            <div class="col-md-6">
+                <div class="card mb-4 expandable-card">
+                    <div class="card-header" style="padding-bottom: 0.2rem" id="other-model-header-bg">
+                        <div class="row">
+                            <div class="col-md-5" style="align-items: center; display: flex;">
+                                <label id="other-model-header">Assistant #1</label>
+                            </div>
+                            <div class="col-md-7">
+                                <select class="form-control" id="model-select" style="height: fit-content; margin-top: -0.3rem;"></select>
+                            </div>
+                        </div>
+                    </div>
+                    <div class="card-body">
+                        <div class="card-text-container">
+                            <div class="card-text" id="other-model-answer"></div>
+                        </div>
+                        <div class="btn btn-primary expand-btn" style="display:flex;"></div>
+                    </div>
+                </div>
+            </div>
+            <div class="col-md-6">
+                <div class="card mb-4 expandable-card">
+                    <div class="card-header" id="our-model-header">
+                        Assistant #2 (Vicuna, our model)
+                    </div>
+                    <div class="card-body">
+                        <div class="card-text-container">
+                            <div class="card-text" id="our-model-answer"></div>
+                        </div>
+                        <div class="btn btn-primary expand-btn" style="display:flex;"></div>
+                    </div>
+                </div>
+            </div>
+        </div>
+
+        <!-- Evaluation -->
+        <div class="card expandable-card">
+            <div class="card-header" style="background-color: #c9c9f2;" id="evaluation-header">GPT-4 Evaluation</div>
+            <div class="card-body">
+                <div class="card-text-container">
+                    <div class="card-text" id="evaluation-result"></div>
+                </div>
+                <div class="btn btn-primary expand-btn" style="display:flex;"></div>
+            </div>
+        </div>
+    </div>
+
+    <div class="container-fluid bg-light py-2">
+        <div class="text-center">
+            <small class="text-muted">This website is co-authored with <a href="https://openai.com" target="_blank">GPT-4</a>.</small>
+        </div>
+    </div>
+
+    <!-- Marked.js -->
+    <script src="https://cdn.jsdelivr.net/npm/marked@4.3.0/lib/marked.umd.min.js"></script>
+    <!-- Bootstrap and Popper.js JavaScript dependencies -->
+    <script src="https://code.jquery.com/jquery-3.5.1.slim.min.js"></script>
+    <script src="https://cdn.jsdelivr.net/npm/@popperjs/core@2.11.6/dist/umd/popper.min.js"></script>
+    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.5.2/js/bootstrap.min.js"></script>
+
+    <script src="script.js"></script>
+    <script>
+      // Fetch the JSON file
+      fetch('data.json')
+        .then(response => response.json())
+        .then(json_data => {
+            // Populate the models and questions.
+            populateModels(json_data.models);
+            populateQuestions(json_data.questions);
+            displayQuestion(currentQuestionIndex);
+        }).catch(error => console.error(error));
+    </script>
+</body>
+
+</html>
--- a/mgm/eval/webpage/script.js
+++ b/mgm/eval/webpage/script.js
+// Description: Script for the evaluation webpage.
+
+let currentQuestionIndex = 1;
+
+// Store the model name mapping for later use.
+modelNameMapping = {
+    "gpt35": "ChatGPT-3.5",
+    "gpt4": "GPT-4",
+    "alpaca": "Alpaca-13b",
+    "vicuna": "Vicuna-13b",
+    "llama": "LLaMA-13b",
+    "bard": "Bard",
+};
+
+modelFigureMapping = {
+    "vicuna": "figures/vicuna.jpeg",
+    // Image from: https://commons.wikimedia.org/wiki/File:ChatGPT_logo.svg
+    "gpt35": "figures/chatgpt.svg",
+    // Image from: https://www.reddit.com/r/logodesign/comments/1128aat/google_ai_bard_logo_design/
+    "bard": "figures/bard.jpg",
+    // Image from: https://crfm.stanford.edu/2023/03/13/alpaca.html
+    "alpaca": "figures/alpaca.png",
+    // Image adapted from https://commons.wikimedia.org/wiki/File:Llama_on_Machu_Picchu.jpg
+    "llama": "figures/llama.jpg",
+}
+
+// Store the question data in a mapping for later use.
+questionMapping = {};
+// Store the question ids in a mapping for later use.
+categoryMapping = {};
+// Store the number of questions for later use.
+questionsCount = 0;
+
+
+function text2Markdown(text) {
+    // Normalize the text for markdown rendering.
+    text = text.trim().replaceAll('\n\n', '\n').replaceAll('\n', '\n\n');
+    return marked.parse(text);
+}
+
+function capitalizeFirstChar(str) {
+    if (!str || str.length === 0) {
+      return str;
+    }
+    return str.charAt(0).toUpperCase() + str.slice(1);
+}
+
+function updateQuestionSelect(question_id) {
+    const select = document.getElementById('question-select');
+    // Clear the question select.
+    select.innerHTML = '';
+    // Populate the question select.
+    category = questionMapping[question_id].category;
+    categoryMapping[category].forEach(question_id => {
+        const question = questionMapping[question_id];
+        const option = document.createElement('option');
+        option.value = question_id;
+        option.textContent = 'Q' + question_id.toString() + ': ' + question.question;
+        select.appendChild(option);
+    });
+    select.value = question_id;
+}
+
+function updateModelSelect() {
+    const select = document.getElementById('model-select');
+    img_path = modelFigureMapping[select.value];
+    document.getElementById('other-model-figure').src = img_path;
+}
+
+function populateModels(models) {
+    const select = document.getElementById('model-select');
+    models.forEach(model => {
+        const option = document.createElement('option');
+        option.value = model;
+        option.textContent = modelNameMapping[model];
+        select.appendChild(option);
+    });
+    updateModelSelect();
+}
+
+function populateQuestions(questions) {
+    const category_select = document.getElementById('category-select');
+
+    questionsCount = questions.length;
+    questions.forEach(question => {
+        const option = document.createElement('option');
+        // Store the question data in a mapping for later use.
+        questionMapping[question.id] = {
+            category: question.category,
+            question: question.question,
+            answers: question.answers,
+            evaluations: question.evaluations,
+            scores: question.scores,
+        };
+        // Store the question id in the category mapping.
+        if (question.category in categoryMapping) {
+            categoryMapping[question.category].push(question.id);
+        } else {
+            categoryMapping[question.category] = [question.id];
+            const category_option = document.createElement('option');
+            category_option.value = question.category;
+            category_option.textContent = capitalizeFirstChar(question.category);
+            category_select.appendChild(category_option);
+        }
+    });
+    // Set the default category.
+    updateQuestionSelect(currentQuestionIndex);
+}
+
+function displayQuestion(index) {
+    const question = questionMapping[index].question;
+    document.getElementById('selected-question').innerHTML = text2Markdown('**Question:** ' + question);
+    displayAnswers(index);
+}
+
+function displayAnswers(index) {
+    const question = questionMapping[index];
+    const otherModel = document.getElementById('model-select').value;
+    // render the answers with markdown
+    document.getElementById('other-model-answer').innerHTML = text2Markdown(question.answers[otherModel]);
+    document.getElementById('our-model-answer').innerHTML = text2Markdown(question.answers.vicuna);
+
+    // Display evaluation
+    score = question.scores[otherModel];
+    score_text = modelNameMapping[otherModel] + " " + score[0] + "/10, Vicuna-13b " + score[1] + "/10";
+    document.getElementById('evaluation-header').textContent = "GPT-4 Evaluation" + " (Score: " + score_text + ")";
+    document.getElementById('evaluation-result').innerHTML = text2Markdown(question.evaluations[otherModel]);
+
+    // Update model names
+    let assistant1_title = "Assistant #1"; // (" + modelNameMapping[otherModel] + ")";
+    let assistant2_title = "Assistant #2 (Vicuna-13b, our model)";
+    // Update scores/labels.
+    let assistant1_score_label = score[0].toString() + '/10';
+    let assistant2_score_label = score[1].toString() + '/10';
+
+    const colorRed ='#fa9'; // '#eb978d';
+    // const colorGreen = '#c9f2c9';
+    const colorBlue = '#8ef'; // '#71dbf9';
+    const colorYellow = '#fe7'; // '#fada57';
+    let otherModelHeaderColor = '';
+    let ourModelHeaderColor = '';
+    // Update the winner.
+    if (score[0] == score[1]) {
+        assistant1_title = '🏆 ' + assistant1_title;
+        assistant1_score_label = '🏆 ' + assistant1_score_label;
+        assistant2_title = '🏆 ' + assistant2_title;
+        assistant2_score_label = '🏆 ' + assistant2_score_label;
+        otherModelHeaderColor = colorYellow;
+        ourModelHeaderColor = colorYellow;
+    } else if (score[0] > score[1]) {
+        assistant1_title = '🏆 ' + assistant1_title;
+        assistant1_score_label = '🏆 ' + assistant1_score_label;
+        otherModelHeaderColor = colorBlue;
+        ourModelHeaderColor = colorRed;
+    } else if (score[0] < score[1]) {
+        assistant2_title = '🏆 ' + assistant2_title;
+        assistant2_score_label = '🏆 ' + assistant2_score_label;
+        otherModelHeaderColor = colorRed;
+        ourModelHeaderColor = colorBlue;
+    }
+
+    document.getElementById('other-model-header-bg').style.backgroundColor = otherModelHeaderColor;
+    document.getElementById('our-model-header').style.backgroundColor = ourModelHeaderColor;
+
+    document.getElementById('other-model-header').textContent = assistant1_title;
+    document.getElementById('our-model-header').textContent = assistant2_title;
+
+    document.getElementById('other-score-label').textContent = assistant1_score_label;
+    document.getElementById('our-score-label').textContent = assistant2_score_label;
+
+    // Update expand buttons visibility for both cards after displaying answers
+    // Reset the expanded state and update expand buttons visibility for both cards after displaying answers
+    document.querySelectorAll('.expandable-card').forEach(card => {
+        card.classList.remove('expanded');
+        updateExpandButtonVisibility(card);
+        const expandBtn = card.querySelector('.expand-btn');
+        expandBtn.innerHTML = '<i class="material-icons" style="pointer-events: none">keyboard_arrow_down</i> Show more';   // .textContent = 'Show more';
+    });
+}
+
+document.getElementById('question-select').addEventListener('change', e => {
+    currentQuestionIndex = parseInt(e.target.value);
+    displayQuestion(currentQuestionIndex);
+});
+
+document.getElementById('category-select').addEventListener('change', e => {
+    let currentCategory = e.target.value;
+    const questionIds = categoryMapping[currentCategory];
+    currentQuestionIndex = questionIds[0];
+    updateQuestionSelect(currentQuestionIndex);
+    displayQuestion(currentQuestionIndex);
+});
+
+// Update expand buttons whenever the model is changed
+document.getElementById('model-select').addEventListener('change', () => {
+    displayAnswers(currentQuestionIndex);
+    document.querySelectorAll('.expandable-card').forEach(card => {
+        updateExpandButtonVisibility(card);
+    });
+    updateModelSelect();
+});
+
+function switchQuestionAndCategory() {
+    document.getElementById('question-select').value = currentQuestionIndex;
+    old_category = document.getElementById('category-select').value;
+    new_category = questionMapping[currentQuestionIndex].category;
+    if (old_category != new_category) {
+        document.getElementById('category-select').value = new_category;
+        updateQuestionSelect(currentQuestionIndex);
+    }
+    displayQuestion(currentQuestionIndex);
+}
+
+document.getElementById('prev-question').addEventListener('click', () => {
+    // Question index starts from 1.
+    currentQuestionIndex = Math.max(1, currentQuestionIndex - 1);
+    switchQuestionAndCategory();
+});
+
+document.getElementById('next-question').addEventListener('click', () => {
+    // Question index starts from 1.
+    currentQuestionIndex = Math.min(questionsCount, currentQuestionIndex + 1);
+    switchQuestionAndCategory();
+});
+
+function updateExpandButtonVisibility(card) {
+    const cardTextContainer = card.querySelector('.card-text-container');
+    const expandBtn = card.querySelector('.expand-btn');
+    if (cardTextContainer.scrollHeight > cardTextContainer.offsetHeight) {
+        expandBtn.style.display = 'flex';
+    } else {
+        expandBtn.style.display = 'none';
+        card.classList.add('expanded');
+    }
+}
+
+document.querySelectorAll('.expand-btn').forEach(btn => {
+    btn.addEventListener('click', e => {
+        const card = e.target.closest('.expandable-card');
+        card.classList.toggle('expanded');
+        const more = '<i class="material-icons" style="pointer-events: none">keyboard_arrow_down</i> Show more';
+        const less = '<i class="material-icons" style="pointer-events: none">keyboard_arrow_up</i> Show less';
+        e.target.innerHTML = card.classList.contains('expanded') ? less : more;
+    });
+});