Initial commit

bc5ebf0f · luopl · bc5ebf0f · bc5ebf0f · bc5ebf0f · bc5ebf0f
Commit bc5ebf0f authored Dec 27, 2024 by luopl
20 changed files
--- a/VLMEvalKit/vlmeval/vlm/misc/minigpt4_7b_eval.yaml
+++ b/VLMEvalKit/vlmeval/vlm/misc/minigpt4_7b_eval.yaml
+model:
+  arch: minigpt4
+  model_type: pretrain_vicuna_7b
+  max_txt_len: 160
+  end_sym: "###"
+  low_resource: True
+  prompt_template: '###Human: {} ###Assistant: '
+  ckpt: "please set this value to the path of pretrained checkpoint"
+
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+  freeze_qformer: True
+
+  # Q-Former
+  num_query_token: 32
+
+  # generation configs
+  prompt: ""
+
+  llama_model: "please set this value to the path of vicuna-7b-v0"
+
+
+datasets:
+  cc_sbu_align:
+    vis_processor:
+      train:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+
+run:
+  task: image_text_pretrain
--- a/VLMEvalKit/vlmeval/vlm/misc/minigptv2_eval.yaml
+++ b/VLMEvalKit/vlmeval/vlm/misc/minigptv2_eval.yaml
+model:
+  arch: minigpt_v2
+  model_type: pretrain
+  max_txt_len: 160
+  end_sym: "</s>"
+  low_resource: True
+  prompt_template: '[INST] {} [/INST]'
+  ckpt: "please set this value to the path of pretrained checkpoint"
+  lora_r: 64
+  lora_alpha: 16
+
+  # vit encoder
+  image_size: 448
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+
+  # generation configs
+  prompt: ""
+
+  # LLM
+  llama_model: "please set this value to the path of llama2-chat-7b"
+
+datasets:
+  cc_sbu_align:
+    vis_processor:
+      train:
+        name: "blip2_image_eval"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+
+run:
+  task: image_text_pretrain
--- a/VLMEvalKit/vlmeval/vlm/mixsense.py
+++ b/VLMEvalKit/vlmeval/vlm/mixsense.py
+import torch
+import transformers
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from PIL import Image
+import warnings
+
+from .base import BaseModel
+from ..smp import *
+
+
+class LLama3Mixsense(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='Zero-Vision/Llama-3-MixSenseV1_1', **kwargs):
+        assert model_path is not None
+        transformers.logging.set_verbosity_error()
+        transformers.logging.disable_progress_bar()
+        warnings.filterwarnings('ignore')
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True
+        )
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path, trust_remote_code=True
+        ).to('cuda').eval()
+        self.kwargs = kwargs
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message)
+        input_ids = self.model.text_process(prompt, self.tokenizer).to(device='cuda')
+        image = Image.open(image_path).convert('RGB')
+        image_tensor = self.model.image_process([image]).to(dtype=self.model.dtype, device='cuda')
+        # generate
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                input_ids,
+                images=image_tensor,
+                max_new_tokens=2048,
+                use_cache=True,
+                eos_token_id=[
+                    self.tokenizer.eos_token_id,
+                    self.tokenizer.convert_tokens_to_ids(['<|eot_id|>'])[0],
+                ],
+            )
+        return self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
--- a/VLMEvalKit/vlmeval/vlm/mmalaya.py
+++ b/VLMEvalKit/vlmeval/vlm/mmalaya.py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
+import warnings
+from .base import BaseModel
+from PIL import Image
+from ..smp import *
+from ..dataset import DATASET_TYPE
+import pandas as pd
+import string
+import torchvision.transforms as T
+import transformers
+
+from torchvision.transforms.functional import InterpolationMode
+
+
+class MMAlaya(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='DataCanvas/MMAlaya', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path, device_map='cpu', trust_remote_code=True
+        ).eval()
+        # need initialize tokenizer
+        model.initialize_tokenizer(self.tokenizer)
+        self.model = model.cuda()
+
+        self.kwargs = kwargs
+        warnings.warn(
+            f'Following kwargs received: {self.kwargs}, will use as generation config. '
+        )
+        torch.cuda.empty_cache()
+
+    def generate_inner(self, message, dataset=None):
+        # read image
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        image = Image.open(image_path).convert('RGB')
+        # tokenize prompt, and proprecess image
+        input_ids, image_tensor, stopping_criteria = self.model.prepare_for_inference(
+            prompt, self.tokenizer, image, return_tensors='pt'
+        )
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                inputs=input_ids.cuda(),
+                images=image_tensor.cuda(),
+                do_sample=False,
+                max_new_tokens=512,
+                num_beams=1,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria],
+            )
+            # truncate input_ids in generate_ids and then decode to text
+            input_token_len = input_ids.shape[1]
+            response = self.tokenizer.batch_decode(
+                output_ids[:, input_token_len:].cpu(),
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False,
+            )[0].strip()
+        return response
+
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose(
+        [
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=MEAN, std=STD),
+        ]
+    )
+    return transform
+
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def dynamic_preprocess(
+    image, min_num=1, max_num=6, image_size=448, use_thumbnail=False
+):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+
+
+def load_image(image_file, input_size=448, max_num=6, upscale=False):
+    image = Image.open(image_file).convert('RGB')
+    if upscale:
+        image = image.resize((image.width * 2, image.height * 2), Image.BILINEAR)
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(
+        image, image_size=input_size, use_thumbnail=True, max_num=max_num
+    )
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+
+
+# This function is used to split InternVL2-Llama3-76B
+def split_model(model_name):
+    import math
+    device_map = {}
+    num_gpus = torch.cuda.device_count()
+    rank, world_size = get_rank_and_world_size()
+    num_gpus = num_gpus // world_size
+    assert num_gpus >= 1
+    if num_gpus == 1:
+        return device_map
+
+    num_layers = {'InternVL2-8B': 32, 'InternVL2-26B': 48,
+                  'InternVL2-40B': 60, 'InternVL2-Llama3-76B': 80}[model_name]
+    # Since the first GPU will be used for ViT, treat it as 0.5 GPU.
+    num_layers_per_gpu = math.ceil(num_layers / (num_gpus - 0.5))
+    num_layers_per_gpu = [num_layers_per_gpu] * num_gpus
+    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
+    layer_cnt = 0
+    for i, num_layer in enumerate(num_layers_per_gpu):
+        for j in range(num_layer):
+            device_map[f'language_model.model.layers.{layer_cnt}'] = rank + world_size * i
+            layer_cnt += 1
+    device_map['vision_model'] = rank
+    device_map['mlp1'] = rank
+    device_map['language_model.model.tok_embeddings'] = rank
+    device_map['language_model.model.embed_tokens'] = rank
+    device_map['language_model.output'] = rank
+    device_map['language_model.model.norm'] = rank
+    device_map['language_model.lm_head'] = rank
+    device_map[f'language_model.model.layers.{num_layers - 1}'] = rank
+    return device_map
+
+
+class MMAlaya2(BaseModel):
+    """
+    This implementation fine-tunes 20 LoRA modules based on the InternVL-Chat-V1-5 model.
+    The fine-tuned LoRA modules are then merged with the InternVL-Chat-V1-5 model
+    using the PEFT model merging method, TIES.
+    The code is based on the implementation in `vlmeval/vlm/internvl_chat.py`.
+    """
+
+    INSTALL_REQ = False
+    INTERLEAVE = True
+
+    def __init__(
+        self,
+        model_path='DataCanvas/MMAlaya2',
+        load_in_8bit=False,
+        **kwargs,
+    ):
+        assert model_path is not None
+        assert version_cmp(transformers.__version__, '4.36.2', 'ge')
+
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, use_fast=False
+        )
+
+        # Regular expression to match the pattern "Image" followed by a number, e.g. Image1
+        self.pattern = r'Image(\d+)'
+        # Replacement pattern to insert a hyphen between "Image" and the number, e.g. Image-1
+        self.replacement = r'Image-\1'
+
+        # Convert InternVL2 response to dataset format
+        # e.g. Image1 -> Image-1
+
+        # Regular expression to match the pattern "Image-" followed by a number
+        self.reverse_pattern = r'Image-(\d+)'
+        # Replacement pattern to remove the hyphen (Image-1 -> Image1)
+        self.reverse_replacement = r'Image\1'
+
+        device_map = split_model('InternVL2-26B')
+        if len(device_map) == 0:
+            device_map = {'': 'cuda'}
+
+        self.model = AutoModel.from_pretrained(
+            model_path,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+            load_in_8bit=load_in_8bit,
+            device_map=device_map
+        ).eval()
+
+        self.image_size = self.model.config.vision_config.image_size
+
+        kwargs_default = dict(
+            do_sample=False, max_new_tokens=1024, top_p=None, num_beams=1
+        )
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+        warnings.warn(
+            f'Following kwargs received: {self.kwargs}, will use as generation config. '
+        )
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN'], dataset):
+            # For Multi-Turn we don't have custom prompt
+            return False
+        else:
+            return True
+
+    def build_multi_choice_prompt(self, line, dataset=None):
+        question = line['question']
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+        prompt = question
+
+        if len(options):
+            prompt += (
+                '\n请直接回答选项字母。'
+                if cn_string(prompt)
+                else "\nAnswer with the option's letter from the given choices directly."
+            )
+        else:
+            prompt += (
+                '\n请直接回答问题。'
+                if cn_string(prompt)
+                else '\nAnswer the question directly.'
+            )
+
+        return prompt
+
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert dataset is None or isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+
+        if dataset is not None and listinstr(['MME'], dataset):
+            question = line['question']
+            prompt = question + ' Answer the question using a single word or phrase.'
+        elif dataset is not None and listinstr(['HallusionBench'], dataset):
+            question = line['question']
+            prompt = (
+                question
+                + ' Please answer yes or no. Answer the question using a single word or phrase.'
+            )
+        elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
+            prompt = self.build_multi_choice_prompt(line, dataset)
+        elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
+            if listinstr(['MathVista', 'MathVision', 'MathVerse'], dataset):
+                prompt = line['question']
+            elif listinstr(['LLaVABench'], dataset):
+                question = line['question']
+                prompt = question + '\nAnswer this question in detail.'
+            elif listinstr(['MMVet'], dataset):
+                prompt = line['question']
+            else:
+                question = line['question']
+                prompt = question + '\nAnswer the question using a single word or phrase.'
+        else:
+            prompt = line['question']
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
+
+    def set_max_num(self, dataset):
+        if dataset is not None and listinstr(['ChartQA_TEST', 'MMMU_DEV_VAL'], dataset):
+            self.max_num = 12
+        elif dataset is not None and listinstr(['DocVQA_VAL', 'DocVQA_TEST'], dataset):
+            self.max_num = 18
+        elif dataset is not None and listinstr(
+            ['InfoVQA_VAL', 'InfoVQA_TEST', 'OCRBench'], dataset
+        ):
+            self.max_num = 24
+        elif dataset is not None and listinstr(
+            ['MMBench-Video', 'Video-MME', 'Video'], dataset
+        ):
+            self.max_num = 1
+        else:
+            self.max_num = 6
+
+    def generate_inner(self, message, dataset=None):
+        self.set_max_num(dataset)
+        image_num = len([x for x in message if x['type'] == 'image'])
+        prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+
+        if image_num > 1:
+            image_path = [x['value'] for x in message if x['type'] == 'image']
+            pixel_values_list = []
+            max_num = max(1, self.max_num // image_num)
+            for file_name in image_path:
+                pixel_values_list.append(load_image(file_name, max_num=max_num).cuda().to(torch.bfloat16))
+            pixel_values = torch.cat(pixel_values_list, dim=0)
+        elif image_num == 1:
+            image_path = [x['value'] for x in message if x['type'] == 'image'][0]
+            pixel_values = (
+                load_image(image_path, max_num=self.max_num).cuda().to(torch.bfloat16)
+            )
+        else:
+            pixel_values = None
+        with torch.no_grad():
+            response = self.model.chat(
+                self.tokenizer,
+                pixel_values=pixel_values,
+                question=prompt,
+                generation_config=self.kwargs,
+                # verbose=False,
+            )
+        return response
+
+
+if __name__ == '__main__':
+    model = MMAlaya2(max_new_tokens=1024, do_sample=False)
+    response = model.generate_inner(
+        [
+            {'type': 'image', 'value': './assets/apple.jpg'},
+            {'type': 'text', 'value': '请详细描述一下这张图片。'},
+        ]
+    )
+    print(response)
--- a/VLMEvalKit/vlmeval/vlm/molmo.py
+++ b/VLMEvalKit/vlmeval/vlm/molmo.py
+import torch
+from PIL import Image
+import os.path as osp
+import sys
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+
+
+class molmo(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='allenai/Molmo-7B-D-0924', **kwargs):
+        try:
+            from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
+            import einops
+        except Exception as e:
+            logging.critical('Please install transformer and einops before using molmo.')
+            raise e
+
+        if '72b' not in model_path.lower():
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                trust_remote_code=True,
+                torch_dtype=torch.bfloat16,
+                device_map='cuda')
+        else:
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                trust_remote_code=True,
+                torch_dtype=torch.bfloat16,
+                device_map='auto')
+
+        self.processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
+        self.kwargs = kwargs
+        self.model_name = model_path
+
+    def generate_inner(self, message, dataset=None):
+        from transformers import GenerationConfig
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+
+        image = Image.open(image_path)
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        # process the image and text
+        inputs = self.processor.process(
+            images=[image],
+            text=prompt
+        )
+
+        # move inputs to the correct device and make a batch of size 1
+        inputs = {k: v.to(self.model.device).unsqueeze(0) for k, v in inputs.items()}
+
+        # generate output; maximum 200 new tokens; stop generation when <|endoftext|> is generated
+        with torch.autocast(device_type="cuda", enabled=True, dtype=torch.bfloat16):
+            output = self.model.generate_from_batch(
+                inputs,
+                GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
+                tokenizer=self.processor.tokenizer
+            )
+
+        # only get generated tokens; decode them to text
+        generated_tokens = output[0, inputs['input_ids'].size(1):]
+        generated_text = self.processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
+
+        # print the generated text
+        return generated_text
--- a/VLMEvalKit/vlmeval/vlm/monkey.py
+++ b/VLMEvalKit/vlmeval/vlm/monkey.py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import warnings
+from .base import BaseModel
+from ..dataset import DATASET_TYPE
+
+
+class Monkey(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='echo840/Monkey', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu', trust_remote_code=True).eval()
+        self.model = model.cuda()
+        self.kwargs = kwargs
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+        torch.cuda.empty_cache()
+
+    def generate_vanilla(self, image_path, prompt):
+        cur_prompt = f'<img>{image_path}</img> {prompt} Answer: '
+        input_ids = self.tokenizer(cur_prompt, return_tensors='pt', padding='longest')
+        attention_mask = input_ids.attention_mask
+        input_ids = input_ids.input_ids
+
+        output_ids = self.model.generate(
+            input_ids=input_ids.cuda(),
+            attention_mask=attention_mask.cuda(),
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=512,
+            min_new_tokens=1,
+            length_penalty=1,
+            num_return_sequences=1,
+            output_hidden_states=True,
+            use_cache=True,
+            pad_token_id=self.tokenizer.eod_id,
+            eos_token_id=self.tokenizer.eod_id,
+        )
+        response = self.tokenizer.decode(
+            output_ids[0][input_ids.size(1):].cpu(),
+            skip_special_tokens=True
+        ).strip()
+        return response
+
+    def generate_multichoice(self, image_path, prompt):
+        cur_prompt = f'<img>{image_path}</img> \n {prompt} Answer: '
+        input_ids = self.tokenizer(cur_prompt, return_tensors='pt', padding='longest')
+        attention_mask = input_ids.attention_mask
+        input_ids = input_ids.input_ids
+
+        output_ids = self.model.generate(
+            input_ids=input_ids.cuda(),
+            attention_mask=attention_mask.cuda(),
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=10,
+            min_new_tokens=1,
+            length_penalty=1,
+            num_return_sequences=1,
+            output_hidden_states=True,
+            use_cache=True,
+            pad_token_id=self.tokenizer.eod_id,
+            eos_token_id=self.tokenizer.eod_id,
+        )
+        response = self.tokenizer.decode(
+            output_ids[0][input_ids.size(1):].cpu(),
+            skip_special_tokens=True
+        ).strip()
+        return response
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        if dataset is None:
+            return self.generate_vanilla(image_path, prompt)
+        assert isinstance(dataset, str)
+        if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'Y/N' or dataset == 'HallusionBench':
+            return self.generate_multichoice(image_path, prompt)
+        else:
+            return self.generate_vanilla(image_path, prompt)
+
+
+class MonkeyChat(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='echo840/Monkey-Chat', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu', trust_remote_code=True).eval()
+        self.model = model.cuda()
+        self.kwargs = kwargs
+
+        self.tokenizer.padding_side = 'left'
+        self.tokenizer.pad_token_id = self.tokenizer.eod_id
+
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+        torch.cuda.empty_cache()
+
+    def generate_vanilla(self, image_path, prompt):
+        cur_prompt = f'<img>{image_path}</img> {prompt} Answer: '
+        input_ids = self.tokenizer(cur_prompt, return_tensors='pt', padding='longest')
+        attention_mask = input_ids.attention_mask
+        input_ids = input_ids.input_ids
+
+        output_ids = self.model.generate(
+            input_ids=input_ids.cuda(),
+            attention_mask=attention_mask.cuda(),
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=512,
+            min_new_tokens=1,
+            length_penalty=1,
+            num_return_sequences=1,
+            output_hidden_states=True,
+            use_cache=True,
+            pad_token_id=self.tokenizer.eod_id,
+            eos_token_id=self.tokenizer.eod_id,
+        )
+        response = self.tokenizer.decode(
+            output_ids[0][input_ids.size(1):].cpu(),
+            skip_special_tokens=True
+        ).strip()
+        return response
+
+    def generate_multichoice(self, image_path, prompt):
+        cur_prompt = f'<img>{image_path}</img> \n {prompt} Answer: '
+        input_ids = self.tokenizer(cur_prompt, return_tensors='pt', padding='longest')
+        attention_mask = input_ids.attention_mask
+        input_ids = input_ids.input_ids
+
+        output_ids = self.model.generate(
+            input_ids=input_ids.cuda(),
+            attention_mask=attention_mask.cuda(),
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=10,
+            min_new_tokens=1,
+            length_penalty=1,
+            num_return_sequences=1,
+            output_hidden_states=True,
+            use_cache=True,
+            pad_token_id=self.tokenizer.eod_id,
+            eos_token_id=self.tokenizer.eod_id,
+        )
+        response = self.tokenizer.decode(
+            output_ids[0][input_ids.size(1):].cpu(),
+            skip_special_tokens=True
+        ).strip()
+        return response
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        if dataset is None:
+            return self.generate_vanilla(image_path, prompt)
+        assert isinstance(dataset, str)
+        if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'Y/N' or dataset == 'HallusionBench':
+            return self.generate_multichoice(image_path, prompt)
+        else:
+            return self.generate_vanilla(image_path, prompt)
--- a/VLMEvalKit/vlmeval/vlm/moondream.py
+++ b/VLMEvalKit/vlmeval/vlm/moondream.py
+import torch
+import re
+from PIL import Image
+from abc import abstractproperty
+import sys
+import os.path as osp
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+import copy
+
+
+class Moondream1(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self,
+                 model_path='vikhyatk/moondream1',
+                 **kwargs):
+        try:
+            from transformers import AutoModelForCausalLM, CodeGenTokenizerFast as Tokenizer
+        except Exception as e:
+            logging.critical(
+                "Please install Transformers version 4.36.2 by running: 'pip install transformers==4.36.2', "
+                "please intall torchvision>=0.16.")
+            raise e
+
+        assert osp.exists(model_path) or splitlen(model_path) == 2
+
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            torch_dtype=torch.float16,
+            device_map='cuda')
+        self.tokenizer = Tokenizer.from_pretrained(model_path)
+
+        default_kwargs = dict(max_new_tokens=512)
+        default_kwargs.update(kwargs)
+        self.kwargs = default_kwargs
+
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+        torch.cuda.empty_cache()
+
+    def generate_inner(self, message, dataset=None):
+        prompt, img = self.message_to_promptimg(message)
+        enc_image = self.model.encode_image(Image.open(img))
+
+        prompt_wtmpl = f'<image>\n\nQuestion: {prompt}\n\nAnswer:'
+        answer = self.model.generate(
+            enc_image, prompt_wtmpl, eos_text='<END>', tokenizer=self.tokenizer, **self.kwargs)[0]
+        cleaned_answer = re.sub('<$', '', re.sub('END$', '', answer)).strip()
+        return cleaned_answer
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if listinstr(['MMMU'], dataset):
+            return False
+        if DATASET_TYPE(dataset) == 'MCQ' or dataset == 'MMVet':
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        if dataset == 'MMVet':
+            prompt = question + '\nAnswer the question directly. '
+        elif DATASET_TYPE(dataset) == 'MCQ':
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            options_prompt = ''
+            for key, item in options.items():
+                options_prompt += f'{key}. {item}\n'
+
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+            prompt = f'Hint: {hint}\n' if hint is not None else ''
+            prompt += f'{question}\n'
+            prompt += (
+                f'{options_prompt}\nAnswer with the option’s letter from the given choices directly. '
+                if len(options) else 'Answer the question directly. '
+            )
+        else:
+            raise NotImplementedError
+
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
+
+
+class Moondream2(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self,
+                 model_path="vikhyatk/moondream2",
+                 revision="2024-08-26",
+                 **kwargs):
+        try:
+            from transformers import AutoModelForCausalLM, AutoTokenizer
+        except Exception as e:
+            logging.critical('''Please install Transformers version 4.44 by running: "pip install transformers==4.44.0",
+            please intall torchvision>=0.16.''')
+            raise e
+
+        assert osp.exists(model_path) or splitlen(model_path) == 2
+
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            torch_dtype=torch.float16,
+            device_map='cuda',
+            revision=revision)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+        default_kwargs = dict(max_new_tokens=512)
+        default_kwargs.update(kwargs)
+        self.kwargs = default_kwargs
+
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+        torch.cuda.empty_cache()
+
+    def generate_inner(self, message, dataset=None):
+        prompt, img = self.message_to_promptimg(message)
+        enc_image = self.model.encode_image(Image.open(img))
+
+        prompt_wtmpl = f'<image>\n\nQuestion: {prompt}\n\nAnswer:'
+        answer = self.model.generate(
+            enc_image, prompt_wtmpl, tokenizer=self.tokenizer, **self.kwargs)[0]
+        cleaned_answer = answer.strip()
+        return cleaned_answer
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if listinstr(['MMMU'], dataset):
+            return False
+        if DATASET_TYPE(dataset) == 'MCQ' or dataset == 'MMVet':
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        if dataset == 'MMVet':
+            prompt = question + '\nAnswer the question directly. '
+        elif DATASET_TYPE(dataset) == 'MCQ':
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            options_prompt = ''
+            for key, item in options.items():
+                options_prompt += f'{key}. {item}\n'
+
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+            prompt = f'Hint: {hint}\n' if hint is not None else ''
+            prompt += f'{question}\n'
+            prompt += (
+                f'{options_prompt}\nAnswer with the option’s letter from the given choices directly. '
+                if len(options) else 'Answer the question directly. '
+            )
+        else:
+            raise NotImplementedError
+
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
--- a/VLMEvalKit/vlmeval/vlm/mplug_owl2.py
+++ b/VLMEvalKit/vlmeval/vlm/mplug_owl2.py
+import sys
+import torch
+from PIL import Image
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+
+
+class mPLUG_Owl2(BaseModel):
+
+    INSTALL_REQ = True
+    INTERLEAVE = False
+
+    def __init__(self, model_path='MAGAer13/mplug-owl2-llama2-7b', **kwargs):
+        try:
+            from mplug_owl2.model.builder import load_pretrained_model
+            from mplug_owl2.mm_utils import get_model_name_from_path
+        except Exception as e:
+            logging.critical('Please install mPLUG_Owl2 before using mPLUG_Owl2. ')
+            raise e
+
+        model_name = get_model_name_from_path(model_path)
+        tokenizer, model, image_processor, context_len = load_pretrained_model(
+            model_path, None, model_name, load_8bit=False, load_4bit=False, device='cpu')
+
+        self.model = model.cuda()
+        self.device = self.model.device
+        self.image_processor = image_processor
+        tokenizer.padding_side = 'left'
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+        self.tokenizer = tokenizer
+        self.context_len = context_len
+
+        kwargs_default = dict(
+            max_new_tokens=512, do_sample=False, num_beams=1,
+            min_new_tokens=1, length_penalty=1, num_return_sequences=1)
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if listinstr(['MMMU'], dataset):
+            return False
+        if DATASET_TYPE(dataset) == 'MCQ' or dataset == 'MMVet':
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        if dataset == 'MMVet':
+            prompt = question + '\nAnswer the question directly. '
+        elif DATASET_TYPE(dataset) == 'MCQ':
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            options_prompt = ''
+            for key, item in options.items():
+                options_prompt += f'{key}. {item}\n'
+
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+            prompt = f'Hint: {hint}\n' if hint is not None else ''
+            prompt += f'{question}\n'
+            prompt += (
+                f'{options_prompt}\nAnswer with the option’s letter from the given choices directly. '
+                if len(options) else 'Answer the question directly. '
+            )
+        else:
+            raise NotImplementedError
+
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
+
+    def generate_inner(self, message, dataset=None):
+        from mplug_owl2.constants import IMAGE_TOKEN_INDEX
+        from mplug_owl2.mm_utils import process_images, tokenizer_image_token
+        kwargs = cp.deepcopy(self.kwargs)
+        if dataset in ['MMVet', 'LLaVABench']:
+            kwargs['length_penalty'] = 0
+        elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
+            kwargs['length_penalty'] = 0
+        elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
+            kwargs['max_new_tokens'] = 10
+        num_images = len([x for x in message if x['type'] == 'image'])
+        assert num_images >= 0
+        prompt_full = 'USER: '
+        images = []
+        if num_images == 1:
+            prompt, image = self.message_to_promptimg(message, dataset=dataset)
+            prompt_full += f'<|image|>{prompt} \nASSISTANT: '
+            images.append(image)
+        else:
+            for msg in message:
+                if msg['type'] == 'image':
+                    images.append(msg['value'])
+                    prompt_full += '<|image|>'
+                elif msg['type'] == 'text':
+                    prompt_full += msg['value']
+            prompt_full += '\nASSISTANT: '
+
+        def preproc_image(fname):
+            image = Image.open(fname).convert('RGB')
+            max_edge = max(image.size)
+            image = image.resize((max_edge, max_edge))
+            return image
+        images = [preproc_image(fname) for fname in images]
+        image_tensor = process_images(images, self.image_processor)
+        image_tensor = image_tensor.to(self.device, dtype=torch.float16)
+        input_ids = tokenizer_image_token(
+            prompt_full, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
+
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                input_ids=input_ids,
+                images=image_tensor,
+                output_hidden_states=True,
+                use_cache=True,
+                **kwargs)
+        answer = self.tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
+        return answer.split('</s>')[0]
--- a/VLMEvalKit/vlmeval/vlm/mplug_owl3.py
+++ b/VLMEvalKit/vlmeval/vlm/mplug_owl3.py
+import torch
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+from torchvision import transforms
+from transformers import AutoTokenizer, AutoModel
+
+import io
+import random
+import numpy as np
+import math
+
+
+def get_frame_indices(num_frames, vlen, sample='rand', fix_start=None, input_fps=1, max_num_frames=-1):
+    if sample in ['rand', 'middle']:
+        acc_samples = min(num_frames, vlen)
+        # split the video into `acc_samples` intervals, and sample from each interval.
+        intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
+        ranges = []
+        for idx, interv in enumerate(intervals[:-1]):
+            ranges.append((interv, intervals[idx + 1] - 1))
+        if sample == 'rand':
+            try:
+                frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
+            except:
+                frame_indices = np.random.permutation(vlen)[:acc_samples]
+                frame_indices.sort()
+                frame_indices = list(frame_indices)
+        elif fix_start is not None:
+            frame_indices = [x[0] + fix_start for x in ranges]
+        elif sample == 'middle':
+            frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
+        else:
+            raise NotImplementedError
+
+        if len(frame_indices) < num_frames:  # padded with last frame
+            padded_frame_indices = [frame_indices[-1]] * num_frames
+            padded_frame_indices[:len(frame_indices)] = frame_indices
+            frame_indices = padded_frame_indices
+
+    elif 'fps' in sample:  # fps0.5, sequentially sample frames at 0.5 fps
+        output_fps = float(sample[3:])
+        duration = float(vlen) / input_fps
+        delta = 1 / output_fps  # gap between frames, this is also the clip length each frame represents
+        frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
+        frame_indices = np.around(frame_seconds * input_fps).astype(int)
+        frame_indices = [e for e in frame_indices if e < vlen]
+        if max_num_frames > 0 and len(frame_indices) > max_num_frames:
+            frame_indices = frame_indices[:max_num_frames]
+            # frame_indices = np.linspace(0 + delta / 2, duration + delta / 2, endpoint=False, num=max_num_frames)
+    elif 'interval' in sample:
+        if num_frames == 1:
+            frame_indices = [random.randint(0, vlen - 1)]
+        else:
+            # transform FPS
+            interval = 8
+            clip_length = num_frames * interval * input_fps / 30
+            max_idx = max(vlen - clip_length, 0)
+            start_idx = random.uniform(0, max_idx)
+            end_idx = start_idx + clip_length - 1
+
+            frame_indices = torch.linspace(start_idx, end_idx, num_frames)
+            frame_indices = torch.clamp(frame_indices, 0, vlen - 1).long().tolist()
+    else:
+        raise ValueError
+    return frame_indices
+
+
+def get_frame_indices_start_end(num_frames, vlen, fps, start_time, end_time):
+    start_idx = max(int(fps * start_time), 0) if start_time is not None and not math.isnan(start_time) else 0
+    end_idx = min(int(fps * end_time), vlen) if end_time is not None and not math.isnan(end_time) else vlen
+    clip_len = end_idx - start_idx
+
+    acc_samples = min(num_frames, clip_len)
+    # split the video into `acc_samples` intervals, and sample from each interval.
+    intervals = np.linspace(start=start_idx, stop=end_idx, num=acc_samples + 1).astype(int)
+    ranges = []
+    for idx, interv in enumerate(intervals[:-1]):
+        ranges.append((interv, intervals[idx + 1] - 1))
+
+    try:
+        frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
+    except:
+        frame_indices = np.random.permutation(list(range(start_idx, end_idx)))[:acc_samples]
+        frame_indices.sort()
+        frame_indices = list(frame_indices)
+
+    if len(frame_indices) < num_frames:  # padded with last frame
+        padded_frame_indices = [frame_indices[-1]] * num_frames
+        padded_frame_indices[:len(frame_indices)] = frame_indices
+        frame_indices = padded_frame_indices
+
+    return frame_indices
+
+
+def read_frames_decord(
+    video_path, width=None, height=None,
+    num_frames=8, sample='rand', fix_start=None,
+    max_num_frames=-1, start_time=None, end_time=None
+):
+    import decord
+    decord.bridge.set_bridge('torch')
+    if video_path.lower().endswith('.webm'):
+        # a workaround for webm, large/auto num_threads will cause error.
+        num_threads = 2
+    else:
+        num_threads = 0
+
+    if width is not None and height is not None:
+        video_reader = decord.VideoReader(video_path, width=width, height=height, num_threads=num_threads)
+    else:
+        video_reader = decord.VideoReader(video_path, num_threads=num_threads)
+    vlen = len(video_reader)
+    fps = video_reader.get_avg_fps()
+    if start_time and end_time:
+        frame_indices = get_frame_indices_start_end(
+            num_frames, vlen, fps, start_time, end_time
+        )
+    else:
+        frame_indices = get_frame_indices(
+            num_frames, vlen, sample=sample, fix_start=fix_start,
+            input_fps=fps, max_num_frames=max_num_frames
+        )
+    frames = video_reader.get_batch(frame_indices)
+    if isinstance(frames, torch.Tensor):
+        frames = frames.numpy()  # (T, H, W, C), torch.uint8
+    else:
+        print(frames.shape)
+        frames = frames.asnumpy()
+    timestamp = {
+        'num_frames': len(frame_indices),
+        'timestamp': ', '.join([str(round(f / fps, 1)) for f in frame_indices])
+    }
+    return frames, timestamp
+
+
+class mPLUG_Owl3(BaseModel):
+    # No separate model module is required, but the dependencies must be met.
+    # https://github.com/X-PLUG/mPLUG-Owl/blob/main/mPLUG-Owl3/requirements.txt
+    INSTALL_REQ = True
+    INTERLEAVE = True
+    INSTALL_REQ_TXT = 'https://github.com/X-PLUG/mPLUG-Owl/blob/main/mPLUG-Owl3/requirements.txt'
+
+    def __init__(self, model_path=None, **kwargs):
+        assert model_path is not None
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path
+        )
+
+        self.model = AutoModel.from_pretrained(
+            model_path,
+            attn_implementation='sdpa',
+            torch_dtype=torch.half,
+            trust_remote_code=True
+        )
+        self.model.eval().cuda()
+        self.processor = self.model.init_processor(self.tokenizer)
+        self.logger = get_logger('mPLUG_Owl3')
+        if self.INSTALL_REQ:
+            self.logger.info(
+                f'Please remember to meet the requirements first\n'
+                f'Here: {self.INSTALL_REQ_TXT}'
+            )
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if listinstr(['MMMU'], dataset):
+            return False
+        if listinstr(['MVBench', 'MMVet'], dataset):
+            return True
+        return False
+
+    def save_video_into_images(self, line, num_frames=16, dataset_class=None):
+        video_url = {
+            'video': osp.join(line['prefix'], line['video']),
+            'num_frames': num_frames,
+            'bound': line.get('bound', None)
+        }
+        if osp.isdir(video_url['video']):
+            frame_paths = []
+            max_frame = len(os.listdir(video_url['video']))
+            fps = 3
+            if video_url['bound']:
+                start, end = line['start'], line['end']
+            else:
+                start, end = -100000, 100000
+            start_idx = max(1, round(start * fps))
+            end_idx = min(round(end * fps), max_frame)
+            seg_size = float(end_idx - start_idx) / num_frames
+            frame_indices = np.array([
+                int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
+                for idx in range(num_frames)
+            ])
+
+            for frame_index in frame_indices:
+                img = os.path.join(video_url['video'], f'{frame_index:05d}.jpg')
+                frame_paths.append(img)
+
+            return frame_paths
+
+        if isinstance(video_url, dict):
+            if video_url['bound']:
+                start_time = line['start']
+                end_time = line['end']
+            else:
+                start_time = None
+                end_time = None
+            num_frames = video_url.get('num_frames', num_frames)
+            video_url = video_url['video']
+        else:
+            start_time = None
+            end_time = None
+            video_url = str(video_url)
+
+        if not osp.exists(video_url):  # for MVBench_MP4
+            video_url = osp.join(dataset_class.data_root, video_url)
+        video, timestamp = read_frames_decord(
+            video_url, num_frames=num_frames, sample='middle', start_time=start_time, end_time=end_time
+        )
+
+        to_pil = transforms.ToPILImage()
+        frames = [to_pil(video[ti]) for ti in range(video.shape[0])]
+        lmu_root = LMUDataRoot()
+        frame_root = osp.join(lmu_root, 'images', dataset_class.dataset_name, 'mplug_owl3')
+        frame_root = osp.join(frame_root, video_url.split('/')[-1].split('.')[0])
+        os.makedirs(frame_root, exist_ok=True)
+        frame_tmpl = 'frame-{}-of-{}.jpg'
+        frame_paths = [osp.join(frame_root, frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)]
+        for im, pth in zip(frames, frame_paths):
+            if not osp.exists(pth):
+                im.save(pth)
+
+        return frame_paths
+
+    # Currently same to mPLUG_Owl2
+    def build_prompt(self, line, dataset=None, num_frames=16, video_llm=False):
+        if not isinstance(dataset, str):
+            dataset_class = dataset
+            dataset = dataset_class.dataset_name
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        if dataset_class.MODALITY == 'VIDEO':
+            if listinstr(['MVBench'], dataset):
+                tgt_path = self.save_video_into_images(line, num_frames, dataset_class)
+            else:
+                tgt_path = dataset_class.save_video_into_images(line, num_frames)
+            if type(line['candidates']) is not list:
+                line['candidates'] = eval(line['candidates'])
+            for idx, c in enumerate(line['candidates']):
+                line[chr(ord('A') + idx)] = c
+        else:
+            tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        if dataset == 'MMVet':
+            prompt = question + '\nAnswer the question directly. '
+        elif listinstr(['MCQ', 'Video-MCQ'], DATASET_TYPE(dataset)):
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            options_prompt = ''
+            for key, item in options.items():
+                options_prompt += f'{key}. {item}\n'
+
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+            prompt = f'Hint: {hint}\n' if hint is not None else ''
+            prompt += f'{question}\n'
+            prompt += (
+                f'{options_prompt}\nAnswer with the option’s letter from the given choices directly. '
+                if len(options) else 'Answer the question directly. '
+            )
+        else:
+            raise NotImplementedError
+
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
+
+    def preproc_image(self, fname, dataset=None):
+        from PIL import Image
+        image = Image.open(fname).convert('RGB')
+        # resize to max_size
+        max_size = 448 * 16
+        if max(image.size) > max_size and not listinstr(['MVBench'], dataset):
+            w, h = image.size
+            if w > h:
+                new_w = max_size
+                new_h = int(h * max_size / w)
+            else:
+                new_h = max_size
+                new_w = int(w * max_size / h)
+            image = image.resize((new_w, new_h), resample=Image.BICUBIC)
+        return image
+
+    def generate_inner(self, message, dataset=None):
+        num_images = len([x for x in message if x['type'] == 'image'])
+        assert num_images >= 0
+
+        images = []
+        prompt_full = ''
+
+        for msg in message:
+            if msg['type'] == 'image':
+                images.append(msg['value'])
+                prompt_full += '<|image|>'
+            elif msg['type'] == 'text':
+                prompt_full += msg['value']
+
+        needed_messages = [
+            {'role': 'user', 'content': prompt_full},
+            {'role': 'assistant', 'content': ''}
+        ]
+
+        images = [self.preproc_image(fname, dataset) for fname in images]
+
+        inputs = self.processor(needed_messages, images=images, videos=None, cut_enable=False)
+
+        inputs.to('cuda')
+        if listinstr(['MVBench'], dataset):
+            inputs.update({
+                'tokenizer': self.tokenizer,
+                'max_new_tokens': 100,
+                'decode_text': True,
+                'do_sample': True,
+                'top_k': 1,
+            })
+        else:
+            inputs.update({
+                'tokenizer': self.tokenizer,
+                'max_new_tokens': 1024,
+                'decode_text': True,
+            })
+
+        g = self.model.generate(**inputs)
+        return g[0]
--- a/VLMEvalKit/vlmeval/vlm/nvlm.py
+++ b/VLMEvalKit/vlmeval/vlm/nvlm.py
+import torch
+from transformers import AutoTokenizer, AutoModel
+import math
+from PIL import Image
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+def split_model():
+    device_map = {}
+
+    num_gpus = torch.cuda.device_count()
+    rank, world_size = get_rank_and_world_size()
+    num_gpus = num_gpus // world_size
+    num_layers = 80
+    # Since the first GPU will be used for ViT, treat it as half a GPU.
+    num_layers_per_gpu = math.ceil(num_layers / (num_gpus - 0.5))
+    num_layers_per_gpu = [num_layers_per_gpu] * num_gpus
+    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
+    layer_cnt = 0
+
+    for i, num_layer in enumerate(num_layers_per_gpu):
+        for j in range(num_layer):
+            device_map[f'language_model.model.layers.{layer_cnt}'] = rank + i * world_size
+            layer_cnt += 1
+
+    device_map['vision_model'] = rank
+    device_map['mlp1'] = rank
+    device_map['language_model.model.embed_tokens'] = rank
+    device_map['language_model.model.norm'] = rank
+    device_map['language_model.model.rotary_emb'] = rank
+    device_map['language_model.lm_head'] = rank
+    device_map[f'language_model.model.layers.{num_layers - 1}'] = rank
+    return device_map
+
+
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+
+
+def load_image(image_file, input_size=448, max_num=12):
+    image = Image.open(image_file).convert('RGB')
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+
+
+class NVLM(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='nvidia/NVLM-D-72B', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
+        kwargs_default = dict(max_new_tokens=1024, do_sample=False)
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+
+        self.model = AutoModel.from_pretrained(
+            model_path,
+            torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=True,
+            use_flash_attn=False,
+            trust_remote_code=True,
+            device_map=split_model()).eval()
+
+        logging.info(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+        torch.cuda.empty_cache()
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        pixel_values = load_image(image_path, max_num=6).to(torch.bfloat16).cuda()
+        response = self.model.chat(self.tokenizer, pixel_values, prompt, self.kwargs)
+        return response.strip()
--- a/VLMEvalKit/vlmeval/vlm/omchat.py
+++ b/VLMEvalKit/vlmeval/vlm/omchat.py
+import torch
+from PIL import Image
+import re
+from transformers import AutoModel, AutoProcessor
+
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+
+
+class OmChat(BaseModel):
+
+    INSTALL_REQ = True
+    INTERLEAVE = True
+
+    def __init__(self, model_path='omlab/omchat-v2.0-13B-single-beta_hf', **kwargs):
+
+        # Recommend to install `transformers==4.44.0`
+        assert model_path is not None
+        self.model_path = model_path
+        print(f'load from {self.model_path}')
+        model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True, torch_dtype=torch.float16)
+        self.model = model.cuda().eval()
+        self.kwargs = kwargs
+        self.processor = AutoProcessor.from_pretrained(self.model_path, trust_remote_code=True)
+        torch.cuda.empty_cache()
+
+        # system prompt
+        self.default_system_prompt = 'You are a helpful assistant. Focus on accuracy and reliability in your response.'
+        self.new1_system_prompt = 'You are a helpful assistant.'
+        self.new2_system_prompt = (
+            'Read the following question carefully, '
+            'solve it step by step, '
+            'and then output the final answer in the format of '
+            "'Answer: single number or single word or phrase'.\n\n"
+        )
+
+        # suffix_prompt for MCQ
+        self.mcq_suffix_prompt_en = 'Please select the correct answer from the options above. \n'
+        self.mcq_suffix_prompt_cn = '请直接回答选项字母。\n'
+        # suffix_prompt for Y/N
+        self.yorn_suffix_prompt = ' Please answer yes or no. Answer the question using a single word or phrase.'
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'Y/N':
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+
+        question = line['question']
+
+        if DATASET_TYPE(dataset) == 'MCQ':
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            options_prompt = 'Options:\n'
+            for key, item in options.items():
+                options_prompt += f'{key}. {item}\n'
+
+            prompt = ''
+            if hint is not None:
+                prompt += f'Hint: {hint}\n'
+            prompt += f'Question: {question}\n'
+            if len(options):
+                prompt += options_prompt
+                if not dataset.startswith('MMMU_'):
+                    if not cn_string(prompt):
+                        prompt += self.mcq_suffix_prompt_en
+                    else:
+                        prompt += self.mcq_suffix_prompt_cn
+
+        elif DATASET_TYPE(dataset) == 'Y/N':
+            prompt = question + self.yorn_suffix_prompt
+
+        print(DATASET_TYPE(dataset))
+        message = []
+        if isinstance(tgt_path, list):
+            message.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            message = [dict(type='image', value=tgt_path)]
+        message.append(dict(type='text', value=prompt))
+
+        return message
+
+    def message_to_promptimg(self, message, dataset=None):
+        if dataset is None or listinstr(['MMMU'], dataset):
+            prompt = '\n'.join([
+                re.sub(r'<image\s*\d+>', '<image>', x['value'])
+                for x in message
+                if x['type'] == 'text'
+            ])
+            image = [x['value'] for x in message if x['type'] == 'image']
+        else:
+            prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+            image = [x['value'] for x in message if x['type'] == 'image']
+        return prompt, image
+
+    def generate_inner(self, message, dataset=None):
+
+        def replace_last_dot(input_string):
+            if input_string.endswith('.'):
+                return input_string[:-1]
+            else:
+                return input_string
+
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        image = [Image.open(img_path).convert('RGB') for img_path in image_path]
+
+        default_kwargs = dict(
+            max_new_tokens=1024,
+            do_sample=False,
+            temperature=0.0,
+            top_p=1)
+
+        if dataset is not None and listinstr(['MathVista_MINI'], dataset):
+            system_prompt = self.new2_system_prompt
+        elif dataset is not None and listinstr(['MMMU_DEV_VAL', 'MMStar'], dataset):
+            system_prompt = self.new1_system_prompt
+        else:
+            system_prompt = self.default_system_prompt
+        inputs = self.processor(text=prompt, system_prompt=system_prompt, images=image, return_tensors='pt').to('cuda')
+        default_kwargs.update(self.kwargs)
+
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                **inputs,
+                eos_token_id=self.model.generation_config.eos_token_id,
+                **default_kwargs
+            )
+        res = self.processor.tokenizer.decode(output_ids[0, inputs.input_ids.shape[1]:]).strip()
+        if '<|im_end|>' in res:
+            res = res.split('<|im_end|>')[0].strip()
+
+        if dataset != 'MMMU_DEV_VAL':
+            if res.startswith('Answer: '):
+                res = res[len('Answer: '):]
+
+            match = re.search(r'\nThe answer is:(.+)', res)
+            if match:
+                res = match.group(1).strip()
+
+        # for OCRBench
+        doc_match = re.search(r'<doc>(.*?)<\/doc>', res)
+        if doc_match:
+            res = doc_match.group(1).strip()
+        res = replace_last_dot(res)
+
+        return res
--- a/VLMEvalKit/vlmeval/vlm/omnilmm.py
+++ b/VLMEvalKit/vlmeval/vlm/omnilmm.py
+import torch
+from PIL import Image
+from transformers import AutoTokenizer
+
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+
+
+DEFAULT_IMAGE_TOKEN = '<image>'
+DEFAULT_IMAGE_PATCH_TOKEN = '<im_patch>'
+DEFAULT_IM_START_TOKEN = '<im_start>'
+DEFAULT_IM_END_TOKEN = '<im_end>'
+
+
+def init_omni_lmm(model_path):
+    from omnilmm.model.omnilmm import OmniLMMForCausalLM
+    from omnilmm.utils import disable_torch_init
+    from omnilmm.model.utils import build_transform
+
+    torch.backends.cuda.matmul.allow_tf32 = True
+    disable_torch_init()
+    tokenizer = AutoTokenizer.from_pretrained(model_path, model_max_length=2048)
+
+    model = OmniLMMForCausalLM.from_pretrained(
+        model_path, tune_clip=True, torch_dtype=torch.bfloat16, device_map='cpu'
+    )
+    model = model.to(device='cuda', dtype=torch.bfloat16)
+
+    image_processor = build_transform(
+        is_train=False, input_size=model.model.config.image_size, std_mode='OPENAI_CLIP'
+    )
+
+    mm_use_im_start_end = getattr(model.config, 'mm_use_im_start_end', False)
+    assert mm_use_im_start_end
+
+    tokenizer.add_tokens(
+        [DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN],
+        special_tokens=True,
+    )
+
+    vision_config = model.model.vision_config
+    vision_config.im_patch_token = tokenizer.convert_tokens_to_ids(
+        [DEFAULT_IMAGE_PATCH_TOKEN]
+    )[0]
+    vision_config.use_im_start_end = mm_use_im_start_end
+    vision_config.im_start_token, vision_config.im_end_token = (
+        tokenizer.convert_tokens_to_ids([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
+    )
+    image_token_len = model.model.config.num_query
+
+    return model, image_processor, image_token_len, tokenizer
+
+
+def expand_question_into_multimodal(
+    question_text, image_token_len, im_st_token, im_ed_token, im_patch_token
+):
+    if '<image>' in question_text[0]['content']:
+        question_text[0]['content'] = question_text[0]['content'].replace(
+            '<image>', im_st_token + im_patch_token * image_token_len + im_ed_token
+        )
+    else:
+        question_text[0]['content'] = (
+            im_st_token
+            + im_patch_token * image_token_len
+            + im_ed_token
+            + '\n'
+            + question_text[0]['content']
+        )
+    return question_text
+
+
+def wrap_question_for_omni_lmm(question, image_token_len, tokenizer):
+    from omnilmm.train.train_utils import omni_preprocess
+
+    question = expand_question_into_multimodal(
+        question,
+        image_token_len,
+        DEFAULT_IM_START_TOKEN,
+        DEFAULT_IM_END_TOKEN,
+        DEFAULT_IMAGE_PATCH_TOKEN,
+    )
+
+    conversation = question
+    data_dict = omni_preprocess(
+        sources=[conversation], tokenizer=tokenizer, generation=True
+    )
+
+    data_dict = dict(input_ids=data_dict['input_ids'][0], labels=data_dict['labels'][0])
+    return data_dict
+
+
+class OmniLMM12B(BaseModel):
+
+    INSTALL_REQ = True
+    INTERLEAVE = False
+
+    def __init__(self, model_path, root, **kwargs) -> None:
+        sys.path.append(root)
+        model, img_processor, image_token_len, tokenizer = init_omni_lmm(model_path)
+        self.model = model
+        self.image_token_len = image_token_len
+        self.image_transform = img_processor
+        self.tokenizer = tokenizer
+        self.model.eval()
+        default_kwargs = dict(
+            max_new_tokens=512,
+            do_sample=False,
+            output_scores=True,
+            return_dict_in_generate=True,
+            repetition_penalty=1.1,
+        )
+        default_kwargs.update(kwargs)
+        self.kwargs = default_kwargs
+        torch.cuda.empty_cache()
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        try:
+            image = Image.open(image_path).convert('RGB')
+        except:
+            logger = get_logger('OmniLMM Inference')
+            logger.error('Image Decode Error')
+            return 'Image Decode Error'
+
+        msgs = [dict(role='user', content=prompt)]
+        input_ids = wrap_question_for_omni_lmm(
+            msgs, self.image_token_len, self.tokenizer
+        )['input_ids']
+        input_ids = torch.as_tensor(input_ids)
+        image = self.image_transform(image)
+
+        with torch.inference_mode():
+            output = self.model.generate_vllm(
+                input_ids=input_ids.unsqueeze(0).cuda(),
+                images=image.unsqueeze(0).half().cuda(),
+                **self.kwargs,
+            )
+
+            response = self.tokenizer.decode(
+                output.sequences[0], skip_special_tokens=True
+            )
+            response = response.strip()
+            return response
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'MCQ':
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        options_prompt = 'Options:\n'
+        for key, item in options.items():
+            options_prompt += f'{key}. {item}\n'
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        prompt = ''
+        if hint is not None:
+            prompt += f'Hint: {hint}\n'
+        prompt += f'{question}\n'
+        if len(options):
+            prompt += options_prompt
+            prompt = (
+                """
+Study the image carefully and pick the option associated with the correct answer.
+Focus solely on selecting the option and avoid including any other content.\n
+"""
+                + prompt
+            )
+
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
--- a/VLMEvalKit/vlmeval/vlm/open_flamingo.py
+++ b/VLMEvalKit/vlmeval/vlm/open_flamingo.py
+import sys
+import torch
+from PIL import Image
+import os.path as osp
+import warnings
+from .base import BaseModel
+from ..smp import *
+from huggingface_hub import snapshot_download
+
+
+class OpenFlamingo(BaseModel):
+
+    INSTALL_REQ = True
+    INTERLEAVE = True
+
+    def __init__(self,
+                 name,
+                 mpt_pth=None,
+                 ckpt_pth=None,
+                 **kwargs):
+
+        if mpt_pth is None:
+            raise ValueError(
+                'Please set `mpt_pth` to the directory of MPT-7B, which is cloned from here: '
+                'https://huggingface.co/mosaicml/mpt-7b. '
+            )
+            raise ValueError
+        if ckpt_pth is None:
+            raise ValueError(
+                'Please set `ckpt_pth` to the openflamingo ckpt, which is the `checkpoint.pt` file downloaded '
+                'from: https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b/tree/main. '
+            )
+        else:
+            if osp.exists(ckpt_pth):
+                if ckpt_pth.endswith('checkpoint.pt'):
+                    pass
+                elif osp.isdir(ckpt_pth):
+                    ckpt_pth = osp.join(ckpt_pth, 'checkpoint.pt')
+                    if not osp.exists(ckpt_pth):
+                        raise ValueError(f'File {ckpt_pth} does not exist. ')
+            elif splitlen(ckpt_pth, '/') == 2:
+                cache_path = get_cache_path(ckpt_pth)
+                if cache_path is None:
+                    snapshot_download(ckpt_pth)
+                cache_path = get_cache_path(ckpt_pth)
+                if cache_path is None:
+                    raise ValueError(f'Directory {cache_path} does not exist. ')
+                else:
+                    ckpt_pth = osp.join(cache_path, 'checkpoint.pt')
+
+        self.name = name
+        assert name in ['v2']
+        self.mpt_pth = mpt_pth
+        try:
+            from open_flamingo import create_model_and_transforms
+        except Exception as e:
+            logging.critical('Please first install open_flamingo to use OpenFlamingo')
+            raise e
+
+        model, image_processor, tokenizer = create_model_and_transforms(
+            clip_vision_encoder_path='ViT-L-14',
+            clip_vision_encoder_pretrained='openai',
+            lang_encoder_path=mpt_pth,
+            tokenizer_path=mpt_pth,
+            cross_attn_every_n_layers=4)
+        ckpt = torch.load(ckpt_pth)
+        model.load_state_dict(ckpt, strict=False)
+        torch.cuda.empty_cache()
+        self.model = model.eval().cuda()
+        self.tokenizer = tokenizer
+        self.tokenizer.padding_side = 'left'
+        self.image_proc = image_processor
+
+        kwargs_default = dict(max_new_tokens=512, num_beams=3)
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+
+    def generate_inner(self, message, dataset=None):
+        vision_x = []
+        prompt = ''
+        for msg in message:
+            if msg['type'] == 'image':
+                img = Image.open(msg['value'])
+                vision_x.append(self.image_proc(img).unsqueeze(0))
+                prompt += '<image>'
+            elif msg['type'] == 'text':
+                prompt += msg['value']
+        prompt += 'Answer: '
+        vision_x = torch.cat(vision_x, dim=0) if len(vision_x) > 1 else vision_x[0]
+        vision_x = vision_x.unsqueeze(1).unsqueeze(0)
+        lang_x = self.tokenizer([prompt], return_tensors='pt')
+        generated_text = self.model.generate(
+            vision_x=vision_x.cuda(),
+            lang_x=lang_x['input_ids'].cuda(),
+            attention_mask=lang_x['attention_mask'].cuda(),
+            **self.kwargs)
+        generated_text = self.tokenizer.decode(generated_text[0])
+        text = generated_text[len(prompt):].split('<|endofchunk|>')[0]
+        return text
--- a/VLMEvalKit/vlmeval/vlm/ovis.py
+++ b/VLMEvalKit/vlmeval/vlm/ovis.py
+import torch
+from transformers import AutoModelForCausalLM
+
+from .base import BaseModel
+from ..dataset import DATASET_TYPE
+from ..smp import *
+
+
+class Ovis(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = True
+
+    def __init__(self, model_path='AIDC-AI/Ovis1.5-Llama3-8B', **kwargs):
+        assert model_path is not None
+        # Recommend to install `transformers==4.43.2` and `torch==2.1.2`.
+        self.model_path = model_path
+        self.device = torch.cuda.current_device()
+        self.dtype = torch.bfloat16
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_path,
+            torch_dtype=self.dtype,
+            multimodal_max_length=8192,
+            trust_remote_code=True
+        )
+        self.model = self.model.eval().to(device=self.device)
+        self.eos_token_id = self.model.generation_config.eos_token_id
+        self.text_tokenizer = self.model.get_text_tokenizer()
+        self.pad_token_id = self.text_tokenizer.pad_token_id
+        self.visual_tokenizer = self.model.get_visual_tokenizer()
+        self.conversation_formatter = self.model.get_conversation_formatter()
+        self.image_placeholder = '<image>'
+        self.gen_kwargs = dict(
+            max_new_tokens=1024,
+            do_sample=False,
+            top_p=None,
+            top_k=None,
+            temperature=None,
+            repetition_penalty=None,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            use_cache=True
+        )
+        self.gen_kwargs.update(kwargs)
+
+    def use_custom_prompt(self, dataset):
+        if DATASET_TYPE(dataset) == 'Y/N' or DATASET_TYPE(dataset) == 'MCQ':
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+
+        if DATASET_TYPE(dataset) == 'Y/N':
+            prompt = self.build_yorn_prompt(line, dataset)
+        elif DATASET_TYPE(dataset) == 'MCQ':
+            prompt = self.build_multi_choice_prompt(line, dataset)
+        else:
+            raise RuntimeError(f'Invalid dataset type: {DATASET_TYPE(dataset)}')
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+
+        # interleave dataset
+        if dataset.startswith('MMMU_'):
+            from .. import MMMUDataset
+            message = MMMUDataset.split_MMMU(message)
+
+        return message
+
+    def build_yorn_prompt(self, line, dataset=None):
+        prompt = line['question']
+        if listinstr(['HallusionBench'], dataset):
+            prompt += ' Please answer yes or no.'
+        prompt += '\n请用单个词或短语回答问题。' if cn_string(
+            prompt) else '\nAnswer the question using a single word or phrase.'
+        return prompt
+
+    def build_multi_choice_prompt(self, line, dataset=None):
+        question = line['question']
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+        prompt = question
+
+        if len(options):
+            prompt += '\n请直接回答选项字母。' if cn_string(
+                prompt) else "\nAnswer with the option's letter from the given choices directly."
+        else:
+            prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
+
+        return prompt
+
+    def generate_inner(self, message, dataset=None):
+        prompt, input_ids, attention_mask, pixel_values = self.prepare_inputs(message)
+        output_ids = self.model.generate(
+            input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            **self.gen_kwargs
+        )
+        response = self.text_tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
+
+        return response
+
+    def prepare_inputs(self, message):
+        # build query
+        images = [x['value'] for x in message if x['type'] == 'image']
+        texts = [x['value'] for x in message if x['type'] == 'text']
+        if len(images) == 0:
+            query = '\n'.join(texts)
+        elif len(images) == 1 and len(texts) == 1:
+            query = self.image_placeholder + '\n' + texts[0]
+        else:  # interleave sample
+            chunks = [x['value'] if x['type'] == 'text' else self.image_placeholder for x in message]
+            query = '\n'.join(chunks)
+
+        # format conversation
+        prompt, input_ids = self.conversation_formatter.format_query(query)
+        attention_mask = torch.ne(input_ids, self.text_tokenizer.pad_token_id)
+        input_ids = input_ids.unsqueeze(0).to(device=self.device)
+        attention_mask = attention_mask.unsqueeze(0).to(device=self.device)
+
+        # preprocess images
+        if len(images) == 0:
+            pixel_values = [None]
+        else:
+            preprocessed_images = [self.visual_tokenizer.preprocess_image(Image.open(image)) for image in images]
+            pixel_values = [torch.cat(preprocessed_images, dim=0).to(device=self.device, dtype=self.dtype)]
+
+        return prompt, input_ids, attention_mask, pixel_values
+
+
+class Ovis1_6(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = True
+
+    def __init__(self, model_path='AIDC-AI/Ovis1.6-Gemma2-9B', **kwargs):
+        assert model_path is not None
+        # Recommend to install `python=3.10`, `transformers==4.44.2`, `torch==2.2.0`, and `numpy==1.24.3`
+        self.model_path = model_path
+        self.device = torch.cuda.current_device()
+        self.dtype = torch.bfloat16
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_path,
+            torch_dtype=self.dtype,
+            multimodal_max_length=8192,
+            trust_remote_code=True
+        )
+        self.model = self.model.eval().to(device=self.device)
+        self.eos_token_id = self.model.generation_config.eos_token_id
+        self.text_tokenizer = self.model.get_text_tokenizer()
+        self.pad_token_id = self.text_tokenizer.pad_token_id
+        self.visual_tokenizer = self.model.get_visual_tokenizer()
+        self.max_partition = 9
+        self.image_placeholder = '<image>'
+        self.gen_kwargs = dict(
+            max_new_tokens=1024,
+            do_sample=False,
+            top_p=None,
+            top_k=None,
+            temperature=None,
+            repetition_penalty=None,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            use_cache=True
+        )
+        self.gen_kwargs.update(kwargs)
+
+    def use_custom_prompt(self, dataset):
+        if DATASET_TYPE(dataset) == 'Y/N' or DATASET_TYPE(dataset) == 'MCQ':
+            return True
+        return False
+
+    def build_yorn_prompt(self, line, dataset=None):
+        prompt = line['question'] + '\nAnswer the question using a single word or phrase.'
+        return prompt
+
+    def build_multi_choice_prompt(self, line, dataset=None):
+        question = line['question']
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+        prompt = question
+
+        if len(options):
+            prompt += "\nAnswer with the option's letter from the given choices directly."
+
+        return prompt
+
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+
+        if DATASET_TYPE(dataset) == 'Y/N':
+            prompt = self.build_yorn_prompt(line, dataset)
+        elif DATASET_TYPE(dataset) == 'MCQ':
+            prompt = self.build_multi_choice_prompt(line, dataset)
+        else:
+            raise RuntimeError(f'Invalid dataset type: {DATASET_TYPE(dataset)}')
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+
+        # interleave dataset
+        if dataset.startswith('MMMU_'):
+            from .. import MMMUDataset
+            message = MMMUDataset.split_MMMU(message)
+
+        return message
+
+    def generate_inner(self, message, dataset=None):
+        prompt, input_ids, attention_mask, pixel_values = self.prepare_inputs(message)
+        output_ids = self.model.generate(
+            input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            **self.gen_kwargs
+        )
+        response = self.text_tokenizer.decode(output_ids[0], skip_special_tokens=True)
+
+        return response
+
+    def prepare_inputs(self, message):
+        # build query
+        images = [x['value'] for x in message if x['type'] == 'image']
+        texts = [x['value'] for x in message if x['type'] == 'text']
+        if len(images) == 0:
+            query = '\n'.join(texts)
+        elif len(images) == 1 and len(texts) == 1:
+            query = self.image_placeholder + '\n' + texts[0]
+        else:  # interleaved sample
+            chunks = [x['value'] if x['type'] == 'text' else self.image_placeholder for x in message]
+            query = '\n'.join(chunks)
+
+        # preprocess inputs
+        prompt, input_ids, pixel_values = self.model.preprocess_inputs(
+            query, [Image.open(image) for image in images], max_partition=self.max_partition
+        )
+
+        # move to self.device
+        attention_mask = torch.ne(input_ids, self.text_tokenizer.pad_token_id)
+        input_ids = input_ids.unsqueeze(0).to(device=self.device)
+        attention_mask = attention_mask.unsqueeze(0).to(device=self.device)
+        pixel_values = [
+            pixel_values.to(device=self.device, dtype=self.dtype) if pixel_values is not None else None
+        ]
+
+        return prompt, input_ids, attention_mask, pixel_values
+
+
+class Ovis1_6_Plus(Ovis1_6):
+    # Recommend to install `python=3.10`, `transformers==4.46.2`, `torch==2.4.0`, and `numpy==1.25.0`
+
+    def build_mmmu_prompt(self, line, dataset: str) -> list[dict[str, str]]:
+        import string
+        import pandas as pd
+
+        question = line['question']
+        options = {cand: line[cand] for cand in string.ascii_uppercase if cand in line and not pd.isna(line[cand])}
+        options_prompt = 'Options:\n'
+        for key, item in options.items():
+            options_prompt += f'{key}. {item}\n'
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        prompt = ''
+        if hint is not None:
+            prompt += f'Hint: {hint}\n'
+        prompt += f'Question: {question}\n'
+        if len(options):
+            prompt += options_prompt
+            prompt += 'Please select the correct answer from the options above.'
+        prompt = prompt.rstrip()
+        return prompt
+
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+
+        if dataset.startswith('MMMU_'):
+            prompt = self.build_mmmu_prompt(line, dataset)
+        elif DATASET_TYPE(dataset) == 'Y/N':
+            prompt = self.build_yorn_prompt(line, dataset)
+        elif DATASET_TYPE(dataset) == 'MCQ':
+            prompt = self.build_multi_choice_prompt(line, dataset)
+        else:
+            raise RuntimeError(f'Invalid dataset type: {DATASET_TYPE(dataset)}')
+
+        message = [dict(type='image', value=s) for s in tgt_path] + [dict(type='text', value=prompt)]
+
+        return message
--- a/VLMEvalKit/vlmeval/vlm/paligemma.py
+++ b/VLMEvalKit/vlmeval/vlm/paligemma.py
+from PIL import Image
+import torch
+
+from .base import BaseModel
+from ..smp import *
+
+
+class PaliGemma(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='google/paligemma-3b-mix-448', **kwargs):
+        try:
+            from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
+        except Exception as e:
+            logging.critical('Please install the latest version transformers.')
+            raise e
+
+        model = PaliGemmaForConditionalGeneration.from_pretrained(
+            model_path,
+            torch_dtype=torch.bfloat16,
+            device_map='cpu',
+            revision='bfloat16',
+        ).eval()
+        self.model = model.cuda()
+        self.processor = AutoProcessor.from_pretrained(model_path)
+        self.kwargs = kwargs
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        image = Image.open(image_path).convert('RGB')
+
+        model_inputs = self.processor(
+            text=prompt, images=image, return_tensors='pt'
+        ).to('cuda')
+        input_len = model_inputs['input_ids'].shape[-1]
+
+        with torch.inference_mode():
+            generation = self.model.generate(
+                **model_inputs, max_new_tokens=512, do_sample=False
+            )
+            generation = generation[0][input_len:]
+            res = self.processor.decode(generation, skip_special_tokens=True)
+        return res
--- a/VLMEvalKit/vlmeval/vlm/pandagpt.py
+++ b/VLMEvalKit/vlmeval/vlm/pandagpt.py
+import sys
+import torch
+import os.path as osp
+import warnings
+from .base import BaseModel
+from ..smp import *
+
+
+class PandaGPT(BaseModel):
+
+    INSTALL_REQ = True
+    INTERLEAVE = False
+
+    def __init__(self, name, root=None, **kwargs):
+        if root is None:
+            raise ValueError('Please set `root` to PandaGPT code directory, which is cloned from here: ')
+
+        assert name == 'PandaGPT_13B'
+        self.name = name
+        sys.path.append(osp.join(root, 'code'))
+        try:
+            from model.openllama import OpenLLAMAPEFTModel
+        except Exception as e:
+            logging.critical(
+                'Please first install PandaGPT and set the root path to use PandaGPT, '
+                'which is cloned from here: https://github.com/yxuansu/PandaGPT. '
+            )
+            raise e
+
+        self.args = {
+            'model': 'openllama_peft',
+            'imagebind_ckpt_path': osp.join(root, 'pretrained_ckpt/imagebind_ckpt'),
+            'vicuna_ckpt_path': osp.join(root, 'pretrained_ckpt/vicuna_ckpt/13b_v0'),
+            'delta_ckpt_path': osp.join(root, 'pretrained_ckpt/pandagpt_ckpt/13b/pytorch_model.pt'),
+            'stage': 2,
+            'max_tgt_len': 512,
+            'lora_r': 32,
+            'lora_alpha': 32,
+            'lora_dropout': 0.1,
+        }
+        model = OpenLLAMAPEFTModel(**self.args)
+        delta_ckpt = torch.load(self.args['delta_ckpt_path'], map_location=torch.device('cpu'))
+        model.load_state_dict(delta_ckpt, strict=False)
+        torch.cuda.empty_cache()
+        self.model = model.eval().half().cuda()
+        kwargs_default = {'top_p': 0.9, 'do_sample': False, 'max_tgt_len': 128, 'temperature': 0.001}
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        struct = {
+            'prompt': prompt,
+            'image_paths': [image_path],
+            'audio_paths': [],
+            'video_paths': [],
+            'thermal_paths': [],
+            'modality_embeds': []
+        }
+        struct.update(self.kwargs)
+        resp = self.model.generate(struct)
+        return resp
--- a/VLMEvalKit/vlmeval/vlm/parrot.py
+++ b/VLMEvalKit/vlmeval/vlm/parrot.py
+import os
+
+import torch
+from PIL import Image
+from abc import abstractproperty
+from .base import BaseModel
+from ..dataset import DATASET_TYPE
+from ..smp import *
+
+
+class Parrot(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='AIDC-AI/Parrot-7B', **kwargs):
+        try:
+            from parrot.model.parrot_arch import ParrotMetaForCausalLM
+            from parrot.utils.constants import DEFAULT_IMAGE_TOKEN, BEGIN_LINE, END_LINE
+            from parrot.model.conversation_formatter import ConversationFormatter
+            from parrot.utils.mm_utils import process_images
+        except Exception as e:
+            logging.critical('Please install Parrot before using Parrot')
+            logging.critical('Please install Parrot from https://github.com/AIDC-AI/Parrot')
+            logging.critical('Using `pip install -e . --no-deps` in the Parrot directory')
+            logging.critical('Recommend to install transformers==4.39.0')
+            raise e
+
+        self.process_images = process_images
+        self.ConversationFormatter = ConversationFormatter
+        self.DEFAULT_IMAGE_TOKEN = DEFAULT_IMAGE_TOKEN
+        self.BEGIN_LINE = BEGIN_LINE
+        self.END_LINE = END_LINE
+
+        try:
+            model_name = 'parrot_qwen2'
+            model, tokenizer, conversation_formatter = ParrotMetaForCausalLM.build(
+                model_name, model_path, mm_vision_tower='openai/clip-vit-large-patch14-336'
+            )
+            self.model = model.cuda()
+            self.vision_tower = self.model.get_vision_tower()
+            self.tokenizer = tokenizer
+            self.conversation_formatter = conversation_formatter
+            self.image_processor = self.model.get_vision_tower().image_processor
+        except Exception as e:
+            logging.critical('Error when loading Parrot model:')
+            raise e
+
+        self.kwargs = dict(
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=512,
+            repetition_penalty=None,
+            use_cache=True,
+            eos_token_id=self.tokenizer.eos_token_id,
+            pad_token_id=self.tokenizer.pad_token_id
+        )
+        if int(os.environ.get('LOCAL_RANK', '0')) == 0:
+            print(f'Following kwargs {self.kwargs} will be used as generation config.')
+
+        self.count = 0
+
+    def use_custom_prompt(self, dataset):
+        if DATASET_TYPE(dataset) == 'Y/N' or DATASET_TYPE(dataset) == 'MCQ':
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+
+        if DATASET_TYPE(dataset) == 'Y/N':
+            prompt = self.built_yorn_prompt(line, dataset)
+        elif DATASET_TYPE(dataset) == 'MCQ':
+            prompt = self.build_multi_choice_prompt(line, dataset)
+        else:
+            raise ValueError(f'Invalid dataset type: {DATASET_TYPE(dataset)}')
+
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=p) for p in tgt_path])
+        return message
+
+    def built_yorn_prompt(self, line, dataset=None):
+        prompt = line['question']
+        previous_suffixs = [' Please answer yes or no.', ' Yes or No', ' Answer in one sentence.']
+        for previous_suffix in previous_suffixs:
+            if prompt.endswith(previous_suffix):
+                prompt = prompt[:-len(previous_suffix)]
+                break
+        prompt += '\n请直接回答Yes或No。请用单个词或短语回答问题。' if cn_string(
+            prompt) else '\nPlease strictly answer Yes or No. Answer the question using a single word or phrase.'
+        return prompt
+
+    def build_multi_choice_prompt(self, line, dataset=None):
+        question = line['question']
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+        prompt = question
+
+        if len(options):
+            default_prompt = "\nAnswer with the option's letter from the given choices directly."
+            if dataset[-3:] == '_cn' or cn_string(prompt):
+                default_prompt = '\n请直接用给定选项中的选项字母回答。'
+            elif dataset[-3:] == '_pt':
+                default_prompt = '\nResponda diretamente com a letra da opção das escolhas dadas.'
+            elif dataset[-3:] == '_ar':
+                default_prompt = '\nأجب مباشرةً بحرف الخيار من الاختيارات المعطاة.'
+            elif dataset[-3:] == '_ru':
+                default_prompt = '\nОтветьте буквой варианта из предложенных вариантов напрямую.'
+            elif dataset[-3:] == '_tr':
+                default_prompt = '\nVerilen seçeneklerden doğrudan seçeneğin harfi ile cevap verin.'
+            prompt += default_prompt
+            # prompt += (
+            #     '\n请直接回答选项字母。' if cn_string(prompt) else
+            #     "\nAnswer with the option's letter from the given choices directly."
+            # )
+        else:
+            prompt += '\n请用单个词或短语回答问题。' if cn_string(
+                prompt) else '\nAnswer the question using a single word or phrase.'
+
+        return prompt
+
+    def process_answer_prefix(self, answer, prefixes):
+        for prefix in prefixes:
+            if prefix in answer.lower():
+                return answer[answer.lower().find(prefix) + len(prefix):]
+        return answer
+
+    def generate_inner(self, message, dataset=None):
+        query, image_paths = self.prepare_inputs(message)
+        images_list = [Image.open(image_path).convert('RGB') for image_path in image_paths]
+        args = abstractproperty()
+        args.image_aspect_ratio = 'pad'
+        image_tensors = self.process_images(images_list, self.image_processor, args).cuda()
+        prompt, input_ids = self.conversation_formatter.format_query(query)
+        input_ids = input_ids.unsqueeze(0).cuda()
+
+        with torch.inference_mode():
+            kwargs = dict(
+                images=image_tensors,
+            )
+            kwargs.update(self.kwargs)
+            output_ids = self.model.generate(input_ids, **kwargs)
+
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        response = self.tokenizer.batch_decode(output_ids[:, input_token_len:],
+                                               skip_special_tokens=True)[0].strip(string.whitespace)
+        answer = response
+
+        if query.endswith("Answer with the option's letter from the given choices directly.") or query.endswith(
+                '请直接回答选项字母。'):
+            qtype = 'multiple-choice'
+            while True:
+                answer = answer.strip(string.punctuation + string.whitespace)
+                if len(answer) > 1:
+                    if answer[0] in string.ascii_uppercase and answer[1] in string.whitespace + string.punctuation:
+                        answer = answer[0]
+                        break
+                    elif answer[-1] in string.ascii_uppercase and answer[-2] in string.whitespace + string.punctuation:
+                        answer = answer[-1]
+                        break
+                    elif listinstr(['answer is', 'answer:'], answer.lower()):
+                        answer = self.process_answer_prefix(answer, ['answer is', 'answer:'])
+                        answer = self.process_answer_prefix(answer, ['option'])
+                    else:
+                        break
+                else:
+                    break
+        else:
+            qtype = 'open'
+
+        if self.count % 50 == 0 and int(os.environ.get('LOCAL_RANK', '0')) == 0:
+            print(f'\n{self.BEGIN_LINE}')
+            print(f'image_paths: {image_paths}\n')
+            print(f'prompt: {prompt}\n')
+            print(f'qtype: {qtype}\n')
+            print(f'output: {response}\n')
+            print(f'answer: {answer}\n')
+            print(f'{self.END_LINE}\n', flush=True)
+
+        self.count += 1
+
+        return answer
+
+    def prepare_inputs(self, message):
+        prompt = ''
+        image_paths = []
+        image_count = 0
+        text_count = 0
+        pure_text = ''
+        for msg in message:
+            if msg['type'] == 'text':
+                text_count += 1
+                prompt += msg['value']
+                pure_text += msg['value']
+            elif msg['type'] == 'image':
+                image_count += 1
+                prompt += self.DEFAULT_IMAGE_TOKEN
+                image_paths.append(msg['value'])
+
+        if image_count == 1 and text_count == 1:
+            prompt = self.DEFAULT_IMAGE_TOKEN + '\n' + pure_text
+
+        return prompt, image_paths
--- a/VLMEvalKit/vlmeval/vlm/phi3_vision.py
+++ b/VLMEvalKit/vlmeval/vlm/phi3_vision.py
+from PIL import Image
+import torch
+
+from .base import BaseModel
+from ..smp import *
+
+
+class Phi3Vision(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='microsoft/Phi-3-vision-128k-instruct', **kwargs):
+        try:
+            from transformers import AutoProcessor, AutoModelForCausalLM
+        except Exception as e:
+            logging.critical('Please install the latest version transformers.')
+            raise e
+
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path, device_map='cuda', trust_remote_code=True, torch_dtype='auto').eval()
+        processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+        self.model = model
+        self.processor = processor
+        self.kwargs = kwargs
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        image = Image.open(image_path).convert('RGB')
+        messages = [
+            {'role': 'user', 'content': f'<|image_1|>\n{prompt}'}
+        ]
+        prompt = self.processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = self.processor(prompt, [image], return_tensors='pt').to('cuda')
+
+        generation_args = {
+            'max_new_tokens': 500,
+            'temperature': 0.0,
+            'do_sample': False,
+        }
+        generation_args.update(self.kwargs)
+
+        generate_ids = self.model.generate(
+            **inputs,
+            eos_token_id=self.processor.tokenizer.eos_token_id,
+            **generation_args
+        )
+        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+        response = self.processor.batch_decode(
+            generate_ids,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
+        return response
+
+    def chat_inner(self, message, dataset=None):
+
+        messages = []
+        image_cnt = 1
+        image_list = []
+        for msg in message:
+            content = ''
+            # If message is just text in the conversation
+            if len(msg['content']) == 1 and msg['content'][0]['type'] == 'text':
+                msg_new = {'role': msg['role'], 'content': msg['content'][0]['value']}
+                messages.append(msg_new)
+                continue
+
+            # If both image & text is present
+            for x in msg['content']:
+                if x['type'] == 'text':
+                    content += x['value']
+                elif x['type'] == 'image':
+                    image = Image.open(x['value']).convert('RGB')
+                    content += f'<|image_{image_cnt}|>\n'
+                    image_list.append(image)
+                    image_cnt += 1
+            msg_new = {'role': msg['role'], 'content': content}
+            messages.append(msg_new)
+
+        prompt = self.processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = self.processor(prompt, image_list, return_tensors='pt').to('cuda')
+
+        generation_args = {
+            'max_new_tokens': 500,
+            'temperature': 0.0,
+            'do_sample': False,
+        }
+        generation_args.update(self.kwargs)
+
+        generate_ids = self.model.generate(
+            **inputs,
+            eos_token_id=self.processor.tokenizer.eos_token_id,
+            **generation_args
+        )
+        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+        response = self.processor.batch_decode(
+            generate_ids,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
+        return response
+
+
+class Phi3_5Vision(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='microsoft/Phi-3.5-vision-instruct', **kwargs):
+        try:
+            from transformers import AutoProcessor, AutoModelForCausalLM
+        except Exception as e:
+            logging.critical('Please install the latest version transformers.')
+            raise e
+
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path, device_map='cuda', trust_remote_code=True, torch_dtype='auto',
+            _attn_implementation='flash_attention_2').eval()
+
+        # for best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame.
+        processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True, num_crops=4)
+        self.model = model
+        self.processor = processor
+        self.kwargs = kwargs
+
+    def generate_inner(self, message, dataset=None):
+
+        prompt = '\n'.join([msg['value'] for msg in message if msg['type'] == 'text'])
+        images = [Image.open(msg['value']).convert('RGB') for msg in message if msg['type'] == 'image']
+        num_images = len(images)
+        placeholder = ''
+        for i in range(1, num_images + 1):
+            placeholder += f'<|image_{i}|>\n'
+
+        messages = [
+            {'role': 'user', 'content': placeholder + prompt}
+        ]
+        prompt = self.processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = self.processor(prompt, images, return_tensors='pt').to('cuda')
+
+        generation_args = {
+            'max_new_tokens': 1000,
+            'temperature': 0.0,
+            'do_sample': False,
+        }
+        generation_args.update(self.kwargs)
+
+        generate_ids = self.model.generate(
+            **inputs,
+            eos_token_id=self.processor.tokenizer.eos_token_id,
+            **generation_args
+        )
+
+        # remove input tokens
+        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+        response = self.processor.batch_decode(
+            generate_ids,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
+        return response
--- a/VLMEvalKit/vlmeval/vlm/pixtral.py
+++ b/VLMEvalKit/vlmeval/vlm/pixtral.py
+import torch
+from PIL import Image
+from .base import BaseModel
+from ..smp import *
+import warnings
+from huggingface_hub import snapshot_download
+
+
+class Pixtral(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = True
+
+    def __init__(self, model_path='mistralai/Pixtral-12B-2409', **kwargs):
+
+        self.model_path = model_path
+        try:
+            from mistral_inference.transformer import Transformer
+            from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+        except ImportError as err:
+            logging.critical('Please install `mistral-inference` and `mistral_common`')
+            raise err
+
+        if os.path.exists(model_path):
+            cache_path = model_path
+        else:
+            if get_cache_path(model_path) is None:
+                snapshot_download(repo_id=model_path)
+            cache_path = get_cache_path(self.model_path)
+
+        self.tokenizer = MistralTokenizer.from_file(f'{cache_path}/tekken.json')
+        model = Transformer.from_folder(cache_path, device='cpu')
+        model.cuda()
+        self.model = model
+        self.max_tokens = 512
+
+    def generate_inner(self, message, dataset=None):
+        try:
+            from mistral_inference.generate import generate
+            from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageURLChunk
+            from mistral_common.protocol.instruct.request import ChatCompletionRequest
+        except ImportError as err:
+            logging.critical('Please install `mistral-inference` and `mistral_common`')
+            raise err
+
+        msg_new = []
+        for msg in message:
+            tp, val = msg['type'], msg['value']
+            if tp == 'text':
+                msg_new.append(TextChunk(text=val))
+            elif tp == 'image':
+                b64 = encode_image_file_to_base64(val)
+                image_url = f'data:image/jpeg;base64,{b64}'
+                msg_new.append(ImageURLChunk(image_url=image_url))
+
+        completion_request = ChatCompletionRequest(messages=[UserMessage(content=msg_new)])
+        encoded = self.tokenizer.encode_chat_completion(completion_request)
+        images = encoded.images
+        tokens = encoded.tokens
+
+        out_tokens, _ = generate(
+            [tokens],
+            self.model,
+            images=[images],
+            max_tokens=self.max_tokens,
+            temperature=0,
+            eos_id=self.tokenizer.instruct_tokenizer.tokenizer.eos_id)
+
+        result = self.tokenizer.decode(out_tokens[0])
+        return result
--- a/VLMEvalKit/vlmeval/vlm/points.py
+++ b/VLMEvalKit/vlmeval/vlm/points.py
+import transformers
+from PIL import Image
+import torch
+import re
+from .base import BaseModel
+from ..dataset import DATASET_TYPE
+from ..smp import cn_string, listinstr
+import pandas as pd
+import string
+from typing import List
+
+
+class POINTS(BaseModel):
+    """Official implementation of POINTS: Improving Your Vision-language Model with Affordable Strategies # noqa
+
+    Paper link: https://arxiv.org/abs/2409.04828
+    POINTS is a vision-language model developed by researchers at WeChat AI. This model represents the inaugural version in our
+    series of multimodal models, known as WePOINTS.
+
+    Args:
+        model_path (str): The path or the name (the unique huggingface id) of the model.
+    """
+
+    def __init__(self, model_path: str, **kwargs) -> None:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        from transformers import CLIPImageProcessor
+
+        version = transformers.__version__
+        use_fast = True
+        if 'yi' in model_path.lower():
+            assert version == '4.38.2', f'The version of transformers for Yi-1.5 should be 4.38.2, but got {version}.'  # noqa
+            use_fast = False
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path, use_fast=use_fast)
+        self.model = AutoModelForCausalLM.from_pretrained(model_path,
+                                                          trust_remote_code=True,  # noqa
+                                                          device_map='cuda'
+                                                          ).to(torch.bfloat16)
+        self.image_processor = CLIPImageProcessor.from_pretrained(
+            model_path)
+
+    def use_custom_prompt(self, dataset: str) -> bool:
+        """Whether to use custom prompt for the dataset.
+
+        Args:
+            dataset (str): The name of the dataset.
+
+        Returns:
+            bool: Whether to use custom prompt for the dataset.
+        """
+        if DATASET_TYPE(dataset) == 'MCQ':
+            return True
+        return False
+
+    def build_prompt(self, line: str, dataset: str) -> List[dict]:
+        """Build prompt for multi-choice dataset.
+
+        Args:
+            line (str): one line of the dataset.
+            dataset (str): The name of the dataset.
+
+        Returns:
+            List[dict]: A list of elements constructed for current line.
+        """
+        assert self.use_custom_prompt(dataset)
+        assert isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+
+        question = line['question']
+        hint = line['hint'] if (
+            'hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+        prompt = question
+
+        if len(options):
+            prompt += (
+                '\n请直接回答选项字母。' if cn_string(prompt) else  # noqa
+                "\nAnswer with the option\'s letter from the given choices directly."  # noqa
+            )
+        else:
+            prompt += '\n请直接回答问题。' if cn_string(  # noqa
+                prompt) else '\nAnswer the question directly.'
+        message = [dict(type='image', value=s) for s in tgt_path]
+        message.append(dict(type='text', value=prompt))
+        return message
+
+    def generate_inner(self, message: List[dict], dataset: str = None) -> str:
+        """Generate response for the given message.
+
+        Args:
+            message (List[dict]): A list of elements constructed for
+                current line.
+            dataset (str): The name of the dataset.
+
+        Returns:
+            str: The generated response.
+        """
+        prompt, image_path = self.message_to_promptimg(message)
+        catty = True  # whether to use catty
+        if dataset == 'HallusionBench':
+            prompt = prompt + \
+                ' Please answer yes or no. Answer the question using a single word or phrase.'  # noqa
+        elif dataset == 'MMVet':
+            prompt = prompt + ' Answer this question in detail.'
+            catty = False
+        else:
+            # use default setting
+            pass
+
+        if dataset is None:
+            max_splits = 8
+        elif listinstr(['MMBench', 'OCRBench'], dataset):
+            max_splits = 12
+        else:
+            max_splits = 8
+
+        image = Image.open(image_path).convert('RGB')
+        generation_config = {
+            'max_new_tokens': 1024,
+            'temperature': 0.0,
+            'top_p': 0.0,
+            'num_beams': 1,
+        }
+        response = self.model.chat(image,
+                                   prompt,
+                                   self.tokenizer,
+                                   self.image_processor,
+                                   catty,
+                                   generation_config,
+                                   max_splits)
+        return response
+
+
+class POINTSV15(BaseModel):
+    """Official implementation of POINTSv1.5
+
+    This implementation is based on the official implementation of POINTSv1.5
+    (https://github.com/WePOINTS/WePOINTS)
+
+    Args:
+        model_path (str): The path or the name (the unique huggingface id)
+            of the model.
+    """
+
+    def __init__(self, model_path: str, **kwargs) -> None:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        from transformers import QuantoConfig
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True)
+        quant_config = QuantoConfig(modules_to_not_convert=['vision_encoder'])
+        self.model = AutoModelForCausalLM.from_pretrained(model_path,
+                                                          trust_remote_code=True,  # noqa
+                                                          device_map='cuda',
+                                                          torch_dtype=torch.bfloat16,
+                                                          quantization_config=quant_config
+                                                          )
+        try:
+            from wepoints.utils.images import Qwen2ImageProcessorForPOINTSV15
+        except ImportError:
+            print('Please install WePOINTS, and refer to https://github.com/WePOINTS/WePOINTS')
+        self.image_processor = Qwen2ImageProcessorForPOINTSV15.from_pretrained(model_path) # noqa
+
+    def use_custom_prompt(self, dataset: str) -> bool:
+        """Whether to use custom prompt for the dataset.
+
+        Args:
+            dataset (str): The name of the dataset.
+
+        Returns:
+            bool: Whether to use custom prompt for the dataset.
+        """
+        if DATASET_TYPE(dataset) == 'MCQ':
+            return True
+        return False
+
+    def build_prompt(self, line: str, dataset: str) -> List[dict]:
+        """Build prompt for multi-choice dataset.
+
+        Args:
+            line (str): one line of the dataset.
+            dataset (str): The name of the dataset.
+
+        Returns:
+            List[dict]: A list of elements constructed for current line.
+        """
+        assert self.use_custom_prompt(dataset)
+        assert isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+
+        question = line['question']
+        hint = line['hint'] if (
+            'hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+        prompt = question
+
+        if len(options):
+            prompt += (
+                '\n请直接回答选项字母。' if cn_string(prompt) else  # noqa
+                "\nAnswer with the option\'s letter from the given choices directly."  # noqa
+            )
+        else:
+            prompt += '\n请直接回答问题。' if cn_string(  # noqa
+                prompt) else '\nAnswer the question directly.'
+        message = [dict(type='image', value=s) for s in tgt_path]
+        message.append(dict(type='text', value=prompt))
+        return message
+
+    def set_image_processor(self, dataset: str) -> None:
+        """Set the image processor for the dataset.
+
+        Args:
+            dataset (str): The name of the dataset.
+        """
+        if dataset in ['OCRBench']:
+            self.image_processor.min_pixels = 280 * 280
+        elif dataset in ['MMMU_DEV_VAL']:
+            self.image_processor.min_pixels = 1280 * 28 * 28
+            self.image_processor.max_pixels = 16384 * 28 * 28
+        elif dataset in ['MathVista_MINI']:
+            self.image_processor.min_pixels = 56 * 56
+        elif dataset in ['MMVet', 'HallusionBench',
+                         'MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11']:
+            self.image_processor.min_pixels = 1280 * 28 * 28
+        else:
+            self.image_processor.min_pixels = 840 * 840
+
+    def construct_messages(self, prompt: str,
+                           image_paths: List[str]) -> List[dict]:
+        """Construct messages for the given prompt and image paths.
+
+        Args:
+            prompt (str): The prompt for the generation.
+            image_paths (List[str]): A list of image paths.
+
+        Returns:
+            List[dict]: A list of elements constructed for current line.
+        """
+        content = []
+        for image_path in image_paths:
+            content.append(
+                dict(type='image', image=image_path)
+            )
+        content.append(
+            dict(type='text', text=prompt)
+        )
+        messages = [
+            {
+                'role': 'user',
+                'content': content
+            }
+        ]
+        return messages
+
+    def generate_inner(self, message: List[dict], dataset: str = None) -> str:
+        """Generate response for the given message.
+
+        Args:
+            message (List[dict]): A list of elements constructed for
+                current line.
+            dataset (str): The name of the dataset.
+
+        Returns:
+            str: The generated response.
+        """
+        self.set_image_processor(dataset)
+        prompt, image_paths = self.message_to_promptimg(message)
+        image_paths = [image_paths]
+        if dataset == 'HallusionBench':
+            prompt = prompt + \
+                ' Please answer yes or no. Answer the question using a single word or phrase.'  # noqa
+        elif dataset == 'MMVet':
+            prompt = prompt + ' Answer this question in detail.'
+        else:
+            # use default setting
+            pass
+        pattern = r'<image \d+>'
+        prompt = re.sub(pattern, '\n', prompt)
+        messages = self.construct_messages(prompt, image_paths)
+
+        generation_config = {
+            'max_new_tokens': 1024,
+            'temperature': 0.0,
+            'top_p': 0.0,
+            'num_beams': 1,
+        }
+        response = self.model.chat(messages,
+                                   self.tokenizer,
+                                   self.image_processor,
+                                   generation_config)
+        return response