init

81028572 · luopl · 81028572 · 81028572 · 81028572 · 81028572
Commit 81028572 authored Sep 28, 2024 by luopl
20 changed files
--- a/VLMEvalKit/vlmeval/vlm/minigpt4.py
+++ b/VLMEvalKit/vlmeval/vlm/minigpt4.py
+import torch
+import sys
+import os.path as osp
+import warnings
+from transformers import StoppingCriteriaList
+from .base import BaseModel
+class MiniGPT4(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = False
+    def __init__(self,
+                 mode='v2',
+                 root='/mnt/petrelfs/share_data/duanhaodong/MiniGPT-4/',
+                 temperature=1,
+                 max_out_len=512):
+        if root is None:
+            warnings.warn(
+                'Please set root to the directory of MiniGPT-4, which is cloned from here: '
+                'https://github.com/Vision-CAIR/MiniGPT-4. '
+            )
+        if mode == 'v2':
+            cfg = 'minigptv2_eval.yaml'
+        elif mode == 'v1_7b':
+            cfg = 'minigpt4_7b_eval.yaml'
+        elif mode == 'v1_13b':
+            cfg = 'minigpt4_13b_eval.yaml'
+        else:
+            raise NotImplementedError
+        self.mode = mode
+        self.temperature = temperature
+        self.max_out_len = max_out_len
+        self.root = root
+        this_dir = osp.dirname(__file__)
+        self.cfg = osp.join(this_dir, 'misc', cfg)
+        sys.path.append(self.root)
+        from omegaconf import OmegaConf
+        from minigpt4.common.registry import registry
+        from minigpt4.conversation.conversation import StoppingCriteriaSub, CONV_VISION_Vicuna0, CONV_VISION_minigptv2
+        device = torch.cuda.current_device()
+        self.device = device
+        cfg_path = self.cfg
+        cfg = OmegaConf.load(cfg_path)
+        model_cfg = cfg.model
+        model_cfg.device_8bit = device
+        model_cls = registry.get_model_class(model_cfg.arch)
+        model = model_cls.from_config(model_cfg)
+        model = model.to(device)
+        model.eval()
+        vis_processor_cfg = cfg.datasets.cc_sbu_align.vis_processor.train
+        vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
+        self.model = model
+        self.vis_processor = vis_processor
+        self.CONV_VISION = CONV_VISION_minigptv2 if self.mode == 'v2' else CONV_VISION_Vicuna0
+        stop_words_ids = [[835], [2277, 29937]]
+        stop_words_ids = [torch.tensor(ids).to(device) for ids in stop_words_ids]
+        self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+    def generate_inner(self, message, dataset=None):
+        from minigpt4.conversation.conversation import Chat
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        if self.mode == 'v2':
+            chat = Chat(self.model, self.vis_processor, device=self.device)
+        else:
+            chat = Chat(self.model, self.vis_processor, device=self.device, stopping_criteria=self.stopping_criteria)
+        chat_state = self.CONV_VISION.copy()
+        img_list = []
+        _ = chat.upload_img(image_path, chat_state, img_list)
+        chat.encode_img(img_list)
+        chat.ask(prompt, chat_state)
+        with torch.inference_mode():
+            msg = chat.answer(conv=chat_state, img_list=img_list)[0]
+        return msg
--- a/VLMEvalKit/vlmeval/vlm/minimonkey.py
+++ b/VLMEvalKit/vlmeval/vlm/minimonkey.py
+import pprint
+import numpy as np
+import torch
+import torchvision.transforms as T
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoModel, AutoTokenizer
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images, target_aspect_ratio
+def dynamic_preprocess2(image, min_num=1, max_num=12, prior_aspect_ratio=None, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    new_target_ratios = []
+    for i in target_ratios:
+        if prior_aspect_ratio[0] % i[0] or prior_aspect_ratio[1] % i[1]:
+            new_target_ratios.append(i)
+        else:
+            continue
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, new_target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+def load_image(image, input_size=448, min_num=1, max_num=12):
+    image = image.convert('RGB')
+    transform = build_transform(input_size=input_size)
+    images, target_aspect_ratio = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True,
+                                                     min_num=min_num, max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values, target_aspect_ratio
+def load_image2(image, input_size=448, min_num=1, max_num=12, target_aspect_ratio=None):
+    image = image.convert('RGB')
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess2(image, image_size=input_size, use_thumbnail=True, min_num=min_num,
+                                 max_num=max_num, prior_aspect_ratio=target_aspect_ratio)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+import warnings
+from .base import BaseModel
+from ..dataset import DATASET_TYPE
+class MiniMonkey(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = False
+    def __init__(self, model_path='mx262/MiniMonkey', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+        self.model_type = torch.bfloat16
+        self.model = AutoModel.from_pretrained(
+            self.model_path,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True).eval().to(self.model_type).cuda()
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True, use_fast=False)
+        self.kwargs = kwargs
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+        torch.cuda.empty_cache()
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        if dataset is None:
+            return self.generate_vanilla(image_path, prompt)
+        assert isinstance(dataset, str)
+        if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'Y/N' or dataset == 'HallusionBench':
+            return self.generate_multichoice(image_path, prompt)
+        else:
+            return self.generate_vanilla(image_path, prompt)
+    def generate_vanilla(self, image_path, prompt):
+        image = Image.open(image_path).convert('RGB')
+        pixel_values, target_aspect_ratio = load_image(image, min_num=4, max_num=12)
+        pixel_values = pixel_values.cuda().to(self.model_type)
+        pixel_values2 = load_image2(image, min_num=3, max_num=7, target_aspect_ratio=target_aspect_ratio)
+        pixel_values2 = pixel_values2.cuda().to(self.model_type)
+        pixel_values = torch.cat([pixel_values2[:-1], pixel_values[:-1], pixel_values2[-1:]], 0)
+        generation_config = dict(do_sample=False, max_new_tokens=512)
+        response, history = self.model.chat(self.tokenizer, pixel_values,
+                                            target_aspect_ratio, prompt, generation_config,
+                                            history=None, return_history=True)
+        return response
+    def generate_multichoice(self, image_path, prompt):
+        return self.generate_vanilla(image_path, prompt)
--- a/VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna13b.yaml
+++ b/VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna13b.yaml
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: instruct_vicuna13b
+  load_finetuned: False
+  load_pretrained: True
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth"
+  finetuned: ""
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+  # Q-Former
+  num_query_token: 32
+  # path to Vicuna checkpoint
+  llm_model: "Please set the path to your vicuna-13b-v1.1"
+  # generation configs
+  prompt: ""
+preprocess:
+    vis_processor:
+        train:
+          name: "blip2_image_train"
+          image_size: 224
+        eval:
+          name: "blip_image_eval"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
--- a/VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna7b.yaml
+++ b/VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna7b.yaml
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: instruct_vicuna7b
+  load_finetuned: False
+  load_pretrained: True
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth"
+  finetuned: ""
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+  # Q-Former
+  num_query_token: 32
+  # path to Vicuna checkpoint
+  llm_model: "Please set the path to your vicuna-7b-v1.1"
+  # generation configs
+  prompt: ""
+preprocess:
+    vis_processor:
+        train:
+          name: "blip2_image_train"
+          image_size: 224
+        eval:
+          name: "blip_image_eval"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
--- a/VLMEvalKit/vlmeval/vlm/misc/minigpt4_13b_eval.yaml
+++ b/VLMEvalKit/vlmeval/vlm/misc/minigpt4_13b_eval.yaml
+model:
+  arch: minigpt4
+  model_type: pretrain_vicuna_7b
+  max_txt_len: 160
+  end_sym: "###"
+  low_resource: True
+  prompt_template: '###Human: {} ###Assistant: '
+  ckpt: "please set this value to the path of pretrained checkpoint"
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+  freeze_qformer: True
+  # Q-Former
+  num_query_token: 32
+  # generation configs
+  prompt: ""
+  llama_model: "please set this value to the path of vicuna-13b-v0"
+datasets:
+  cc_sbu_align:
+    vis_processor:
+      train:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+run:
+  task: image_text_pretrain
--- a/VLMEvalKit/vlmeval/vlm/misc/minigpt4_7b_eval.yaml
+++ b/VLMEvalKit/vlmeval/vlm/misc/minigpt4_7b_eval.yaml
+model:
+  arch: minigpt4
+  model_type: pretrain_vicuna_7b
+  max_txt_len: 160
+  end_sym: "###"
+  low_resource: True
+  prompt_template: '###Human: {} ###Assistant: '
+  ckpt: "please set this value to the path of pretrained checkpoint"
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+  freeze_qformer: True
+  # Q-Former
+  num_query_token: 32
+  # generation configs
+  prompt: ""
+  llama_model: "please set this value to the path of vicuna-7b-v0"
+datasets:
+  cc_sbu_align:
+    vis_processor:
+      train:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+run:
+  task: image_text_pretrain
--- a/VLMEvalKit/vlmeval/vlm/misc/minigptv2_eval.yaml
+++ b/VLMEvalKit/vlmeval/vlm/misc/minigptv2_eval.yaml
+model:
+  arch: minigpt_v2
+  model_type: pretrain
+  max_txt_len: 160
+  end_sym: "</s>"
+  low_resource: True
+  prompt_template: '[INST] {} [/INST]'
+  ckpt: "please set this value to the path of pretrained checkpoint"
+  lora_r: 64
+  lora_alpha: 16
+  # vit encoder
+  image_size: 448
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+  # generation configs
+  prompt: ""
+  # LLM
+  llama_model: "please set this value to the path of llama2-chat-7b"
+datasets:
+  cc_sbu_align:
+    vis_processor:
+      train:
+        name: "blip2_image_eval"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+run:
+  task: image_text_pretrain
--- a/VLMEvalKit/vlmeval/vlm/mixsense.py
+++ b/VLMEvalKit/vlmeval/vlm/mixsense.py
+import torch
+import transformers
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from PIL import Image
+import warnings
+from .base import BaseModel
+from ..smp import *
+class LLama3Mixsense(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = False
+    def __init__(self, model_path='Zero-Vision/Llama-3-MixSenseV1_1', **kwargs):
+        assert model_path is not None
+        transformers.logging.set_verbosity_error()
+        transformers.logging.disable_progress_bar()
+        warnings.filterwarnings('ignore')
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True
+        )
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path, trust_remote_code=True
+        ).to('cuda').eval()
+        self.kwargs = kwargs
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message)
+        input_ids = self.model.text_process(prompt, self.tokenizer).to(device='cuda')
+        image = Image.open(image_path).convert('RGB')
+        image_tensor = self.model.image_process([image]).to(dtype=self.model.dtype, device='cuda')
+        # generate
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                input_ids,
+                images=image_tensor,
+                max_new_tokens=2048,
+                use_cache=True,
+                eos_token_id=[
+                    self.tokenizer.eos_token_id,
+                    self.tokenizer.convert_tokens_to_ids(['<|eot_id|>'])[0],
+                ],
+            )
+        return self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
--- a/VLMEvalKit/vlmeval/vlm/mmalaya.py
+++ b/VLMEvalKit/vlmeval/vlm/mmalaya.py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
+import warnings
+from .base import BaseModel
+from PIL import Image
+from ..smp import *
+from ..dataset import DATASET_TYPE
+import pandas as pd
+import string
+import torchvision.transforms as T
+import transformers
+from torchvision.transforms.functional import InterpolationMode
+class MMAlaya(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = False
+    def __init__(self, model_path='DataCanvas/MMAlaya', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path, device_map='cpu', trust_remote_code=True
+        ).eval()
+        # need initialize tokenizer
+        model.initialize_tokenizer(self.tokenizer)
+        self.model = model.cuda()
+        self.kwargs = kwargs
+        warnings.warn(
+            f'Following kwargs received: {self.kwargs}, will use as generation config. '
+        )
+        torch.cuda.empty_cache()
+    def generate_inner(self, message, dataset=None):
+        # read image
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        image = Image.open(image_path).convert('RGB')
+        # tokenize prompt, and proprecess image
+        input_ids, image_tensor, stopping_criteria = self.model.prepare_for_inference(
+            prompt, self.tokenizer, image, return_tensors='pt'
+        )
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                inputs=input_ids.cuda(),
+                images=image_tensor.cuda(),
+                do_sample=False,
+                max_new_tokens=512,
+                num_beams=1,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria],
+            )
+            # truncate input_ids in generate_ids and then decode to text
+            input_token_len = input_ids.shape[1]
+            response = self.tokenizer.batch_decode(
+                output_ids[:, input_token_len:].cpu(),
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False,
+            )[0].strip()
+        return response
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose(
+        [
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=MEAN, std=STD),
+        ]
+    )
+    return transform
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(
+    image, min_num=1, max_num=6, image_size=448, use_thumbnail=False
+):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+def load_image(image_file, input_size=448, max_num=6, upscale=False):
+    image = Image.open(image_file).convert('RGB')
+    if upscale:
+        image = image.resize((image.width * 2, image.height * 2), Image.BILINEAR)
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(
+        image, image_size=input_size, use_thumbnail=True, max_num=max_num
+    )
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+# This function is used to split InternVL2-Llama3-76B
+def split_model(model_name):
+    import math
+    device_map = {}
+    num_gpus = torch.cuda.device_count()
+    rank, world_size = get_rank_and_world_size()
+    num_gpus = num_gpus // world_size
+    assert num_gpus >= 1
+    if num_gpus == 1:
+        return device_map
+    num_layers = {'InternVL2-8B': 32, 'InternVL2-26B': 48,
+                  'InternVL2-40B': 60, 'InternVL2-Llama3-76B': 80}[model_name]
+    # Since the first GPU will be used for ViT, treat it as 0.2 GPU.
+    num_layers_per_gpu = math.ceil(num_layers / (num_gpus - 0.8))
+    num_layers_per_gpu = [num_layers_per_gpu] * num_gpus
+    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.2)
+    layer_cnt = 0
+    for i, num_layer in enumerate(num_layers_per_gpu):
+        for j in range(num_layer):
+            device_map[f'language_model.model.layers.{layer_cnt}'] = rank + world_size * i
+            layer_cnt += 1
+    device_map['vision_model'] = rank
+    device_map['mlp1'] = rank
+    device_map['language_model.model.tok_embeddings'] = rank
+    device_map['language_model.model.embed_tokens'] = rank
+    device_map['language_model.output'] = rank
+    device_map['language_model.model.norm'] = rank
+    device_map['language_model.lm_head'] = rank
+    device_map[f'language_model.model.layers.{num_layers - 1}'] = rank
+    return device_map
+class MMAlaya2(BaseModel):
+    """
+    This implementation fine-tunes 20 LoRA modules based on the InternVL-Chat-V1-5 model.
+    The fine-tuned LoRA modules are then merged with the InternVL-Chat-V1-5 model
+    using the PEFT model merging method, TIES.
+    The code is based on the implementation in `vlmeval/vlm/internvl_chat.py`.
+    """
+    INSTALL_REQ = False
+    INTERLEAVE = True
+    def __init__(
+        self,
+        model_path='DataCanvas/MMAlaya2',
+        load_in_8bit=False,
+        **kwargs,
+    ):
+        assert model_path is not None
+        assert version_cmp(transformers.__version__, '4.36.2', 'ge')
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, use_fast=False
+        )
+        # Regular expression to match the pattern "Image" followed by a number, e.g. Image1
+        self.pattern = r'Image(\d+)'
+        # Replacement pattern to insert a hyphen between "Image" and the number, e.g. Image-1
+        self.replacement = r'Image-\1'
+        # Convert InternVL2 response to dataset format
+        # e.g. Image1 -> Image-1
+        # Regular expression to match the pattern "Image-" followed by a number
+        self.reverse_pattern = r'Image-(\d+)'
+        # Replacement pattern to remove the hyphen (Image-1 -> Image1)
+        self.reverse_replacement = r'Image\1'
+        device_map = split_model('InternVL2-26B')
+        if len(device_map) == 0:
+            device_map = {'': 'cuda'}
+        self.model = AutoModel.from_pretrained(
+            model_path,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+            load_in_8bit=load_in_8bit,
+            device_map=device_map
+        ).eval()
+        self.image_size = self.model.config.vision_config.image_size
+        kwargs_default = dict(
+            do_sample=False, max_new_tokens=1024, top_p=None, num_beams=1
+        )
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+        warnings.warn(
+            f'Following kwargs received: {self.kwargs}, will use as generation config. '
+        )
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN'], dataset):
+            # For Multi-Turn we don't have custom prompt
+            return False
+        else:
+            return True
+    def build_multi_choice_prompt(self, line, dataset=None):
+        question = line['question']
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+        prompt = question
+        if len(options):
+            prompt += (
+                '\n请直接回答选项字母。'
+                if cn_string(prompt)
+                else "\nAnswer with the option's letter from the given choices directly."
+            )
+        else:
+            prompt += (
+                '\n请直接回答问题。'
+                if cn_string(prompt)
+                else '\nAnswer the question directly.'
+            )
+        return prompt
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert dataset is None or isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+        if dataset is not None and listinstr(['MME'], dataset):
+            question = line['question']
+            prompt = question + ' Answer the question using a single word or phrase.'
+        elif dataset is not None and listinstr(['HallusionBench'], dataset):
+            question = line['question']
+            prompt = (
+                question
+                + ' Please answer yes or no. Answer the question using a single word or phrase.'
+            )
+        elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
+            prompt = self.build_multi_choice_prompt(line, dataset)
+        elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
+            if listinstr(['MathVista', 'MathVision', 'MathVerse'], dataset):
+                prompt = line['question']
+            elif listinstr(['LLaVABench'], dataset):
+                question = line['question']
+                prompt = question + '\nAnswer this question in detail.'
+            elif listinstr(['MMVet'], dataset):
+                prompt = line['question']
+            else:
+                question = line['question']
+                prompt = question + '\nAnswer the question using a single word or phrase.'
+        else:
+            prompt = line['question']
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
+    def set_max_num(self, dataset):
+        if dataset is not None and listinstr(['ChartQA_TEST', 'MMMU_DEV_VAL'], dataset):
+            self.max_num = 12
+        elif dataset is not None and listinstr(['DocVQA_VAL', 'DocVQA_TEST'], dataset):
+            self.max_num = 18
+        elif dataset is not None and listinstr(
+            ['InfoVQA_VAL', 'InfoVQA_TEST', 'OCRBench'], dataset
+        ):
+            self.max_num = 24
+        elif dataset is not None and listinstr(
+            ['MMBench-Video', 'Video-MME', 'Video'], dataset
+        ):
+            self.max_num = 1
+        else:
+            self.max_num = 6
+    def generate_inner(self, message, dataset=None):
+        self.set_max_num(dataset)
+        image_num = len([x for x in message if x['type'] == 'image'])
+        prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+        if image_num > 1:
+            image_path = [x['value'] for x in message if x['type'] == 'image']
+            pixel_values_list = []
+            max_num = max(1, self.max_num // image_num)
+            for file_name in image_path:
+                pixel_values_list.append(load_image(file_name, max_num=max_num).cuda().to(torch.bfloat16))
+            pixel_values = torch.cat(pixel_values_list, dim=0)
+        elif image_num == 1:
+            image_path = [x['value'] for x in message if x['type'] == 'image'][0]
+            pixel_values = (
+                load_image(image_path, max_num=self.max_num).cuda().to(torch.bfloat16)
+            )
+        else:
+            pixel_values = None
+        with torch.no_grad():
+            response = self.model.chat(
+                self.tokenizer,
+                pixel_values=pixel_values,
+                question=prompt,
+                generation_config=self.kwargs,
+                # verbose=False,
+            )
+        return response
+if __name__ == '__main__':
+    model = MMAlaya2(max_new_tokens=1024, do_sample=False)
+    response = model.generate_inner(
+        [
+            {'type': 'image', 'value': './assets/apple.jpg'},
+            {'type': 'text', 'value': '请详细描述一下这张图片。'},
+        ]
+    )
+    print(response)
--- a/VLMEvalKit/vlmeval/vlm/monkey.py
+++ b/VLMEvalKit/vlmeval/vlm/monkey.py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import warnings
+from .base import BaseModel
+from ..dataset import DATASET_TYPE
+class Monkey(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = False
+    def __init__(self, model_path='echo840/Monkey', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu', trust_remote_code=True).eval()
+        self.model = model.cuda()
+        self.kwargs = kwargs
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+        torch.cuda.empty_cache()
+    def generate_vanilla(self, image_path, prompt):
+        cur_prompt = f'<img>{image_path}</img> {prompt} Answer: '
+        input_ids = self.tokenizer(cur_prompt, return_tensors='pt', padding='longest')
+        attention_mask = input_ids.attention_mask
+        input_ids = input_ids.input_ids
+        output_ids = self.model.generate(
+            input_ids=input_ids.cuda(),
+            attention_mask=attention_mask.cuda(),
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=512,
+            min_new_tokens=1,
+            length_penalty=1,
+            num_return_sequences=1,
+            output_hidden_states=True,
+            use_cache=True,
+            pad_token_id=self.tokenizer.eod_id,
+            eos_token_id=self.tokenizer.eod_id,
+        )
+        response = self.tokenizer.decode(
+            output_ids[0][input_ids.size(1):].cpu(),
+            skip_special_tokens=True
+        ).strip()
+        return response
+    def generate_multichoice(self, image_path, prompt):
+        cur_prompt = f'<img>{image_path}</img> \n {prompt} Answer: '
+        input_ids = self.tokenizer(cur_prompt, return_tensors='pt', padding='longest')
+        attention_mask = input_ids.attention_mask
+        input_ids = input_ids.input_ids
+        output_ids = self.model.generate(
+            input_ids=input_ids.cuda(),
+            attention_mask=attention_mask.cuda(),
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=10,
+            min_new_tokens=1,
+            length_penalty=1,
+            num_return_sequences=1,
+            output_hidden_states=True,
+            use_cache=True,
+            pad_token_id=self.tokenizer.eod_id,
+            eos_token_id=self.tokenizer.eod_id,
+        )
+        response = self.tokenizer.decode(
+            output_ids[0][input_ids.size(1):].cpu(),
+            skip_special_tokens=True
+        ).strip()
+        return response
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        if dataset is None:
+            return self.generate_vanilla(image_path, prompt)
+        assert isinstance(dataset, str)
+        if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'Y/N' or dataset == 'HallusionBench':
+            return self.generate_multichoice(image_path, prompt)
+        else:
+            return self.generate_vanilla(image_path, prompt)
+class MonkeyChat(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = False
+    def __init__(self, model_path='echo840/Monkey-Chat', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu', trust_remote_code=True).eval()
+        self.model = model.cuda()
+        self.kwargs = kwargs
+        self.tokenizer.padding_side = 'left'
+        self.tokenizer.pad_token_id = self.tokenizer.eod_id
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+        torch.cuda.empty_cache()
+    def generate_vanilla(self, image_path, prompt):
+        cur_prompt = f'<img>{image_path}</img> {prompt} Answer: '
+        input_ids = self.tokenizer(cur_prompt, return_tensors='pt', padding='longest')
+        attention_mask = input_ids.attention_mask
+        input_ids = input_ids.input_ids
+        output_ids = self.model.generate(
+            input_ids=input_ids.cuda(),
+            attention_mask=attention_mask.cuda(),
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=512,
+            min_new_tokens=1,
+            length_penalty=1,
+            num_return_sequences=1,
+            output_hidden_states=True,
+            use_cache=True,
+            pad_token_id=self.tokenizer.eod_id,
+            eos_token_id=self.tokenizer.eod_id,
+        )
+        response = self.tokenizer.decode(
+            output_ids[0][input_ids.size(1):].cpu(),
+            skip_special_tokens=True
+        ).strip()
+        return response
+    def generate_multichoice(self, image_path, prompt):
+        cur_prompt = f'<img>{image_path}</img> \n {prompt} Answer: '
+        input_ids = self.tokenizer(cur_prompt, return_tensors='pt', padding='longest')
+        attention_mask = input_ids.attention_mask
+        input_ids = input_ids.input_ids
+        output_ids = self.model.generate(
+            input_ids=input_ids.cuda(),
+            attention_mask=attention_mask.cuda(),
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=10,
+            min_new_tokens=1,
+            length_penalty=1,
+            num_return_sequences=1,
+            output_hidden_states=True,
+            use_cache=True,
+            pad_token_id=self.tokenizer.eod_id,
+            eos_token_id=self.tokenizer.eod_id,
+        )
+        response = self.tokenizer.decode(
+            output_ids[0][input_ids.size(1):].cpu(),
+            skip_special_tokens=True
+        ).strip()
+        return response
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        if dataset is None:
+            return self.generate_vanilla(image_path, prompt)
+        assert isinstance(dataset, str)
+        if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'Y/N' or dataset == 'HallusionBench':
+            return self.generate_multichoice(image_path, prompt)
+        else:
+            return self.generate_vanilla(image_path, prompt)
--- a/VLMEvalKit/vlmeval/vlm/moondream.py
+++ b/VLMEvalKit/vlmeval/vlm/moondream.py
+import torch
+import re
+from PIL import Image
+from abc import abstractproperty
+import sys
+import os.path as osp
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+import copy
+class Moondream1(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = False
+    def __init__(self,
+                 model_path='vikhyatk/moondream1',
+                 **kwargs):
+        try:
+            from transformers import AutoModelForCausalLM, CodeGenTokenizerFast as Tokenizer
+        except:
+            warnings.warn('''Please install Transformers version 4.36.2 by running: "pip install transformers==4.36.2",
+            please intall torchvision>=0.16.''')
+        warnings.warn('''Please install Transformers version 4.36.2 by running: "pip install transformers==4.36.2",
+            please intall torchvision>=0.16.''')
+        assert osp.exists(model_path) or splitlen(model_path) == 2
+        self.model = (
+            AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.float16)
+            .to(torch.device('cuda'))
+        )
+        self.tokenizer = Tokenizer.from_pretrained(model_path)
+        default_kwargs = dict(
+            max_new_tokens=512,
+        )
+        default_kwargs.update(kwargs)
+        self.kwargs = default_kwargs
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+        torch.cuda.empty_cache()
+    def generate_inner(self, message, dataset=None):
+        images = []
+        prompt = ''
+        for s in message:
+            if s['type'] == 'image':
+                images.append(s['value'])
+            elif s['type'] == 'text':
+                prompt += s['value']
+        images = [Image.open(s) for s in images]
+        enc_image = self.model.encode_image(images[0])
+        prompt_wtmpl = prompt = f'<image>\n\nQuestion: {prompt}\n\nAnswer: '
+        answer = self.model.generate(
+            enc_image, prompt_wtmpl, eos_text='<END>', tokenizer=self.tokenizer, **self.kwargs)[0]
+        cleaned_answer = re.sub('<$', '', re.sub('END$', '', answer)).strip()
+        return cleaned_answer
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if listinstr(['MMMU'], dataset):
+            return False
+        if DATASET_TYPE(dataset) == 'MCQ' or dataset == 'MMVet':
+            return True
+        return False
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        if dataset == 'MMVet':
+            prompt = question + '\nAnswer the question directly. '
+        elif DATASET_TYPE(dataset) == 'MCQ':
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            options_prompt = ''
+            for key, item in options.items():
+                options_prompt += f'{key}. {item}\n'
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+            prompt = f'Hint: {hint}\n' if hint is not None else ''
+            prompt += f'{question}\n'
+            prompt += (
+                f'{options_prompt}\nAnswer with the option’s letter from the given choices directly. '
+                if len(options) else 'Answer the question directly. '
+            )
+        else:
+            raise NotImplementedError
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
+class Moondream2(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = False
+    def __init__(self,
+                 model_path='vikhyatk/moondream2',
+                 **kwargs):
+        try:
+            from transformers import AutoModelForCausalLM, AutoTokenizer
+        except:
+            warnings.warn('''Please install Transformers version 4.44 by running: "pip install transformers==4.44.0",
+            please intall torchvision>=0.16.''')
+        warnings.warn('''Please install Transformers version 4.44 by running: "pip install transformers==4.44.0",
+        please intall torchvision>=0.16.''')
+        assert osp.exists(model_path) or splitlen(model_path) == 2
+        flash_attn_flag = False
+        try:
+            import flash_attn
+            flash_attn_flag = True
+        except ImportError:
+            pass
+        if flash_attn_flag:
+            self.model = (
+                AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.float16,
+                                                     attn_implementation='flash_attention_2')
+                .to(torch.device('cuda'))
+            )
+        else:
+            self.model = (
+                AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.float16)
+                .to(torch.device('cuda'))
+            )
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+        default_kwargs = dict(
+            max_new_tokens=512,
+        )
+        default_kwargs.update(kwargs)
+        self.kwargs = default_kwargs
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+        torch.cuda.empty_cache()
+    def generate_inner(self, message, dataset=None):
+        images = []
+        prompt = ''
+        for s in message:
+            if s['type'] == 'image':
+                images.append(s['value'])
+            elif s['type'] == 'text':
+                prompt += s['value']
+        images = [Image.open(s) for s in images]
+        enc_image = self.model.encode_image(images[0])
+        prompt_wtmpl = prompt = f'<image>\n\nQuestion: {prompt}\n\nAnswer: '
+        answer = self.model.generate(
+            enc_image, prompt_wtmpl, eos_text='<END>', tokenizer=self.tokenizer, **self.kwargs)[0]
+        cleaned_answer = re.sub('<$', '', re.sub('END$', '', answer)).strip()
+        return cleaned_answer
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if listinstr(['MMMU'], dataset):
+            return False
+        if DATASET_TYPE(dataset) == 'MCQ' or dataset == 'MMVet':
+            return True
+        return False
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        if dataset == 'MMVet':
+            prompt = question + '\nAnswer the question directly. '
+        elif DATASET_TYPE(dataset) == 'MCQ':
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            options_prompt = ''
+            for key, item in options.items():
+                options_prompt += f'{key}. {item}\n'
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+            prompt = f'Hint: {hint}\n' if hint is not None else ''
+            prompt += f'{question}\n'
+            prompt += (
+                f'{options_prompt}\nAnswer with the option’s letter from the given choices directly. '
+                if len(options) else 'Answer the question directly. '
+            )
+        else:
+            raise NotImplementedError
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
--- a/VLMEvalKit/vlmeval/vlm/mplug_owl2.py
+++ b/VLMEvalKit/vlmeval/vlm/mplug_owl2.py
+import sys
+import torch
+from PIL import Image
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+class mPLUG_Owl2(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = False
+    def __init__(self, model_path='MAGAer13/mplug-owl2-llama2-7b', **kwargs):
+        try:
+            from mplug_owl2.model.builder import load_pretrained_model
+            from mplug_owl2.mm_utils import get_model_name_from_path
+        except:
+            warnings.warn('Please install mPLUG_Owl2 before using mPLUG_Owl2. ')
+            sys.exit(-1)
+        model_name = get_model_name_from_path(model_path)
+        tokenizer, model, image_processor, context_len = load_pretrained_model(
+            model_path, None, model_name, load_8bit=False, load_4bit=False, device='cpu')
+        self.model = model.cuda()
+        self.device = self.model.device
+        self.image_processor = image_processor
+        tokenizer.padding_side = 'left'
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+        self.tokenizer = tokenizer
+        self.context_len = context_len
+        kwargs_default = dict(
+            max_new_tokens=512, do_sample=False, num_beams=1,
+            min_new_tokens=1, length_penalty=1, num_return_sequences=1)
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if listinstr(['MMMU'], dataset):
+            return False
+        if DATASET_TYPE(dataset) == 'MCQ' or dataset == 'MMVet':
+            return True
+        return False
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        if dataset == 'MMVet':
+            prompt = question + '\nAnswer the question directly. '
+        elif DATASET_TYPE(dataset) == 'MCQ':
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            options_prompt = ''
+            for key, item in options.items():
+                options_prompt += f'{key}. {item}\n'
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+            prompt = f'Hint: {hint}\n' if hint is not None else ''
+            prompt += f'{question}\n'
+            prompt += (
+                f'{options_prompt}\nAnswer with the option’s letter from the given choices directly. '
+                if len(options) else 'Answer the question directly. '
+            )
+        else:
+            raise NotImplementedError
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
+    def generate_inner(self, message, dataset=None):
+        from mplug_owl2.constants import IMAGE_TOKEN_INDEX
+        from mplug_owl2.mm_utils import process_images, tokenizer_image_token
+        kwargs = cp.deepcopy(self.kwargs)
+        if dataset in ['MMVet', 'LLaVABench']:
+            kwargs['length_penalty'] = 0
+        elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
+            kwargs['length_penalty'] = 0
+        elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
+            kwargs['max_new_tokens'] = 10
+        num_images = len([x for x in message if x['type'] == 'image'])
+        assert num_images >= 0
+        prompt_full = 'USER: '
+        images = []
+        if num_images == 1:
+            prompt, image = self.message_to_promptimg(message, dataset=dataset)
+            prompt_full += f'<|image|>{prompt} \nASSISTANT: '
+            images.append(image)
+        else:
+            for msg in message:
+                if msg['type'] == 'image':
+                    images.append(msg['value'])
+                    prompt_full += '<|image|>'
+                elif msg['type'] == 'text':
+                    prompt_full += msg['value']
+            prompt_full += '\nASSISTANT: '
+        def preproc_image(fname):
+            image = Image.open(fname).convert('RGB')
+            max_edge = max(image.size)
+            image = image.resize((max_edge, max_edge))
+            return image
+        images = [preproc_image(fname) for fname in images]
+        image_tensor = process_images(images, self.image_processor)
+        image_tensor = image_tensor.to(self.device, dtype=torch.float16)
+        input_ids = tokenizer_image_token(
+            prompt_full, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                input_ids=input_ids,
+                images=image_tensor,
+                output_hidden_states=True,
+                use_cache=True,
+                **kwargs)
+        answer = self.tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
+        return answer.split('</s>')[0]
--- a/VLMEvalKit/vlmeval/vlm/mplug_owl3.py
+++ b/VLMEvalKit/vlmeval/vlm/mplug_owl3.py
+import torch
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+from torchvision import transforms
+from transformers import AutoTokenizer, AutoModel
+import io
+import random
+import numpy as np
+import math
+def get_frame_indices(num_frames, vlen, sample='rand', fix_start=None, input_fps=1, max_num_frames=-1):
+    if sample in ['rand', 'middle']:
+        acc_samples = min(num_frames, vlen)
+        # split the video into `acc_samples` intervals, and sample from each interval.
+        intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
+        ranges = []
+        for idx, interv in enumerate(intervals[:-1]):
+            ranges.append((interv, intervals[idx + 1] - 1))
+        if sample == 'rand':
+            try:
+                frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
+            except:
+                frame_indices = np.random.permutation(vlen)[:acc_samples]
+                frame_indices.sort()
+                frame_indices = list(frame_indices)
+        elif fix_start is not None:
+            frame_indices = [x[0] + fix_start for x in ranges]
+        elif sample == 'middle':
+            frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
+        else:
+            raise NotImplementedError
+        if len(frame_indices) < num_frames:  # padded with last frame
+            padded_frame_indices = [frame_indices[-1]] * num_frames
+            padded_frame_indices[:len(frame_indices)] = frame_indices
+            frame_indices = padded_frame_indices
+    elif 'fps' in sample:  # fps0.5, sequentially sample frames at 0.5 fps
+        output_fps = float(sample[3:])
+        duration = float(vlen) / input_fps
+        delta = 1 / output_fps  # gap between frames, this is also the clip length each frame represents
+        frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
+        frame_indices = np.around(frame_seconds * input_fps).astype(int)
+        frame_indices = [e for e in frame_indices if e < vlen]
+        if max_num_frames > 0 and len(frame_indices) > max_num_frames:
+            frame_indices = frame_indices[:max_num_frames]
+            # frame_indices = np.linspace(0 + delta / 2, duration + delta / 2, endpoint=False, num=max_num_frames)
+    elif 'interval' in sample:
+        if num_frames == 1:
+            frame_indices = [random.randint(0, vlen - 1)]
+        else:
+            # transform FPS
+            interval = 8
+            clip_length = num_frames * interval * input_fps / 30
+            max_idx = max(vlen - clip_length, 0)
+            start_idx = random.uniform(0, max_idx)
+            end_idx = start_idx + clip_length - 1
+            frame_indices = torch.linspace(start_idx, end_idx, num_frames)
+            frame_indices = torch.clamp(frame_indices, 0, vlen - 1).long().tolist()
+    else:
+        raise ValueError
+    return frame_indices
+def get_frame_indices_start_end(num_frames, vlen, fps, start_time, end_time):
+    start_idx = max(int(fps * start_time), 0) if start_time is not None and not math.isnan(start_time) else 0
+    end_idx = min(int(fps * end_time), vlen) if end_time is not None and not math.isnan(end_time) else vlen
+    clip_len = end_idx - start_idx
+    acc_samples = min(num_frames, clip_len)
+    # split the video into `acc_samples` intervals, and sample from each interval.
+    intervals = np.linspace(start=start_idx, stop=end_idx, num=acc_samples + 1).astype(int)
+    ranges = []
+    for idx, interv in enumerate(intervals[:-1]):
+        ranges.append((interv, intervals[idx + 1] - 1))
+    try:
+        frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
+    except:
+        frame_indices = np.random.permutation(list(range(start_idx, end_idx)))[:acc_samples]
+        frame_indices.sort()
+        frame_indices = list(frame_indices)
+    if len(frame_indices) < num_frames:  # padded with last frame
+        padded_frame_indices = [frame_indices[-1]] * num_frames
+        padded_frame_indices[:len(frame_indices)] = frame_indices
+        frame_indices = padded_frame_indices
+    return frame_indices
+def read_frames_decord(
+    video_path, width=None, height=None,
+    num_frames=8, sample='rand', fix_start=None,
+    max_num_frames=-1, start_time=None, end_time=None
+):
+    import decord
+    decord.bridge.set_bridge('torch')
+    if video_path.lower().endswith('.webm'):
+        # a workaround for webm, large/auto num_threads will cause error.
+        num_threads = 2
+    else:
+        num_threads = 0
+    if width is not None and height is not None:
+        video_reader = decord.VideoReader(video_path, width=width, height=height, num_threads=num_threads)
+    else:
+        video_reader = decord.VideoReader(video_path, num_threads=num_threads)
+    vlen = len(video_reader)
+    fps = video_reader.get_avg_fps()
+    if start_time and end_time:
+        frame_indices = get_frame_indices_start_end(
+            num_frames, vlen, fps, start_time, end_time
+        )
+    else:
+        frame_indices = get_frame_indices(
+            num_frames, vlen, sample=sample, fix_start=fix_start,
+            input_fps=fps, max_num_frames=max_num_frames
+        )
+    frames = video_reader.get_batch(frame_indices)
+    if isinstance(frames, torch.Tensor):
+        frames = frames.numpy()  # (T, H, W, C), torch.uint8
+    else:
+        print(frames.shape)
+        frames = frames.asnumpy()
+    timestamp = {
+        'num_frames': len(frame_indices),
+        'timestamp': ', '.join([str(round(f / fps, 1)) for f in frame_indices])
+    }
+    return frames, timestamp
+class mPLUG_Owl3(BaseModel):
+    # No separate model module is required, but the dependencies must be met.
+    # https://github.com/X-PLUG/mPLUG-Owl/blob/main/mPLUG-Owl3/requirements.txt
+    INSTALL_REQ = True
+    INTERLEAVE = True
+    INSTALL_REQ_TXT = 'https://github.com/X-PLUG/mPLUG-Owl/blob/main/mPLUG-Owl3/requirements.txt'
+    def __init__(self, model_path=None, **kwargs):
+        assert model_path is not None
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path
+        )
+        self.model = AutoModel.from_pretrained(
+            model_path,
+            attn_implementation='sdpa',
+            torch_dtype=torch.half,
+            trust_remote_code=True
+        )
+        self.model.eval().cuda()
+        self.processor = self.model.init_processor(self.tokenizer)
+        self.logger = get_logger('mPLUG_Owl3')
+        if self.INSTALL_REQ:
+            self.logger.info(
+                f'Please remember to meet the requirements first\n'
+                f'Here: {self.INSTALL_REQ_TXT}'
+            )
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if listinstr(['MMMU'], dataset):
+            return False
+        if listinstr(['MVBench', 'MMVet'], dataset):
+            return True
+        return False
+    def save_video_into_images(self, line, num_frames=16, dataset_class=None):
+        video_url = {
+            'video': osp.join(line['prefix'], line['video']),
+            'num_frames': num_frames,
+            'bound': line.get('bound', None)
+        }
+        if osp.isdir(video_url['video']):
+            frame_paths = []
+            max_frame = len(os.listdir(video_url['video']))
+            fps = 3
+            if video_url['bound']:
+                start, end = line['start'], line['end']
+            else:
+                start, end = -100000, 100000
+            start_idx = max(1, round(start * fps))
+            end_idx = min(round(end * fps), max_frame)
+            seg_size = float(end_idx - start_idx) / num_frames
+            frame_indices = np.array([
+                int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
+                for idx in range(num_frames)
+            ])
+            for frame_index in frame_indices:
+                img = os.path.join(video_url['video'], f'{frame_index:05d}.jpg')
+                frame_paths.append(img)
+            return frame_paths
+        if isinstance(video_url, dict):
+            if video_url['bound']:
+                start_time = line['start']
+                end_time = line['end']
+            else:
+                start_time = None
+                end_time = None
+            num_frames = video_url.get('num_frames', num_frames)
+            video_url = video_url['video']
+        else:
+            start_time = None
+            end_time = None
+            video_url = str(video_url)
+        if not osp.exists(video_url):  # for MVBench_MP4
+            video_url = osp.join(dataset_class.data_root, video_url)
+        video, timestamp = read_frames_decord(
+            video_url, num_frames=num_frames, sample='middle', start_time=start_time, end_time=end_time
+        )
+        to_pil = transforms.ToPILImage()
+        frames = [to_pil(video[ti]) for ti in range(video.shape[0])]
+        lmu_root = LMUDataRoot()
+        frame_root = osp.join(lmu_root, 'images', dataset_class.dataset_name, 'mplug_owl3')
+        frame_root = osp.join(frame_root, video_url.split('/')[-1].split('.')[0])
+        os.makedirs(frame_root, exist_ok=True)
+        frame_tmpl = 'frame-{}-of-{}.jpg'
+        frame_paths = [osp.join(frame_root, frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)]
+        for im, pth in zip(frames, frame_paths):
+            if not osp.exists(pth):
+                im.save(pth)
+        return frame_paths
+    # Currently same to mPLUG_Owl2
+    def build_prompt(self, line, dataset=None, num_frames=16, video_llm=False):
+        if not isinstance(dataset, str):
+            dataset_class = dataset
+            dataset = dataset_class.dataset_name
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        if dataset_class.MODALITY == 'VIDEO':
+            if listinstr(['MVBench'], dataset):
+                tgt_path = self.save_video_into_images(line, num_frames, dataset_class)
+            else:
+                tgt_path = dataset_class.save_video_into_images(line, num_frames)
+            if type(line['candidates']) != list:
+                line['candidates'] = eval(line['candidates'])
+            for idx, c in enumerate(line['candidates']):
+                line[chr(ord('A') + idx)] = c
+        else:
+            tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        if dataset == 'MMVet':
+            prompt = question + '\nAnswer the question directly. '
+        elif listinstr(['MCQ', 'Video-MCQ'], DATASET_TYPE(dataset)):
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            options_prompt = ''
+            for key, item in options.items():
+                options_prompt += f'{key}. {item}\n'
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+            prompt = f'Hint: {hint}\n' if hint is not None else ''
+            prompt += f'{question}\n'
+            prompt += (
+                f'{options_prompt}\nAnswer with the option’s letter from the given choices directly. '
+                if len(options) else 'Answer the question directly. '
+            )
+        else:
+            raise NotImplementedError
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
+    def preproc_image(self, fname, dataset=None):
+        from PIL import Image
+        image = Image.open(fname).convert('RGB')
+        # resize to max_size
+        max_size = 448 * 16
+        if max(image.size) > max_size and not listinstr(['MVBench'], dataset):
+            w, h = image.size
+            if w > h:
+                new_w = max_size
+                new_h = int(h * max_size / w)
+            else:
+                new_h = max_size
+                new_w = int(w * max_size / h)
+            image = image.resize((new_w, new_h), resample=Image.BICUBIC)
+        return image
+    def generate_inner(self, message, dataset=None):
+        num_images = len([x for x in message if x['type'] == 'image'])
+        assert num_images >= 0
+        images = []
+        prompt_full = ''
+        for msg in message:
+            if msg['type'] == 'image':
+                images.append(msg['value'])
+                prompt_full += '<|image|>'
+            elif msg['type'] == 'text':
+                prompt_full += msg['value']
+        needed_messages = [
+            {'role': 'user', 'content': prompt_full},
+            {'role': 'assistant', 'content': ''}
+        ]
+        images = [self.preproc_image(fname, dataset) for fname in images]
+        inputs = self.processor(needed_messages, images=images, videos=None, cut_enable=False)
+        inputs.to('cuda')
+        if listinstr(['MVBench'], dataset):
+            inputs.update({
+                'tokenizer': self.tokenizer,
+                'max_new_tokens': 100,
+                'decode_text': True,
+                'do_sample': True,
+                'top_k': 1,
+            })
+        else:
+            inputs.update({
+                'tokenizer': self.tokenizer,
+                'max_new_tokens': 1024,
+                'decode_text': True,
+            })
+        g = self.model.generate(**inputs)
+        return g[0]
--- a/VLMEvalKit/vlmeval/vlm/omchat.py
+++ b/VLMEvalKit/vlmeval/vlm/omchat.py
+import torch
+from PIL import Image
+import re
+from transformers import AutoModel, AutoProcessor
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+class OmChat(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = True
+    def __init__(self, model_path='omlab/omchat-v2.0-13B-single-beta_hf', **kwargs):
+        # Recommend to install `transformers==4.44.0`
+        assert model_path is not None
+        self.model_path = model_path
+        print(f'load from {self.model_path}')
+        model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True, torch_dtype=torch.float16)
+        self.model = model.cuda().eval()
+        self.kwargs = kwargs
+        self.processor = AutoProcessor.from_pretrained(self.model_path, trust_remote_code=True)
+        torch.cuda.empty_cache()
+        # system prompt
+        self.default_system_prompt = 'You are a helpful assistant. Focus on accuracy and reliability in your response.'
+        self.new1_system_prompt = 'You are a helpful assistant.'
+        self.new2_system_prompt = (
+            'Read the following question carefully, '
+            'solve it step by step, '
+            'and then output the final answer in the format of '
+            "'Answer: single number or single word or phrase'.\n\n"
+        )
+        # suffix_prompt for MCQ
+        self.mcq_suffix_prompt_en = 'Please select the correct answer from the options above. \n'
+        self.mcq_suffix_prompt_cn = '请直接回答选项字母。\n'
+        # suffix_prompt for Y/N
+        self.yorn_suffix_prompt = ' Please answer yes or no. Answer the question using a single word or phrase.'
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'Y/N':
+            return True
+        return False
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        question = line['question']
+        if DATASET_TYPE(dataset) == 'MCQ':
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            options_prompt = 'Options:\n'
+            for key, item in options.items():
+                options_prompt += f'{key}. {item}\n'
+            prompt = ''
+            if hint is not None:
+                prompt += f'Hint: {hint}\n'
+            prompt += f'Question: {question}\n'
+            if len(options):
+                prompt += options_prompt
+                if not dataset.startswith('MMMU_'):
+                    if not cn_string(prompt):
+                        prompt += self.mcq_suffix_prompt_en
+                    else:
+                        prompt += self.mcq_suffix_prompt_cn
+        elif DATASET_TYPE(dataset) == 'Y/N':
+            prompt = question + self.yorn_suffix_prompt
+        print(DATASET_TYPE(dataset))
+        message = []
+        if isinstance(tgt_path, list):
+            message.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            message = [dict(type='image', value=tgt_path)]
+        message.append(dict(type='text', value=prompt))
+        return message
+    def message_to_promptimg(self, message, dataset=None):
+        if dataset is None or listinstr(['MMMU'], dataset):
+            prompt = '\n'.join([
+                re.sub(r'<image\s*\d+>', '<image>', x['value'])
+                for x in message
+                if x['type'] == 'text'
+            ])
+            image = [x['value'] for x in message if x['type'] == 'image']
+        else:
+            prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+            image = [x['value'] for x in message if x['type'] == 'image']
+        return prompt, image
+    def generate_inner(self, message, dataset=None):
+        def replace_last_dot(input_string):
+            if input_string.endswith('.'):
+                return input_string[:-1]
+            else:
+                return input_string
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        image = [Image.open(img_path).convert('RGB') for img_path in image_path]
+        default_kwargs = dict(
+            max_new_tokens=1024,
+            do_sample=False,
+            temperature=0.0,
+            top_p=1)
+        if dataset is not None and listinstr(['MathVista_MINI'], dataset):
+            system_prompt = self.new2_system_prompt
+        elif dataset is not None and listinstr(['MMMU_DEV_VAL', 'MMStar'], dataset):
+            system_prompt = self.new1_system_prompt
+        else:
+            system_prompt = self.default_system_prompt
+        inputs = self.processor(text=prompt, system_prompt=system_prompt, images=image, return_tensors='pt').to('cuda')
+        default_kwargs.update(self.kwargs)
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                **inputs,
+                eos_token_id=self.model.generation_config.eos_token_id,
+                **default_kwargs
+            )
+        res = self.processor.tokenizer.decode(output_ids[0, inputs.input_ids.shape[1]:]).strip()
+        if '<|im_end|>' in res:
+            res = res.split('<|im_end|>')[0].strip()
+        if dataset != 'MMMU_DEV_VAL':
+            if res.startswith('Answer: '):
+                res = res[len('Answer: '):]
+            match = re.search(r'\nThe answer is:(.+)', res)
+            if match:
+                res = match.group(1).strip()
+        # for OCRBench
+        doc_match = re.search(r'<doc>(.*?)<\/doc>', res)
+        if doc_match:
+            res = doc_match.group(1).strip()
+        res = replace_last_dot(res)
+        return res
--- a/VLMEvalKit/vlmeval/vlm/omnilmm.py
+++ b/VLMEvalKit/vlmeval/vlm/omnilmm.py
+import torch
+from PIL import Image
+from transformers import AutoTokenizer
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+DEFAULT_IMAGE_TOKEN = '<image>'
+DEFAULT_IMAGE_PATCH_TOKEN = '<im_patch>'
+DEFAULT_IM_START_TOKEN = '<im_start>'
+DEFAULT_IM_END_TOKEN = '<im_end>'
+def init_omni_lmm(model_path):
+    from omnilmm.model.omnilmm import OmniLMMForCausalLM
+    from omnilmm.utils import disable_torch_init
+    from omnilmm.model.utils import build_transform
+    torch.backends.cuda.matmul.allow_tf32 = True
+    disable_torch_init()
+    tokenizer = AutoTokenizer.from_pretrained(model_path, model_max_length=2048)
+    model = OmniLMMForCausalLM.from_pretrained(
+        model_path, tune_clip=True, torch_dtype=torch.bfloat16, device_map='cpu'
+    )
+    model = model.to(device='cuda', dtype=torch.bfloat16)
+    image_processor = build_transform(
+        is_train=False, input_size=model.model.config.image_size, std_mode='OPENAI_CLIP'
+    )
+    mm_use_im_start_end = getattr(model.config, 'mm_use_im_start_end', False)
+    assert mm_use_im_start_end
+    tokenizer.add_tokens(
+        [DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN],
+        special_tokens=True,
+    )
+    vision_config = model.model.vision_config
+    vision_config.im_patch_token = tokenizer.convert_tokens_to_ids(
+        [DEFAULT_IMAGE_PATCH_TOKEN]
+    )[0]
+    vision_config.use_im_start_end = mm_use_im_start_end
+    vision_config.im_start_token, vision_config.im_end_token = (
+        tokenizer.convert_tokens_to_ids([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
+    )
+    image_token_len = model.model.config.num_query
+    return model, image_processor, image_token_len, tokenizer
+def expand_question_into_multimodal(
+    question_text, image_token_len, im_st_token, im_ed_token, im_patch_token
+):
+    if '<image>' in question_text[0]['content']:
+        question_text[0]['content'] = question_text[0]['content'].replace(
+            '<image>', im_st_token + im_patch_token * image_token_len + im_ed_token
+        )
+    else:
+        question_text[0]['content'] = (
+            im_st_token
+            + im_patch_token * image_token_len
+            + im_ed_token
+            + '\n'
+            + question_text[0]['content']
+        )
+    return question_text
+def wrap_question_for_omni_lmm(question, image_token_len, tokenizer):
+    from omnilmm.train.train_utils import omni_preprocess
+    question = expand_question_into_multimodal(
+        question,
+        image_token_len,
+        DEFAULT_IM_START_TOKEN,
+        DEFAULT_IM_END_TOKEN,
+        DEFAULT_IMAGE_PATCH_TOKEN,
+    )
+    conversation = question
+    data_dict = omni_preprocess(
+        sources=[conversation], tokenizer=tokenizer, generation=True
+    )
+    data_dict = dict(input_ids=data_dict['input_ids'][0], labels=data_dict['labels'][0])
+    return data_dict
+class OmniLMM12B(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = False
+    def __init__(self, model_path, root, **kwargs) -> None:
+        sys.path.append(root)
+        model, img_processor, image_token_len, tokenizer = init_omni_lmm(model_path)
+        self.model = model
+        self.image_token_len = image_token_len
+        self.image_transform = img_processor
+        self.tokenizer = tokenizer
+        self.model.eval()
+        default_kwargs = dict(
+            max_new_tokens=512,
+            do_sample=False,
+            output_scores=True,
+            return_dict_in_generate=True,
+            repetition_penalty=1.1,
+        )
+        default_kwargs.update(kwargs)
+        self.kwargs = default_kwargs
+        torch.cuda.empty_cache()
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        try:
+            image = Image.open(image_path).convert('RGB')
+        except:
+            logger = get_logger('OmniLMM Inference')
+            logger.error('Image Decode Error')
+            return 'Image Decode Error'
+        msgs = [dict(role='user', content=prompt)]
+        input_ids = wrap_question_for_omni_lmm(
+            msgs, self.image_token_len, self.tokenizer
+        )['input_ids']
+        input_ids = torch.as_tensor(input_ids)
+        image = self.image_transform(image)
+        with torch.inference_mode():
+            output = self.model.generate_vllm(
+                input_ids=input_ids.unsqueeze(0).cuda(),
+                images=image.unsqueeze(0).half().cuda(),
+                **self.kwargs,
+            )
+            response = self.tokenizer.decode(
+                output.sequences[0], skip_special_tokens=True
+            )
+            response = response.strip()
+            return response
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'MCQ':
+            return True
+        return False
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        options_prompt = 'Options:\n'
+        for key, item in options.items():
+            options_prompt += f'{key}. {item}\n'
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        prompt = ''
+        if hint is not None:
+            prompt += f'Hint: {hint}\n'
+        prompt += f'{question}\n'
+        if len(options):
+            prompt += options_prompt
+            prompt = (
+                """
+Study the image carefully and pick the option associated with the correct answer.
+Focus solely on selecting the option and avoid including any other content.\n
+"""
+                + prompt
+            )
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
--- a/VLMEvalKit/vlmeval/vlm/open_flamingo.py
+++ b/VLMEvalKit/vlmeval/vlm/open_flamingo.py
+import sys
+import torch
+from PIL import Image
+import os.path as osp
+import warnings
+from .base import BaseModel
+from ..smp import splitlen, get_cache_path
+from huggingface_hub import snapshot_download
+class OpenFlamingo(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = True
+    def __init__(self,
+                 name,
+                 mpt_pth=None,
+                 ckpt_pth=None,
+                 **kwargs):
+        if mpt_pth is None:
+            warnings.warn(
+                'Please set `mpt_pth` to the directory of MPT-7B, which is cloned from here: '
+                'https://huggingface.co/mosaicml/mpt-7b. '
+            )
+            sys.exit(-1)
+        if ckpt_pth is None:
+            warnings.warn(
+                'Please set `ckpt_pth` to the openflamingo ckpt, which is the `checkpoint.pt` file downloaded '
+                'from: https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b/tree/main. '
+            )
+            sys.exit(-1)
+        else:
+            if osp.exists(ckpt_pth):
+                if ckpt_pth.endswith('checkpoint.pt'):
+                    pass
+                elif osp.isdir(ckpt_pth):
+                    ckpt_pth = osp.join(ckpt_pth, 'checkpoint.pt')
+                    if not osp.exists(ckpt_pth):
+                        sys.exit(-1)
+            elif splitlen(ckpt_pth, '/') == 2:
+                cache_path = get_cache_path(ckpt_pth)
+                if cache_path is None:
+                    snapshot_download(ckpt_pth)
+                cache_path = get_cache_path(ckpt_pth)
+                if cache_path is None:
+                    sys.exit(-1)
+                else:
+                    ckpt_pth = osp.join(cache_path, 'checkpoint.pt')
+        self.name = name
+        assert name in ['v2']
+        self.mpt_pth = mpt_pth
+        try:
+            from open_flamingo import create_model_and_transforms
+        except:
+            raise ImportError('Please first install open_flamingo to use OpenFlamingo')
+        model, image_processor, tokenizer = create_model_and_transforms(
+            clip_vision_encoder_path='ViT-L-14',
+            clip_vision_encoder_pretrained='openai',
+            lang_encoder_path=mpt_pth,
+            tokenizer_path=mpt_pth,
+            cross_attn_every_n_layers=4)
+        ckpt = torch.load(ckpt_pth)
+        model.load_state_dict(ckpt, strict=False)
+        torch.cuda.empty_cache()
+        self.model = model.eval().cuda()
+        self.tokenizer = tokenizer
+        self.tokenizer.padding_side = 'left'
+        self.image_proc = image_processor
+        kwargs_default = dict(max_new_tokens=512, num_beams=3)
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+    def generate_inner(self, message, dataset=None):
+        vision_x = []
+        prompt = ''
+        for msg in message:
+            if msg['type'] == 'image':
+                img = Image.open(msg['value'])
+                vision_x.append(self.image_proc(img).unsqueeze(0))
+                prompt += '<image>'
+            elif msg['type'] == 'text':
+                prompt += msg['value']
+        prompt += 'Answer: '
+        vision_x = torch.cat(vision_x, dim=0) if len(vision_x) > 1 else vision_x[0]
+        vision_x = vision_x.unsqueeze(1).unsqueeze(0)
+        lang_x = self.tokenizer([prompt], return_tensors='pt')
+        generated_text = self.model.generate(
+            vision_x=vision_x.cuda(),
+            lang_x=lang_x['input_ids'].cuda(),
+            attention_mask=lang_x['attention_mask'].cuda(),
+            **self.kwargs)
+        generated_text = self.tokenizer.decode(generated_text[0])
+        text = generated_text[len(prompt):].split('<|endofchunk|>')[0]
+        return text
--- a/VLMEvalKit/vlmeval/vlm/ovis.py
+++ b/VLMEvalKit/vlmeval/vlm/ovis.py
+import torch
+from transformers import AutoModelForCausalLM
+from .base import BaseModel
+from ..dataset import DATASET_TYPE
+from ..smp import *
+class Ovis(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = True
+    def __init__(self, model_path='AIDC-AI/Ovis1.5-Llama3-8B', **kwargs):
+        assert model_path is not None
+        # Recommend to install `transformers==4.43.2` and `torch==2.1.2`.
+        self.model_path = model_path
+        self.device = torch.cuda.current_device()
+        self.dtype = torch.bfloat16
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_path,
+            torch_dtype=self.dtype,
+            multimodal_max_length=8192,
+            trust_remote_code=True
+        )
+        self.model = self.model.eval().to(device=self.device)
+        self.eos_token_id = self.model.generation_config.eos_token_id
+        self.text_tokenizer = self.model.get_text_tokenizer()
+        self.pad_token_id = self.text_tokenizer.pad_token_id
+        self.visual_tokenizer = self.model.get_visual_tokenizer()
+        self.conversation_formatter = self.model.get_conversation_formatter()
+        self.image_placeholder = '<image>'
+        self.gen_kwargs = dict(
+            max_new_tokens=1024,
+            do_sample=False,
+            top_p=None,
+            top_k=None,
+            temperature=None,
+            repetition_penalty=None,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            use_cache=True
+        )
+    def use_custom_prompt(self, dataset):
+        if DATASET_TYPE(dataset) == 'Y/N' or DATASET_TYPE(dataset) == 'MCQ':
+            return True
+        return False
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+        if DATASET_TYPE(dataset) == 'Y/N':
+            prompt = self.built_yorn_prompt(line, dataset)
+        elif DATASET_TYPE(dataset) == 'MCQ':
+            prompt = self.build_multi_choice_prompt(line, dataset)
+        else:
+            raise RuntimeError(f'Invalid dataset type: {DATASET_TYPE(dataset)}')
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        # interleave dataset
+        if dataset.startswith('MMMU_'):
+            from .. import MMMUDataset
+            message = MMMUDataset.split_MMMU(message)
+        return message
+    def built_yorn_prompt(self, line, dataset=None):
+        prompt = line['question']
+        if listinstr(['HallusionBench'], dataset):
+            prompt += ' Please answer yes or no.'
+        prompt += '\n请用单个词或短语回答问题。' if cn_string(
+            prompt) else '\nAnswer the question using a single word or phrase.'
+        return prompt
+    def build_multi_choice_prompt(self, line, dataset=None):
+        question = line['question']
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+        prompt = question
+        if len(options):
+            prompt += '\n请直接回答选项字母。' if cn_string(
+                prompt) else "\nAnswer with the option's letter from the given choices directly."
+        else:
+            prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
+        return prompt
+    def generate_inner(self, message, dataset=None):
+        prompt, input_ids, attention_mask, pixel_values = self.prepare_inputs(message)
+        output_ids = self.model.generate(
+            input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            **self.gen_kwargs
+        )
+        response = self.text_tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
+        return response
+    def prepare_inputs(self, message):
+        # build query
+        images = [x['value'] for x in message if x['type'] == 'image']
+        texts = [x['value'] for x in message if x['type'] == 'text']
+        if len(images) == 0:
+            query = '\n'.join(texts)
+        elif len(images) == 1 and len(texts) == 1:
+            query = self.image_placeholder + '\n' + texts[0]
+        else:  # interleave sample
+            chunks = [x['value'] if x['type'] == 'text' else self.image_placeholder for x in message]
+            query = '\n'.join(chunks)
+        # format conversation
+        prompt, input_ids = self.conversation_formatter.format_query(query)
+        attention_mask = torch.ne(input_ids, self.text_tokenizer.pad_token_id)
+        input_ids = input_ids.unsqueeze(0).to(device=self.device)
+        attention_mask = attention_mask.unsqueeze(0).to(device=self.device)
+        # preprocess images
+        if len(images) == 0:
+            pixel_values = [None]
+        else:
+            preprocessed_images = [self.visual_tokenizer.preprocess_image(Image.open(image)) for image in images]
+            pixel_values = [torch.cat(preprocessed_images, dim=0).to(device=self.device, dtype=self.dtype)]
+        return prompt, input_ids, attention_mask, pixel_values
+class Ovis1_6(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = True
+    def __init__(self, model_path='AIDC-AI/Ovis1.6-Gemma2-9B', **kwargs):
+        assert model_path is not None
+        # Recommend to install `python=3.10`, `transformers==4.44.2`, `torch==2.2.0`, and `numpy==1.24.3`
+        self.model_path = model_path
+        self.device = torch.cuda.current_device()
+        self.dtype = torch.bfloat16
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_path,
+            torch_dtype=self.dtype,
+            multimodal_max_length=8192,
+            trust_remote_code=True
+        )
+        self.model = self.model.eval().to(device=self.device)
+        self.eos_token_id = self.model.generation_config.eos_token_id
+        self.text_tokenizer = self.model.get_text_tokenizer()
+        self.pad_token_id = self.text_tokenizer.pad_token_id
+        self.visual_tokenizer = self.model.get_visual_tokenizer()
+        self.max_partition = 9
+        self.image_placeholder = '<image>'
+        self.gen_kwargs = dict(
+            max_new_tokens=1024,
+            do_sample=False,
+            top_p=None,
+            top_k=None,
+            temperature=None,
+            repetition_penalty=None,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            use_cache=True
+        )
+    def use_custom_prompt(self, dataset):
+        if DATASET_TYPE(dataset) == 'Y/N' or DATASET_TYPE(dataset) == 'MCQ':
+            return True
+        return False
+    def built_yorn_prompt(self, line, dataset=None):
+        prompt = line['question'] + '\nAnswer the question using a single word or phrase.'
+        return prompt
+    def build_multi_choice_prompt(self, line, dataset=None):
+        question = line['question']
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+        prompt = question
+        if len(options):
+            prompt += "\nAnswer with the option's letter from the given choices directly."
+        return prompt
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+        if DATASET_TYPE(dataset) == 'Y/N':
+            prompt = self.built_yorn_prompt(line, dataset)
+        elif DATASET_TYPE(dataset) == 'MCQ':
+            prompt = self.build_multi_choice_prompt(line, dataset)
+        else:
+            raise RuntimeError(f'Invalid dataset type: {DATASET_TYPE(dataset)}')
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        # interleave dataset
+        if dataset.startswith('MMMU_'):
+            from .. import MMMUDataset
+            message = MMMUDataset.split_MMMU(message)
+        return message
+    def generate_inner(self, message, dataset=None):
+        prompt, input_ids, attention_mask, pixel_values = self.prepare_inputs(message)
+        output_ids = self.model.generate(
+            input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            **self.gen_kwargs
+        )
+        response = self.text_tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        return response
+    def prepare_inputs(self, message):
+        # build query
+        images = [x['value'] for x in message if x['type'] == 'image']
+        texts = [x['value'] for x in message if x['type'] == 'text']
+        if len(images) == 0:
+            query = '\n'.join(texts)
+        elif len(images) == 1 and len(texts) == 1:
+            query = self.image_placeholder + '\n' + texts[0]
+        else:  # interleaved sample
+            chunks = [x['value'] if x['type'] == 'text' else self.image_placeholder for x in message]
+            query = '\n'.join(chunks)
+        # preprocess inputs
+        prompt, input_ids, pixel_values = self.model.preprocess_inputs(
+            query, [Image.open(image) for image in images], max_partition=self.max_partition
+        )
+        # move to self.device
+        attention_mask = torch.ne(input_ids, self.text_tokenizer.pad_token_id)
+        input_ids = input_ids.unsqueeze(0).to(device=self.device)
+        attention_mask = attention_mask.unsqueeze(0).to(device=self.device)
+        pixel_values = [
+            pixel_values.to(device=self.device, dtype=self.dtype) if pixel_values is not None else None
+        ]
+        return prompt, input_ids, attention_mask, pixel_values
--- a/VLMEvalKit/vlmeval/vlm/paligemma.py
+++ b/VLMEvalKit/vlmeval/vlm/paligemma.py
+from PIL import Image
+import torch
+from .base import BaseModel
+from ..smp import *
+class PaliGemma(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = False
+    def __init__(self, model_path='google/paligemma-3b-mix-448', **kwargs):
+        try:
+            from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
+        except:
+            warnings.warn('Please install the latest version transformers.')
+            sys.exit(-1)
+        model = PaliGemmaForConditionalGeneration.from_pretrained(
+            model_path,
+            torch_dtype=torch.bfloat16,
+            device_map='cpu',
+            revision='bfloat16',
+        ).eval()
+        self.model = model.cuda()
+        self.processor = AutoProcessor.from_pretrained(model_path)
+        self.kwargs = kwargs
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        image = Image.open(image_path).convert('RGB')
+        model_inputs = self.processor(
+            text=prompt, images=image, return_tensors='pt'
+        ).to('cuda')
+        input_len = model_inputs['input_ids'].shape[-1]
+        with torch.inference_mode():
+            generation = self.model.generate(
+                **model_inputs, max_new_tokens=512, do_sample=False
+            )
+            generation = generation[0][input_len:]
+            res = self.processor.decode(generation, skip_special_tokens=True)
+        return res
--- a/VLMEvalKit/vlmeval/vlm/pandagpt.py
+++ b/VLMEvalKit/vlmeval/vlm/pandagpt.py
+import sys
+import torch
+import os.path as osp
+import warnings
+from .base import BaseModel
+class PandaGPT(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = False
+    def __init__(self, name, root=None, **kwargs):
+        if root is None:
+            warnings.warn('Please set `root` to PandaGPT code directory, which is cloned from here: ')
+            sys.exit(-1)
+        assert name == 'PandaGPT_13B'
+        self.name = name
+        sys.path.append(osp.join(root, 'code'))
+        try:
+            from model.openllama import OpenLLAMAPEFTModel
+        except:
+            raise ImportError(
+                'Please first install PandaGPT and set the root path to use PandaGPT, '
+                'which is cloned from here: https://github.com/yxuansu/PandaGPT. '
+            )
+        self.args = {
+            'model': 'openllama_peft',
+            'imagebind_ckpt_path': osp.join(root, 'pretrained_ckpt/imagebind_ckpt'),
+            'vicuna_ckpt_path': osp.join(root, 'pretrained_ckpt/vicuna_ckpt/13b_v0'),
+            'delta_ckpt_path': osp.join(root, 'pretrained_ckpt/pandagpt_ckpt/13b/pytorch_model.pt'),
+            'stage': 2,
+            'max_tgt_len': 512,
+            'lora_r': 32,
+            'lora_alpha': 32,
+            'lora_dropout': 0.1,
+        }
+        model = OpenLLAMAPEFTModel(**self.args)
+        delta_ckpt = torch.load(self.args['delta_ckpt_path'], map_location=torch.device('cpu'))
+        model.load_state_dict(delta_ckpt, strict=False)
+        torch.cuda.empty_cache()
+        self.model = model.eval().half().cuda()
+        kwargs_default = {'top_p': 0.9, 'do_sample': False, 'max_tgt_len': 128, 'temperature': 0.001}
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        struct = {
+            'prompt': prompt,
+            'image_paths': [image_path],
+            'audio_paths': [],
+            'video_paths': [],
+            'thermal_paths': [],
+            'modality_embeds': []
+        }
+        struct.update(self.kwargs)
+        resp = self.model.generate(struct)
+        return resp
--- a/VLMEvalKit/vlmeval/vlm/parrot.py
+++ b/VLMEvalKit/vlmeval/vlm/parrot.py
+import os
+import torch
+from PIL import Image
+from abc import abstractproperty
+from .base import BaseModel
+from ..dataset import DATASET_TYPE
+from ..smp import *
+class Parrot(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = False
+    def __init__(self, model_path='AIDC-AI/Parrot-7B', **kwargs):
+        try:
+            from parrot.model.parrot_arch import ParrotMetaForCausalLM
+            from parrot.utils.constants import DEFAULT_IMAGE_TOKEN, BEGIN_LINE, END_LINE
+            from parrot.model.conversation_formatter import ConversationFormatter
+            from parrot.utils.mm_utils import process_images
+        except:
+            warnings.warn('Please install Parrot before using Parrot')
+            warnings.warn('Please install Parrot from https://github.com/AIDC-AI/Parrot')
+            warnings.warn('Using `pip install -e . --no-deps` in the Parrot directory')
+            warnings.warn('Recommend to install transformers==4.39.0')
+            sys.exit(-1)
+        self.process_images = process_images
+        self.ConversationFormatter = ConversationFormatter
+        self.DEFAULT_IMAGE_TOKEN = DEFAULT_IMAGE_TOKEN
+        self.BEGIN_LINE = BEGIN_LINE
+        self.END_LINE = END_LINE
+        try:
+            model_name = 'parrot_qwen2'
+            model, tokenizer, conversation_formatter = ParrotMetaForCausalLM.build(
+                model_name, model_path, mm_vision_tower='openai/clip-vit-large-patch14-336'
+            )
+            self.model = model.cuda()
+            self.vision_tower = self.model.get_vision_tower()
+            self.tokenizer = tokenizer
+            self.conversation_formatter = conversation_formatter
+            self.image_processor = self.model.get_vision_tower().image_processor
+        except Exception as e:
+            warnings.warn(f'Error when loading Parrot model:\n{e}')
+            exit(-1)
+        self.kwargs = dict(
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=512,
+            repetition_penalty=None,
+            use_cache=True,
+            eos_token_id=self.tokenizer.eos_token_id,
+            pad_token_id=self.tokenizer.pad_token_id
+        )
+        if int(os.environ.get('LOCAL_RANK', '0')) == 0:
+            print(f'Following kwargs {self.kwargs} will be used as generation config.')
+        self.count = 0
+    def use_custom_prompt(self, dataset):
+        if DATASET_TYPE(dataset) == 'Y/N' or DATASET_TYPE(dataset) == 'MCQ':
+            return True
+        return False
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+        if DATASET_TYPE(dataset) == 'Y/N':
+            prompt = self.built_yorn_prompt(line, dataset)
+        elif DATASET_TYPE(dataset) == 'MCQ':
+            prompt = self.build_multi_choice_prompt(line, dataset)
+        else:
+            raise ValueError(f'Invalid dataset type: {DATASET_TYPE(dataset)}')
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=p) for p in tgt_path])
+        return message
+    def built_yorn_prompt(self, line, dataset=None):
+        prompt = line['question']
+        previous_suffixs = [' Please answer yes or no.', ' Yes or No', ' Answer in one sentence.']
+        for previous_suffix in previous_suffixs:
+            if prompt.endswith(previous_suffix):
+                prompt = prompt[:-len(previous_suffix)]
+                break
+        prompt += '\n请直接回答Yes或No。请用单个词或短语回答问题。' if cn_string(
+            prompt) else '\nPlease strictly answer Yes or No. Answer the question using a single word or phrase.'
+        return prompt
+    def build_multi_choice_prompt(self, line, dataset=None):
+        question = line['question']
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+        prompt = question
+        if len(options):
+            default_prompt = "\nAnswer with the option's letter from the given choices directly."
+            if dataset[-3:] == '_cn' or cn_string(prompt):
+                default_prompt = '\n请直接用给定选项中的选项字母回答。'
+            elif dataset[-3:] == '_pt':
+                default_prompt = '\nResponda diretamente com a letra da opção das escolhas dadas.'
+            elif dataset[-3:] == '_ar':
+                default_prompt = '\nأجب مباشرةً بحرف الخيار من الاختيارات المعطاة.'
+            elif dataset[-3:] == '_ru':
+                default_prompt = '\nОтветьте буквой варианта из предложенных вариантов напрямую.'
+            elif dataset[-3:] == '_tr':
+                default_prompt = '\nVerilen seçeneklerden doğrudan seçeneğin harfi ile cevap verin.'
+            prompt += default_prompt
+            # prompt += (
+            #     '\n请直接回答选项字母。' if cn_string(prompt) else
+            #     "\nAnswer with the option's letter from the given choices directly."
+            # )
+        else:
+            prompt += '\n请用单个词或短语回答问题。' if cn_string(
+                prompt) else '\nAnswer the question using a single word or phrase.'
+        return prompt
+    def process_answer_prefix(self, answer, prefixes):
+        for prefix in prefixes:
+            if prefix in answer.lower():
+                return answer[answer.lower().find(prefix) + len(prefix):]
+        return answer
+    def generate_inner(self, message, dataset=None):
+        query, image_paths = self.prepare_inputs(message)
+        images_list = [Image.open(image_path).convert('RGB') for image_path in image_paths]
+        args = abstractproperty()
+        args.image_aspect_ratio = 'pad'
+        image_tensors = self.process_images(images_list, self.image_processor, args).cuda()
+        prompt, input_ids = self.conversation_formatter.format_query(query)
+        input_ids = input_ids.unsqueeze(0).cuda()
+        with torch.inference_mode():
+            kwargs = dict(
+                images=image_tensors,
+            )
+            kwargs.update(self.kwargs)
+            output_ids = self.model.generate(input_ids, **kwargs)
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        response = self.tokenizer.batch_decode(output_ids[:, input_token_len:],
+                                               skip_special_tokens=True)[0].strip(string.whitespace)
+        answer = response
+        if query.endswith("Answer with the option's letter from the given choices directly.") or query.endswith(
+                '请直接回答选项字母。'):
+            qtype = 'multiple-choice'
+            while True:
+                answer = answer.strip(string.punctuation + string.whitespace)
+                if len(answer) > 1:
+                    if answer[0] in string.ascii_uppercase and answer[1] in string.whitespace + string.punctuation:
+                        answer = answer[0]
+                        break
+                    elif answer[-1] in string.ascii_uppercase and answer[-2] in string.whitespace + string.punctuation:
+                        answer = answer[-1]
+                        break
+                    elif listinstr(['answer is', 'answer:'], answer.lower()):
+                        answer = self.process_answer_prefix(answer, ['answer is', 'answer:'])
+                        answer = self.process_answer_prefix(answer, ['option'])
+                    else:
+                        break
+                else:
+                    break
+        else:
+            qtype = 'open'
+        if self.count % 50 == 0 and int(os.environ.get('LOCAL_RANK', '0')) == 0:
+            print(f'\n{self.BEGIN_LINE}')
+            print(f'image_paths: {image_paths}\n')
+            print(f'prompt: {prompt}\n')
+            print(f'qtype: {qtype}\n')
+            print(f'output: {response}\n')
+            print(f'answer: {answer}\n')
+            print(f'{self.END_LINE}\n', flush=True)
+        self.count += 1
+        return answer
+    def prepare_inputs(self, message):
+        prompt = ''
+        image_paths = []
+        image_count = 0
+        text_count = 0
+        pure_text = ''
+        for msg in message:
+            if msg['type'] == 'text':
+                text_count += 1
+                prompt += msg['value']
+                pure_text += msg['value']
+            elif msg['type'] == 'image':
+                image_count += 1
+                prompt += self.DEFAULT_IMAGE_TOKEN
+                image_paths.append(msg['value'])
+        if image_count == 1 and text_count == 1:
+            prompt = self.DEFAULT_IMAGE_TOKEN + '\n' + pure_text
+        return prompt, image_paths