Initial commit

bc5ebf0f · luopl · bc5ebf0f · bc5ebf0f · bc5ebf0f · bc5ebf0f
Commit bc5ebf0f authored Dec 27, 2024 by luopl
20 changed files
--- a/VLMEvalKit/vlmeval/vlm/h2ovl_mississippi.py
+++ b/VLMEvalKit/vlmeval/vlm/h2ovl_mississippi.py
+import torch
+from transformers import AutoTokenizer, AutoModel
+import warnings
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+import pandas as pd
+import string
+
+
+class H2OVLChat(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = True
+
+    def __init__(self, model_path='h2oai/h2ovl-mississippi-2b', **kwargs):
+        assert model_path is not None
+
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
+
+        device = torch.cuda.current_device()
+        self.device = device
+        self.model = AutoModel.from_pretrained(
+            model_path,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True).eval()
+        self.model = self.model.to(device)
+        self.image_size = self.model.config.vision_config.image_size
+
+        kwargs_default = dict(do_sample=False, max_new_tokens=1024, top_p=None, num_beams=1)
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+
+    def use_custom_prompt(self, dataset):
+        return True
+
+    def build_multi_choice_prompt(self, line, dataset=None):
+        question = line['question']
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+        prompt = question
+
+        if len(options):
+            prompt += '\n请直接回答选项字母。' if cn_string(
+                prompt) else "\nAnswer with the option's letter from the given choices directly."
+        else:
+            prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
+
+        return prompt
+
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert dataset is None or isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+
+        if dataset is not None and listinstr(['MME'], dataset):
+            question = line['question']
+            prompt = question + ' Answer the question using a single word or phrase.'
+        elif dataset is not None and listinstr(['HallusionBench'], dataset):
+            question = line['question']
+            prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.'
+        elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
+            prompt = self.build_multi_choice_prompt(line, dataset)
+        elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
+            if 'MathVista' in dataset:
+                prompt = line['question']
+            elif listinstr(['LLaVABench'], dataset):
+                question = line['question']
+                prompt = question + '\nAnswer this question in detail.'
+            elif listinstr(['MMVet'], dataset):
+                prompt = line['question']
+            else:
+                question = line['question']
+                prompt = question + '\nAnswer the question using a single word or phrase.'
+        else:
+            prompt = line['question']
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
+
+    def generate_inner(self, message, dataset=None):
+        image_num = len([x for x in message if x['type'] == 'image'])
+        question = ''
+        image_files = [x['value'] for x in message if x['type'] == 'image']
+
+        if image_num == 1:
+            question = '<image>\n' + '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+
+        elif image_num > 1:
+            text_part = ' '.join([x['value'] for x in message if x['type'] == 'text'])
+            image_part = ' '.join([f'<image-{i + 1}>: <image>' for i in range(image_num)])
+            question = image_part + '\n' + text_part
+
+        else:
+            question = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+            image_files = None
+
+        response, history = self.model.chat(
+            self.tokenizer,
+            image_files=image_files,
+            question=question,
+            generation_config=self.kwargs,
+            max_tiles=6,
+            history=None,
+            return_history=True)
+        return response
--- a/VLMEvalKit/vlmeval/vlm/idefics.py
+++ b/VLMEvalKit/vlmeval/vlm/idefics.py
+import torch
+import os.path as osp
+import warnings
+from .base import BaseModel
+from ..smp import splitlen, listinstr
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForVision2Seq
+from transformers.image_utils import load_image
+
+
+class IDEFICS(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = True
+
+    def __init__(self, model_path='HuggingFaceM4/idefics-9b-instruct', **kwargs):
+        assert osp.exists(model_path) or splitlen(model_path) == 2
+        from transformers import IdeficsForVisionText2Text, AutoProcessor
+
+        self.model = IdeficsForVisionText2Text.from_pretrained(
+            model_path, torch_dtype=torch.bfloat16, device_map='auto'
+        )
+        self.processor = AutoProcessor.from_pretrained(model_path)
+        kwargs_default = {'max_new_tokens': 512}
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+        self.file_root = osp.dirname(__file__)
+        warnings.warn(
+            f'Following kwargs received: {self.kwargs}, will use as generation config. '
+        )
+
+    def generate_inner(self, message, dataset=None):
+        prompts = (
+            ['Users:']
+            + [msg['value'] if msg['type'] == 'text' else Image.open(msg['value']) for msg in message]
+            + ['<end_of_utterance>', '\nAssistant: ']
+        )
+        inputs = self.processor(
+            prompts, add_end_of_utterance_token=False, return_tensors='pt'
+        ).to('cuda')
+        exit_condition = self.processor.tokenizer(
+            '<end_of_utterance>', add_special_tokens=False
+        ).input_ids
+        bad_words_ids = self.processor.tokenizer(
+            ['<image>', '<fake_token_around_image>'], add_special_tokens=False
+        ).input_ids
+
+        generated_ids = self.model.generate(
+            **inputs,
+            eos_token_id=exit_condition,
+            bad_words_ids=bad_words_ids,
+            **self.kwargs,
+        )
+        generated_text = self.processor.batch_decode(
+            generated_ids, skip_special_tokens=True
+        )
+        text = generated_text[0].split('\nAssistant: ')[-1]
+        return text
+
+
+class IDEFICS2(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = True
+
+    def __init__(self, model_path='HuggingFaceM4/idefics2-8b', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+        if 'Idefics3' in self.model_path.lower():
+            warnings.warn('Install transfomers from source: PR https://github.com/open-compass/VLMEvalKit/pull/379')
+            warnings.warn('Reference: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3')
+        self.processor = AutoProcessor.from_pretrained(model_path)
+        model = AutoModelForVision2Seq.from_pretrained(
+            model_path,
+            torch_dtype=torch.bfloat16,
+            _attn_implementation='flash_attention_2',
+            device_map='cpu')
+        self.model = model.to('cuda')
+
+        kwargs_default = {'max_new_tokens': 1024}
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+        warnings.warn(
+            f'Following kwargs received: {self.kwargs}, will use as generation config. '
+        )
+        torch.cuda.empty_cache()
+
+    def _process(self, formatted_messages, formatted_images):
+        inputs = self.processor(
+            text=formatted_messages, images=formatted_images, return_tensors='pt'
+        )
+        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+        return inputs
+
+    def build_prompt_default(self, message, add_brief=False, add_yes_or_no=False, change_the_img_place=False):
+        if change_the_img_place:
+            new_message = []
+            for s in message:
+                if s['type'] == 'image':
+                    new_message.append(s)
+            for s in message:
+                if s['type'] == 'text':
+                    new_message.append(s)
+            message = new_message
+        prompt, images = 'User:', []
+        for msg in message:
+            if msg['type'] == 'image':
+                img = load_image(msg['value'])
+                images.append(img)
+                prompt += '<image>'
+            elif msg['type'] == 'text':
+                prompt += msg['value'].strip()
+        if add_brief:
+            prompt += '\nGive a very brief answer.'
+        if add_yes_or_no:
+            prompt += '\nAnswer yes or no.'
+        prompt += '<end_of_utterance>\nAssistant:'
+        return prompt, images
+
+    def build_prompt_puremcq(self, message):
+        replace_mapping = {
+            '\nOptions:': '\nChoices:',
+            'Please select the correct answer from the options above.': 'Answer with the letter.',
+        }
+
+        prompt, images = 'User:', []
+        for msg in message:
+            if msg['type'] == 'image':
+                img = load_image(msg['value'])
+                images.append(img)
+                prompt += '<image>'
+            elif msg['type'] == 'text':
+                instruction = msg['value'].strip()
+                for k, v in replace_mapping.items():
+                    instruction = instruction.replace(k, v)
+                prompt += instruction
+        prompt += '<end_of_utterance>\nAssistant: Answer:'
+        return prompt, images
+
+    def build_prompt_mt(self, message):
+        prompt, images = '', []
+        for msg in message:
+            if msg['role'] == 'user':
+                prompt += 'User: '
+            elif msg['role'] == 'assistant':
+                prompt += 'Assistant: '
+            for item in msg['content']:
+                if item['type'] == 'image':
+                    img = load_image(item['value'])
+                    images.append(img)
+                    prompt += '<image>'
+                elif item['type'] == 'text':
+                    prompt += item['value'].strip()
+                prompt += '<end_of_utterance>\n'
+        return prompt + 'Assistant: '
+
+    def build_prompt_mmbench(self, message):
+        replace_mapping = {
+            '\nOptions:': '\nChoices:',
+            'Please select the correct answer from the options above.': 'Answer with a letter.',
+        }
+
+        prompt, images = 'User:', []
+        for msg in message:
+            if msg['type'] == 'image':
+                img = load_image(msg['value'])
+                images.append(img)
+                prompt += '<image>'
+            elif msg['type'] == 'text':
+                instruction = msg['value'].strip()
+                for k, v in replace_mapping.items():
+                    instruction = instruction.replace(k, v)
+                # Swap hint and question
+                if instruction.startswith('Hint:'):
+                    hint, question = instruction.split('\nQuestion:')
+                    question, choices = question.split('\nChoices:')
+                    instruction = (
+                        'Question:' + question + '\n' + hint + '\nChoices:' + choices
+                    )
+                prompt += instruction
+        prompt += '<end_of_utterance>\nAssistant: Answer:'
+        return prompt, images
+
+    def build_prompt_mmmu(self, message):
+        replace_mapping = {
+            'Question:': '',
+            'Please select the correct answer from the options above.': 'Answer with the letter.',
+            '\nOptions:': '\nChoices:',
+        }
+
+        prompt, images, img_counter = 'User: Question: ', [], 1
+        for msg in message:
+            if msg['type'] == 'image':
+                prompt += f'<image {img_counter}>:<image>\n'
+                img_counter += 1
+        img_counter = 1
+
+        for msg in message:
+            if msg['type'] == 'image':
+                img = load_image(msg['value'])
+                images.append(img)
+                prompt += f' <image {img_counter}> '
+                img_counter += 1
+            elif msg['type'] == 'text':
+                instruction = msg['value'].strip()
+                for k, v in replace_mapping.items():
+                    instruction = instruction.replace(k, v)
+                prompt += instruction.strip()
+        prompt += '<end_of_utterance>\nAssistant:'
+        if 'A.' in prompt and 'B.' in prompt:
+            prompt += ' Answer:'
+        return prompt, images
+
+    def build_prompt_mathvista(self, message):
+        replace_mapping = {
+            '(A) ': 'A. ',
+            '(B) ': 'B. ',
+            '(C) ': 'C. ',
+            '(D) ': 'D. ',
+            '(E) ': 'E. ',
+            '(F) ': 'F. ',
+            '(G) ': 'G. ',
+            '(H) ': 'H. ',
+            '\nOptions:': '\nChoices:',
+            'Hint: ': '',
+        }
+
+        prompt, images = 'User:', []
+        for msg in message:
+            if msg['type'] == 'image':
+                img = load_image(msg['value'])
+                images.append(img)
+                prompt += '<image>'
+            elif msg['type'] == 'text':
+                instruction = msg['value'].strip()
+                for k, v in replace_mapping.items():
+                    instruction = instruction.replace(k, v)
+                prompt += instruction.strip()
+        if 'A.' in prompt and 'B.' in prompt:
+            prompt += '\nAnswer with the letter.'
+        prompt += '<end_of_utterance>\nAssistant:'
+        if 'A.' in prompt and 'B.' in prompt:
+            prompt += ' Answer:'
+        return prompt, images
+
+    def chat_inner(self, message, dataset=None):
+        formatted_messages, formatted_images = self.build_prompt_mt(message)
+        inputs = self._process(formatted_messages, formatted_images)
+
+        generated_ids = self.model.generate(**inputs, **self.kwargs)
+        generated_text = self.processor.batch_decode(
+            generated_ids[:, inputs['input_ids'].size(1):], skip_special_tokens=True
+        )[0]
+        response = generated_text.strip()
+        # print(dataset, " | ", formatted_messages.replace("\n", "\\n"), " | ", response.replace("\n", "\\n"))
+        return response
+
+    def generate_inner(self, message, dataset=None):
+        if dataset in [
+            'MMBench_DEV_EN', 'MMBench_DEV_EN_V11',
+            'MMBench_TEST_EN', 'MMBench_TEST_EN_V11',
+            'MMBench_DEV_CN', 'MMBench_DEV_CN_V11',
+            'MMBench_TEST_CN', 'MMBench_TEST_CN_V11',
+            'MMBench', 'MMBench_V11', 'MMBench_CN', 'MMBench_CN_V11'
+        ]:
+            formatted_messages, formatted_images = self.build_prompt_mmbench(message)
+        elif dataset in ['MMMU_DEV_VAL', 'MMMU_TEST']:
+            formatted_messages, formatted_images = self.build_prompt_mmmu(message)
+        elif dataset in ['MathVista_MINI']:
+            formatted_messages, formatted_images = self.build_prompt_mathvista(message)
+        elif dataset in [
+            'MME',
+            'MMVet',
+            'OCRVQA_TEST',
+            'OCRVQA_TESTCORE',
+            'TextVQA_VAL',
+            'ChartQA_TEST',
+            'DocVQA_VAL',
+            'DocVQA_TEST',
+            'InfoVQA_VAL',
+            'InfoVQA_TEST',
+        ]:
+            formatted_messages, formatted_images = self.build_prompt_default(
+                message, add_brief=True
+            )
+        elif dataset == 'HallusionBench':
+            formatted_messages, formatted_images = self.build_prompt_default(
+                message, add_yes_or_no=True
+            )
+        elif dataset in [
+            'MMStar',
+            'SEEDBench_IMG',
+            'AI2D_TEST',
+            'ScienceQA_VAL',
+            'ScienceQA_TEST',
+        ]:
+            formatted_messages, formatted_images = self.build_prompt_puremcq(message)
+        elif listinstr(['MLVU','TempCompass','MVBench'], dataset):
+            formatted_messages, formatted_images = self.build_prompt_default(message, change_the_img_place=True)
+        else:
+            formatted_messages, formatted_images = self.build_prompt_default(message)
+
+        inputs = self._process(formatted_messages, formatted_images)
+
+        generated_ids = self.model.generate(**inputs, **self.kwargs)
+        generated_text = self.processor.batch_decode(
+            generated_ids[:, inputs['input_ids'].size(1):], skip_special_tokens=True
+        )[0]
+        response = generated_text.strip()
+        # print(dataset, " | ", formatted_messages.replace("\n", "\\n"), " | ", response.replace("\n", "\\n"))
+        return response
--- a/VLMEvalKit/vlmeval/vlm/instructblip.py
+++ b/VLMEvalKit/vlmeval/vlm/instructblip.py
+import torch
+from PIL import Image
+import os.path as osp
+import sys
+from .base import BaseModel
+from ..smp import *
+
+
+class InstructBLIP(BaseModel):
+
+    INSTALL_REQ = True
+    INTERLEAVE = False
+
+    def __init__(self, name):
+        self.config_map = {
+            'instructblip_7b': 'misc/blip2_instruct_vicuna7b.yaml',
+            'instructblip_13b': 'misc/blip2_instruct_vicuna13b.yaml',
+        }
+
+        self.file_path = __file__
+        config_root = osp.dirname(self.file_path)
+
+        try:
+            from lavis.models import load_preprocess
+            from omegaconf import OmegaConf
+            from lavis.common.registry import registry
+        except Exception as e:
+            logging.critical('Please install lavis before using InstructBLIP. ')
+            raise e
+
+        assert name in self.config_map
+        cfg_path = osp.join(config_root, self.config_map[name])
+        cfg = OmegaConf.load(cfg_path)
+
+        model_cfg = cfg.model
+        assert osp.exists(model_cfg.llm_model) or splitlen(model_cfg.llm_model) == 2
+        model_cls = registry.get_model_class(name='blip2_vicuna_instruct')
+        model = model_cls.from_config(model_cfg)
+        model.eval()
+
+        self.device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
+        device = self.device
+        model.to(device)
+        self.model = model
+        self.kwargs = {'max_length': 512}
+
+        preprocess_cfg = cfg.preprocess
+        vis_processors, _ = load_preprocess(preprocess_cfg)
+        self.vis_processors = vis_processors
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        vis_processors = self.vis_processors
+        raw_image = Image.open(image_path).convert('RGB')
+        image_tensor = vis_processors['eval'](raw_image).unsqueeze(0).to(self.device)
+        outputs = self.model.generate(dict(image=image_tensor, prompt=prompt))
+        return outputs[0]
--- a/VLMEvalKit/vlmeval/vlm/internvl/__init__.py
+++ b/VLMEvalKit/vlmeval/vlm/internvl/__init__.py
+from .internvl_chat import InternVLChat
+
+__all__ = ['InternVLChat']
--- a/VLMEvalKit/vlmeval/vlm/internvl/internvl_chat.py
+++ b/VLMEvalKit/vlmeval/vlm/internvl/internvl_chat.py
+import math
+import pandas as pd
+import random
+import re
+import string
+import torch
+import torch.distributed as dist
+import torchvision.transforms as T
+import transformers
+import warnings
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoTokenizer, AutoConfig, AutoModel, CLIPImageProcessor
+
+from .utils import (build_multi_choice_prompt,
+                    build_video_prompt,
+                    build_mpo_prompt,
+                    build_mcq_cot_prompt,
+                    build_qa_cot_prompt,
+                    mpo_post_processing,
+                    reorganize_prompt,
+                    split_model, load_image)
+from .utils import mpo_prompt_with_final_answer, mpo_prompt_without_final_answer
+from ..base import BaseModel
+from ...dataset import DATASET_TYPE, DATASET_MODALITY
+from ...smp import *
+
+
+class InternVLChat(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = True
+
+    def __init__(self,
+                 model_path='OpenGVLab/InternVL-Chat-V1-5',
+                 load_in_8bit=False,
+                 use_mpo_prompt=False,
+                 version='V1.0',
+                 **kwargs):
+
+        assert model_path is not None
+        assert version_cmp(transformers.__version__, '4.37.2', 'ge')
+
+        self.use_mpo_prompt = use_mpo_prompt
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
+
+        # Regular expression to match the pattern 'Image' followed by a number, e.g. Image1
+        self.pattern = r'Image(\d+)'
+        # Replacement pattern to insert a hyphen between 'Image' and the number, e.g. Image-1
+        self.replacement = r'Image-\1'
+
+        # Convert InternVL2 response to dataset format
+        # e.g. Image1 -> Image-1
+
+        # Regular expression to match the pattern 'Image-' followed by a number
+        self.reverse_pattern = r'Image-(\d+)'
+        # Replacement pattern to remove the hyphen (Image-1 -> Image1)
+        self.reverse_replacement = r'Image\1'
+
+        if auto_split_flag():
+            device_map, visible_devices = split_model(model_path=model_path)
+            self.device = visible_devices[0]
+            self.model = AutoModel.from_pretrained(
+                model_path,
+                torch_dtype=torch.bfloat16,
+                load_in_8bit=load_in_8bit,
+                trust_remote_code=True,
+                low_cpu_mem_usage=True,
+                device_map=device_map).eval()
+        else:
+            self.model = AutoModel.from_pretrained(
+                model_path,
+                torch_dtype=torch.bfloat16,
+                load_in_8bit=load_in_8bit,
+                trust_remote_code=True,
+                low_cpu_mem_usage=True).eval().cuda()
+            self.device = 'cuda'
+
+        self.image_size = self.model.config.vision_config.image_size
+        self.version = version
+        kwargs_default = dict(do_sample=False, max_new_tokens=4096, top_p=None)
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN'], dataset):
+            # For Multi-Turn we don't have custom prompt
+            return False
+        if DATASET_MODALITY(dataset) == 'VIDEO':
+            # For Video benchmarks we don't have custom prompt at here
+            return False
+        else:
+            return True
+
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert dataset is None or isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+
+        if dataset is not None and DATASET_TYPE(dataset) == 'Y/N':
+            question = line['question']
+            if listinstr(['MME'], dataset):
+                prompt = question + ' Answer the question using a single word or phrase.'
+            elif listinstr(['HallusionBench', 'AMBER'], dataset):
+                prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.'
+            else:
+                prompt = question
+        elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
+            prompt = build_multi_choice_prompt(line, dataset)
+            if os.getenv('USE_COT') == '1':
+                prompt = build_mcq_cot_prompt(line, prompt)
+        elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
+            question = line['question']
+            if listinstr(['LLaVABench', 'WildVision'], dataset):
+                prompt = question + '\nAnswer this question in detail.'
+            elif listinstr(['OCRVQA', 'TextVQA', 'ChartQA', 'DocVQA', 'InfoVQA', 'OCRBench',
+                            'DUDE', 'SLIDEVQA', 'GQA', 'MMLongBench_DOC'], dataset):
+                prompt = question + '\nAnswer the question using a single word or phrase.'
+            elif listinstr(['MathVista', 'MathVision', 'VCR', 'MTVQA', 'MMVet', 'MathVerse',
+                            'MMDU', 'CRPE', 'MIA-Bench', 'MM-Math', 'DynaMath', 'QSpatial'], dataset):
+                prompt = question
+                if os.getenv('USE_COT') == '1':
+                    prompt = build_qa_cot_prompt(line, prompt)
+            else:
+                prompt = question + '\nAnswer the question using a single word or phrase.'
+        else:
+            # VQA_ex_prompt: OlympiadBench, VizWiz
+            prompt = line['question']
+            if os.getenv('USE_COT') == '1':
+                prompt = build_qa_cot_prompt(line, prompt)
+
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+
+        if self.use_mpo_prompt:
+            message = build_mpo_prompt(message, line, dataset)
+        return message
+
+    def set_max_num(self, dataset):
+        # The total limit on the number of images processed, set to avoid Out-of-Memory issues.
+        self.total_max_num = 64
+        if dataset is None:
+            self.max_num = 6
+            return None
+        res_12_datasets = ['ChartQA_TEST', 'MMMU_DEV_VAL', 'MMMU_TEST', 'MME-RealWorld',
+                           'VCR_EN', 'VCR_ZH', 'OCRVQA']
+        res_18_datasets = ['DocVQA_VAL', 'DocVQA_TEST', 'DUDE', 'MMLongBench_DOC', 'SLIDEVQA']
+        res_24_datasets = ['InfoVQA_VAL', 'InfoVQA_TEST', 'OCRBench', 'HRBench4K', 'HRBench8K']
+        if DATASET_MODALITY(dataset) == 'VIDEO':
+            self.max_num = 1
+        elif listinstr(res_12_datasets, dataset):
+            self.max_num = 12
+        elif listinstr(res_18_datasets, dataset):
+            self.max_num = 18
+        elif listinstr(res_24_datasets, dataset):
+            self.max_num = 24
+        else:
+            self.max_num = 6
+
+    def generate_v1_2(self, message, dataset=None):
+        self.INTERLEAVE = False
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        image = Image.open(image_path).convert('RGB')
+        image = image.resize((self.image_size, self.image_size))
+        image_processor = CLIPImageProcessor.from_pretrained(self.model_path)
+        pixel_values = image_processor(images=image, return_tensors='pt').pixel_values
+        pixel_values = pixel_values.to(torch.bfloat16).to(self.device)
+        with torch.no_grad():
+            response = self.model.chat(self.tokenizer, pixel_values=pixel_values,
+                                       question=prompt, generation_config=self.kwargs)
+        return response
+
+    def generate_v1_5(self, message, dataset=None):
+        image_num = len([x for x in message if x['type'] == 'image'])
+        max_num = max(1, min(self.max_num, self.total_max_num // image_num))
+        prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+
+        if DATASET_MODALITY(dataset) == 'VIDEO':
+            prompt = build_video_prompt(prompt, dataset)
+
+        if image_num > 1:
+            image_path = [x['value'] for x in message if x['type'] == 'image']
+            pixel_values_list = []
+            for file_name in image_path:
+                pixel_values_list.append(load_image(file_name, max_num=max_num).to(self.device).to(torch.bfloat16))
+            pixel_values = torch.cat(pixel_values_list, dim=0)
+        elif image_num == 1:
+            image_path = [x['value'] for x in message if x['type'] == 'image'][0]
+            pixel_values = load_image(image_path, max_num=max_num).to(self.device).to(torch.bfloat16)
+        else:
+            pixel_values = None
+        with torch.no_grad():
+            response = self.model.chat(
+                self.tokenizer,
+                pixel_values=pixel_values,
+                question=prompt,
+                generation_config=self.kwargs,
+                verbose=True)
+        return response
+
+    def generate_v2(self, message, dataset=None):
+        image_num = len([x for x in message if x['type'] == 'image'])
+        max_num = max(1, min(self.max_num, self.total_max_num // image_num))
+        prompt = reorganize_prompt(message, image_num, dataset=dataset)
+
+        if dataset is not None and DATASET_MODALITY(dataset) == 'VIDEO':
+            prompt = build_video_prompt(prompt, dataset)
+
+        if image_num > 1:
+            image_path = [x['value'] for x in message if x['type'] == 'image']
+            num_patches_list, pixel_values_list = [], []
+            for image_idx, file_name in enumerate(image_path):
+                upscale_flag = image_idx == 0 and dataset is not None and listinstr(['MMMU'], dataset)
+                curr_pixel_values = load_image(
+                    file_name, max_num=max_num, upscale=upscale_flag).to(self.device).to(torch.bfloat16)
+                num_patches_list.append(curr_pixel_values.size(0))
+                pixel_values_list.append(curr_pixel_values)
+            pixel_values = torch.cat(pixel_values_list, dim=0)
+        elif image_num == 1:
+            image_path = [x['value'] for x in message if x['type'] == 'image'][0]
+            upscale_flag = dataset is not None and listinstr(['MMMU'], dataset)
+            pixel_values = load_image(
+                image_path, max_num=max_num, upscale=upscale_flag).to(self.device).to(torch.bfloat16)
+            num_patches_list = [pixel_values.size(0)]
+        else:
+            pixel_values = None
+            num_patches_list = []
+
+        with torch.no_grad():
+            response = self.model.chat(
+                self.tokenizer,
+                pixel_values=pixel_values,
+                num_patches_list=num_patches_list,
+                question=prompt,
+                generation_config=self.kwargs,
+                verbose=True
+            )
+
+        if self.use_mpo_prompt:
+            response = mpo_post_processing(response, dataset)
+        return response
+
+    def generate_inner(self, message, dataset=None):
+        self.set_max_num(dataset)
+        print(f'InternVL model version: {self.version}')
+        if self.version in ['V1.1', 'V1.2']:
+            return self.generate_v1_2(message, dataset)
+        elif self.version == 'V1.5':
+            return self.generate_v1_5(message, dataset)
+        elif self.version == 'V2.0':
+            return self.generate_v2(message, dataset)
+        else:
+            raise ValueError(f'Unsupported version: {self.version}')
+
+    def build_history(self, message):
+        # Global Variables
+        image_path = []
+        image_cnt = 0
+
+        def concat_tilist(tilist):
+            nonlocal image_cnt  # Declare image_cnt as nonlocal to modify it
+            prompt = ''
+            for item in tilist:
+                # Substitute the pattern in the text
+                if item['type'] == 'text':
+                    prompt += re.sub(self.pattern, self.replacement, item['value'])
+                elif item['type'] == 'image':
+                    image_cnt += 1
+                    prompt += '<image>\n'
+                    image_path.append(item['value'])
+            return prompt
+
+        # Only previous messages
+        assert len(message) % 2 == 0
+        history = []
+        for i in range(len(message) // 2):
+            m1, m2 = message[2 * i], message[2 * i + 1]
+            assert m1['role'] == 'user' and m2['role'] == 'assistant'
+            history.append((concat_tilist(m1['content']), concat_tilist(m2['content'])))
+
+        return history, image_path, image_cnt
+
+    def chat_inner_v2(self, message, dataset=None):
+
+        if len(message) > 1:
+            history, image_path, image_cnt = self.build_history(message[:-1])
+        else:
+            history, image_path, image_cnt = None, [], 1
+        current_msg = message[-1]
+        question = ''
+
+        # If message is just text in the conversation
+        if len(current_msg['content']) == 1 and current_msg['content'][0]['type'] == 'text':
+            question = current_msg['content'][0]['value']
+            question = re.sub(self.pattern, self.replacement, question)  # Fix pattern as per InternVL
+        else:
+            for msg in current_msg['content']:
+                if msg['type'] == 'text':
+                    question += re.sub(self.pattern, self.replacement, msg['value'])
+                elif msg['type'] == 'image':
+                    image_cnt += 1
+                    question += '<image>\n'
+                    image_path.append(msg['value'])
+
+        if image_cnt > 1:
+            num_patches_list = []
+            pixel_values_list = []
+            for image_idx, file_name in enumerate(image_path):
+                upscale_flag = image_idx == 0 and dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset)
+                curr_pixel_values = load_image(
+                    file_name, max_num=self.max_num, upscale=upscale_flag).to(self.device).to(torch.bfloat16)
+                num_patches_list.append(curr_pixel_values.size(0))
+                pixel_values_list.append(curr_pixel_values)
+            pixel_values = torch.cat(pixel_values_list, dim=0)
+        elif image_cnt == 1:
+            upscale_flag = listinstr(['MMMU_DEV_VAL'], dataset)
+            pixel_values = load_image(
+                image_path, max_num=self.max_num, upscale=upscale_flag).to(self.device).to(torch.bfloat16)
+            num_patches_list = [pixel_values.size(0)]
+        else:
+            pixel_values = None
+            num_patches_list = []
+
+        response, history = self.model.chat(
+            self.tokenizer,
+            pixel_values=pixel_values,
+            num_patches_list=num_patches_list,
+            question=question,
+            generation_config=self.kwargs,
+            history=history,
+            return_history=True
+        )
+
+        response = re.sub(self.reverse_pattern, self.reverse_replacement, response)
+
+        return response
+
+    def chat_inner(self, message, dataset=None):
+        self.set_max_num(dataset)
+
+        if self.version in ['V1.1', 'V1.2']:
+            raise ValueError(f'Unsupported version for Multi-Turn: {self.version}')
+        elif self.version == 'V1.5':
+            raise ValueError(f'Unsupported version for Multi-Turn: {self.version}')
+        elif self.version == 'V2.0':
+            kwargs_default = dict(do_sample=False, max_new_tokens=512, top_p=None, num_beams=1)
+            self.kwargs = kwargs_default
+            return self.chat_inner_v2(message, dataset)
+        else:
+            raise ValueError(f'Unsupported version for Multi-Turn: {self.version}')
--- a/VLMEvalKit/vlmeval/vlm/internvl/utils.py
+++ b/VLMEvalKit/vlmeval/vlm/internvl/utils.py
+import math
+import pandas as pd
+import random
+import re
+import string
+import torch
+import torch.distributed as dist
+import torchvision.transforms as T
+import transformers
+import warnings
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoTokenizer, AutoConfig, AutoModel, CLIPImageProcessor
+
+from ..base import BaseModel
+from ...dataset import DATASET_TYPE, DATASET_MODALITY
+from ...smp import *
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+
+
+def load_image(image_file, input_size=448, max_num=6, upscale=False):
+    image = Image.open(image_file).convert('RGB')
+    if upscale:
+        image = image.resize((image.width * 2, image.height * 2), Image.BILINEAR)
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+
+
+def get_local_rank_and_local_world_size():
+    if not dist.is_available():
+        return 0, 1
+    if not dist.is_initialized():
+        return 0, 1
+
+    if 'SLURM_LOCALID' in os.environ:
+        local_rank = int(os.environ['SLURM_LOCALID'])
+        local_world_size = int(os.environ['SLURM_NTASKS_PER_NODE'])
+        return local_rank, local_world_size
+
+    if 'LOCAL_RANK' in os.environ and 'LOCAL_WORLD_SIZE' in os.environ:
+        return int(os.environ['LOCAL_RANK']), int(os.environ['LOCAL_WORLD_SIZE'])
+
+    raise NotImplementedError(
+        "Fail to get local_rank and local_world_size! "
+        "Please ensure that you set the environment variable "
+        "`LOCAL_RANK` and `LOCAL_WORLD_SIZE`"
+    )
+
+
+def split_model(model_path):
+    num_gpus_per_node = 8
+    rank, world_size = get_rank_and_world_size()
+    try:
+        local_rank, local_world_size = get_local_rank_and_local_world_size()
+    except:
+        local_rank = rank
+
+    if 'GPUS_PER_PROCESS' in os.environ:
+        gpus_per_process = int(os.environ['GPUS_PER_PROCESS'])
+    else:
+        gpus_per_process = 8  # default to use 8 GPUs for one model
+
+    start_gpu = local_rank * gpus_per_process
+    end_gpu = start_gpu + gpus_per_process
+
+    assert end_gpu <= num_gpus_per_node, f"Process {local_rank} tries to access GPU {end_gpu}, " \
+                                         f"but only {num_gpus_per_node} GPUs are available per node."
+
+    visible_devices = list(range(start_gpu, end_gpu))
+
+    device_map = {}
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+
+    num_gpus_for_vit = 0.5
+    num_layers = config.llm_config.num_hidden_layers
+    num_layers_per_gpu = math.ceil(num_layers / (len(visible_devices) - num_gpus_for_vit))
+    num_layers_per_gpu = [num_layers_per_gpu] * len(visible_devices)
+    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
+
+    layer_cnt = 0
+    for i, num_layer in enumerate(num_layers_per_gpu):
+        for j in range(num_layer):
+            device_map[f'language_model.model.layers.{layer_cnt}'] = visible_devices[i]
+            layer_cnt += 1
+    device_map['vision_model'] = visible_devices[0]
+    device_map['mlp1'] = visible_devices[0]
+    device_map['language_model.model.tok_embeddings'] = visible_devices[0]
+    device_map['language_model.model.embed_tokens'] = visible_devices[0]
+    device_map['language_model.output'] = visible_devices[0]
+    device_map['language_model.model.norm'] = visible_devices[0]
+    device_map['language_model.lm_head'] = visible_devices[0]
+    device_map[f'language_model.model.layers.{num_layers - 1}'] = visible_devices[0]
+
+    return device_map, visible_devices
+
+
+def split_model_old(model_name):
+    import math
+    device_map = {}
+    num_gpus = torch.cuda.device_count()
+    rank, world_size = get_rank_and_world_size()
+    num_gpus = num_gpus // world_size
+
+    num_layers_map = {
+        'InternVL2-8B': 32,
+        'InternVL2-26B': 48,
+        'InternVL2-40B': 60,
+        'InternVL2-Llama3-76B': 80
+    }
+
+    if model_name not in num_layers_map:
+        return 'cuda'
+    num_layers = num_layers_map[model_name]
+    # Since the first GPU will be used for ViT, treat it as 0.5 GPU.
+    num_layers_per_gpu = math.ceil(num_layers / (num_gpus - 0.5))
+    num_layers_per_gpu = [num_layers_per_gpu] * num_gpus
+    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
+    layer_cnt = 0
+    for i, num_layer in enumerate(num_layers_per_gpu):
+        for j in range(num_layer):
+            device_map[f'language_model.model.layers.{layer_cnt}'] = rank + world_size * i
+            layer_cnt += 1
+    device_map['vision_model'] = rank
+    device_map['mlp1'] = rank
+    device_map['language_model.model.tok_embeddings'] = rank
+    device_map['language_model.model.embed_tokens'] = rank
+    device_map['language_model.output'] = rank
+    device_map['language_model.model.norm'] = rank
+    device_map['language_model.lm_head'] = rank
+    device_map['language_model.model.rotary_emb'] = rank
+    device_map[f'language_model.model.layers.{num_layers - 1}'] = rank
+    return device_map
+
+
+def build_mcq_cot_prompt(line, prompt):
+    cot_prompt = (
+        "Answer the preceding multiple choice question. The last line of your response should follow "
+        "this format: 'Answer: \\boxed{$LETTER}' (without quotes), where LETTER is one of the options. "
+        "If you are uncertain or the problem is too complex, make a reasoned guess based on the "
+        "information provided. Avoid repeating steps indefinitely—provide your best guess even if "
+        "unsure. Think step by step logically, considering all relevant information before answering."
+    )
+    prompt = prompt.replace("Answer with the option's letter from the given choices directly.", '').strip()
+    prompt = prompt + '\n' + cot_prompt
+
+    return prompt
+
+
+def build_qa_cot_prompt(line, prompt):
+    cot_prompt = (
+        "Answer the preceding question. The last line of your response should follow this format: "
+        "'Answer: \\boxed{$FINAL_ANSWER}' (without quotes), where 'FINAL_ANSWER' is your conclusion "
+        "based on the reasoning provided. If you are uncertain or the problem is too complex, make "
+        "a reasoned guess based on the information provided. Avoid repeating steps indefinitely—"
+        "provide your best guess even if unsure. Think step by step logically, considering all "
+        "relevant information before answering."
+    )
+    prompt = prompt + '\n' + cot_prompt
+
+    return prompt
+
+
+def build_multi_choice_prompt(line, dataset=None):
+    question = line['question']
+    hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+    if hint is not None:
+        question = hint + '\n' + question
+
+    options = {
+        cand: line[cand]
+        for cand in string.ascii_uppercase
+        if cand in line and not pd.isna(line[cand])
+    }
+    for key, item in options.items():
+        question += f'\n{key}. {item}'
+    prompt = question
+
+    if len(options):
+        prompt += '\n请直接回答选项字母。' if cn_string(
+            prompt) else "\nAnswer with the option's letter from the given choices directly."
+    else:
+        prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
+
+    return prompt
+
+
+def build_video_prompt(prompt, dataset=None, max_frames=64):
+    for start in range(0, max_frames, 8):
+        images_to_remove = ''.join([f'<Image-{i}>' for i in range(start + 1, start + 9)])
+        prompt = prompt.replace(images_to_remove, '')
+    for i in range(max_frames):
+        prompt = prompt.replace(f'Image-{i + 1}', f'Frame-{i + 1}')
+    if listinstr(['MMBench-Video'], dataset):
+        prompt = prompt.replace('\nAnswer:', '')
+    elif listinstr(['Video-MME'], dataset):
+        prompt = prompt.replace('\nAnswer:', '')
+        prompt += "\nAnswer with the option's letter from the given choices directly."
+    elif listinstr(['MVBench'], dataset):
+        prompt = prompt.replace('Best option:(', '')
+
+    return prompt
+
+
+def reorganize_prompt(message, image_num, dataset=None):
+    if dataset is not None and listinstr(['MUIRBench'], dataset):
+        prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+        images_to_remove = ' '.join(['<image>'] * image_num)
+        prompt = prompt.replace(images_to_remove, '')
+        for i in range(image_num):
+            prompt = prompt.replace('<image>', f'<Image-{i + 1}>', 1)
+        prompt = ''.join([f'Image-{i + 1}: <image>\n' for i in range(image_num)]) + prompt
+    elif image_num == 1:
+        prompt = '<image>\n' + '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+    else:
+        prompt, image_idx = '', 1
+        for x in message:
+            if x['type'] == 'text':
+                prompt += x['value']
+            elif x['type'] == 'image':
+                prompt += f'<Image-{image_idx}>'
+                image_idx += 1
+        prompt = ''.join([f'Image-{i + 1}: <image>\n' for i in range(image_num)]) + prompt
+        images_to_remove = ''.join([f'<Image-{i + 1}>' for i in range(image_num)])
+        prompt = prompt.replace(images_to_remove, '')
+    return prompt
+
+
+mpo_prompt_with_final_answer = (
+    "Your task is to answer the question below. "
+    "Give step by step reasoning before you answer, and when you're ready to answer, "
+    "please use the format \"Final answer: ..\""
+    "\n\n"
+    "Question:"
+    "\n\n"
+    "{question}"
+)
+
+mpo_prompt_without_final_answer = (
+    "Your task is to answer the question below. "
+    "Give step by step reasoning. "
+    "\n\n"
+    "Question:"
+    "\n\n"
+    "{question}"
+)
+
+
+def mpo_post_processing(response, dataset):
+
+    def extract_answer(text):
+        match = re.search(r'(Final answer:|Answer:)\s*(.*)', text, re.IGNORECASE)
+        if match:
+            return match.group(2).strip()
+        return text
+
+    if dataset is not None and (DATASET_TYPE(dataset) in ['Y/N', 'MCQ'] or listinstr(['CRPE'], dataset)):
+        response = extract_answer(response).strip()
+    return response
+
+
+def build_mpo_prompt(message, line, dataset):
+    if not listinstr(['LLaVABench'], dataset):
+
+        if listinstr(['MMVet'], dataset):
+            cot_prompt = mpo_prompt_without_final_answer
+        else:
+            cot_prompt = mpo_prompt_with_final_answer
+
+        question_orig = line['question']
+        if listinstr(['MathVerse', 'MathVision'], dataset):
+            question_orig = question_orig.split('Question:', 1)[-1].strip()
+            question_orig = question_orig.replace('Choices:\n', '').strip()
+
+        prompt = cot_prompt.format(question=question_orig)
+    else:
+        prompt = line['question']
+    message[0]['value'] = prompt
+    return message
--- a/VLMEvalKit/vlmeval/vlm/janus.py
+++ b/VLMEvalKit/vlmeval/vlm/janus.py
+import sys
+import torch
+from transformers import AutoModelForCausalLM, AutoConfig
+import warnings
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+
+
+class Janus(BaseModel):
+
+    INSTALL_REQ = True
+    INTERLEAVE = True
+
+    def check_install(self):
+        try:
+            import janus
+        except Exception as e:
+            logging.critical(
+                'Please first install janus from source codes in: https://github.com/deepseek-ai/Janus')
+            raise e
+
+    def __init__(self, model_path='deepseek-ai/Janus-1.3B', **kwargs):
+        self.check_install()
+        assert model_path is not None
+        self.model_path = model_path
+        from janus.models import VLChatProcessor
+
+        self.vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
+        self.tokenizer = self.vl_chat_processor.tokenizer
+
+        model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
+        self.model = model.to(torch.bfloat16).cuda().eval()
+
+        torch.cuda.empty_cache()
+        default_kwargs = dict(
+            max_new_tokens=512,
+            do_sample=False,
+            use_cache=True,
+            output_logits=False,
+            output_scores=False,
+            return_dict_in_generate=False)
+
+        default_kwargs.update(kwargs)
+        self.kwargs = default_kwargs
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+
+    def prepare_inputs(self, message):
+        def prepare_itlist(msgs):
+            content, images = '', []
+            for s in msgs:
+                if s['type'] == 'image':
+                    images.append(s['value'])
+                    content += '<image_placeholder>'
+                elif s['type'] == 'text':
+                    content += s['value']
+            return content, images
+        conversation = []
+        if 'role' not in message[0]:
+            content, images = prepare_itlist(message)
+            conversation.append(dict(role='User', content=content, images=images))
+        else:
+            role_map = {'user': 'User', 'assistant': 'Assistant'}
+            for msgs in message:
+                role = role_map[msgs['role']]
+                content, images = prepare_itlist(msgs['content'])
+                conversation.append(dict(role=role, content=content, images=images))
+        conversation.append(dict(role='Assistant', content=''))
+        return conversation
+
+    def generate_inner(self, message, dataset=None):
+        if dataset is None or not ('MMVet' in dataset):
+            self.vl_chat_processor.system_prompt = ""
+        else:
+            self.vl_chat_processor.system_prompt = "You are a helpful assistant. Please answer truthfully and write out your thinking step by step to be sure you get the right answer."  # noqa: E501
+
+        conversation = self.prepare_inputs(message)
+        from janus.utils.io import load_pil_images
+        pil_images = load_pil_images(conversation)
+        prepare_inputs = self.vl_chat_processor(conversations=conversation, images=pil_images, force_batchify=True)
+        prepare_inputs = prepare_inputs.to(self.model.device, dtype=torch.bfloat16)
+        inputs_embeds = self.model.prepare_inputs_embeds(**prepare_inputs)
+
+        outputs = self.model.language_model.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=prepare_inputs.attention_mask,
+            pad_token_id=self.tokenizer.eos_token_id,
+            bos_token_id=self.tokenizer.bos_token_id,
+            eos_token_id=self.tokenizer.eos_token_id,
+            **self.kwargs)
+        answer = self.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
+        return answer
+
+    def chat_inner(self, message, dataset=None):
+        return self.generate_inner(message, dataset=dataset)
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'Y/N' or DATASET_TYPE(dataset) == 'MCQ' or dataset == 'MMVet':
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        if DATASET_TYPE(dataset) == 'Y/N':
+            if dataset == 'POPE':
+                question = question.replace(" Please answer yes or no.", "")
+            prompt = '\n' + question + "\nAnswer the question using a single word or phrase."
+        elif DATASET_TYPE(dataset) == 'MCQ':
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            options_prompt = ''
+            for key, item in options.items():
+                options_prompt += f'{key}. {item}\n'
+
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+            prompt = f'\nHint: {hint}\n' if hint is not None else '\n'
+            prompt += f'{question}\n'
+            prompt += (
+                f"{options_prompt}\nAnswer with the option's letter from the given choices directly."
+                if len(options) else 'Answer the question directly. '
+            )
+        elif dataset == 'MMVet':
+            prompt = '\n' + question
+        else:
+            raise NotImplementedError
+
+        message = [dict(type='image', value=s) for s in tgt_path]
+        message.extend([dict(type='text', value=prompt)])
+        return message
--- a/VLMEvalKit/vlmeval/vlm/kosmos.py
+++ b/VLMEvalKit/vlmeval/vlm/kosmos.py
+import torch
+import re
+from PIL import Image
+from abc import abstractproperty
+import sys
+import os.path as osp
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+import copy
+
+
+class Kosmos2(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = True
+
+    def __init__(self,
+                 model_path='microsoft/kosmos-2-patch14-224',
+                 **kwargs):
+        try:
+            from transformers import AutoProcessor, Kosmos2ForConditionalGeneration
+        except Exception as e:
+            logging.critical("Please install Transformers version 4.45.1 by running: pip install transformers==4.45.1")
+            raise e
+
+        assert osp.exists(model_path) or splitlen(model_path) == 2
+
+        self.model = (
+            Kosmos2ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
+            .to(torch.device('cuda'))
+        )
+        self.processor = AutoProcessor.from_pretrained(model_path)
+
+        default_kwargs = dict(
+            max_new_tokens=512,
+            use_cache=True
+        )
+
+        default_kwargs.update(kwargs)
+        self.kwargs = default_kwargs
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+        torch.cuda.empty_cache()
+
+    def generate_inner(self, message, dataset=None):
+        TASK_TOKEN = '<grounding> '
+        QEUSTION_TOKEN = 'Question: '
+        ANSWER_TOKEN = 'Answer: '
+        images = []
+        prompt = ''
+
+        prompt += TASK_TOKEN
+        for s in message:
+            if s['type'] == 'image':
+                images.append(s['value'])
+            elif s['type'] == 'text':
+                prompt += QEUSTION_TOKEN
+                prompt += s['value']
+                prompt += ANSWER_TOKEN
+
+        images = [Image.open(s) for s in images]
+        inputs = self.processor(text=prompt, images=images[0], return_tensors='pt').to(torch.device('cuda'))
+
+        generated_ids = self.model.generate(
+            pixel_values=inputs['pixel_values'],
+            input_ids=inputs['input_ids'],
+            attention_mask=inputs['attention_mask'],
+            image_embeds=None,
+            image_embeds_position_mask=inputs['image_embeds_position_mask'],
+            **self.kwargs
+        )
+
+        generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        processed_text = self.processor.post_process_generation(generated_text, cleanup_and_extract=True)[0]
+        cleaned_answer = re.sub(r'(Question:.*?Answer:|Question:.*)', '', processed_text).strip()
+        return cleaned_answer
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if listinstr(['MMMU'], dataset):
+            return False
+        if DATASET_TYPE(dataset) == 'MCQ' or dataset == 'MMVet':
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        if dataset == 'MMVet':
+            prompt = question + '\nAnswer the question directly. '
+        elif DATASET_TYPE(dataset) == 'MCQ':
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            options_prompt = ''
+            for key, item in options.items():
+                options_prompt += f'{key}. {item}\n'
+
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+            prompt = f'Hint: {hint}\n' if hint is not None else ''
+            prompt += f'{question}\n'
+            prompt += (
+                f'{options_prompt}\nAnswer with the option’s letter from the given choices directly. '
+                if len(options) else 'Answer the question directly. '
+            )
+        else:
+            raise NotImplementedError
+
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
--- a/VLMEvalKit/vlmeval/vlm/llama_vision.py
+++ b/VLMEvalKit/vlmeval/vlm/llama_vision.py
+import torch
+from PIL import Image
+import os.path as osp
+import sys
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+
+
+class llama_vision(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    # This function is used to split Llama-3.2-90B
+    def split_model(self):
+        import math
+        device_map = {}
+        num_gpus = torch.cuda.device_count()
+        rank, world_size = get_rank_and_world_size()
+        num_gpus = num_gpus // world_size
+
+        num_layers = 100
+        # GPU0: -5, GPU-1: -7
+        total_cost = num_layers + 5 + 7
+
+        # Since the first GPU will be used for ViT, treat it as 0.8 GPU.
+        num_layers_per_gpu = total_cost // num_gpus
+        num_layers_per_gpu = [num_layers_per_gpu] * num_gpus
+        # The total number of GPUs might be odd
+        num_layers_per_gpu[-1] = total_cost - sum(num_layers_per_gpu[:-1])
+        num_layers_per_gpu[0] -= 5
+        num_layers_per_gpu[-1] -= 7
+
+        layer_cnt = 0
+        for i, num_layer in enumerate(num_layers_per_gpu):
+            for j in range(num_layer):
+                device_map[f'language_model.model.layers.{layer_cnt}'] = rank + world_size * i
+                layer_cnt += 1
+
+        device_map['vision_model'] = rank
+        device_map['language_model.model.embed_tokens'] = rank
+        device_map['language_model.model.rotary_emb'] = rank
+        device_map['language_model.model.norm'] = rank + world_size * (num_gpus - 1)
+        device_map['language_model.lm_head'] = rank + world_size * (num_gpus - 1)
+        device_map['multi_modal_projector'] = rank + world_size * (num_gpus - 1)
+        return device_map
+
+    def __init__(self, model_path='meta-llama/Llama-3.2-11B-Vision-Instruct', **kwargs):
+        try:
+            from transformers import MllamaForConditionalGeneration, AutoProcessor
+        except Exception as e:
+            logging.critical('Please install transformers>=4.45.0 before using llama_vision.')
+            raise e
+
+        rank, world_size = get_rank_and_world_size()
+
+        if '11b' in model_path.lower() and auto_split_flag():
+            assert world_size == 1, 'We only support world_size == 1 when AUTO_SPLIT is set for Llama-3.2-11B'
+            logging.warning('Currently, we only support to split the 11B model across all GPUs.')
+            self.model = MllamaForConditionalGeneration.from_pretrained(
+                model_path,
+                torch_dtype=torch.bfloat16,
+                device_map='auto',
+            ).eval()
+        elif '90b' in model_path.lower():
+            device_map = self.split_model()
+            self.model = MllamaForConditionalGeneration.from_pretrained(
+                model_path,
+                torch_dtype=torch.bfloat16,
+                device_map=device_map,
+            ).eval()
+        else:
+            self.model = MllamaForConditionalGeneration.from_pretrained(
+                model_path,
+                torch_dtype=torch.bfloat16,
+                device_map='cpu',
+            ).cuda().eval()
+
+        self.device = 'cuda'
+        self.processor = AutoProcessor.from_pretrained(model_path)
+        if 'Instruct' in model_path:
+            kwargs_default = dict(do_sample=True, temperature=0.6, top_p=0.9)
+        else:
+            kwargs_default = dict(do_sample=False, max_new_tokens=512, temperature=0.0, top_p=None, num_beams=1)
+        kwargs.update(kwargs_default)
+        print(f'Following kwargs received: {kwargs}, will use as generation config. ')
+        self.kwargs = kwargs
+        self.model_name = model_path
+
+    def use_custom_prompt(self, dataset):
+        if dataset is None:
+            return False
+        if listinstr(['AI2D', 'MMMU', 'MathVista', 'ChartQA', 'DocVQA'], dataset):
+            # For Certain dataset we use custom prompt
+            return True
+        else:
+            return False
+
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert dataset is None or isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        if listinstr(['AI2D'], dataset):
+            self.kwargs['max_new_tokens'] = 400
+            for key, item in options.items():
+                question += f'\n{key}. {item}'
+            if '11B' in self.model_name:
+                prompt = (
+                    f'Look at the scientific diagram carefully and answer the following question: {question}\n'
+                    f'Think step by step and finally respond to the question '
+                    f"with only the correct option number as \"FINAL ANSWER\"."
+                    f"<cot_start>Let's think step by step."
+                )
+            elif '90B' in self.model_name:
+                prompt = (
+                    f'Look at the scientific diagram carefully and answer the following question: {question}\n'
+                    f'Respond only with the correct option digit.'
+                )
+        elif listinstr(['MMMU'], dataset):
+            self.kwargs['max_new_tokens'] = 2048
+            options = '\n'.join([f'{key}. {item}' for key, item in options.items()])
+            prompt = (
+                f'Look at the image carefully and solve the following question step-by-step. '
+                f'Question: {question} Options: {options} Indicate the correct answer at the end.'
+            )
+            for i in range(len(tgt_path)):
+                prompt = prompt.replace(f'<image {i+1}>', '')
+        elif listinstr(['MathVista'], dataset):
+            self.kwargs['max_new_tokens'] = 2048
+            prompt = f'{question}'
+        elif listinstr(['ChartQA'], dataset):
+            self.kwargs['max_new_tokens'] = 512
+            if '11B' in self.model_name:
+                prompt = (
+                    f'You are provided a chart image and will be asked a question. '
+                    f'You have to think through your answer and provide a step-by-step solution. '
+                    f'Once you have the solution, write the final answer in at most a few words at the end '
+                    f"with the phrase \"FINAL ANSWER:\". "
+                    f"The question is: {question}<cot_start>Let's think step by step."
+                )
+            elif '90B' in self.model_name:
+                prompt = (
+                    f'You are provided a chart image and will be asked a question. '
+                    f'Follow these steps carefully:\n '
+                    f'Step 1: Analyze the question to understand what specific data or information is being asked for. '
+                    f'Focus on whether the question is asking for a specific number or category '
+                    f'from the chart image.\n '
+                    f'Step 2: Identify any numbers, categories, or groups mentioned in the question '
+                    f'and take note of them. Focus on detecting and matching them directly to the image. \n'
+                    f'Step 3: Study the image carefully and find the relevant data corresponding to the categories '
+                    f'or numbers mentioned. Avoid unnecessary assumptions or calculations; '
+                    f'simply read the correct data from the image.\n '
+                    f'Step 4: Develop a clear plan to solve the question by locating the right data. '
+                    f'Focus only on the specific category or group that matches the question. \n'
+                    f'Step 5: Use step-by-step reasoning to ensure you are referencing the correct numbers '
+                    f'or data points from the image, avoiding unnecessary extra steps or interpretations.\n '
+                    f"Step 6: Provide the final answer, starting with \"FINAL ANSWER:\" "
+                    f'and using as few words as possible, '
+                    f'simply stating the number or data point requested. \n\n '
+                    f"The question is: {question}<cot_start>Let's think step by step."
+                )
+        elif listinstr(['DocVQA'], dataset):
+            self.kwargs['max_new_tokens'] = 512
+            prompt = (
+                f'Read the text in the image carefully and answer the question '
+                f'with the text as seen exactly in the image. '
+                f'For yes/no questions, just respond Yes or No. '
+                f'If the answer is numeric, just respond with the number and nothing else. '
+                f'If the answer has multiple words, just respond with the words and absolutely nothing else. '
+                f'Never respond in a sentence or a phrase.\n Question: {question}'
+            )
+        else:
+            raise NotImplementedError(f'Dataset {dataset}) not supported.')
+
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+
+        image = Image.open(image_path)
+        messages = [
+            {'role': 'user', 'content': [
+                {'type': 'image'},
+                {'type': 'text', 'text': prompt}
+            ]}
+        ]
+        input_text = self.processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = self.processor(image, input_text, return_tensors='pt').to(self.device)
+        if not self.use_custom_prompt(dataset):
+            if dataset is not None and DATASET_TYPE(dataset) in ['MCQ', 'Y/N']:
+                self.kwargs['max_new_tokens'] = 128
+            else:
+                self.kwargs['max_new_tokens'] = 512
+        output = self.model.generate(**inputs, **self.kwargs)
+        return self.processor.decode(output[0][inputs['input_ids'].shape[1]:]).replace('<|eot_id|>', '')
--- a/VLMEvalKit/vlmeval/vlm/llava/__init__.py
+++ b/VLMEvalKit/vlmeval/vlm/llava/__init__.py
+from .llava import LLaVA, LLaVA_Next, LLaVA_Next2, LLaVA_OneVision, LLaVA_OneVision_HF
+from .llava_xtuner import LLaVA_XTuner
+
+__all__ = ['LLaVA', 'LLaVA_Next', 'LLaVA_XTuner', 'LLaVA_Next2', 'LLaVA_OneVision', 'LLaVA_OneVision_HF']
--- a/VLMEvalKit/vlmeval/vlm/llava/llava.py
+++ b/VLMEvalKit/vlmeval/vlm/llava/llava.py
+import torch
+from PIL import Image
+from abc import abstractproperty
+import sys
+import os.path as osp
+from ..base import BaseModel
+from ...smp import *
+from ...dataset import DATASET_TYPE, DATASET_MODALITY
+import copy
+import requests
+
+
+class LLaVA(BaseModel):
+
+    INSTALL_REQ = True
+    INTERLEAVE = True
+
+    def __init__(self, model_path="liuhaotian/llava_v1.5_7b", **kwargs):
+        try:
+            from llava.model.builder import load_pretrained_model
+            from llava.mm_utils import get_model_name_from_path
+        except Exception as err:
+            logging.critical(
+                "Please install llava from https://github.com/haotian-liu/LLaVA"
+            )
+            raise err
+
+        assert osp.exists(model_path) or splitlen(model_path) == 2
+        self.system_prompt = (
+            "A chat between a curious human and an artificial intelligence assistant. "
+            "The assistant gives helpful, detailed, and polite answers to the human's questions. "
+        )
+        self.stop_str = "</s>"
+
+        if model_path == "Lin-Chen/ShareGPT4V-7B":
+            model_name = "llava-v1.5-7b"
+        elif model_path == "Lin-Chen/ShareGPT4V-13B":
+            model_name = "llava-v1.5-13b"
+        else:
+            model_name = get_model_name_from_path(model_path)
+
+        try:
+            self.tokenizer, self.model, self.image_processor, self.context_len = (
+                load_pretrained_model(
+                    model_path=model_path,
+                    model_base=None,
+                    model_name=model_name,
+                    device="cpu",
+                    device_map="cpu",
+                )
+            )
+        except Exception as err:
+            if "ShareGPT4V" in model_path:
+                import llava
+
+                logging.critical(
+                    "Please manually remove the encoder type check in "
+                    f"{llava.__path__[0]}/model/multimodal_encoder/builder.py "
+                    "Line 8 to use the ShareGPT4V model. "
+                )
+            else:
+                logging.critical("Unknown error when loading LLaVA model.")
+            raise err
+
+        self.model = self.model.cuda()
+        self.conv_mode = "llava_v1"
+
+        kwargs_default = dict(
+            do_sample=False,
+            temperature=0,
+            max_new_tokens=512,
+            top_p=None,
+            num_beams=1,
+            use_cache=True,
+        )  # noqa E501
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+        warnings.warn(
+            f"Following kwargs received: {self.kwargs}, will use as generation config. "
+        )
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == "MCQ":
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert dataset is None or isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+
+        question = line["question"]
+        hint = line["hint"] if ("hint" in line and not pd.isna(line["hint"])) else None
+        if hint is not None:
+            question = hint + "\n" + question
+
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f"\n{key}. {item}"
+        prompt = question
+
+        if len(options):
+            prompt += (
+                "\n请直接回答选项字母。"
+                if cn_string(prompt)
+                else "\nAnswer with the option's letter from the given choices directly."
+            )
+        else:
+            prompt += (
+                "\n请直接回答问题。"
+                if cn_string(prompt)
+                else "\nAnswer the question directly."
+            )
+
+        message = [dict(type="image", value=s) for s in tgt_path]
+        message.append(dict(type="text", value=prompt))
+        return message
+
+    def concat_tilist(self, message):
+        text, images = "", []
+        for item in message:
+            if item["type"] == "text":
+                text += item["value"]
+            elif item["type"] == "image":
+                text += " <image> "
+                images.append(item["value"])
+        return text, images
+
+    def chat_inner(self, message, dataset=None):
+        from llava.mm_utils import (
+            process_images,
+            tokenizer_image_token,
+            KeywordsStoppingCriteria,
+        )
+        from llava.constants import IMAGE_TOKEN_INDEX
+
+        prompt = self.system_prompt
+        images = []
+        for utter in message:
+            prompt += "USER: " if utter["role"] == "user" else "ASSISTANT: "
+            content, images_sub = self.concat_tilist(utter["content"])
+            prompt += content
+            images.extend(images_sub)
+            prompt += " " if utter["role"] == "user" else self.stop_str
+        assert message[-1]["role"] == "user", message
+        prompt += "ASSISTANT: "
+
+        images = [Image.open(s).convert("RGB") for s in images]
+        args = abstractproperty()
+        args.image_aspect_ratio = "pad"
+        image_tensor = process_images(images, self.image_processor, args).to(
+            "cuda", dtype=torch.float16
+        )
+
+        input_ids = (
+            tokenizer_image_token(
+                prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
+            )
+            .unsqueeze(0)
+            .cuda()
+        )
+        keywords = [self.stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(
+            keywords, self.tokenizer, input_ids
+        )
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                input_ids,
+                images=image_tensor,
+                stopping_criteria=[stopping_criteria],
+                **self.kwargs,
+            )
+        output = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[
+            0
+        ].strip()
+        return output
+
+    def generate_inner(self, message, dataset=None):
+        from llava.mm_utils import (
+            process_images,
+            tokenizer_image_token,
+            KeywordsStoppingCriteria,
+        )
+        from llava.constants import IMAGE_TOKEN_INDEX
+
+        # Support interleave text and image
+        content, images = self.concat_tilist(message)
+
+        images = [Image.open(s).convert("RGB") for s in images]
+        args = abstractproperty()
+        args.image_aspect_ratio = "pad"
+        if images:
+            image_tensor = process_images(images, self.image_processor, args).to(
+                "cuda", dtype=torch.float16
+            )
+        else:
+            image_tensor = None
+
+        prompt = self.system_prompt + "USER: " + content + " ASSISTANT: "
+
+        input_ids = (
+            tokenizer_image_token(
+                prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
+            )
+            .unsqueeze(0)
+            .cuda()
+        )
+        keywords = [self.stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(
+            keywords, self.tokenizer, input_ids
+        )
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                input_ids,
+                images=image_tensor,
+                stopping_criteria=[stopping_criteria],
+                **self.kwargs,
+            )
+
+        output = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[
+            0
+        ].strip()
+        return output
+
+
+class LLaVA_Next(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = True
+
+    def __init__(self, model_path="llava-hf/llava-v1.6-vicuna-7b-hf", **kwargs):
+        import transformers
+        from transformers import (
+            LlavaNextProcessor,
+            LlavaNextForConditionalGeneration,
+            AutoProcessor,
+            LlavaForConditionalGeneration,
+        )
+
+        self.model_path = model_path
+        if "34b" in model_path.lower():
+            self.processor = LlavaNextProcessor.from_pretrained(
+                self.model_path, use_fast=False
+            )
+        elif "interleave" in model_path.lower():
+            self.processor = AutoProcessor.from_pretrained(self.model_path)
+        else:
+            self.processor = LlavaNextProcessor.from_pretrained(self.model_path)
+        flash_attn_flag = False
+        try:
+            import flash_attn
+
+            flash_attn_flag = True
+        except ImportError:
+            pass
+
+        if flash_attn_flag:
+            if "interleave" in model_path.lower():
+                model = LlavaForConditionalGeneration.from_pretrained(
+                    self.model_path,
+                    torch_dtype=torch.float16,
+                    low_cpu_mem_usage=True,
+                    use_flash_attention_2=True,
+                )
+            else:
+                model = LlavaNextForConditionalGeneration.from_pretrained(
+                    self.model_path,
+                    torch_dtype=torch.float16,
+                    low_cpu_mem_usage=True,
+                    use_flash_attention_2=True,
+                )
+        else:
+            if "interleave" in model_path.lower():
+                model = LlavaForConditionalGeneration.from_pretrained(
+                    self.model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+                )
+            else:
+                model = LlavaNextForConditionalGeneration.from_pretrained(
+                    self.model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+                )
+
+        model = model.eval()
+        self.model = model.cuda()
+        kwargs_default = dict(
+            do_sample=False, temperature=0, max_new_tokens=512, top_p=None, num_beams=1
+        )
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+        warnings.warn(
+            f"Following kwargs received: {self.kwargs}, will use as generation config. "
+        )
+
+    def apply_prompt_template(self, prompt):
+        model_path = self.model_path.lower()
+        if "mistral" in model_path:
+            template = "[INST] PLACEHOLDER [/INST]"
+        elif "vicuna" in model_path:
+            template = (
+                "A chat between a curious human and an artificial intelligence assistant. "
+                "The assistant gives helpful, detailed, and polite answers to the human's questions. "
+                "USER: PLACEHOLDER ASSISTANT:"
+            )
+        elif "34b" in model_path:
+            template = (
+                "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\nPLACEHOLDER<|im_end|>"
+                "<|im_start|>assistant\n"
+            )
+        else:
+            raise NotImplementedError(
+                f"Prompt template for {model_path} not implemented."
+            )
+
+        prompt = template.replace("PLACEHOLDER", f"<image>\n{prompt}")
+        return prompt
+
+    def output_process(self, answer):
+        if "<s>" in answer:
+            answer = answer.replace("<s>", "").strip()
+        if "[/INST]" in answer:
+            answer = answer.split("[/INST]")[1].strip()
+        elif "ASSISTANT:" in answer:
+            answer = answer.split("ASSISTANT:")[1].strip()
+        elif "assistant\n" in answer:
+            answer = answer.split("assistant\n")[1].strip()
+        elif "<|end_header_id|>\n\n" in answer:
+            answer = answer.split("<|end_header_id|>\n\n")[2].strip()
+
+        if "</s>" in answer:
+            answer = answer.split("</s>")[0].strip()
+        elif "<|im_end|>" in answer:
+            answer = answer.split("<|im_end|>")[0].strip()
+        elif "<|eot_id|>" in answer:
+            answer = answer.split("<|eot_id|>")[0].strip()
+        return answer
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == "MCQ":
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert dataset is None or isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+
+        question = line["question"]
+        hint = line["hint"] if ("hint" in line and not pd.isna(line["hint"])) else None
+        if hint is not None:
+            question = hint + "\n" + question
+
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f"\n{key}. {item}"
+        prompt = question
+
+        if len(options):
+            prompt += (
+                "\n请直接回答选项字母。"
+                if cn_string(prompt)
+                else "\nAnswer with the option's letter from the given choices directly."
+            )
+        else:
+            prompt += (
+                "\n请直接回答问题。"
+                if cn_string(prompt)
+                else "\nAnswer the question directly."
+            )
+        message = [dict(type="image", value=s) for s in tgt_path]
+        message.append(dict(type="text", value=prompt))
+        return message
+
+    def generate_inner(self, message, dataset=None):
+        content, images = [], []
+        for msg in message:
+            if msg["type"] == "text":
+                content.append({"type": msg["type"], "text": msg["value"]})
+            else:
+                content.append({"type": "image"})
+                images.append(Image.open(msg["value"]).convert("RGB"))
+        conversation = [
+            {
+                "role": "user",
+                "content": content,
+            }
+        ]
+        prompt = self.processor.apply_chat_template(
+            conversation, add_generation_prompt=True
+        )
+        inputs = self.processor(prompt, images, return_tensors="pt").to(
+            "cuda", torch.float16
+        )
+        output = self.model.generate(**inputs, **self.kwargs)
+        answer = self.processor.decode(output[0], skip_special_token=True)
+        answer = self.output_process(answer)
+        return answer
+
+
+class LLaVA_Next2(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = True
+
+    DEFAULT_IMAGE_TOKEN = "<image>"
+    IMAGE_TOKEN_INDEX = -200
+
+    def __init__(self, model_path="lmms-lab/llama3-llava-next-8b", **kwargs):
+        assert model_path is not None
+        try:
+            from llava.model.builder import load_pretrained_model
+            from llava.conversation import conv_templates, SeparatorStyle
+            from llava.mm_utils import (
+                get_model_name_from_path,
+                tokenizer_image_token,
+                KeywordsStoppingCriteria,
+            )
+        except Exception as err:
+            logging.critical(
+                "Please `pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git`"
+            )
+            raise err
+
+        model_name = get_model_name_from_path(model_path)
+        tokenizer, model, image_processor, _ = load_pretrained_model(
+            model_path, None, model_name, device_map=None
+        )
+        model.cuda().eval()
+        model.tie_weights()
+
+        if "llama3" in model_path.lower():
+            conv_mode = "llava_llama_3"
+        elif "qwen" in model_path.lower():
+            conv_mode = "qwen_1_5"
+        self.conv_template = conv_mode
+        self.conv_templates = conv_templates
+        self.tokenizer = tokenizer
+        self.model = model
+        self.image_processor = image_processor
+        self.tokenizer_image_token = tokenizer_image_token
+        self.KeywordStoppingCriteria = KeywordsStoppingCriteria
+        self.SeparatorStyle = SeparatorStyle
+
+    def generate_inner(self, message, dataset=None):
+        content, images = "", []
+        for msg in message:
+            if msg["type"] == "text":
+                content += msg["value"]
+            else:
+                images.append(Image.open(msg["value"]).convert("RGB"))
+                content += self.DEFAULT_IMAGE_TOKEN + "\n"
+
+        preprocess = self.image_processor.preprocess
+        image_tokenizer = self.tokenizer_image_token
+        image_tensor = [
+            preprocess(f, return_tensors="pt")["pixel_values"][0].half().cuda()
+            for f in images
+        ]
+        image_tensor = torch.stack(image_tensor)
+
+        conv = copy.deepcopy(self.conv_templates[self.conv_template])
+        conv.append_message(conv.roles[0], content)
+        conv.append_message(conv.roles[1], None)
+        prompt_question = conv.get_prompt()
+
+        input_ids = image_tokenizer(
+            prompt_question, self.tokenizer, self.IMAGE_TOKEN_INDEX, return_tensors="pt"
+        )
+        input_ids = input_ids.unsqueeze(0).cuda()
+
+        stop_str = conv.sep if conv.sep_style != self.SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = self.KeywordStoppingCriteria(
+            keywords, self.tokenizer, input_ids
+        )
+
+        cont = self.model.generate(
+            input_ids,
+            images=image_tensor,
+            do_sample=False,
+            temperature=0,
+            max_new_tokens=512,
+            stopping_criteria=[stopping_criteria],
+        )
+        text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0]
+        return text_outputs
+
+
+class LLaVA_OneVision(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = True
+    VIDEO_LLM = True
+    DEFAULT_IMAGE_TOKEN = "<image>"
+    IMAGE_TOKEN_INDEX = -200
+
+    # This function is used to split InternVL2-Llama3-76B
+    def split_model(self, model_path):
+        import math
+
+        device_map = {}
+        num_gpus = torch.cuda.device_count()
+        rank, world_size = get_rank_and_world_size()
+        num_gpus = num_gpus // world_size
+        if "72b" not in model_path.lower():
+            return None
+        # embed_tokens, vision_tower, mm_projector, lm_head are treated as 2 layers
+        num_layers = 80 + 8
+        num_layers_per_gpu = math.ceil(num_layers / num_gpus)
+        num_layers_per_gpu = [num_layers_per_gpu] * num_gpus
+        num_layers_per_gpu[0] -= 6
+        num_layers_per_gpu[-1] -= 2
+        layer_cnt = 0
+        for i, num_layer in enumerate(num_layers_per_gpu):
+            for j in range(num_layer):
+                device_map[f"model.layers.{layer_cnt}"] = rank + world_size * i
+                layer_cnt += 1
+        last_gpu = rank + world_size * (num_gpus - 1)
+        device_map["model.image_newline"] = rank
+        device_map["model.embed_tokens"] = rank
+        device_map["model.norm"] = rank
+        device_map["model.vision_tower"] = rank
+        device_map["model.vision_resampler"] = rank
+        device_map["model.mm_projector"] = rank
+        device_map["lm_head"] = last_gpu
+        return device_map
+
+    def __init__(self, model_path="lmms-lab/llava-onevision-qwen2-7b-si", **kwargs):
+        assert model_path is not None
+        try:
+            from llava.model.builder import load_pretrained_model
+            from llava.conversation import conv_templates, SeparatorStyle
+            from llava.mm_utils import (
+                get_model_name_from_path,
+                process_images,
+                tokenizer_image_token,
+                KeywordsStoppingCriteria,
+            )  # noqa: E501
+        except Exception as err:
+            logging.critical(
+                "Please `pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git`"
+            )
+            raise err
+
+        video_kwargs_default = dict(
+            overwrite=True, mm_spatial_pool_mode="average", force_sample=True
+        )
+        video_kwargs_default.update(kwargs)
+        self.video_kwargs = video_kwargs_default
+
+        overwrite_config = None
+        if "video" in model_path.lower():
+            if self.video_kwargs["overwrite"]:
+                overwrite_config = {}
+                overwrite_config["mm_spatial_pool_mode"] = self.video_kwargs[
+                    "mm_spatial_pool_mode"
+                ]
+
+        rank, world_size = get_rank_and_world_size()
+        model_name = get_model_name_from_path(model_path)
+        device_map = self.split_model(model_path)
+
+        if device_map is None:
+            if auto_split_flag():
+                assert world_size == 1, 'Only support world_size == 1 when AUTO_SPLIT set for non-72B LLaVA-OneVision'
+                logging.warning('Currently, we only support to split the non-72B model across all GPUs.')
+                tokenizer, model, image_processor, _ = load_pretrained_model(
+                    model_path,
+                    None,
+                    model_name,
+                    device_map="auto",
+                    overwrite_config=overwrite_config,
+                )
+            else:
+                tokenizer, model, image_processor, _ = load_pretrained_model(
+                    model_path,
+                    None,
+                    model_name,
+                    device_map="cpu",
+                    overwrite_config=overwrite_config,
+                )
+                model.cuda()
+        else:
+            tokenizer, model, image_processor, _ = load_pretrained_model(
+                model_path,
+                None,
+                model_name,
+                device_map=device_map,
+                overwrite_config=overwrite_config,
+            )
+        model.eval()
+        model.tie_weights()
+
+        if "llava" in model_path.lower():
+            conv_mode = "qwen_1_5"
+        if 'llava-video' in model_path.lower():
+            self.nframe = 64
+        else:
+            self.nframe = 16
+            if "72b" in model_path.lower():
+                self.nframe = 32
+
+        if "video" in model_path.lower():
+            self.force_sample = self.video_kwargs["force_sample"]
+        else:
+            self.force_sample = False
+
+        self.conv_template = conv_mode
+        self.conv_templates = conv_templates
+        self.tokenizer = tokenizer
+        self.model = model
+        self.image_processor = image_processor
+        self.tokenizer_image_token = tokenizer_image_token
+        self.process_images = (
+            process_images  # Store process_images as a class attribute
+        )
+        self.KeywordStoppingCriteria = KeywordsStoppingCriteria
+        self.SeparatorStyle = SeparatorStyle
+
+    def generate_inner_image(self, message, dataset=None):
+        content, images = "", []
+        image_sizes = []  # Store image sizes
+
+        for msg in message:
+            if msg["type"] == "text":
+                content += msg["value"]
+            else:
+                img = Image.open(msg["value"]).convert("RGB")
+                images.append(img)
+                image_sizes.append(img.size)  # Store the size of each image
+                content += self.DEFAULT_IMAGE_TOKEN + "\n"
+
+        # Process images using the class attribute self.process_images
+        image_tensor = self.process_images(
+            images, self.image_processor, self.model.config
+        )
+        image_tensor = [
+            _image.to(dtype=torch.float16, device="cuda") for _image in image_tensor
+        ]
+
+        conv = copy.deepcopy(self.conv_templates[self.conv_template])
+        conv.append_message(conv.roles[0], content)
+        conv.append_message(conv.roles[1], None)
+        prompt_question = conv.get_prompt()
+
+        input_ids = self.tokenizer_image_token(
+            prompt_question, self.tokenizer, self.IMAGE_TOKEN_INDEX, return_tensors="pt"
+        )
+        input_ids = input_ids.unsqueeze(0).cuda()
+
+        stop_str = conv.sep if conv.sep_style != self.SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = self.KeywordStoppingCriteria(
+            keywords, self.tokenizer, input_ids
+        )
+
+        # Pass image sizes along with other parameters
+        cont = self.model.generate(
+            input_ids,
+            images=image_tensor,
+            image_sizes=image_sizes,  # Pass the image sizes here
+            do_sample=False,
+            temperature=0,
+            max_new_tokens=512,
+            stopping_criteria=[stopping_criteria],
+        )
+        text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0]
+        return text_outputs
+
+    def generate_inner_video(self, message, dataset=None):
+        content, text_content, visual_content, videos = "", "", "", []
+
+        for msg in message:
+            if msg["type"] == "text":
+                text_content += msg["value"]
+            else:
+                videos.append(msg["value"])
+                visual_content += self.DEFAULT_IMAGE_TOKEN + "\n"
+
+        if len(videos) > 1:
+            raise ValueError(
+                "LLaVA-OneVision does not support multiple videos as input."
+            )
+
+        video_frames, frame_time, video_time = self.load_video(
+            videos[0], self.nframe, self.force_sample
+        )
+
+        time_instruciton = (
+            f"The video lasts for {video_time:.2f} seconds,"
+            f"and {len(video_frames[0])} frames are uniformly sampled from it."
+            f"These frames are located at {frame_time}."
+            f"Please answer the following questions related to this video.\n"
+        )
+
+        if self.force_sample:
+            content = visual_content + time_instruciton + text_content
+        else:
+            content = visual_content + text_content
+
+        image_tensors = []
+        frames = (
+            self.image_processor.preprocess(video_frames, return_tensors="pt")[
+                "pixel_values"
+            ]
+            .half()
+            .cuda()
+        )
+        image_tensors.append(frames)
+
+        conv = copy.deepcopy(self.conv_templates[self.conv_template])
+        conv.append_message(conv.roles[0], content)
+        conv.append_message(conv.roles[1], None)
+        prompt_question = conv.get_prompt()
+
+        input_ids = self.tokenizer_image_token(
+            prompt_question, self.tokenizer, self.IMAGE_TOKEN_INDEX, return_tensors="pt"
+        )
+        input_ids = input_ids.unsqueeze(0).cuda()
+        image_sizes = [frame.size for frame in video_frames]
+        modalities = ["video"] * len(video_frames)
+
+        stop_str = conv.sep if conv.sep_style != self.SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = self.KeywordStoppingCriteria(
+            keywords, self.tokenizer, input_ids
+        )
+
+        # Pass image sizes along with other parameters
+        cont = self.model.generate(
+            input_ids,
+            images=image_tensors,
+            image_sizes=image_sizes,  # Pass the image sizes here
+            do_sample=False,
+            temperature=0,
+            max_new_tokens=512,
+            modalities=modalities,
+            stopping_criteria=[stopping_criteria],
+        )
+        text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0]
+        return text_outputs
+
+    def load_video(self, video_path, max_frames_num, force_sample=False, fps=1):
+        from decord import VideoReader, cpu
+        import numpy as np
+
+        if max_frames_num == 0:
+            return np.zeros((1, 336, 336, 3))
+        vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+        total_frame_num = len(vr)
+        video_time = total_frame_num / vr.get_avg_fps()
+        fps = round(vr.get_avg_fps() / fps)
+        frame_idx = [i for i in range(0, len(vr), fps)]
+        frame_time = [i / fps for i in frame_idx]
+        if len(frame_idx) > max_frames_num or force_sample:
+            sample_fps = max_frames_num
+            uniform_sampled_frames = np.linspace(
+                0, total_frame_num - 1, sample_fps, dtype=int
+            )
+            frame_idx = uniform_sampled_frames.tolist()
+            frame_time = [i / vr.get_avg_fps() for i in frame_idx]
+        frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
+        spare_frames = vr.get_batch(frame_idx).asnumpy()
+        # import pdb;pdb.set_trace()
+        return spare_frames, frame_time, video_time
+
+    def generate_inner(self, message, dataset=None):
+        if DATASET_MODALITY(dataset) == 'VIDEO':
+            return self.generate_inner_video(message, dataset)
+        else:
+            return self.generate_inner_image(message, dataset)
+
+
+class LLaVA_OneVision_HF(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = True
+    VIDEO_LLM = True
+    DEFAULT_IMAGE_TOKEN = "<image>"
+    IMAGE_TOKEN_INDEX = -200
+
+    def __init__(self, model_path="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", **kwargs):
+        from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
+        assert model_path is not None, "Model path must be provided."
+        self.model = LlavaOnevisionForConditionalGeneration.from_pretrained(
+            model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+        ).to('cuda')
+        self.processor = AutoProcessor.from_pretrained(model_path)
+
+        self.video_kwargs = kwargs.get("video_kwargs", {})
+        self.force_sample = self.video_kwargs.get("force_sample", False)
+        self.nframe = kwargs.get("nframe", 8)
+        self.fps = 1
+        self.model_path = model_path
+
+    def generate_inner_image(self, message, dataset=None):
+        content, images = "", []
+        image_sizes = []
+
+        for msg in message:
+            if msg["type"] == "text":
+                content += msg["value"]
+            elif msg["type"] == "image":
+                img = Image.open(msg["value"]).convert("RGB")
+                images.append(img)
+                image_sizes.append(img.size)
+                content += self.DEFAULT_IMAGE_TOKEN + "\n"
+
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": content.split("\n", 1)[-1]},
+                    {"type": "image"},
+                ],
+            }
+        ]
+        prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True)
+        inputs = self.processor(images=images, text=prompt, return_tensors="pt").to('cuda', torch.float16)
+
+        output = self.model.generate(**inputs, max_new_tokens=512)
+        if self.model_path == "NCSOFT/VARCO-VISION-14B-HF":
+            return self.processor.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+        return self.processor.decode(output[0], skip_special_tokens=True)
+
+    def generate_inner_video(self, message, dataset=None):
+        content, text_content, visual_content, videos = "", "", "", []
+
+        for msg in message:
+            if msg["type"] == "text":
+                text_content += msg["value"]
+            elif msg["type"] == "video":
+                videos.append(msg["value"])
+                visual_content += self.DEFAULT_IMAGE_TOKEN + "\n"
+
+        if len(videos) > 1:
+            raise ValueError("LLaVA-OneVision does not support multiple videos as input.")
+
+        video_frames, frame_time, video_time = self.load_video(
+            videos[0], self.nframe, fps=1, force_sample=self.force_sample
+        )
+
+        time_instruction = (
+            f"The video lasts for {video_time:.2f} seconds, "
+            f"and {len(video_frames)} frames are uniformly sampled from it. "
+            f"These frames are located at {frame_time}. "
+            f"Please answer the following questions related to this video.\n"
+        )
+
+        content = visual_content + time_instruction + text_content
+        conversation = [
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": content}, {"type": "video"}],
+            }
+        ]
+        prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True)
+
+        inputs = self.processor(videos=video_frames, text=prompt, return_tensors="pt").to('cuda', torch.float16)
+        output = self.model.generate(**inputs, max_new_tokens=512)
+        return self.processor.decode(output[0], skip_special_tokens=True)
+
+    def load_video(self, video_path, max_frames_num, fps=1, force_sample=False):
+        from decord import VideoReader, cpu
+        import numpy as np
+
+        vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+        total_frame_num = len(vr)
+        avg_fps = vr.get_avg_fps()
+
+        if avg_fps == 0:
+            raise ValueError(f"Video '{video_path}' has an average FPS of 0, which is invalid.")
+        if fps <= 0:
+            raise ValueError("FPS argument must be greater than 0.")
+
+        effective_fps = round(avg_fps / fps)
+        frame_idx = list(range(0, total_frame_num, effective_fps))
+        frame_time = [i / avg_fps for i in frame_idx]
+
+        if len(frame_idx) > max_frames_num or force_sample:
+            uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frames_num, dtype=int)
+            frame_idx = uniform_sampled_frames.tolist()
+            frame_time = [i / avg_fps for i in frame_idx]
+
+        frame_time_str = ", ".join([f"{t:.2f}s" for t in frame_time])
+        video_frames = vr.get_batch(frame_idx).asnumpy()
+        video_time = total_frame_num / avg_fps
+
+        return video_frames, frame_time_str, video_time
+
+    def generate_inner(self, message, dataset=None):
+        if DATASET_MODALITY(dataset) == "VIDEO":
+            return self.generate_inner_video(message, dataset)
+        else:
+            return self.generate_inner_image(message, dataset)
--- a/VLMEvalKit/vlmeval/vlm/llava/llava_xtuner.py
+++ b/VLMEvalKit/vlmeval/vlm/llava/llava_xtuner.py
+import os
+import os.path as osp
+import string
+import sys
+import warnings
+
+import pandas as pd
+import torch
+from huggingface_hub import snapshot_download
+from PIL import Image
+from transformers import (AutoModel, AutoModelForCausalLM, AutoTokenizer,
+                          CLIPImageProcessor, CLIPVisionModel,
+                          GenerationConfig, StoppingCriteriaList)
+
+from ..base import BaseModel
+from ...smp import *
+from ...dataset import DATASET_TYPE
+
+
+class LLaVA_XTuner(BaseModel):
+
+    INSTALL_REQ = True
+    INTERLEAVE = False
+
+    def __init__(self,
+                 llava_path,
+                 llm_path=None,
+                 visual_encoder_path='openai/clip-vit-large-patch14-336',
+                 visual_select_layer=-2,
+                 prompt_template=None,
+                 stop_words=[],
+                 torch_dtype=torch.float16):
+        try:
+            from peft import PeftModel
+            from xtuner.utils import PROMPT_TEMPLATE, StopWordStoppingCriteria
+        except Exception as err:
+            logging.critical(
+                'Please install xtuner with `pip install -U xtuner` before '
+                'using LLaVA_XTuner')
+            raise err
+
+        if not osp.isdir(llava_path):
+            cache_path = get_cache_path(llava_path)
+            if cache_path is not None:
+                llava_path = cache_path
+            else:
+                llava_path = snapshot_download(repo_id=llava_path)
+        assert osp.exists(llava_path) and osp.isdir(llava_path)
+
+        # build visual_encoder
+        if 'llm' in os.listdir(llava_path):
+            assert llm_path is None, (
+                "Please don't specify the `llm_path` since passed "
+                '`llava_path` contains a LLM!')
+            llm_path = osp.join(llava_path, 'llm')
+        else:
+            assert llm_path is not None, 'Please specify the `llm_path`!'
+
+        llm = AutoModelForCausalLM.from_pretrained(llm_path,
+                                                   trust_remote_code=True,
+                                                   torch_dtype=torch_dtype,
+                                                   device_map='cpu')
+        tokenizer = AutoTokenizer.from_pretrained(llm_path,
+                                                  trust_remote_code=True,
+                                                  encode_special_tokens=True)
+        print(f'Load LLM from {llm_path}')
+
+        # build visual_encoder
+        if 'visual_encoder' in os.listdir(llava_path):
+            assert visual_encoder_path is None, (
+                "Please don't specify the `visual_encoder_path` since passed "
+                '`llava_path` contains a visual encoder!')
+            visual_encoder_path = osp.join(llava_path, 'visual_encoder')
+        else:
+            assert visual_encoder_path is not None, (
+                'Please specify the `visual_encoder_path`!')
+        visual_encoder = CLIPVisionModel.from_pretrained(
+            visual_encoder_path, torch_dtype=torch_dtype, device_map='cpu')
+        image_processor = CLIPImageProcessor.from_pretrained(
+            visual_encoder_path)
+        print(f'Load visual_encoder from {visual_encoder_path}')
+
+        # load adapter
+        if 'llm_adapter' in os.listdir(llava_path):
+            adapter_path = osp.join(llava_path, 'llm_adapter')
+            llm = PeftModel.from_pretrained(llm,
+                                            adapter_path,
+                                            trust_remote_code=True,
+                                            device_map='cpu')
+            print(f'Load LLM adapter from {llava_path}')
+        if 'visual_encoder_adapter' in os.listdir(llava_path):
+            adapter_path = osp.join(llava_path, 'visual_encoder_adapter')
+            visual_encoder = PeftModel.from_pretrained(visual_encoder,
+                                                       adapter_path,
+                                                       trust_remote_code=True,
+                                                       device_map='cpu')
+            print(f'Load visual_encoder adapter from {llava_path}')
+
+        # build projector
+        projector_path = osp.join(llava_path, 'projector')
+        projector = AutoModel.from_pretrained(projector_path,
+                                              trust_remote_code=True,
+                                              torch_dtype=torch_dtype,
+                                              device_map='cpu')
+        print(f'Load projector from {llava_path}')
+
+        llm.eval()
+        visual_encoder.eval()
+        projector.eval()
+
+        self.llm = llm.cuda()
+        self.tokenizer = tokenizer
+        self.visual_encoder = visual_encoder.cuda()
+        self.image_processor = image_processor
+        self.projector = projector.cuda()
+        self.visual_select_layer = visual_select_layer
+        if prompt_template is not None:
+            # modified prompt template
+            if prompt_template == 'llama3_chat':
+                self.prompt_template = dict(
+                    SYSTEM=('<|start_header_id|>system<|end_header_id|>\n\n'
+                            '{system}<|eot_id|>'),
+                    INSTRUCTION=(
+                        '<|start_header_id|>user<|end_header_id|>\n\n{input}<|eot_id|>'
+                        '<|start_header_id|>assistant<|end_header_id|>\n\n'),
+                    SUFFIX='<|eot_id|>',
+                    SUFFIX_AS_EOS=True,
+                    STOP_WORDS=['<|eot_id|>'])
+            else:
+                self.prompt_template = PROMPT_TEMPLATE[prompt_template]
+            stop_words += self.prompt_template.get('STOP_WORDS', [])
+        else:
+            self.prompt_template = None
+
+        self.stop_criteria = StoppingCriteriaList()
+        for word in stop_words:
+            self.stop_criteria.append(
+                StopWordStoppingCriteria(self.tokenizer, word))
+
+    def build_gen_config(self, dataset):
+        gen_kwargs = dict(max_new_tokens=512,
+                          do_sample=True,
+                          temperature=1,
+                          num_beams=5,
+                          eos_token_id=self.tokenizer.eos_token_id,
+                          pad_token_id=self.tokenizer.pad_token_id
+                          if self.tokenizer.pad_token_id is not None else
+                          self.tokenizer.eos_token_id)
+        # For single word generation
+        if (dataset is not None
+                and DATASET_TYPE(dataset) in ['MCQ', 'Y/N']):
+            gen_kwargs.update(
+                dict(max_new_tokens=5, do_sample=False, num_beams=1))
+        return GenerationConfig(**gen_kwargs)
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'MCQ':
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert dataset is None or isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+
+        question = line['question']
+        hint = line['hint'] if ('hint' in line
+                                and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+
+        if not cn_string(question):
+            prompt = question + '\n' + ("Answer with the option's letter "
+                                        'from the given choices directly.')
+        else:
+            prompt = question + '\n' + '请直接回答选项字母。'
+
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
+
+    def generate_inner(self, message, dataset=None):
+        from xtuner.dataset.utils import expand2square
+        from xtuner.model.utils import prepare_inputs_labels_for_multimodal
+        from xtuner.utils import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        prompt = prompt.replace('<image>', '')
+        image = Image.open(image_path).convert('RGB')
+        image = expand2square(
+            image,
+            tuple(int(x * 255) for x in self.image_processor.image_mean))
+        image = self.image_processor.preprocess(
+            image, return_tensors='pt')['pixel_values'][0]
+        image = image.cuda().unsqueeze(0)
+        visual_outputs = self.visual_encoder(image, output_hidden_states=True)
+        pixel_values = self.projector(
+            visual_outputs.hidden_states[self.visual_select_layer][:, 1:])
+
+        inputs = DEFAULT_IMAGE_TOKEN + '\n' + prompt
+
+        if self.prompt_template:
+            inputs = self.prompt_template['INSTRUCTION'].format(input=inputs)
+
+        chunk_encode = []
+        for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)):
+            if idx == 0:
+                cur_encode = self.tokenizer(chunk)
+            else:
+                cur_encode = self.tokenizer(chunk, add_special_tokens=False)
+            chunk_encode.append(cur_encode)
+        assert len(chunk_encode) == 2
+        ids = []
+        for idx, cur_chunk_encode in enumerate(chunk_encode):
+            ids.extend(cur_chunk_encode['input_ids'])
+            if idx != len(chunk_encode) - 1:
+                ids.append(IMAGE_TOKEN_INDEX)
+        ids = torch.tensor(ids).cuda().unsqueeze(0)
+        mm_inputs = prepare_inputs_labels_for_multimodal(
+            llm=self.llm, input_ids=ids, pixel_values=pixel_values)
+
+        gen_config = self.build_gen_config(dataset)
+        generate_output = self.llm.generate(
+            **mm_inputs,
+            generation_config=gen_config,
+            streamer=None,
+            bos_token_id=self.tokenizer.bos_token_id,
+            stopping_criteria=self.stop_criteria)
+        predict = self.tokenizer.decode(generate_output[0],
+                                        skip_special_tokens=True).strip()
+        return predict
--- a/VLMEvalKit/vlmeval/vlm/mantis.py
+++ b/VLMEvalKit/vlmeval/vlm/mantis.py
+import torch
+from PIL import Image
+from abc import abstractproperty
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+import warnings
+
+
+class Mantis(BaseModel):
+    """
+    Mantis Model
+    This implementation is adpated from the Llava model from llava.py and the Idefics model from idefics.py
+    """
+    INSTALL_REQ = True
+    INTERLEAVE = True
+
+    DEFAULT_IMAGE_TOKEN = '<image>'
+    IMAGE_TOKEN_INDEX = -200
+
+    def __init__(self, model_path='TIGER-Lab/Mantis-8B-siglip-llama3', **kwargs):
+        assert model_path is not None
+        try:
+            from mantis.models.mllava import LlavaForConditionalGeneration, MLlavaProcessor
+            from mantis.models.mfuyu import MFuyuForCausalLM, MFuyuProcessor
+            from mantis.models.conversation import conv_mllava_v1 as default_conv, conv_templates
+        except Exception as e:
+            logging.critical(
+                "Mantis is not installed. Please install Mantis to use this model.Please use 'pip install "
+                "git+https://github.com/TIGER-AI-Lab/Mantis.git' to install"
+            )
+            raise e
+
+        try:
+            from transformers import AutoModelForVision2Seq, AutoProcessor
+        except Exception as e:
+            logging.critical(f'{type(e)}: {e}')
+            logging.critical("Upgrade transformers to use Mantis's idefics model.\nError: %s" % e)
+
+        # inference implementation for attention, can be "sdpa", "eager", "flash_attention_2".
+        # Seems FA2 is not effective during inference:
+        # https://discuss.huggingface.co/t/flash-attention-has-no-effect-on-inference/73453/5
+        # if is_flash_attn_2_available:
+        #     best_fit_attn_implementation = "flash_attention_2"
+        # flash_attn has a bug that says: ERROR Error query and key must have the same dtype in generating
+
+        try:
+            import flash_attn
+            best_fit_attn_implementation = 'flash_attention_2'
+        except ImportError:
+            best_fit_attn_implementation = 'eager'
+        self.model_path = model_path
+        attn_implementation = best_fit_attn_implementation
+        self._is_idefics = 'idefics' in model_path.lower()
+        # Here load the "non-idefics" Mantis model.
+        if not self._is_idefics:
+            if 'fuyu' in model_path.lower():
+                self.processor = MFuyuProcessor.from_pretrained(self.model_path)
+                model = MFuyuForCausalLM.from_pretrained(
+                    self.model_path,
+                    device_map='cuda',
+                    attn_implementation=attn_implementation,
+                    torch_dtype=torch.float16
+                )
+            else:
+                self.processor = MLlavaProcessor.from_pretrained(self.model_path)
+                model = LlavaForConditionalGeneration.from_pretrained(
+                    self.model_path,
+                    device_map='cuda',
+                    attn_implementation=attn_implementation,
+                    torch_dtype=torch.float16
+                )
+        else:
+            self.processor = AutoProcessor.from_pretrained(self.model_path)
+            model = AutoModelForVision2Seq.from_pretrained(
+                self.model_path,
+                device_map='cuda',
+                torch_dtype=torch.float16
+            )
+
+        model = model.eval()
+        self.model = model.cuda()
+        kwargs_default = dict(do_sample=False, temperature=0, max_new_tokens=1024, top_p=None, num_beams=1)
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+
+        self.tokenizer = self.processor.tokenizer
+        self.default_conv = default_conv
+        self.conv_templates = conv_templates
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'MCQ':
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert dataset is None or isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+
+        question = line['question']
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+        prompt = question
+
+        if len(options):
+            prompt += (
+                '\n请直接回答选项字母。' if cn_string(prompt) else
+                "\nAnswer with the option's letter from the given choices directly."
+            )
+        else:
+            prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
+        message = [dict(type='image', value=s) for s in tgt_path]
+        message.append(dict(type='text', value=prompt))
+        return message
+
+    def output_process(self, answer):
+        if '<s>' in answer:
+            answer = answer.replace('<s>', '').strip()
+        if '[/INST]' in answer:
+            answer = answer.split('[/INST]')[1].strip()
+        elif 'ASSISTANT:' in answer:
+            answer = answer.split('ASSISTANT:')[1].strip()
+        elif 'assistant\n' in answer:
+            answer = answer.split('assistant\n')[1].strip()
+        elif '<|end_header_id|>\n\n' in answer:
+            answer = answer.split('<|end_header_id|>\n\n')[2].strip()
+
+        if '</s>' in answer:
+            answer = answer.split('</s>')[0].strip()
+        elif '<|im_end|>' in answer:
+            answer = answer.split('<|im_end|>')[0].strip()
+        elif '<|eot_id|>' in answer:
+            answer = answer.split('<|eot_id|>')[0].strip()
+        elif '<end_of_utterance>' in answer:
+            answer = answer.split('<end_of_utterance>')[0].strip()
+        elif '|ENDOFTEXT|' in answer:
+            answer = answer.split('|ENDOFTEXT|')[0].strip()
+        return answer
+
+    def generate_inner(self, message, dataset=None):
+        content, images = '', []
+        ide_content, question = [], ''
+        for msg in message:
+            if msg['type'] == 'text':
+                content += msg['value']
+                question += msg['value']
+            else:
+                images.append(Image.open(msg['value']).convert('RGB'))
+                content += (self.DEFAULT_IMAGE_TOKEN + '\n')
+                ide_content.append({'type': 'image'})
+        if self._is_idefics:
+            # Follow the idefics implementation:
+            ide_content.append({'type': 'text', 'text': question})
+            prompt = [{'role': 'user', 'content': ide_content}]
+            prompt = self.processor.apply_chat_template(prompt, add_generation_prompt=True)
+        else:
+            # Follow the Mantis code base to make sure they are consistent:
+            # https://github.com/TIGER-AI-Lab/Mantis/blob/main/mantis/models/mllava/utils.py#L33
+            # Users don't need to define chat template as it is done here
+            if 'llama-3' in self.model.language_model.name_or_path.lower():
+                conv = self.conv_templates['llama_3']
+                terminators = [
+                    self.processor.tokenizer.eos_token_id,
+                    self.processor.tokenizer.convert_tokens_to_ids('<|eot_id|>')
+                ]
+            else:
+                conv = self.default_conv
+                terminators = [self.processor.tokenizer.eos_token_id]
+
+            # Using EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+            if 'eos_token_id' not in self.kwargs:
+                self.kwargs['eos_token_id'] = terminators
+
+            conv = conv.copy()
+            conv.append_message(conv.roles[0], content)
+            conv.append_message(conv.roles[1], '')
+            assert conv.messages[-1][0] == conv.roles[1] and conv.messages[-1][1] == '', 'Format check'
+            prompt = conv.get_prompt()
+
+        inputs = self.processor(prompt, images, return_tensors='pt', truncation=True)
+        # FIXME: Fuyu model would return a list instead of a pytorch tensor. This weird behavior needs fixing.
+        if 'image_patches' in inputs.keys():
+            inputs['image_patches'] = inputs['image_patches'][0]
+        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+        output = self.model.generate(**inputs, **self.kwargs)
+        output = output[0]
+        generated_ids = output[inputs['input_ids'].shape[-1]:]
+        answer = self.processor.decode(generated_ids, skip_special_token=True)
+        answer = self.output_process(answer)
+        return answer
--- a/VLMEvalKit/vlmeval/vlm/mgm.py
+++ b/VLMEvalKit/vlmeval/vlm/mgm.py
+import sys
+import torch
+import os.path as osp
+import os
+import warnings
+from .base import BaseModel
+from ..smp import *
+from PIL import Image
+
+'''
+    Please follow the instructions to download ckpt.
+    https://github.com/dvlab-research/MGM?tab=readme-ov-file#pretrained-weights
+'''
+
+
+class Mini_Gemini(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = False
+
+    def __init__(self, model_path, root=None, conv_mode='llava_v1', **kwargs):
+        if root is None:
+            warnings.warn('Please set `root` to Mini_Gemini code directory, \
+                which is cloned from here: "https://github.com/dvlab-research/MGM?tab=readme-ov-file" ')
+            raise ValueError
+        warnings.warn('Please follow the instructions of Mini_Gemini to put the ckpt file in the right place, \
+            which can be found at https://github.com/dvlab-research/MGM?tab=readme-ov-file#structure')
+        assert model_path == 'YanweiLi/MGM-7B-HD', 'We only support MGM-7B-HD for now'
+        self.model_path = model_path
+        sys.path.append(root)
+        try:
+            from mgm.model.builder import load_pretrained_model
+            from mgm.mm_utils import get_model_name_from_path
+        except Exception as e:
+            logging.critical(
+                'Please first install Mini_Gemini and set the root path to use Mini_Gemini, '
+                'which is cloned from here: "https://github.com/dvlab-research/MGM?tab=readme-ov-file" '
+            )
+            raise e
+
+        VLMEvalKit_path = os.getcwd()
+        os.chdir(root)
+        warnings.warn('Please set `root` to Mini_Gemini code directory, \
+            which is cloned from here: "https://github.com/dvlab-research/MGM?tab=readme-ov-file" ')
+        model_path = osp.join(root, 'work_dirs', 'MGM', 'MGM-7B-HD')
+        try:
+            model_name = get_model_name_from_path(model_path)
+        except Exception as e:
+            logging.critical(
+                'Please follow the instructions of Mini_Gemini to put the ckpt file in the right place, '
+                'which can be found at https://github.com/dvlab-research/MGM?tab=readme-ov-file#structure'
+            )
+            raise e
+
+        tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name)
+        os.chdir(VLMEvalKit_path)
+        self.model = model
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        self.conv_mode = conv_mode
+
+        kwargs_default = dict(temperature=float(0), num_beams=1, top_p=None, max_new_tokens=1024, use_cache=True)
+        kwargs_default.update(kwargs)
+        do_sample = kwargs_default['temperature'] > 0
+        kwargs_default.update({'do_sample': do_sample})
+        self.kwargs = kwargs_default
+
+    def generate_inner(self, message, dataset=None):
+        try:
+            from mgm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, \
+                DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+            from mgm.conversation import conv_templates
+            from mgm.mm_utils import tokenizer_image_token, process_images
+        except Exception as e:
+            logging.critical(
+                'Please first install Mini_Gemini and set the root path to use Mini_Gemini, '
+                'which is cloned from here: "https://github.com/dvlab-research/MGM?tab=readme-ov-file" '
+            )
+            raise e
+
+        prompt, image = self.message_to_promptimg(message, dataset=dataset)
+        image = Image.open(image)
+        prompt = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + prompt
+        conv = conv_templates[self.conv_mode].copy()
+        conv.append_message(conv.roles[0], prompt)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
+        input_ids = input_ids.unsqueeze(0).cuda()
+        if hasattr(self.model.config, 'image_size_aux'):
+            if not hasattr(self.image_processor, 'image_size_raw'):
+                self.image_processor.image_size_raw = self.image_processor.crop_size.copy()
+            self.image_processor.crop_size['height'] = self.model.config.image_size_aux
+            self.image_processor.crop_size['width'] = self.model.config.image_size_aux
+            self.image_processor.size['shortest_edge'] = self.model.config.image_size_aux
+        image_tensor = process_images([image], self.image_processor, self.model.config)[0]
+        image_grid = getattr(self.model.config, 'image_grid', 1)
+        if hasattr(self.model.config, 'image_size_aux'):
+            raw_shape = [
+                self.image_processor.image_size_raw['height'] * image_grid,
+                self.image_processor.image_size_raw['width'] * image_grid
+            ]
+            image_tensor_aux = image_tensor
+            image_tensor = torch.nn.functional.interpolate(
+                image_tensor[None],
+                size=raw_shape,
+                mode='bilinear',
+                align_corners=False
+            )[0]
+        else:
+            image_tensor_aux = []
+        if image_grid >= 2:
+            raw_image = image_tensor.reshape(
+                3, image_grid, self.image_processor.image_size_raw['height'],
+                image_grid, self.image_processor.image_size_raw['width']
+            )
+            raw_image = raw_image.permute(1, 3, 0, 2, 4)
+            raw_image = raw_image.reshape(
+                -1, 3, self.image_processor.image_size_raw['height'], self.image_processor.image_size_raw['width']
+            )
+
+            if getattr(self.model.config, 'image_global', False):
+                global_image = image_tensor
+                if len(global_image.shape) == 3:
+                    global_image = global_image[None]
+                global_image = torch.nn.functional.interpolate(
+                    global_image,
+                    size=[
+                        self.image_processor.image_size_raw['height'],
+                        self.image_processor.image_size_raw['width']
+                    ],
+                    mode='bilinear',
+                    align_corners=False
+                )
+                # [image_crops, image_global]
+                raw_image = torch.cat([raw_image, global_image], dim=0)
+            image_tensor = raw_image.contiguous()
+
+        images = image_tensor[None].to(dtype=self.model.dtype, device='cuda', non_blocking=True)
+        if len(image_tensor_aux) > 0:
+            images_aux = image_tensor_aux[None].to(dtype=self.model.dtype, device='cuda', non_blocking=True)
+        else:
+            images_aux = None
+
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                input_ids,
+                images=images,
+                images_aux=images_aux,
+                # no_repeat_ngram_size=3,
+                bos_token_id=self.tokenizer.bos_token_id,  # Begin of sequence token
+                eos_token_id=self.tokenizer.eos_token_id,  # End of sequence token
+                pad_token_id=self.tokenizer.pad_token_id,  # Pad token
+                **self.kwargs
+            )
+
+        outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+        return outputs
--- a/VLMEvalKit/vlmeval/vlm/minicpm_v.py
+++ b/VLMEvalKit/vlmeval/vlm/minicpm_v.py
+import math
+import torch
+import random
+import numpy as np
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE, DATASET_MODALITY
+
+
+class MiniCPM_V(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='openbmb/MiniCPM-V', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+        print(f'load from {self.model_path}')
+        self.model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True)
+        self.model = self.model.to(dtype=torch.bfloat16)
+        self.model.eval().cuda()
+        self.kwargs = kwargs
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
+        torch.cuda.empty_cache()
+        self.num_beams = 1 if self.model_path == 'openbmb/MiniCPM-V' else 3
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN'], dataset):
+            # For Multi-Turn we don't have custom prompt
+            return False
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        options_prompt = 'Options:\n'
+        for key, item in options.items():
+            options_prompt += f'{key}. {item}\n'
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        prompt = ''
+        if hint is not None:
+            prompt += f'Hint: {hint}\n'
+        prompt += f'{question}\n'
+        if len(options):
+            prompt += options_prompt
+            prompt = 'Study the image carefully and pick the option associated with the correct answer. \
+                Focus solely on selecting the option and avoid including any other content.\n' + prompt
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=p) for p in tgt_path])
+
+        return message
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        image = Image.open(image_path).convert('RGB')
+        msgs = [{'role': 'user', 'content': prompt}]
+        if DATASET_TYPE(dataset) == 'MCQ':
+            max_new_tokens = 20
+        elif DATASET_TYPE(dataset) == 'Y/N':
+            max_new_tokens = 100
+        else:
+            max_new_tokens = 1024
+
+        default_kwargs = dict(
+            max_new_tokens=max_new_tokens,
+            sampling=False,
+            num_beams=self.num_beams
+        )
+        default_kwargs.update(self.kwargs)
+        res, _, _ = self.model.chat(
+            image=image,
+            msgs=msgs,
+            context=None,
+            tokenizer=self.tokenizer,
+            **default_kwargs
+        )
+        return res
+
+
+class MiniCPM_Llama3_V(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = True
+
+    def __init__(self, model_path='openbmb/MiniCPM-Llama3-V-2_5', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+        print(f'load from {self.model_path}')
+        self.model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True)
+        self.model = self.model.to(dtype=torch.float16)
+        self.model.eval().cuda()
+        self.kwargs = kwargs
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
+        torch.cuda.empty_cache()
+        self.num_beams = 1 if self.model_path == 'openbmb/MiniCPM-V' else 3
+        self.options_system_prompt = ('Carefully read the following question and select the letter corresponding '
+                                      'to the correct answer. Highlight the applicable choices without giving '
+                                      'explanations.')
+        self.wo_options_system_prompt = 'Carefully read the following question Answer the question directly.'
+        self.detail_system_prompt = 'Answer this question in detail.'
+        self.vqa_prompt = 'Answer the question using a single word or phrase.'
+
+    def use_custom_prompt(self, dataset):
+        if listinstr(['MCQ', 'VQA'], DATASET_TYPE(dataset)):
+            return True
+        elif dataset is not None and listinstr(['HallusionBench'], dataset):
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+
+        tgt_path = self.dump_image(line, dataset)
+        system_prompt = ''
+
+        question = line['question']
+        if DATASET_TYPE(dataset) == 'MCQ':
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            options_prompt = 'Options:\n'
+            for key, item in options.items():
+                options_prompt += f'{key}. {item}\n'
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+            prompt = ''
+            if hint is not None:
+                prompt += f'Hint: {hint}\n'
+            prompt += f'Question: {question}\n'
+            if len(options):
+                prompt += options_prompt
+                system_prompt = self.options_system_prompt + '\nPlease just indicate your choice.'
+            else:
+                system_prompt = self.wo_options_system_prompt
+            if 'MMMU' in dataset:  # Corner Case
+                prompt = system_prompt + '\n' + prompt
+                system_prompt = ''
+        elif dataset is not None and listinstr(['HallusionBench'], dataset):
+            question = line['question'] + ' Yes or No?'
+            prompt = question
+        elif dataset is not None and listinstr(['MME'], dataset):
+            question = line['question'] + ' Yes or No?'
+            prompt = question
+        elif dataset is not None and listinstr(['OCRBench'], dataset):
+            system_prompt = self.vqa_prompt
+            question = line['question']
+            prompt = question
+        elif DATASET_TYPE(dataset) == 'VQA':
+            if listinstr(['LLaVABench', 'MMLongBench_DOC'], dataset):
+                system_prompt = ''
+                prompt = question
+            elif listinstr(['MMVet'], dataset):
+                system_prompt = self.detail_system_prompt
+                prompt = question
+            else:
+                system_prompt = self.vqa_prompt
+                prompt = question
+
+        msgs = []
+        if system_prompt:
+            msgs.append(dict(type='text', value=system_prompt))
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=prompt))
+        return msgs
+
+    def generate_inner(self, message, dataset=None):
+        if DATASET_TYPE(dataset) == 'MCQ':
+            max_new_tokens = 200
+        elif DATASET_TYPE(dataset) == 'Y/N':
+            max_new_tokens = 3
+        else:
+            max_new_tokens = 1024
+
+        default_kwargs = dict(
+            max_new_tokens=max_new_tokens,
+            sampling=False,
+            num_beams=self.num_beams,
+        )
+        default_kwargs.update(self.kwargs)
+
+        content = []
+        for x in message:
+            if x['type'] == 'text':
+                content.append(x['value'])
+            elif x['type'] == 'image':
+                image = Image.open(x['value']).convert('RGB')
+                content.append(image)
+        msgs = [{'role': 'user', 'content': content}]
+
+        res = self.model.chat(
+            msgs=msgs,
+            context=None,
+            image=None,
+            tokenizer=self.tokenizer,
+            **default_kwargs
+        )
+
+        if isinstance(res, tuple) and len(res) > 0:
+            res = res[0]
+        return res
+
+    def chat_inner(self, message, dataset=None):
+        max_new_tokens = 1024
+
+        default_kwargs = dict(
+            max_new_tokens=max_new_tokens,
+            sampling=False,
+            num_beams=self.num_beams,
+        )
+        default_kwargs.update(self.kwargs)
+
+        msgs = []
+        for msg in message:
+            content = []
+            if len(msg['content']) == 1 and msg['content'][0]['type'] == 'text':
+                msg_new = {'role': msg['role'], 'content': msg['content'][0]['value']}
+                msgs.append(msg_new)
+                continue
+
+            for x in msg['content']:
+                if x['type'] == 'text':
+                    content.append(x['value'])
+                elif x['type'] == 'image':
+                    image = Image.open(x['value']).convert('RGB')
+                    content.append(image)
+            msg_new = {'role': msg['role'], 'content': content}
+            msgs.append(msg_new)
+
+        res = self.model.chat(
+            msgs=msgs,
+            context=None,
+            image=None,
+            tokenizer=self.tokenizer,
+            **default_kwargs)
+
+        if isinstance(res, tuple) and len(res) > 0:
+            res = res[0]
+        return res
+
+
+class MiniCPM_V_2_6(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = True
+
+    def __init__(self, model_path='openbmb/MiniCPM-V', **kwargs):
+        random.seed(0)
+        np.random.seed(0)
+        torch.manual_seed(0)
+        torch.cuda.manual_seed_all(0)
+
+        assert model_path is not None
+        self.model_path = model_path
+        print(f'load from path {self.model_path}')
+        self.model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True)
+        self.model = self.model.to(dtype=torch.bfloat16)
+        self.model.eval().cuda()
+
+        self.kwargs = kwargs
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
+        torch.cuda.empty_cache()
+        self.num_beams = 1 if self.model_path == 'openbmb/MiniCPM-V' else 3
+
+        self.options_suffix_prompt = '''\nAnswer with the option's letter from the given choices directly.'''
+        self.wo_options_system_prompt = 'Carefully read the following question Answer the question directly.'
+        self.detail_system_prompt = 'Answer this question in detail.'
+        self.vqa_prompt = 'Answer the question using a single word or phrase.'
+
+        self.multi_choice_cot_prompt = ('''Carefully read the following multichoice question, solve it step '''
+                                        '''by step and finally pick the option associated with the correct '''
+                                        '''answer in the format of "Answer: selected option\n\n''')
+        self.short_ans_cot_prompt = ('''Read the following question carefully, solve it step by step, and '''
+                                     '''then output the final answer in the format of "Answer: single number '''
+                                     '''or single word or phrase".\n\n''')
+
+    def use_custom_prompt(self, dataset=None):
+        if dataset is None:
+            return False
+        if DATASET_TYPE(dataset) in ['MCQ', 'VQA', 'Y/N']:
+            return True
+        return False
+
+    def use_cot(self, dataset=None):
+        if dataset is None:
+            return False
+        if listinstr(['MMMU', 'HallusionBench', 'OCRBench', 'ChartQA'], dataset):
+            return True
+        elif listinstr(['MathVista', 'MMVet', 'MMBench', 'MMStar', 'AI2D', 'RealWorldQA',
+                        'POPE', 'ScienceQA', 'TextVQA', 'DocVQA'], dataset):
+            return False
+        else:
+            return False
+
+    def use_upsize(self, dataset=None):
+        if dataset is None:
+            return False
+        if listinstr(['MMVet', 'MMBench', 'MMStar', 'AI2D', 'OCRBench'], dataset):
+            return True
+        else:
+            return False
+
+    def build_prompt(self, line, dataset=None):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+
+        tgt_path = self.dump_image(line, dataset)
+        system_prompt, prompt = '', ''
+
+        question = line['question']
+
+        if not self.use_cot(dataset):
+            if DATASET_TYPE(dataset) == 'MCQ':
+                options = {
+                    cand: line[cand]
+                    for cand in string.ascii_uppercase
+                    if cand in line and not pd.isna(line[cand])
+                }
+                options_prompt = 'Options:\n'
+                for key, item in options.items():
+                    options_prompt += f'{key}. {item}\n'
+                hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+                if hint is not None:
+                    prompt += f'Hint: {hint}\n'
+                prompt += f'Question: {question}\n'
+                if len(options):
+                    prompt += options_prompt
+                    prompt += self.options_suffix_prompt
+                else:
+                    system_prompt = self.wo_options_system_prompt
+
+                if 'MMMU' in dataset:
+                    if len(system_prompt) > 0:
+                        prompt = system_prompt + '\n' + prompt
+                        system_prompt = ''
+            elif dataset is not None and listinstr(['HallusionBench'], dataset):
+                question += ' Yes or No?'
+                prompt = question
+            elif dataset is not None and listinstr(['OCRBench'], dataset):
+                system_prompt = self.vqa_prompt
+                prompt = question
+            elif DATASET_TYPE(dataset) == 'VQA':
+                if listinstr(['LLaVABench'], dataset):
+                    system_prompt = ''
+                elif listinstr(['MMVet'], dataset):
+                    system_prompt = self.detail_system_prompt
+                else:
+                    system_prompt = self.vqa_prompt
+                prompt = question
+            else:
+                prompt = question
+        else:
+            has_options = True
+            if DATASET_TYPE(dataset) == 'MCQ':
+                options = {
+                    cand: line[cand]
+                    for cand in string.ascii_uppercase
+                    if cand in line and not pd.isna(line[cand])
+                }
+                options_prompt = ''
+                for key, item in options.items():
+                    options_prompt += f'{key}. {item}\n'
+                hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+                if hint is not None:
+                    prompt += f'Hint: {hint}\n'
+                prompt += f'{question}\n'
+
+                if len(options):
+                    prompt += options_prompt
+                else:
+                    has_options = False
+
+                if 'MMMU' in dataset:
+                    if len(system_prompt) > 0:
+                        prompt = system_prompt + '\n' + prompt
+                        system_prompt = ''
+            else:
+                prompt = question
+
+            if DATASET_TYPE(dataset) in ['MCQ', 'Y/N', 'VQA']:
+                if DATASET_TYPE(dataset) == 'MCQ':
+                    if has_options:
+                        prompt = self.multi_choice_cot_prompt + prompt
+                    else:
+                        prompt = self.short_ans_cot_prompt + prompt
+                elif DATASET_TYPE(dataset) == 'Y/N':
+                    prompt = self.short_ans_cot_prompt + prompt
+                else:
+                    prompt = self.short_ans_cot_prompt + prompt
+
+        msgs = []
+        if system_prompt:
+            msgs.append(dict(type='text', value=system_prompt))
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=prompt))
+
+        return msgs
+
+    def generate_inner(self, message, dataset=None):
+        if DATASET_MODALITY(dataset) == 'VIDEO':
+            max_slice_nums = 1
+            use_image_id = False
+            max_inp_length = 2048 * 10
+        else:
+            max_slice_nums = None
+            use_image_id = True
+            max_inp_length = 8192
+
+        max_new_tokens = 2048
+        default_kwargs = dict(
+            max_new_tokens=max_new_tokens,
+            sampling=False,
+            num_beams=self.num_beams,
+        )
+        default_kwargs.update(self.kwargs)
+
+        content = []
+
+        for x in message:
+            if x['type'] == 'text':
+                content.append(x['value'])
+            elif x['type'] == 'image':
+                image = Image.open(x['value']).convert('RGB')
+                if not self.use_upsize(dataset):
+                    content.append(image)
+                else:
+                    img_width, img_height = image.width, image.height
+                    if (img_width * img_height) >= (1344 * 1344):
+                        content.append(image)
+                    else:
+                        ratio = math.sqrt((1344 * 1344) / (img_width * img_height))
+                        max_img_width = int(img_width * ratio)
+                        new_img_width = random.randint(img_width, max_img_width)
+                        new_img_height = int(new_img_width / img_width * img_height)
+                        resized_image = image.resize((new_img_width, new_img_height))
+                        content.append(resized_image)
+        msgs = [{'role': 'user', 'content': content}]
+
+        res = self.model.chat(
+            image=None,
+            msgs=msgs,
+            context=None,
+            tokenizer=self.tokenizer,
+            max_inp_length=max_inp_length,
+            use_image_id=use_image_id,
+            max_slice_nums=max_slice_nums,
+            **default_kwargs
+        )
+
+        if isinstance(res, tuple) and len(res) > 0:
+            res = res[0]
+
+        return res
--- a/VLMEvalKit/vlmeval/vlm/minigpt4.py
+++ b/VLMEvalKit/vlmeval/vlm/minigpt4.py
+import torch
+import sys
+import os.path as osp
+import warnings
+from transformers import StoppingCriteriaList
+from .base import BaseModel
+
+
+class MiniGPT4(BaseModel):
+
+    INSTALL_REQ = True
+    INTERLEAVE = False
+
+    def __init__(self,
+                 mode='v2',
+                 root='/mnt/petrelfs/share_data/duanhaodong/MiniGPT-4/',
+                 temperature=1,
+                 max_out_len=512):
+
+        if root is None:
+            warnings.warn(
+                'Please set root to the directory of MiniGPT-4, which is cloned from here: '
+                'https://github.com/Vision-CAIR/MiniGPT-4. '
+            )
+
+        if mode == 'v2':
+            cfg = 'minigptv2_eval.yaml'
+        elif mode == 'v1_7b':
+            cfg = 'minigpt4_7b_eval.yaml'
+        elif mode == 'v1_13b':
+            cfg = 'minigpt4_13b_eval.yaml'
+        else:
+            raise NotImplementedError
+
+        self.mode = mode
+        self.temperature = temperature
+        self.max_out_len = max_out_len
+        self.root = root
+        this_dir = osp.dirname(__file__)
+
+        self.cfg = osp.join(this_dir, 'misc', cfg)
+        sys.path.append(self.root)
+
+        from omegaconf import OmegaConf
+        from minigpt4.common.registry import registry
+        from minigpt4.conversation.conversation import StoppingCriteriaSub, CONV_VISION_Vicuna0, CONV_VISION_minigptv2
+
+        device = torch.cuda.current_device()
+        self.device = device
+
+        cfg_path = self.cfg
+        cfg = OmegaConf.load(cfg_path)
+
+        model_cfg = cfg.model
+        model_cfg.device_8bit = device
+        model_cls = registry.get_model_class(model_cfg.arch)
+        model = model_cls.from_config(model_cfg)
+        model = model.to(device)
+        model.eval()
+        vis_processor_cfg = cfg.datasets.cc_sbu_align.vis_processor.train
+        vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
+        self.model = model
+        self.vis_processor = vis_processor
+
+        self.CONV_VISION = CONV_VISION_minigptv2 if self.mode == 'v2' else CONV_VISION_Vicuna0
+        stop_words_ids = [[835], [2277, 29937]]
+        stop_words_ids = [torch.tensor(ids).to(device) for ids in stop_words_ids]
+        self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+
+    def generate_inner(self, message, dataset=None):
+        from minigpt4.conversation.conversation import Chat
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        if self.mode == 'v2':
+            chat = Chat(self.model, self.vis_processor, device=self.device)
+        else:
+            chat = Chat(self.model, self.vis_processor, device=self.device, stopping_criteria=self.stopping_criteria)
+
+        chat_state = self.CONV_VISION.copy()
+        img_list = []
+        _ = chat.upload_img(image_path, chat_state, img_list)
+        chat.encode_img(img_list)
+        chat.ask(prompt, chat_state)
+        with torch.inference_mode():
+            msg = chat.answer(conv=chat_state, img_list=img_list)[0]
+        return msg
--- a/VLMEvalKit/vlmeval/vlm/minimonkey.py
+++ b/VLMEvalKit/vlmeval/vlm/minimonkey.py
+import torch
+from transformers import AutoTokenizer, AutoConfig, AutoModel, CLIPImageProcessor
+import warnings
+from PIL import Image
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+import pandas as pd
+import string
+import torch.distributed as dist
+import torchvision.transforms as T
+import transformers
+
+from torchvision.transforms.functional import InterpolationMode
+import re
+
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def dynamic_preprocess(image, min_num=5, max_num=6, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images, target_aspect_ratio
+
+
+def dynamic_preprocess2(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False, prior_aspect_ratio=None):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    new_target_ratios = []
+    if prior_aspect_ratio is not None:
+        for i in target_ratios:
+            if prior_aspect_ratio[0] % i[0] != 0 or prior_aspect_ratio[1] % i[1] != 0:
+                new_target_ratios.append(i)
+            else:
+                continue
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, new_target_ratios, orig_width, orig_height, image_size)
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+
+
+def load_image(image_file, input_size=448, min_num=1, max_num=6):
+    image = Image.open(image_file).convert('RGB')
+    transform = build_transform(input_size=input_size)
+    images, target_aspect_ratio = dynamic_preprocess(
+        image, image_size=input_size, use_thumbnail=True, min_num=min_num, max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values, target_aspect_ratio
+
+
+def load_image2(image_file, input_size=448, target_aspect_ratio=(1, 1), min_num=1, max_num=6):
+    image = Image.open(image_file).convert('RGB')
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess2(
+        image,
+        image_size=input_size,
+        prior_aspect_ratio=target_aspect_ratio,
+        use_thumbnail=True,
+        min_num=min_num,
+        max_num=max_num)
+
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+
+
+# This function is used to split InternVL2-Llama3-76B
+def split_model(model_name):
+    import math
+    device_map = {}
+    num_gpus = torch.cuda.device_count()
+    rank, world_size = get_rank_and_world_size()
+    num_gpus = num_gpus // world_size
+
+    num_layers = {'InternVL2-8B': 32, 'InternVL2-26B': 48,
+                  'InternVL2-40B': 60, 'InternVL2-Llama3-76B': 80}[model_name]
+    # Since the first GPU will be used for ViT, treat it as 0.8 GPU.
+    num_layers_per_gpu = math.ceil(num_layers / (num_gpus - 0.2))
+    num_layers_per_gpu = [num_layers_per_gpu] * num_gpus
+    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.8)
+    layer_cnt = 0
+    for i, num_layer in enumerate(num_layers_per_gpu):
+        for j in range(num_layer):
+            device_map[f'language_model.model.layers.{layer_cnt}'] = rank + world_size * i
+            layer_cnt += 1
+    device_map['vision_model'] = rank
+    device_map['mlp1'] = rank
+    device_map['language_model.model.tok_embeddings'] = rank
+    device_map['language_model.model.embed_tokens'] = rank
+    device_map['language_model.output'] = rank
+    device_map['language_model.model.norm'] = rank
+    device_map['language_model.lm_head'] = rank
+    device_map[f'language_model.model.layers.{num_layers - 1}'] = rank
+    return device_map
+
+
+# To revert changes
+class MiniMonkey(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = True
+
+    def __init__(self, model_path='mx262/MiniMokney', load_in_8bit=False, **kwargs):
+        assert model_path is not None
+        assert version_cmp(transformers.__version__, '4.36.2', 'ge')
+
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
+
+        # Regular expression to match the pattern 'Image' followed by a number, e.g. Image1
+        self.pattern = r'Image(\d+)'
+        # Replacement pattern to insert a hyphen between 'Image' and the number, e.g. Image-1
+        self.replacement = r'Image-\1'
+
+        # Convert InternVL2 response to dataset format
+        # e.g. Image1 -> Image-1
+
+        # Regular expression to match the pattern 'Image-' followed by a number
+        self.reverse_pattern = r'Image-(\d+)'
+        # Replacement pattern to remove the hyphen (Image-1 -> Image1)
+        self.reverse_replacement = r'Image\1'
+
+        if listinstr(['InternVL2-Llama3-76B'], model_path):
+            device_map = split_model(model_path.split('/')[-1])
+            self.model = AutoModel.from_pretrained(
+                model_path,
+                torch_dtype=torch.bfloat16,
+                load_in_8bit=load_in_8bit,
+                trust_remote_code=True,
+                low_cpu_mem_usage=True,
+                device_map=device_map).eval()
+        else:
+            device = torch.cuda.current_device()
+            self.device = device
+            self.model = AutoModel.from_pretrained(
+                model_path,
+                torch_dtype=torch.bfloat16,
+                trust_remote_code=True,
+                load_in_8bit=load_in_8bit).eval()
+            if not load_in_8bit:
+                self.model = self.model.to(device)
+
+        self.image_size = self.model.config.vision_config.image_size
+        self.kwargs = kwargs
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+
+    def use_custom_prompt(self, dataset):
+        if dataset is None:
+            return False
+        if listinstr(['MMDU'], dataset):
+            # For Multi-Turn we don't have custom prompt
+            return False
+        else:
+            return True
+
+    def build_multi_choice_prompt(self, line, dataset=None):
+        question = line['question']
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+        prompt = question
+
+        if len(options):
+            prompt += '\n请直接回答选项字母。' if cn_string(
+                prompt) else "\nAnswer with the option's letter from the given choices directly."
+        else:
+            prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
+
+        return prompt
+
+    def build_video_prompt(self, prompt, dataset=None, max_nframe=64):
+        for start in range(0, max_nframe, 8):
+            images_to_remove = ''.join([f'<image-{i}>' for i in range(start + 1, start + 9)])
+            prompt = prompt.replace(images_to_remove, '')
+        for i in range(max_nframe):
+            prompt = prompt.replace(f'<image-{i + 1}>', f'Frame{i + 1}')
+        if listinstr(['MMBench-Video'], dataset):
+            prompt = prompt.replace('\nAnswer:', '')
+            prompt += '\nAnswer the question using a single word or phrase.'
+        elif listinstr(['Video-MME'], dataset):
+            prompt = prompt.replace('\nAnswer:', '')
+            prompt += "\nAnswer with the option's letter from the given choices directly."
+        return prompt
+
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert dataset is None or isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+        kwargs_default = dict(do_sample=False, max_new_tokens=512, top_p=None, num_beams=1)
+        self.kwargs = kwargs_default
+
+        if dataset is not None and listinstr(['MME'], dataset):
+            question = line['question']
+            prompt = question + ' Answer the question using a single word or phrase.'
+        elif dataset is not None and listinstr(['HallusionBench'], dataset):
+            question = line['question']
+            prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.'
+        elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
+            prompt = self.build_multi_choice_prompt(line, dataset)
+        elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
+            if listinstr(['MathVista', 'MathVision'], dataset):
+                prompt = line['question']
+            elif listinstr(['LLaVABench'], dataset):
+                question = line['question']
+                prompt = question + '\nAnswer this question in detail.'
+            elif listinstr(['MMVet'], dataset):
+                prompt = line['question']
+            else:
+                question = line['question']
+                prompt = question + '\nAnswer the question using a single word or phrase.'
+        else:
+            prompt = line['question']
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
+
+    def set_max_num(self, dataset):
+        if dataset is None:
+            self.max_num = 12
+            self.max_num2 = 7
+            self.min_num = 4
+            self.min_num2 = 3
+            return
+
+        if dataset is not None and listinstr(['ChartQA_TEST'], dataset):
+            self.max_num = 12
+            self.max_num2 = 3
+        elif dataset is not None and listinstr(['DocVQA_VAL', 'DocVQA_TEST', 'TextVQA_VAL'], dataset):
+            self.max_num = 23
+            self.max_num2 = 15
+            self.min_num = 14
+            self.min_num2 = 5
+        elif dataset is not None and listinstr(['InfoVQA_VAL', 'InfoVQA_TEST', 'SEEDBench_IMG'], dataset):
+            self.max_num = 23
+            self.max_num2 = 5
+            self.min_num = 15
+            self.min_num2 = 3
+        elif dataset is not None and listinstr(['OCRBench', 'POPE'], dataset):
+            self.max_num = 24
+            self.max_num2 = 8
+            self.min_num = 9
+            self.min_num2 = 5
+        elif dataset is not None and listinstr(['HallusionBench'], dataset):
+            self.max_num = 11
+            self.max_num2 = 6
+            self.min_num = 4
+            self.min_num2 = 2
+        elif dataset is not None and listinstr(['MME'], dataset):
+            self.max_num = 11
+            self.max_num2 = 6
+            self.min_num = 5
+            self.min_num2 = 2
+        elif dataset is not None and listinstr(['AI2D_TEST'], dataset):
+            self.max_num = 12
+            self.max_num2 = 6
+            self.min_num = 5
+            self.min_num2 = 2
+        elif dataset is not None and listinstr(['CCBench'], dataset):
+            self.max_num = 24
+            self.max_num2 = 8
+            self.min_num = 9
+            self.min_num2 = 4
+        elif dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset):
+            self.max_num = 12
+            self.max_num2 = 7
+            self.min_num = 5
+            self.min_num2 = 3
+        else:
+            self.max_num = 12
+            self.max_num2 = 7
+            self.min_num = 4
+            self.min_num2 = 3
+
+    def generate_v2(self, message, dataset=None):
+        image_num = len([x for x in message if x['type'] == 'image'])
+        if image_num == 1:
+            prompt = '<image>\n' + '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+        else:
+            prompt, image_idx = '', 1
+            for x in message:
+                if x['type'] == 'text':
+                    prompt += x['value']
+                elif x['type'] == 'image':
+                    prompt += f'<image-{image_idx}>'
+                    image_idx += 1
+            prompt = ' '.join([f'<image-{i + 1}>: <image>' for i in range(image_num)]) + '\n' + prompt
+
+        if dataset is not None and listinstr(['Video'], dataset):
+            prompt = self.build_video_prompt(prompt, dataset)
+
+        if image_num > 1:
+            image_path = [x['value'] for x in message if x['type'] == 'image']
+            num_patches_list = []
+            pixel_values_list = []
+            for image_idx, file_name in enumerate(image_path):
+                curr_pixel_values, target_aspect_ratio = load_image(
+                    file_name, min_num=self.min_num, max_num=self.max_num)
+                curr_pixel_values = curr_pixel_values.cuda().to(torch.bfloat16)
+                curr_pixel_values2 = load_image2(
+                    file_name, target_aspect_ratio=target_aspect_ratio, min_num=self.min_num2, max_num=self.max_num2)
+                curr_pixel_values2 = curr_pixel_values2.cuda().to(torch.bfloat16)
+                curr_pixel_values = torch.cat(
+                    (curr_pixel_values[:-1], curr_pixel_values2[:-1], curr_pixel_values[-1:]), 0)
+                num_patches_list.append(curr_pixel_values.size(0))
+                pixel_values_list.append(curr_pixel_values)
+            pixel_values = torch.cat(pixel_values_list, dim=0)
+        elif image_num == 1:
+            image_path = [x['value'] for x in message if x['type'] == 'image'][0]
+            pixel_values, target_aspect_ratio = load_image(image_path, min_num=self.min_num, max_num=self.max_num)
+            pixel_values = pixel_values.cuda().to(torch.bfloat16)
+            pixel_values2 = load_image2(
+                image_path, target_aspect_ratio=target_aspect_ratio, min_num=self.min_num2, max_num=self.max_num2)
+            pixel_values2 = pixel_values2.cuda().to(torch.bfloat16)
+            pixel_values = torch.cat((pixel_values[:-1], pixel_values2[:-1], pixel_values[-1:]), 0)
+            num_patches_list = [pixel_values.size(0)]
+        else:
+            pixel_values = None
+            num_patches_list = []
+
+        with torch.no_grad():
+            response = self.model.chat(
+                self.tokenizer,
+                pixel_values=pixel_values,
+                target_aspect_ratio=(1, 1),
+                num_patches_list=num_patches_list,
+                question=prompt,
+                generation_config=self.kwargs,
+                verbose=False
+            )
+        return response
+
+    def generate_inner(self, message, dataset=None):
+        self.set_max_num(dataset)
+        return self.generate_v2(message, dataset)
+
+    def build_history(self, message):
+        # Global Variables
+        image_path = []
+        image_cnt = 0
+
+        def concat_tilist(tilist):
+            nonlocal image_cnt  # Declare image_cnt as nonlocal to modify it
+            prompt = ''
+            for item in tilist:
+                # Substitute the pattern in the text
+                if item['type'] == 'text':
+                    prompt += re.sub(self.pattern, self.replacement, item['value'])
+                elif item['type'] == 'image':
+                    image_cnt += 1
+                    prompt += '<image>\n'
+                    image_path.append(item['value'])
+            return prompt
+
+        # Only previous messages
+        assert len(message) % 2 == 0
+        history = []
+        for i in range(len(message) // 2):
+            m1, m2 = message[2 * i], message[2 * i + 1]
+            assert m1['role'] == 'user' and m2['role'] == 'assistant'
+            history.append((concat_tilist(m1['content']), concat_tilist(m2['content'])))
+
+        return history, image_path, image_cnt
+
+    def chat_inner_v2(self, message, dataset=None):
+
+        image_cnt = 0
+        if len(message) > 1:
+            history, image_path, image_cnt = self.build_history(message[:-1])
+        else:
+            history, image_path, image_cnt = None, [], 1
+        current_msg = message[-1]
+        question = ''
+
+        # If message is just text in the conversation
+        if len(current_msg['content']) == 1 and current_msg['content'][0]['type'] == 'text':
+            question = current_msg['content'][0]['value']
+            question = re.sub(self.pattern, self.replacement, question)  # Fix pattern as per InternVL
+        else:
+            for msg in current_msg['content']:
+                if msg['type'] == 'text':
+                    question += re.sub(self.pattern, self.replacement, msg['value'])
+                elif msg['type'] == 'image':
+                    image_cnt += 1
+                    question += '<image>\n'
+                    image_path.append(msg['value'])
+
+        if image_cnt > 1:
+            num_patches_list = []
+            pixel_values_list = []
+            for image_idx, file_name in enumerate(image_path):
+                curr_pixel_values, target_aspect_ratio = load_image(
+                    file_name, min_num=self.min_num, max_num=self.max_num)
+                curr_pixel_values = curr_pixel_values.cuda().to(torch.bfloat16)
+                curr_pixel_values2 = load_image2(
+                    file_name, target_aspect_ratio=target_aspect_ratio, min_num=self.min_num2, max_num=self.max_num2)
+                curr_pixel_values2 = curr_pixel_values2.cuda().to(torch.bfloat16)
+                curr_pixel_values = torch.cat(
+                    (curr_pixel_values[:-1], curr_pixel_values2[:-1], curr_pixel_values[-1:]), 0)
+                num_patches_list.append(curr_pixel_values.size(0))
+                pixel_values_list.append(curr_pixel_values)
+            pixel_values = torch.cat(pixel_values_list, dim=0)
+        elif image_cnt == 1:
+            pixel_values, target_aspect_ratio = load_image(image_path, min_num=self.min_num, max_num=self.max_num)
+            pixel_values = pixel_values.cuda().to(torch.bfloat16)
+            pixel_values2 = load_image2(
+                image_path, target_aspect_ratio=target_aspect_ratio, min_num=self.min_num2, max_num=self.max_num2)
+            pixel_values2 = pixel_values2.cuda().to(torch.bfloat16)
+            pixel_values = torch.cat((pixel_values[:-1], pixel_values2[:-1], pixel_values[-1:]), 0)
+            num_patches_list = [pixel_values.size(0)]
+        else:
+            pixel_values = None
+            num_patches_list = []
+
+        response, history = self.model.chat(
+            self.tokenizer,
+            pixel_values=pixel_values,
+            target_aspect_ratio=target_aspect_ratio,
+            num_patches_list=num_patches_list,
+            question=question,
+            generation_config=self.kwargs,
+            history=history,
+            return_history=True
+        )
+
+        response = re.sub(self.reverse_pattern, self.reverse_replacement, response)
+
+        return response
+
+    def chat_inner(self, message, dataset=None):
+        self.set_max_num(dataset)
+        kwargs_default = dict(do_sample=False, max_new_tokens=512, top_p=None, num_beams=1)
+        self.kwargs = kwargs_default
+        return self.chat_inner_v2(message, dataset)
--- a/VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna13b.yaml
+++ b/VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna13b.yaml
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: instruct_vicuna13b
+  load_finetuned: False
+  load_pretrained: True
+
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth"
+  finetuned: ""
+
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+
+  # Q-Former
+  num_query_token: 32
+
+  # path to Vicuna checkpoint
+  llm_model: "Please set the path to your vicuna-13b-v1.1"
+
+  # generation configs
+  prompt: ""
+
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip2_image_train"
+          image_size: 224
+        eval:
+          name: "blip_image_eval"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
--- a/VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna7b.yaml
+++ b/VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna7b.yaml
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+model:
+  arch: instruct_vicuna7b
+  load_finetuned: False
+  load_pretrained: True
+
+  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth"
+  finetuned: ""
+
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+
+  # Q-Former
+  num_query_token: 32
+
+  # path to Vicuna checkpoint
+  llm_model: "Please set the path to your vicuna-7b-v1.1"
+
+  # generation configs
+  prompt: ""
+
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip2_image_train"
+          image_size: 224
+        eval:
+          name: "blip_image_eval"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
--- a/VLMEvalKit/vlmeval/vlm/misc/minigpt4_13b_eval.yaml
+++ b/VLMEvalKit/vlmeval/vlm/misc/minigpt4_13b_eval.yaml
+model:
+  arch: minigpt4
+  model_type: pretrain_vicuna_7b
+  max_txt_len: 160
+  end_sym: "###"
+  low_resource: True
+  prompt_template: '###Human: {} ###Assistant: '
+  ckpt: "please set this value to the path of pretrained checkpoint"
+
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+  freeze_qformer: True
+
+  # Q-Former
+  num_query_token: 32
+
+  # generation configs
+  prompt: ""
+
+  llama_model: "please set this value to the path of vicuna-13b-v0"
+
+datasets:
+  cc_sbu_align:
+    vis_processor:
+      train:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+
+run:
+  task: image_text_pretrain