Initial commit

bc5ebf0f · luopl · bc5ebf0f · bc5ebf0f · bc5ebf0f · bc5ebf0f
Commit bc5ebf0f authored Dec 27, 2024 by luopl
20 changed files
--- a/VLMEvalKit/vlmeval/vlm/vintern_chat.py
+++ b/VLMEvalKit/vlmeval/vlm/vintern_chat.py
+import torch
+from transformers import AutoTokenizer, AutoConfig, AutoModel, CLIPImageProcessor
+import warnings
+from PIL import Image
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE, DATASET_MODALITY
+import pandas as pd
+import string
+import torch.distributed as dist
+import torchvision.transforms as T
+import transformers
+
+from torchvision.transforms.functional import InterpolationMode
+import re
+
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def dynamic_preprocess(image, min_num=1, max_num=4, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+
+
+def load_image(image_file, input_size=448, max_num=6, upscale=False):
+    image = Image.open(image_file).convert('RGB')
+    if upscale:
+        image = image.resize((image.width * 2, image.height * 2), Image.BILINEAR)
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+
+
+class VinternChat(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = True
+
+    def __init__(self, model_path='5CD-AI/Vintern-3B-beta', load_in_8bit=False, **kwargs):
+        assert model_path is not None
+        assert version_cmp(transformers.__version__, '4.36.2', 'ge')
+
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
+
+        # Regular expression to match the pattern 'Image' followed by a number, e.g. Image1
+        self.pattern = r'Image(\d+)'
+        # Replacement pattern to insert a hyphen between 'Image' and the number, e.g. Image-1
+        self.replacement = r'Image-\1'
+
+        # Convert InternVL2 response to dataset format
+        # e.g. Image1 -> Image-1
+
+        # Regular expression to match the pattern 'Image-' followed by a number
+        self.reverse_pattern = r'Image-(\d+)'
+        # Replacement pattern to remove the hyphen (Image-1 -> Image1)
+        self.reverse_replacement = r'Image\1'
+
+        device = torch.cuda.current_device()
+        self.device = device
+        self.model = AutoModel.from_pretrained(
+            model_path,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+            load_in_8bit=load_in_8bit).eval()
+        if not load_in_8bit:
+            self.model = self.model.to(device)
+
+        self.image_size = self.model.config.vision_config.image_size
+        kwargs_default = dict(do_sample=False, max_new_tokens=1024, top_p=None, num_beams=3)
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+
+    def use_custom_prompt(self, dataset):
+        if dataset is None:
+            return False
+        if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN'], dataset):
+            # For Multi-Turn we don't have custom prompt
+            return False
+        if DATASET_MODALITY(dataset) == 'VIDEO':
+            # For Video benchmarks we don't have custom prompt at here
+            return False
+        else:
+            return True
+
+    def build_multi_choice_prompt(self, line, dataset=None):
+        question = line['question']
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+        prompt = question
+
+        if len(options):
+            prompt += '\n请直接回答选项字母。' if cn_string(
+                prompt) else "\nAnswer with the option's letter from the given choices directly."
+        else:
+            prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
+
+        return prompt
+
+    def build_video_prompt(self, prompt, dataset=None, max_frames=64):
+        for start in range(0, max_frames, 8):
+            images_to_remove = ''.join([f'<Image-{i}>' for i in range(start + 1, start + 9)])
+            prompt = prompt.replace(images_to_remove, '')
+        for i in range(max_frames):
+            prompt = prompt.replace(f'Image-{i + 1}', f'Frame-{i + 1}')
+        if listinstr(['MMBench-Video'], dataset):
+            prompt = prompt.replace('\nAnswer:', '')
+        elif listinstr(['Video-MME'], dataset):
+            prompt = prompt.replace('\nAnswer:', '')
+            prompt += "\nAnswer with the option's letter from the given choices directly."
+        elif listinstr(['MVBench'], dataset):
+            prompt = prompt.replace('Best option:(', '')
+
+        return prompt
+
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert dataset is None or isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+
+        kwargs_default = dict(do_sample=False, max_new_tokens=1024, top_p=None, num_beams=3)
+
+        if listinstr(['MTVQA'], dataset):
+            kwargs_default["max_new_tokens"] = 256
+
+        if listinstr(['MMMU_DEV_VAL','MMMU_TEST'], dataset):
+            kwargs_default["num_beams"] = 1
+
+        self.kwargs = kwargs_default
+
+        if dataset is not None and DATASET_TYPE(dataset) == 'Y/N':
+            question = line['question']
+            if listinstr(['MME'], dataset):
+                prompt = question + ' Answer the question using a single word or phrase.'
+            elif listinstr(['HallusionBench'], dataset):
+                prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.'
+            else:
+                prompt = line['question']
+        elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
+            prompt = self.build_multi_choice_prompt(line, dataset)
+        elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
+            question = line['question']
+            if listinstr(['MathVista', 'MathVision', 'VCR', 'MTVQA', 'MMVet', 'MathVerse'], dataset):
+                prompt = question
+            elif listinstr(['LLaVABench'], dataset):
+                prompt = question + '\nAnswer this question in detail.'
+            else:
+                prompt = question + '\nAnswer the question using a single word or phrase.'
+        else:
+            prompt = line['question']
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
+
+    def set_max_num(self, dataset):
+        if dataset is None:
+            self.max_num = 1
+            return
+
+        # res_1_datasets = ['MMBench-Video', 'Video-MME', 'MVBench', 'Video']
+        res_12_datasets = ['ChartQA_TEST', 'MMMU_DEV_VAL', 'MMMU_TEST', 'MME-RealWorld',
+                           'MME-RealWorld', 'VCR_EN', 'VCR_ZH']
+        res_18_datasets = ['DocVQA_VAL', 'DocVQA_TEST']
+        res_24_datasets = ['InfoVQA_VAL', 'InfoVQA_TEST', 'OCRBench', 'HRBench4K', 'HRBench8K']
+        if DATASET_MODALITY(dataset) == 'VIDEO':
+            self.max_num = 1
+        elif listinstr(res_12_datasets, dataset):
+            self.max_num = 6  # 12
+        elif listinstr(res_18_datasets, dataset):
+            self.max_num = 6  # 18
+        elif listinstr(res_24_datasets, dataset):
+            self.max_num = 6  # 24
+        elif listinstr(["MME"], dataset):
+            self.max_num = 6  # 24
+        else:
+            self.max_num = 6  # 6
+
+    def generate_v2(self, message, dataset=None):
+        image_num = len([x for x in message if x['type'] == 'image'])
+        if image_num == 1:
+            prompt = '<image>\n' + '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+        else:
+            prompt, image_idx = '', 1
+            for x in message:
+                if x['type'] == 'text':
+                    prompt += x['value']
+                elif x['type'] == 'image':
+                    prompt += f'<Image-{image_idx}>'
+                    image_idx += 1
+            prompt = '\n'.join([f'Image-{i + 1}: <image>' for i in range(image_num)]) + '\n' + prompt
+
+        if dataset is not None and DATASET_MODALITY(dataset) == 'VIDEO':
+            prompt = self.build_video_prompt(prompt, dataset)
+
+        if image_num > 1:
+            image_path = [x['value'] for x in message if x['type'] == 'image']
+            num_patches_list = []
+            pixel_values_list = []
+            for image_idx, file_name in enumerate(image_path):
+                upscale_flag = image_idx == 0 and dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset)
+                curr_pixel_values = load_image(
+                    file_name, max_num=self.max_num, upscale=upscale_flag).to(self.device).to(torch.bfloat16)
+                num_patches_list.append(curr_pixel_values.size(0))
+                pixel_values_list.append(curr_pixel_values)
+            pixel_values = torch.cat(pixel_values_list, dim=0)
+        elif image_num == 1:
+            image_path = [x['value'] for x in message if x['type'] == 'image'][0]
+            upscale_flag = dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset)
+            pixel_values = load_image(
+                image_path, max_num=self.max_num, upscale=upscale_flag).to(self.device).to(torch.bfloat16)
+            num_patches_list = [pixel_values.size(0)]
+        else:
+            pixel_values = None
+            num_patches_list = []
+
+        with torch.no_grad():
+            response = self.model.chat(
+                self.tokenizer,
+                pixel_values=pixel_values,
+                num_patches_list=num_patches_list,
+                question=prompt,
+                generation_config=self.kwargs,
+                verbose=False
+            )
+        return response
+
+    def generate_inner(self, message, dataset=None):
+        self.set_max_num(dataset)
+        return self.generate_v2(message, dataset)
+
+    def build_history(self, message):
+        # Global Variables
+        image_path = []
+        image_cnt = 0
+
+        def concat_tilist(tilist):
+            nonlocal image_cnt  # Declare image_cnt as nonlocal to modify it
+            prompt = ''
+            for item in tilist:
+                # Substitute the pattern in the text
+                if item['type'] == 'text':
+                    prompt += re.sub(self.pattern, self.replacement, item['value'])
+                elif item['type'] == 'image':
+                    image_cnt += 1
+                    prompt += '<image>\n'
+                    image_path.append(item['value'])
+            return prompt
+
+        # Only previous messages
+        assert len(message) % 2 == 0
+        history = []
+        for i in range(len(message) // 2):
+            m1, m2 = message[2 * i], message[2 * i + 1]
+            assert m1['role'] == 'user' and m2['role'] == 'assistant'
+            history.append((concat_tilist(m1['content']), concat_tilist(m2['content'])))
+
+        return history, image_path, image_cnt
+
+    def chat_inner_v2(self, message, dataset=None):
+
+        image_cnt = 0
+        if len(message) > 1:
+            history, image_path, image_cnt = self.build_history(message[:-1])
+        else:
+            history, image_path, image_cnt = None, [], 1
+        current_msg = message[-1]
+        question = ''
+
+        # If message is just text in the conversation
+        if len(current_msg['content']) == 1 and current_msg['content'][0]['type'] == 'text':
+            question = current_msg['content'][0]['value']
+            question = re.sub(self.pattern, self.replacement, question)  # Fix pattern as per InternVL
+        else:
+            for msg in current_msg['content']:
+                if msg['type'] == 'text':
+                    question += re.sub(self.pattern, self.replacement, msg['value'])
+                elif msg['type'] == 'image':
+                    image_cnt += 1
+                    question += '<image>\n'
+                    image_path.append(msg['value'])
+
+        if image_cnt > 1:
+            num_patches_list = []
+            pixel_values_list = []
+            for image_idx, file_name in enumerate(image_path):
+                upscale_flag = image_idx == 0 and dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset)
+                curr_pixel_values = load_image(
+                    file_name, max_num=1, upscale=upscale_flag).to(self.device).to(torch.bfloat16)
+                num_patches_list.append(curr_pixel_values.size(0))
+                pixel_values_list.append(curr_pixel_values)
+            pixel_values = torch.cat(pixel_values_list, dim=0)
+        elif image_cnt == 1:
+            upscale_flag = dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset)
+            pixel_values = load_image(
+                image_path, max_num=self.max_num, upscale=upscale_flag).to(self.device).to(torch.bfloat16)
+            num_patches_list = [pixel_values.size(0)]
+        else:
+            pixel_values = None
+            num_patches_list = []
+
+        response, history = self.model.chat(
+            self.tokenizer,
+            pixel_values=pixel_values,
+            num_patches_list=num_patches_list,
+            question=question,
+            generation_config=self.kwargs,
+            history=history,
+            return_history=True
+        )
+
+        response = re.sub(self.reverse_pattern, self.reverse_replacement, response)
+
+        return response
+
+    def chat_inner(self, message, dataset=None):
+        self.set_max_num(dataset)
+        kwargs_default = dict(do_sample=False, max_new_tokens=512, top_p=None, num_beams=3)
+        self.kwargs = kwargs_default
+        return self.chat_inner_v2(message, dataset)
--- a/VLMEvalKit/vlmeval/vlm/visualglm.py
+++ b/VLMEvalKit/vlmeval/vlm/visualglm.py
+import warnings
+from .base import BaseModel
+from ..smp import *
+
+
+class VisualGLM(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='THUDM/visualglm-6b', **kwargs):
+        try:
+            import sat
+        except Exception as err:
+            logging.critical('Please install SwissArmyTransformer to use VisualGLM')
+            raise err
+
+        assert model_path is not None
+        self.model_path = model_path
+
+        from transformers import AutoModel
+        from transformers import AutoTokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()
+        self.model = model
+        self.kwargs = kwargs
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        output, _ = self.model.chat(
+            image_path=image_path,
+            tokenizer=self.tokenizer,
+            query=prompt,
+            history=[],
+            **self.kwargs
+        )
+        return output
--- a/VLMEvalKit/vlmeval/vlm/vxverse.py
+++ b/VLMEvalKit/vlmeval/vlm/vxverse.py
+import torch
+import sys
+import os.path as osp
+import warnings
+from .base import BaseModel
+from transformers import StoppingCriteriaList
+
+from PIL import Image
+from huggingface_hub import snapshot_download
+from vlmeval.smp import *
+
+model_cfgs = {
+    'XVERSE-V-13B': {
+        'arch': 'vxverse',
+        'model_type': 'pretrain_xverse13b-chat',
+        'max_txt_len': 512,
+        'end_sym': '<|endoftext|>',
+        'low_resource': False,
+        'prompt_template': 'Human: {}\nAssistant: ',
+        'ckpt': 'xverse/XVERSE-V-13B',
+        'lora_r': 128,
+        'lora_alpha': 256,
+        'lora_dropout': 0.05,
+        'lora_target_modules': 'all_linear',
+        'has_qformer': False,
+        'n_proj_layers': 2,
+        'vit_model': 'openai/clip-vit-large-patch14',
+        'vit_path': 'openai/clip-vit-large-patch14',
+        'image_size': 224,
+        'drop_path_rate': 0,
+        'vit_precision': 'fp16',
+        'llama_model': 'xverse/XVERSE-13B-Chat',
+    }
+}
+
+
+class VXVERSE(BaseModel):
+
+    INSTALL_REQ = True
+    INTERLEAVE = False
+
+    def __init__(self, model_name='XVERSE-V-13B', root=None, **kwargs):
+        from omegaconf import OmegaConf
+        if root is None:
+            warnings.warn('Please set root to the directory of vxverse.')
+
+        if model_name == 'XVERSE-V-13B':
+            cfg = model_cfgs['XVERSE-V-13B']
+        else:
+            raise NotImplementedError
+
+        ckpt_dir = cfg['ckpt']
+        if not osp.isdir(ckpt_dir):
+            cache_path = get_cache_path(ckpt_dir)
+            if cache_path is not None:
+                ckpt_dir = cache_path
+            else:
+                ckpt_dir = snapshot_download(repo_id=ckpt_dir)
+        assert osp.exists(ckpt_dir) and osp.isdir(ckpt_dir)
+        ckpt = osp.join(ckpt_dir, 'adapter_and_lora.bin')
+        cfg['ckpt'] = ckpt
+        model_cfg = OmegaConf.create(cfg)
+
+        self.model_name = model_name
+
+        self.root = root
+        sys.path.append(self.root)
+
+        from vxverse.common.registry import registry
+        from vxverse.conversation.conversation import CONV_VISION_XVERSE
+
+        device = torch.cuda.current_device()
+        self.device = device
+
+        model_cls = registry.get_model_class(model_cfg.arch)
+        model = model_cls.from_config(model_cfg)
+        model = model.to(device)
+        model.eval()
+        vis_processor_cfg = OmegaConf.create(dict(name='hd_image_train', image_size=224))
+        vis_processor = registry.get_processor_class(
+            vis_processor_cfg.name
+        ).from_config(vis_processor_cfg)
+
+        self.model = model
+        self.vis_processor = vis_processor
+        self.vis_processor_cfg = vis_processor_cfg
+
+        self.CONV_VISION = CONV_VISION_XVERSE
+        self.CONV_VISION.system = ''
+        stop_words_ids = [[835], [2277, 29937]]
+        self.stop_words_ids = stop_words_ids
+        default_kwargs = dict(max_new_tokens=512)
+        default_kwargs.update(kwargs)
+        self.kwargs = default_kwargs
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+
+        image = Image.open(image_path).convert('RGB')
+        image = self.vis_processor(image)
+
+        if self.vis_processor_cfg.name == 'hd_image_train':
+            patches_per_image = [[image.shape[0]]]
+            image = [image]
+        else:
+            patches_per_image = None
+            image = image.unsqueeze(0)
+
+        chat_state = self.CONV_VISION.copy()
+        texts = self.prepare_texts([prompt], chat_state)
+        texts = [text.lstrip() for text in texts]
+        answers = self.model.generate(
+            image,
+            texts,
+            patches_per_images=patches_per_image,
+            do_sample=False,
+            stop_words_ids=self.stop_words_ids,
+            **self.kwargs
+        )
+        return answers[0]
+
+    def prepare_texts(self, texts, conv_temp):
+        convs = [conv_temp.copy() for _ in range(len(texts))]
+        [
+            conv.append_message(conv.roles[0], '<ImageHere>\n{}'.format(text))
+            for conv, text in zip(convs, texts)
+        ]
+        [conv.append_message(conv.roles[1], None) for conv in convs]
+        texts = [conv.get_prompt() for conv in convs]
+        return texts
--- a/VLMEvalKit/vlmeval/vlm/wemm.py
+++ b/VLMEvalKit/vlmeval/vlm/wemm.py
+import torch
+from PIL import Image
+import sys
+from ..smp import *
+from .base import BaseModel
+from ..dataset import DATASET_TYPE
+from transformers import AutoModel, GenerationConfig
+
+
+class WeMM(BaseModel):
+    def __init__(self, model_path='feipengma/WeMM', **kwargs):
+        self.wemm = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True)
+        self.wemm.cuda()
+        self.wemm.eval()
+        torch.cuda.empty_cache()
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'MCQ':
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert dataset is None or isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+        prompt = question
+
+        if len(options):
+            prompt += (
+                '\n请直接回答选项字母。' if cn_string(prompt) else
+                "\nAnswer with the option's letter from the given choices directly."
+            )
+        else:
+            prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
+
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=p) for p in tgt_path])
+        return message
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+
+        if dataset == 'HallusionBench':
+            prompt = prompt + ' Please answer yes or no. Answer the question using a single word or phrase.'
+
+        gen_config = None
+        if dataset == 'MMVet':
+            gen_config = GenerationConfig(
+                max_new_tokens=512,
+                do_sample=True,
+                temperatures=0.7,
+                num_beams=3,
+                eos_token_id=self.wemm.tokenizer.eos_token_id,
+                pad_token_id=self.wemm.tokenizer.pad_token_id
+                if self.wemm.tokenizer.pad_token_id is not None else self.wemm.tokenizer.eos_token_id,
+            )
+        pred = self.wemm.mm_generate(image_path, prompt, gen_config)
+
+        return pred
--- a/VLMEvalKit/vlmeval/vlm/xcomposer/__init__.py
+++ b/VLMEvalKit/vlmeval/vlm/xcomposer/__init__.py
+from .sharecaptioner import ShareCaptioner
+from .xcomposer import XComposer
+from .xcomposer2 import XComposer2
+from .xcomposer2_4KHD import XComposer2_4KHD
+from .xcomposer2d5 import XComposer2d5
+
+__all__ = ['ShareCaptioner', 'XComposer', 'XComposer2', 'XComposer2_4KHD', 'XComposer2d5']
--- a/VLMEvalKit/vlmeval/vlm/xcomposer/sharecaptioner.py
+++ b/VLMEvalKit/vlmeval/vlm/xcomposer/sharecaptioner.py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from ..base import BaseModel
+from ...smp import *
+from ...dataset import DATASET_TYPE
+
+
+class ShareCaptioner(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='Lin-Chen/ShareCaptioner', **kwargs):
+        assert model_path is not None
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path, device_map='cuda', trust_remote_code=True).eval()
+        self.model.tokenizer = tokenizer
+        self.model.cuda()
+        self.model.half()
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'MCQ':
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+
+        if dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
+            question = line['question']
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+            if hint is not None:
+                question = hint + '\n' + question
+
+            option_candidate = string.ascii_uppercase
+            options = {
+                cand: line[cand]
+                for cand in option_candidate
+                if cand in line and not pd.isna(line[cand])
+            }
+            for key, item in options.items():
+                question += f'\n{key}. {item}'
+            prompt = question
+
+            if not cn_string(prompt):
+                prompt = prompt + '\n' + "Answer with the option's letter from the given choices directly."
+            else:
+                prompt = prompt + '\n' + '请直接回答选项字母。'
+        else:
+            prompt = line['question']
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        seg1 = '<|User|>:'
+        seg2 = f'{prompt}{self.model.eoh}\n<|Bot|>:'
+        self.seg_emb1 = self.model.encode_text(seg1, add_special_tokens=True)
+        self.seg_emb2 = self.model.encode_text(seg2, add_special_tokens=False)
+
+        image = Image.open(image_path).convert('RGB')
+        image = self.model.vis_processor(image).unsqueeze(0)
+        image = image.to(self.model.device)
+        tmp_bs = image.shape[0]
+        tmp_seg_emb1 = self.seg_emb1.repeat(tmp_bs, 1, 1)
+        tmp_seg_emb2 = self.seg_emb2.repeat(tmp_bs, 1, 1)
+        with torch.cuda.amp.autocast():
+            with torch.no_grad():
+                image = self.model.encode_img(image)
+                input_emb = torch.cat(
+                    [tmp_seg_emb1, image, tmp_seg_emb2], dim=1)
+                out_embeds = self.model.internlm_model.generate(
+                    inputs_embeds=input_emb,
+                    max_length=500,
+                    num_beams=3,
+                    min_length=1,
+                    do_sample=True,
+                    repetition_penalty=1.5,
+                    length_penalty=1.0,
+                    temperature=1.,
+                    eos_token_id=self.model.tokenizer.eos_token_id,
+                    num_return_sequences=1)
+
+        for j, out in enumerate(out_embeds):
+            out[out == -1] = 2
+            response = self.model.decode_text([out])
+        return response
--- a/VLMEvalKit/vlmeval/vlm/xcomposer/xcomposer.py
+++ b/VLMEvalKit/vlmeval/vlm/xcomposer/xcomposer.py
+import torch
+from transformers import AutoModel, AutoTokenizer
+from transformers import StoppingCriteria, StoppingCriteriaList
+from PIL import Image
+from ..base import BaseModel
+from ...smp import *
+from ...dataset import DATASET_TYPE
+
+
+class StoppingCriteriaSub(StoppingCriteria):
+    def __init__(self, stops=[], encounters=1):
+        super().__init__()
+        self.stops = stops
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
+        for stop in self.stops:
+            if torch.all((stop == input_ids[0][-len(stop):])).item():
+                return True
+
+        return False
+
+
+class XComposer(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='internlm/internlm-xcomposer-vl-7b', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+
+        model = AutoModel.from_pretrained(self.model_path, device_map='cpu', trust_remote_code=True).cuda().eval()
+        tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
+        model.tokenizer = tokenizer
+        self.model = model
+        self.device = self.model.internlm_model.model.embed_tokens.weight.device
+        self.eoh = '<TOKENS_UNUSED_0>'
+        self.eoa = '<TOKENS_UNUSED_1>'
+        stop_words_ids = [
+            torch.tensor([103027]).to(self.device),  # end of human
+            torch.tensor([103028]).to(self.device),  # end of bot
+        ]
+        default_kwargs = {
+            'max_new_tokens': 512, 'num_beams': 5, 'do_sample': False,
+            'min_length': 1, 'repetition_penalty': 1.5, 'length_penalty': 1.0
+        }
+        default_kwargs.update(kwargs)
+        self.kwargs = default_kwargs
+        self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+
+    def generate_inner(self, message, dataset=None):
+        if len(message) == 2:
+            if message[0]['type'] == 'text' and message[1]['type'] == 'image':
+                message = [message[1], message[0]]
+        kwargs = cp.deepcopy(self.kwargs)
+        if dataset is not None:
+            if DATASET_TYPE(dataset) == 'MCQ':
+                kwargs['max_new_tokens'] = 5
+                kwargs['num_beams'] = 5
+
+        with torch.cuda.amp.autocast():
+            with torch.no_grad():
+                prompt_embs = self.message_to_prompt_embs(message, dataset)
+                outputs = self.model.internlm_model.generate(
+                    inputs_embeds=prompt_embs,
+                    stopping_criteria=self.stopping_criteria,
+                    **kwargs
+                )
+
+        output_token = outputs[0]
+        if output_token[0] == 0:
+            output_token = output_token[1:]
+        if output_token[0] == 1:
+            output_token = output_token[1:]
+        output_text = self.model.tokenizer.decode(output_token, add_special_tokens=False)
+
+        output_text = output_text.split(self.model.eoa)[0]
+        output_text = output_text.split('<|Bot|>')[-1].strip()
+        return output_text
+
+    def message_to_prompt_embs(self, message, dataset=None):
+        assert isinstance(message, list)
+        img_embeds = []
+        prompt_full = '<|User|>: '
+        for msg in message:
+            if msg['type'] == 'text':
+                prompt_full += msg['value']
+            elif msg['type'] == 'image':
+                image = Image.open(msg['value']).convert('RGB')
+                image = self.model.vis_processor(image).unsqueeze(0).to(self.device)
+                img_embeds.append(self.model.encode_img(image))
+                prompt_full += '<ImageHere>'
+
+        prompt_full += self.model.eoh + ' <|Bot|>: '
+        if dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
+            prompt_full += 'Answer: The answer is '
+        elif dataset is not None and DATASET_TYPE(dataset) in ['VQA', 'QA', 'Y/N']:
+            prompt_full += 'Answer: '
+
+        prompt_segs = prompt_full.split('<ImageHere>')
+        assert len(prompt_segs) == len(img_embeds) + 1
+
+        prompt_seg_tokens = [
+            self.model.tokenizer(seg, return_tensors='pt', add_special_tokens=(i == 0)).to(self.device).input_ids.long()
+            for i, seg in enumerate(prompt_segs)
+        ]
+        prompt_seg_embs = [self.model.internlm_model.model.embed_tokens(seg) for seg in prompt_seg_tokens]
+        all_embeddings = []
+        for i in range(len(img_embeds)):
+            all_embeddings.extend([prompt_seg_embs[i], img_embeds[i]])
+        all_embeddings.append(prompt_seg_embs[-1])
+        prompt_embs = torch.cat(all_embeddings, dim=1)
+        return prompt_embs
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'MCQ':
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        options_prompt = ''
+        for key, item in options.items():
+            options_prompt += f'{key}. {item}\n'
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        context = 'N/A' if hint is None else hint
+        mid_prompt = 'Context: ' + context + '\nQuestion: ' + question
+        if len(options_prompt):
+            mid_prompt += '\nOptions: ' + options_prompt
+
+        if len(options):
+            txt_prompt = 'Please answer this question by choosing the correct choice.'
+        else:
+            txt_prompt = 'Please answer this question directly. '
+        prompt = txt_prompt + mid_prompt
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
--- a/VLMEvalKit/vlmeval/vlm/xcomposer/xcomposer2.py
+++ b/VLMEvalKit/vlmeval/vlm/xcomposer/xcomposer2.py
+import torch
+import torchvision
+from transformers import AutoModel, AutoTokenizer
+from PIL import Image
+from ..base import BaseModel
+from ...smp import *
+from ...dataset import DATASET_TYPE
+import re
+pattern = re.compile(r'[A-Z]')
+
+
+def __padding__(image):
+    width, height = image.size
+    tar = max(width, height)
+    top_padding = int((tar - height) / 2)
+    bottom_padding = tar - height - top_padding
+    left_padding = int((tar - width) / 2)
+    right_padding = tar - width - left_padding
+    image = torchvision.transforms.functional.pad(image, [left_padding, top_padding, right_padding, bottom_padding])
+    return image
+
+
+meta_instruction = """
+You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).
+- InternLM-XComposer (浦语·灵笔) is a multi-modality conversational language model that is developed by
+Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
+- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language
+chosen by the user such as English and 中文.
+- InternLM-XComposer (浦语·灵笔) is capable of comprehending and articulating responses effectively
+based on the provided image.
+"""
+
+
+def model_gen(model, text, images, need_bos=True, padding=False, beams=3, max_token=500):
+    pt1 = 0
+    embeds = []
+    im_mask = []
+    images = [images]
+    images_loc = [0]
+    for i, pts in enumerate(images_loc + [len(text)]):
+        subtext = text[pt1:pts]
+        if need_bos or len(subtext) > 0:
+            text_embeds = model.encode_text(subtext, add_special_tokens=need_bos)
+            embeds.append(text_embeds)
+            im_mask.append(torch.zeros(text_embeds.shape[:2]).cuda())
+            need_bos = False
+        if i < len(images):
+            try:
+                image = Image.open(images[i]).convert('RGB')
+            except:
+                image = images[i].convert('RGB')
+            if padding:
+                image = __padding__(image)
+            image = model.vis_processor(image).unsqueeze(0).cuda()
+            image_embeds = model.encode_img(image)
+            embeds.append(image_embeds)
+            im_mask.append(torch.ones(image_embeds.shape[:2]).cuda())
+        pt1 = pts
+    embeds = torch.cat(embeds, dim=1)
+    im_mask = torch.cat(im_mask, dim=1)
+    im_mask = im_mask.bool()
+
+    outputs = model.generate(
+        inputs_embeds=embeds,
+        im_mask=im_mask,
+        temperature=1.0,
+        max_new_tokens=max_token,
+        num_beams=beams,
+        do_sample=False,
+        repetition_penalty=1.0)
+
+    output_token = outputs[0]
+    if output_token[0] == 0 or output_token[0] == 1:
+        output_token = output_token[1:]
+    output_text = model.tokenizer.decode(output_token, add_special_tokens=False)
+    output_text = output_text.split('[UNUSED_TOKEN_145]')[0].strip()
+    return output_text
+
+
+class XComposer2(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='internlm/internlm-xcomposer2-vl-7b', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+
+        model = AutoModel.from_pretrained(self.model_path, device_map='cpu', trust_remote_code=True).cuda().eval()
+        model.half()
+        tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
+        model.tokenizer = tokenizer
+        self.model = model
+        self.device = self.model.model.tok_embeddings.weight.device
+
+    def generate_mme(self, image_path, text):
+        text = text.split('Please answer')[0].strip()
+        text = f'{text} Answer this question briefly'
+        text = f'[UNUSED_TOKEN_146]user\n{text}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+
+        return model_gen(self.model, text, image_path, need_bos=True, padding=True, beams=5)
+
+    def generate_multichoice(self, image_path, text, dataset):
+        out = model_gen(self.model, text, image_path, need_bos=True, padding=False, beams=5, max_token=5)
+        if 'mmmu' in dataset.lower():
+            return out
+        res = pattern.findall(out)
+        if len(res) == 0:
+            print('Error:', out)
+            res = 'Z'
+        return res[0]
+
+    def generate_vqa(self, image_path, text):
+        out = model_gen(self.model, text, image_path, need_bos=True)
+        return out
+
+    def generate_vanilla(self, image_path, text):
+        text = (
+            '[UNUSED_TOKEN_146]system\n{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]user\n{}'
+            'Answer this question in detail.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+        ).format(meta_instruction, text)
+        out = model_gen(self.model, text, image_path, need_bos=True, max_token=500)
+        return out
+
+    def generate_brief(self, image_path, text):
+        text = (
+            '[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.{}'
+            '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+        ).format(text)
+        out = model_gen(self.model, text, image_path, need_bos=True, max_token=10)
+        return out
+
+    def generate_driectly(self, image_path, text):
+        text = '[UNUSED_TOKEN_146]user\n{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'.format(text)
+        out = model_gen(self.model, text, image_path, need_bos=True, max_token=500)
+        return out
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        with torch.cuda.amp.autocast():
+            if dataset is None:
+                return self.generate_vanilla(image_path, prompt)
+            assert isinstance(dataset, str)
+            if dataset == 'MME':
+                return self.generate_mme(image_path, prompt)
+
+            elif listinstr(['hallu'], dataset.lower()):
+                return self.generate_brief(image_path, prompt)
+
+            elif listinstr(['llava'], dataset.lower()):
+                return self.generate_vanilla(image_path, prompt)
+
+            elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
+                return self.generate_multichoice(image_path, prompt, dataset)
+
+            elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
+                return self.generate_vqa(image_path, prompt)
+
+            else:
+                return self.generate_vanilla(image_path, prompt)
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'VQA':
+            return True
+        return False
+
+    def build_mcqa(self, line):
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        img_prompt = '[UNUSED_TOKEN_146]user\n'
+        if len(options):
+            options_prompt = ''
+            for key, item in options.items():
+                options_prompt += f'{key}. {item} '
+            options_prompt = options_prompt.strip()
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+
+            context = 'N/A' if hint is None else hint
+            mid_prompt = 'Question: ' + question + '\nContext: ' + context + '\nOptions: ' + options_prompt
+            ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\nThe answer is'
+            prompt = img_prompt + mid_prompt + ans_prompt
+        else:
+            mid_prompt = f'Answer the question using a single word or phrase.{question}'
+            ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+            prompt = img_prompt + mid_prompt + ans_prompt
+
+        return prompt
+
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+
+        if DATASET_TYPE(dataset) == 'MCQ':
+            prompt = self.build_mcqa(line)
+        elif DATASET_TYPE(dataset) == 'VQA':
+            if 'mathvista' in dataset.lower():
+                q = line['question']
+                prompt = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+            else:
+                q = line['question']
+                prompt = (
+                    f'[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.{q}'
+                    '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+                )
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
--- a/VLMEvalKit/vlmeval/vlm/xcomposer/xcomposer2_4KHD.py
+++ b/VLMEvalKit/vlmeval/vlm/xcomposer/xcomposer2_4KHD.py
+import torch
+from transformers import AutoModel, AutoTokenizer
+from PIL import Image
+from ..base import BaseModel
+from ...smp import *
+from ...dataset import DATASET_TYPE
+import numpy as np
+import torchvision.transforms as transforms
+
+import re
+pattern = re.compile(r'[A-Z]')
+
+
+def padding_336(b):
+    width, height = b.size
+    tar = int(np.ceil(height / 336) * 336)
+    top_padding = int((tar - height) / 2)
+    bottom_padding = tar - height - top_padding
+    left_padding = 0
+    right_padding = 0
+    b = transforms.functional.pad(b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255, 255, 255])
+
+    return b
+
+
+def HD_transform(img, im_num=16):
+    width, height = img.size
+    trans = False
+    if width < height:
+        img = img.transpose(Image.TRANSPOSE)
+        trans = True
+        width, height = img.size
+    ratio = (width / height)
+    scale = 1
+    while scale * np.ceil(scale / ratio) <= im_num:
+        scale += 1
+    scale -= 1
+    new_w = int(scale * 336)
+    new_h = int(new_w / ratio)
+
+    img = transforms.functional.resize(img, [new_h, new_w],)
+    img = padding_336(img)
+    width, height = img.size
+    assert width * height <= im_num * 336 * 336
+    if trans:
+        img = img.transpose(Image.TRANSPOSE)
+
+    return img
+
+
+meta_instruction = """You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).
+- InternLM-XComposer (浦语·灵笔) is a multi-modality conversational language model that is developed\
+ by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
+- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language chosen by\
+ the user such as English and 中文.
+- InternLM-XComposer (浦语·灵笔) is capable of comprehending and articulating responses\
+ effectively based on the provided image."""
+
+
+def model_gen(model, text, images, need_bos=True, padding=False, beams=3, max_token=500):
+    pt1 = 0
+    embeds = []
+    im_mask = []
+    images = [images]
+    images_loc = [0]
+    for i, pts in enumerate(images_loc + [len(text)]):
+        subtext = text[pt1:pts]
+        if need_bos or len(subtext) > 0:
+            text_embeds = model.encode_text(subtext, add_special_tokens=need_bos)
+            embeds.append(text_embeds)
+            im_mask.append(torch.zeros(text_embeds.shape[:2]).cuda())
+            need_bos = False
+        if i < len(images):
+            try:
+                image = Image.open(images[i]).convert('RGB')
+            except:
+                image = images[i].convert('RGB')
+
+            image = HD_transform(image, im_num=model.hd_num)
+            image = model.vis_processor(image).unsqueeze(0).cuda()
+            image_embeds = model.encode_img(image)
+            embeds.append(image_embeds)
+            im_mask.append(torch.ones(image_embeds.shape[:2]).cuda())
+        pt1 = pts
+    embeds = torch.cat(embeds, dim=1)
+    im_mask = torch.cat(im_mask, dim=1)
+    im_mask = im_mask.bool()
+
+    outputs = model.generate(inputs_embeds=embeds, im_mask=im_mask,
+                             temperature=1.0, max_new_tokens=max_token, num_beams=beams,
+                             do_sample=False, repetition_penalty=1.0)
+    output_token = outputs[0]
+    if output_token[0] == 0 or output_token[0] == 1:
+        output_token = output_token[1:]
+    output_text = model.tokenizer.decode(output_token, add_special_tokens=False)
+    output_text = output_text.split('[UNUSED_TOKEN_145]')[0].strip()
+    return output_text
+
+
+class XComposer2_4KHD(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='internlm/internlm-xcomposer2-4khd-7b', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+
+        model = AutoModel.from_pretrained(self.model_path, device_map='cpu', trust_remote_code=True).cuda().eval()
+        model.half()
+        tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
+        model.tokenizer = tokenizer
+        self.model = model
+        self.device = self.model.model.tok_embeddings.weight.device
+        self.model.hd_num = 25
+
+    def generate_mme(self, image_path, text):
+        text = text.split('Please answer')[0].strip()
+        text = f'{text} Answer this question briefly'
+        text = f'[UNUSED_TOKEN_146]user\n{text}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+
+        return model_gen(self.model, text, image_path, need_bos=True, padding=True, beams=5)
+
+    def generate_multichoice(self, image_path, text, dataset):
+        out = model_gen(self.model, text, image_path, need_bos=True, padding=False, beams=5, max_token=5)
+        if 'mmmu' in dataset.lower():
+            return out
+        res = pattern.findall(out)
+        if len(res) == 0:
+            print('Error:', out)
+            res = 'Z'
+        return res[0]
+
+    def generate_vqa(self, image_path, text):
+        out = model_gen(self.model, text, image_path, need_bos=True, max_token=100)
+        return out
+
+    def generate_vanilla(self, image_path, text):
+        out = model_gen(self.model, text, image_path, need_bos=True, max_token=500)
+        return out
+
+    def generate_brief(self, image_path, text):
+        text = '[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.{}\
+               [UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'.format(text)
+        out = model_gen(self.model, text, image_path, need_bos=True, max_token=10)
+        return out
+
+    def generate(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        if listinstr(['docvqa_test', 'infovqa_test'], dataset.lower()):
+            self.model.hd_num = 65
+        elif listinstr(['docvqa_val', 'infovqa_val', 'OCRBench'], dataset.lower()):
+            self.model.hd_num = 55
+        elif listinstr(['mmlongbench_doc'], dataset.lower()):
+            self.model.hd_num = 45
+        elif listinstr(['mmmu', 'mmbench', 'mmvet'], dataset.lower()):
+            self.model.hd_num = 16
+        else:
+            self.model.hd_num = 25
+
+        with torch.cuda.amp.autocast():
+            if dataset is None:
+                return self.generate_vanilla(image_path, prompt)
+            assert isinstance(dataset, str)
+            if dataset == 'MME':
+                return self.generate_mme(image_path, prompt)
+
+            elif listinstr(['hallu'], dataset.lower()):
+                return self.generate_brief(image_path, prompt)
+
+            elif listinstr(['llava', 'mmvet'], dataset.lower()):
+                return self.generate_vanilla(image_path, prompt)
+
+            elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
+                return self.generate_multichoice(image_path, prompt, dataset)
+
+            elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
+                return self.generate_vqa(image_path, prompt)
+
+            else:
+                return self.generate_vanilla(image_path, prompt)
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'VQA':
+            return True
+        return False
+
+    def build_mcqa(self, line):
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        img_prompt = '[UNUSED_TOKEN_146]user\n'
+        if len(options):
+            options_prompt = ''
+            for key, item in options.items():
+                options_prompt += f'{key}. {item} '
+            options_prompt = options_prompt.strip()
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+
+            context = 'N/A' if hint is None else hint
+            mid_prompt = 'Question: ' + question + '\nContext: ' + context + '\nOptions: ' + options_prompt
+            ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\nThe answer is'
+            prompt = img_prompt + mid_prompt + ans_prompt
+        else:
+            mid_prompt = f'Answer the question using a single word or phrase.{question}'
+            ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+            prompt = img_prompt + mid_prompt + ans_prompt
+
+        return prompt
+
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+
+        if DATASET_TYPE(dataset) == 'MCQ':
+            prompt = self.build_mcqa(line)
+        elif DATASET_TYPE(dataset) == 'VQA':
+            if 'mathvista' in dataset.lower():
+                q = line['question']
+                prompt = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+            elif listinstr(['llava', 'mmvet'], dataset.lower()):
+                q = line['question']
+                prompt = '[UNUSED_TOKEN_146]system\n{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]user\n{}\
+                         Answer this question in detail.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]\
+                         assistant\n'.format(meta_instruction, q)
+            elif listinstr(['mmlongbench_doc'], dataset.lower()):
+                q = line['question']
+                prompt = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+            else:
+                q = line['question']
+                prompt = f'[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.\
+                          {q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+        ret = [dict(type='text', value=prompt)]
+        ret.extend([dict(type='image', value=s) for s in tgt_path])
+        return ret
--- a/VLMEvalKit/vlmeval/vlm/xcomposer/xcomposer2d5.py
+++ b/VLMEvalKit/vlmeval/vlm/xcomposer/xcomposer2d5.py
+import re
+
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+from PIL import Image, ImageDraw, ImageFont
+from transformers import AutoModel, AutoTokenizer
+
+from ...dataset import DATASET_TYPE
+from ...smp import *
+from ..base import BaseModel
+
+pattern = re.compile(r'[A-Z]')
+conv_pattern = '\\[UNUSED_TOKEN_146\\]user\\\n|\\[UNUSED_TOKEN_146\\]assistant\\\n|\\[UNUSED_TOKEN_145\\]'
+
+
+def get_font():
+    try:
+        truetype_url = "http://opencompass.openxlab.space/utils/Fonts/SimHei.ttf"
+        ff = urlopen(truetype_url)
+        # ff = '/fs-computility/mllm/shared/dongxiaoyi/share_data/SimHei.ttf'
+        font = ImageFont.truetype(ff, size=40)
+    except Exception as e:
+        logging.warning(f'{type(e)}: {e}')
+        logging.warning("Fail to download the font. Use the default one.")
+        font = ImageFont.load_default(size=40)
+    return font
+
+
+def padding_560(b):
+    width, height = b.size
+    tar = int(np.ceil(height / 560) * 560)
+    top_padding = int((tar - height) / 2)
+    bottom_padding = tar - height - top_padding
+    left_padding = 0
+    right_padding = 0
+    b = transforms.functional.pad(
+        b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255, 255, 255])
+
+    return b
+
+
+def Identity_transform(img, hd_num=25):
+    width, height = img.size
+    trans = False
+    if width < height:
+        img = img.transpose(Image.TRANSPOSE)
+        trans = True
+        width, height = img.size
+    ratio = (width / height)
+    scale = 1
+    new_h = int(scale * 560)
+    new_w = int(new_h * ratio)
+    # print (new_h, new_w)
+
+    img = transforms.functional.resize(img, [new_h, new_w],)
+    img = img.transpose(Image.TRANSPOSE)
+    img = padding_560(img)
+    width, height = img.size
+    if not trans:
+        img = img.transpose(Image.TRANSPOSE)
+
+    return img
+
+
+def HD_transform(img, im_num=36, id_scale=1.5):
+    width, height = img.size
+    trans = False
+    if width < height:
+        img = img.transpose(Image.TRANSPOSE)
+        trans = True
+        width, height = img.size
+    ratio = (width / height)
+    scale = 1
+    while scale * np.ceil(scale / ratio) <= im_num:
+        scale += 1
+    scale -= 1
+
+    scale = min(np.ceil(width * id_scale / 560), scale)
+    new_w = int(scale * 560)
+    new_h = int(new_w / ratio)
+
+    img = transforms.functional.resize(img, [new_h, new_w],)
+    img = padding_560(img)
+    width, height = img.size
+    assert width * height <= im_num * 560 * 560
+    if trans:
+        img = img.transpose(Image.TRANSPOSE)
+
+    return img
+
+
+def img_process(imgs):
+    new_imgs = []
+    for img in imgs:
+        w, h = img.size
+        scale = w / h
+        if w > h:
+            new_w = 560 * 2
+            new_h = int(560 * 2 / scale)
+        else:
+            new_w = int(560 * 2 * scale)
+            new_h = 560 * 2
+        img = transforms.functional.resize(img, [new_h, new_w],)
+        new_imgs.append(img)
+    imgs = new_imgs
+    new_w = 0
+    new_h = 0
+    pad = 40
+    if w > h:
+        for im in imgs:
+            w,h = im.size
+            new_w = max(new_w, w)
+            new_h += h + 10 + pad
+        font = get_font()
+        new_img = Image.new('RGB', (new_w, new_h), 'white')
+        draw = ImageDraw.Draw(new_img)
+        curr_h = 0
+        for idx, im in enumerate(imgs):
+            w,h = im.size
+            new_img.paste(im, (0, pad + curr_h))
+            draw.text((0, curr_h), f'<IMAGE {idx}>', font=font, fill='black')
+            if idx + 1 < len(imgs):
+                draw.line([(0, pad + curr_h + h + 5), (new_w, pad + curr_h + h + 5)], fill='black', width=2)
+            curr_h += h + 10 + pad
+        # print (new_w, new_h)
+    else:
+        for im in imgs:
+            w,h = im.size
+            new_w += w + 10
+            new_h = max(new_h, h)
+        new_h += pad
+        font = get_font()
+        new_img = Image.new('RGB', (new_w, new_h), 'white')
+        draw = ImageDraw.Draw(new_img)
+        curr_w = 0
+        for idx, im in enumerate(imgs):
+            w,h = im.size
+            new_img.paste(im, (curr_w, pad))
+            draw.text((curr_w, 0), f'<IMAGE {idx}>', font=font, fill='black')
+            if idx + 1 < len(imgs):
+                draw.line([(curr_w + w + 5, 0), (curr_w + w + 5, new_h)], fill='black', width=2)
+            curr_w += w + 10
+    return new_img
+
+
+meta_instruction = """You are an AI assistant whose name is InternLM (书生·浦语).\n" + "- InternLM (书生·浦语) \
+is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室).
+It is designed to be helpful, honest, and harmless.\n"+"- InternLM (书生·浦语) \
+can understand and communicate fluently in the language chosen by the user such as English and 中文."""
+
+
+def model_gen(model, text, images, need_bos=True, padding=False, beams=3, max_token=500, video_input=False):
+    embeds = []
+    im_mask = []
+    # print(text)
+
+    im_idx = 0
+    sub_q = text.split('<IM_POS>')
+    add_im = len(sub_q) - 1
+    for subtext in sub_q:
+        if need_bos or len(subtext) > 0:
+            text_embeds = model.encode_text(
+                subtext, add_special_tokens=need_bos)
+            embeds.append(text_embeds)
+            im_mask.append(torch.zeros(text_embeds.shape[:2]).to(model.device))
+            need_bos = False
+
+        if im_idx < len(images) and add_im:
+            image = images[im_idx]
+            if video_input:
+                image = Identity_transform(image)
+            else:
+                if len(images) > 1:
+                    image = HD_transform(image, im_num=model.hd_num // len(images), id_scale=model.id_scale)
+                else:
+                    image = HD_transform(
+                        image, im_num=model.hd_num, id_scale=model.id_scale)
+            # print(image.size)
+            image = model.vis_processor(image).unsqueeze(0).to(model.device)
+            image_embeds = model.encode_img(image)
+            im_idx += 1
+            add_im -= 1
+            embeds.append(image_embeds)
+            im_mask.append(torch.ones(
+                image_embeds.shape[:2], dtype=torch.long).to(model.device))
+
+    embeds = torch.cat(embeds, dim=1)
+    im_mask = torch.cat(im_mask, dim=1)
+    im_mask = im_mask.bool()
+
+    outputs = model.generate(inputs_embeds=embeds, im_mask=im_mask,
+                             temperature=1.0, max_new_tokens=max_token, num_beams=beams,
+                             do_sample=False, repetition_penalty=1.0)
+
+    output_token = outputs[0]
+    if output_token[0] == 0 or output_token[0] == 1:
+        output_token = output_token[1:]
+    output_text = model.tokenizer.decode(output_token, add_special_tokens=False)
+    output_text = output_text.split('[UNUSED_TOKEN_145]')[0].strip().split('<|im_end|>')[0].strip().split('The answer is')[-1].strip()  # noqa
+    # print(output_text)
+    return output_text
+
+
+class XComposer2d5(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = True
+
+    def __init__(self, model_path='internlm/internlm-xcomposer2d5-7b', id_scale=1.5, beam=3, **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+        self.id_scale = id_scale
+        self.beam = beam
+
+        model = AutoModel.from_pretrained(
+            self.model_path, device_map='cpu', trust_remote_code=True, local_files_only=True).cuda().eval()
+        model.half()
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.model_path, trust_remote_code=True)
+        model.tokenizer = tokenizer
+        self.model = model
+        self.device = self.model.model.tok_embeddings.weight.device
+        self.model.hd_num = 36
+        self.model.id_scale = self.id_scale
+
+    def message_to_promptimg(self, message, dataset=None, video_input=False):
+        num_images = len([x for x in message if x['type'] == 'image'])
+        if num_images == 0:
+            prompt = '\n'.join([x['value']
+                               for x in message if x['type'] == 'text'])
+            image = None
+
+        else:
+            image = [Image.open(x['value']).convert('RGB') for x in message if x['type'] == 'image']
+
+            if video_input:
+                im_prompt = '<IM_POS>Here are some frames of a video.'
+                if len(image) > 64:
+                    step = len(image) / 64
+                    image = [image[int(i * step)] for i in range(64)]
+                image = [img_process(image)]
+
+            else:
+                if len(image) > 1:
+                    im_prompt = ' '.join([
+                        f'Image{im_idx+1}: <IM_POS>;' for im_idx in range(len(image))])
+                else:
+                    im_prompt = '<IM_POS>'
+
+            prompt = ''
+            for x in message:
+                if x['type'] == 'text' and x.get('role', '') != 'system':
+                    prompt += x['value']
+            sp = [i for i in re.split(conv_pattern, prompt) if i != '' and i != '\n']
+            assert len(sp) <= 2
+            q = sp[0]
+            prompt = f'[UNUSED_TOKEN_146]user\n{im_prompt}{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+
+            for idx in range(10):
+                idx = chr(65 + idx)
+                prompt = prompt.replace(f'({idx})', f'{idx}.')
+
+        return prompt, image
+
+    def generate_mme(self, image_path, text):
+        text = text.split('Please answer')[0].strip()
+        text = f'{text} Answer this question briefly'
+        text = f'[UNUSED_TOKEN_146]user\n{text}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+
+        return model_gen(self.model, text, image_path, need_bos=True, padding=True, beams=self.beam)
+
+    def generate_multichoice(self, image_path, text, dataset):
+        out = model_gen(self.model, text, image_path,
+                        need_bos=True, padding=False, beams=self.beam, max_token=5)
+        if 'mmmu' in dataset.lower():
+            return out
+        res = pattern.findall(out)
+        if len(res) == 0:
+            print('Error:', out)
+            res = 'Z'
+        return res[0]
+
+    def generate_vqa(self, image_path, text):
+        out = model_gen(self.model, text, image_path, beams=self.beam,
+                        need_bos=True, max_token=100)
+        return out
+
+    def generate_vanilla(self, image_path, text):
+        out = model_gen(self.model, text, image_path, beams=self.beam,
+                        need_bos=True, max_token=500)
+        return out
+
+    def generate_brief(self, image_path, text):
+        text = '[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.{}\
+               [UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'.format(text)
+        out = model_gen(self.model, text, image_path, beams=self.beam,
+                        need_bos=True, max_token=10)
+        return out
+
+    def generate_video(self, image_path, text):
+        out = model_gen(
+            self.model, text, image_path, beams=1,  # self.beam,
+            need_bos=True, max_token=100, video_input=True)
+        return out
+
+    def set_max_num(self, dataset):
+        if dataset is not None and listinstr(['MME-RealWorld', 'MME-RealWorld-CN'], dataset):
+            self.model.hd_num = 25
+
+    def generate_inner(self, message, dataset=None):
+        self.set_max_num(dataset)
+        with torch.cuda.amp.autocast():
+            if dataset is None:
+                prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+                return self.generate_vanilla(image_path, prompt)
+            assert isinstance(dataset, str)
+
+            if listinstr(['video', 'mvbench'], dataset.lower()):
+                prompt, image_path = self.message_to_promptimg(message, dataset=dataset, video_input=True)
+                return self.generate_video(image_path, prompt)
+            else:
+                prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+                if dataset == 'MME':
+                    return self.generate_mme(image_path, prompt)
+                elif listinstr(['hallu', 'pope'], dataset.lower()):
+                    return self.generate_brief(image_path, prompt)
+                elif listinstr(['llava', 'mmvet'], dataset.lower()):
+                    return self.generate_vanilla(image_path, prompt)
+                elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
+                    return self.generate_multichoice(image_path, prompt, dataset)
+                elif listinstr(['MME-RealWorld', 'MME-RealWorld-CN'], dataset):
+                    return self.generate_multichoice(image_path, prompt, dataset)
+                elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
+                    return self.generate_vqa(image_path, prompt)
+                else:
+                    return self.generate_vanilla(image_path, prompt)
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'VQA':
+            return True
+        return False
+
+    def build_mcqa(self, line):
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        img_prompt = '[UNUSED_TOKEN_146]user\n'
+        if len(options):
+            options_prompt = ''
+            for key, item in options.items():
+                options_prompt += f'{key}. {item} '
+            options_prompt = options_prompt.strip()
+            hint = line['hint'] if (
+                'hint' in line and not pd.isna(line['hint'])) else None
+
+            context = 'N/A' if hint is None else hint
+            mid_prompt = 'Question: ' + question + '\nContext: ' + \
+                context + '\nOptions: ' + options_prompt
+            ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\nThe answer is'
+            prompt = img_prompt + mid_prompt + ans_prompt
+        else:
+            mid_prompt = f'Answer the question using a single word or phrase.{question}'
+            ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+            prompt = img_prompt + mid_prompt + ans_prompt
+
+        return prompt
+
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+
+        if DATASET_TYPE(dataset) == 'MCQ':
+            prompt = self.build_mcqa(line)
+        elif DATASET_TYPE(dataset) == 'VQA':
+            if 'mathvista' in dataset.lower():
+                q = line['question']
+                prompt = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+            elif listinstr(['llava', 'mmvet'], dataset.lower()):
+                q = line['question']
+                prompt = '[UNUSED_TOKEN_146]system\n{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]user\n{}\
+                         Answer this question in detail.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]\
+                         assistant\n'.format(meta_instruction, q)
+            elif listinstr(['mmlongbench_doc', 'dude', 'slidevqa'], dataset.lower()):
+                q = line['question']
+                prompt = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+            else:
+                q = line['question']
+                prefix = 'Answer the question using a single word or phrase.'
+                prompt = f'[UNUSED_TOKEN_146]user\n{prefix}{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+        ret = [dict(type='text', value=prompt)]
+        ret.extend([dict(type='image', value=s) for s in tgt_path])
+        return ret
--- a/VLMEvalKit/vlmeval/vlm/xgen_mm.py
+++ b/VLMEvalKit/vlmeval/vlm/xgen_mm.py
+from PIL import Image
+import torch
+
+from .base import BaseModel
+from ..smp import *
+
+
+class XGenMM(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = True
+
+    def __init__(self, model_path='Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5', **kwargs):
+        try:
+            from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor
+        except Exception as err:
+            logging.critical('Please install the latest version transformers.')
+            raise err
+
+        model = AutoModelForVision2Seq.from_pretrained(
+            model_path, device_map='cuda', trust_remote_code=True, torch_dtype='auto'
+        ).eval()
+
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, use_fast=False, legacy=False
+        )
+        tokenizer = model.update_special_tokens(tokenizer)
+        tokenizer.eos_token = '<|end|>'
+        tokenizer.padding_side = 'left'
+        image_processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code=True)
+        self.model = model
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+        self.kwargs = kwargs
+
+    def apply_prompt_template(self, query):
+        s = (
+            '<|system|>\nA chat between a curious user and an artificial intelligence assistant. '
+            "The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n"
+            f'<|user|>\n{query}<|end|>\n<|assistant|>\n'
+        )
+        return s
+
+    def generate_inner(self, message, dataset=None):
+
+        content, images, image_sizes = '', [], []
+
+        for msg in message:
+            if msg['type'] == 'text':
+                content += msg['value']
+            elif msg['type'] == 'image':
+                image = Image.open(msg['value']).convert('RGB')
+                images.append(self.image_processor([image], image_aspect_ratio='anyres')['pixel_values'].to('cuda'))
+                image_sizes.append(image.size)
+                content += '<image> '
+
+        inputs = {'pixel_values': [images]}
+        prompt = self.apply_prompt_template(content)
+        language_inputs = self.tokenizer([prompt], return_tensors='pt').to('cuda')
+        inputs.update(language_inputs)
+
+        generation_args = {
+            'max_new_tokens': 1024,
+            'temperature': 0.0,
+            'do_sample': False,
+            'top_p': None,
+            'num_beams': 1
+        }
+        generation_args.update(self.kwargs)
+
+        generate_ids = self.model.generate(
+            **inputs, image_size=[image_sizes],
+            pad_token_id=self.tokenizer.pad_token_id,
+            eos_token_id=self.tokenizer.eos_token_id,
+            **generation_args
+        )
+
+        # remove input tokens
+        response = self.tokenizer.decode(generate_ids[0], skip_special_tokens=True).split('<|end|>')[0]
+
+        return response
--- a/VLMEvalKit/vlmeval/vlm/yi_vl.py
+++ b/VLMEvalKit/vlmeval/vlm/yi_vl.py
+import torch
+import sys
+import os.path as osp
+import warnings
+from PIL import Image
+from vlmeval.smp import get_cache_path, load, dump, splitlen
+from huggingface_hub import snapshot_download
+from .base import BaseModel
+
+
+"""
+You can perform inference of Yi-VL through the following steps:
+1. clone the repo https://github.com/01-ai/Yi to path-to-Yi
+2. set up the environment and install the required packages in path-to-Yi/VL/requirements.txt
+3. set Yi_ROOT in vlmeval/config.py
+    Yi_ROOT = path-to-Yi
+
+You are all set now! To run a demo for Yi-VL:
+```python
+from vlmeval import *
+model = supported_VLM['Yi_VL_6B']()
+model.generate('apple.jpg', 'What is in this image?')
+```
+To run evaluation for Yi-VL, use `python run.py --model Yi_VL_6B --data {dataset_list}`
+"""
+
+
+def edit_config(repo_id):
+    if not osp.exists(repo_id):
+        root = get_cache_path(repo_id)
+    else:
+        root = repo_id
+    assert root is not None and osp.exists(root)
+    cfg = osp.join(root, 'config.json')
+    data = load(cfg)
+    mm_vision_tower = data['mm_vision_tower']
+    if mm_vision_tower.startswith('./vit/'):
+        data['mm_vision_tower'] = osp.join(root, mm_vision_tower)
+        assert osp.exists(data['mm_vision_tower'])
+        dump(data, cfg)
+
+
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    import torch
+
+    setattr(torch.nn.Linear, 'reset_parameters', lambda self: None)
+    setattr(torch.nn.LayerNorm, 'reset_parameters', lambda self: None)
+
+
+class Yi_VL(BaseModel):
+
+    INSTALL_REQ = True
+    INTERLEAVE = False
+
+    def __init__(self,
+                 model_path='01-ai/Yi-VL-6B',
+                 root=None,
+                 **kwargs):
+
+        if root is None:
+            warnings.warn(
+                'Please set root to the directory of Yi, '
+                'which is cloned from here: https://github.com/01-ai/Yi.'
+            )
+
+        self.root = osp.join(root, 'VL')
+        sys.path.append(self.root)
+
+        if splitlen(model_path, '/') == 2 and not osp.exists(model_path):
+            if get_cache_path(model_path) is None:
+                snapshot_download(repo_id=model_path)
+            edit_config(model_path)
+        elif osp.exists(model_path):
+            edit_config(model_path)
+
+        from llava.mm_utils import get_model_name_from_path, load_pretrained_model
+        from llava.model.constants import key_info
+
+        disable_torch_init()
+        key_info['model_path'] = model_path
+        get_model_name_from_path(model_path)
+        self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
+            model_path,
+            device_map='cpu')
+        self.model = self.model.cuda()
+        self.conv_mode = 'mm_default'
+
+        kwargs_default = dict(temperature=0.2,
+                              num_beams=1,
+                              do_sample=False,
+                              max_new_tokens=1024,
+                              top_p=None)
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+
+        from llava.conversation import conv_templates
+        from llava.model.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
+        from llava.mm_utils import KeywordsStoppingCriteria, expand2square, tokenizer_image_token
+
+        qs = DEFAULT_IMAGE_TOKEN + '\n' + prompt
+        conv = conv_templates[self.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        input_ids = (
+            tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
+            .unsqueeze(0)
+            .cuda()
+        )
+
+        image = Image.open(image_path)
+        if getattr(self.model.config, 'image_aspect_ratio', None) == 'pad':
+            if image.mode == 'L':
+                background_color = int(sum([int(x * 255) for x in self.image_processor.image_mean]) / 3)
+            else:
+                background_color = tuple(int(x * 255) for x in self.image_processor.image_mean)
+            image = expand2square(image, background_color)
+        image_tensor = self.image_processor.preprocess(image, return_tensors='pt')[
+            'pixel_values'
+        ][0]
+
+        stop_str = conv.sep
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids)
+        self.model = self.model.to(dtype=torch.bfloat16)
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                input_ids,
+                images=image_tensor.unsqueeze(0).to(dtype=torch.bfloat16).cuda(),
+                stopping_criteria=[stopping_criteria],
+                use_cache=True,
+                **self.kwargs)
+
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(
+                f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids'
+            )
+        outputs = self.tokenizer.batch_decode(
+            output_ids[:, input_token_len:], skip_special_tokens=True
+        )[0]
+        outputs = outputs.strip()
+
+        if outputs.endswith(stop_str):
+            outputs = outputs[: -len(stop_str)]
+        outputs = outputs.strip()
+        return outputs
--- a/assets/infer_result2.png
+++ b/assets/infer_result2.png
--- a/assets/mrope.png
+++ b/assets/mrope.png
--- a/assets/qwen.png
+++ b/assets/qwen.png
--- a/assets/qwen2_vl_framework.jpg
+++ b/assets/qwen2_vl_framework.jpg
--- a/icon.png
+++ b/icon.png
--- a/inference_vllm/multi_dcu_inference.py
+++ b/inference_vllm/multi_dcu_inference.py
+from transformers import AutoProcessor
+from vllm import LLM, SamplingParams
+from qwen_vl_utils import process_vision_info
+MODEL_PATH = 'Qwen/Qwen2-VL-7B-Instruct'
+def main():
+    # 指定多卡推理
+    llm = LLM(
+        model=MODEL_PATH,
+        limit_mm_per_prompt={"image": 10, "video": 10},
+        tensor_parallel_size=4,  # 设置为你要使用的 GPU 数量
+        trust_remote_code=True,
+        gpu_memory_utilization=0.95,
+        dtype="float16",
+        enforce_eager=True
+    )
+
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=256,
+        stop_token_ids=[],
+    )
+
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": "https://modelscope.oss-cn-beijing.aliyuncs.com/resource/qwen.png",
+                    "min_pixels": 224 * 224,
+                    "max_pixels": 1280 * 28 * 28,
+                },
+                {"type": "text", "text": "What is the text in the illustrate?"},
+            ],
+        },
+    ]
+
+    processor = AutoProcessor.from_pretrained(MODEL_PATH)
+    prompt = processor.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+
+    mm_data = {}
+    if image_inputs is not None:
+        mm_data["image"] = image_inputs
+    if video_inputs is not None:
+        mm_data["video"] = video_inputs
+
+    llm_inputs = {
+        "prompt": prompt,
+        "multi_modal_data": mm_data,
+    }
+
+    # 多卡推理
+    outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
+    generated_text = outputs[0].outputs[0].text
+
+    print(generated_text)
+
+
+if __name__ == '__main__':
+    main()
--- a/inference_vllm/single_dcu_inference.py
+++ b/inference_vllm/single_dcu_inference.py
+from transformers import AutoProcessor
+from vllm import LLM, SamplingParams
+from qwen_vl_utils import process_vision_info
+
+MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct"
+
+llm = LLM(
+    model=MODEL_PATH,
+    limit_mm_per_prompt={"image": 10, "video": 10},
+)
+
+sampling_params = SamplingParams(
+    temperature=0.1,
+    top_p=0.001,
+    repetition_penalty=1.05,
+    max_tokens=256,
+    stop_token_ids=[],
+)
+
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": "https://modelscope.oss-cn-beijing.aliyuncs.com/resource/qwen.png",
+                "min_pixels": 224 * 224,
+                "max_pixels": 1280 * 28 * 28,
+            },
+            {"type": "text", "text": "What is the text in the illustrate?"},
+        ],
+    },
+]
+# For video input, you can pass following values instead:
+# "type": "video",
+# "video": "<video URL>",
+
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+prompt = processor.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
+)
+image_inputs, video_inputs = process_vision_info(messages)
+
+mm_data = {}
+if image_inputs is not None:
+    mm_data["image"] = image_inputs
+if video_inputs is not None:
+    mm_data["video"] = video_inputs
+
+llm_inputs = {
+    "prompt": prompt,
+    "multi_modal_data": mm_data,
+}
+
+outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
+generated_text = outputs[0].outputs[0].text
+
+print(generated_text)
\ No newline at end of file
--- a/model.properties
+++ b/model.properties
+# 模型唯一标识
+modelCode=1199
+# 模型名称
+modelName=qwen2_vl_pytorch
+# 模型描述
+modelDescription=Qwen2-VL是基于Qwen2开发而成,在架构上的一大改进是实现了对原生动态分辨率的全面支持，与上一代模型相比，能够处理任意分辨率的图像输入。
+# 应用场景
+appScenario=推理,训练,对话问答,科研,教育,政府,金融
+# 框架类型
+frameType=Pytorch