init

81028572 · luopl · 81028572 · 81028572 · 81028572 · 81028572
Commit 81028572 authored Sep 28, 2024 by luopl
20 changed files
--- a/VLMEvalKit/vlmeval/vlm/vila.py
+++ b/VLMEvalKit/vlmeval/vlm/vila.py
+import torch
+from PIL import Image
+from abc import abstractproperty
+import sys
+import os.path as osp
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+import copy
+
+
+class VILA(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = True
+
+    def __init__(self,
+                 model_path='Efficient-Large-Model/Llama-3-VILA1.5-8b',
+                 **kwargs):
+        try:
+            from llava.model.builder import load_pretrained_model
+            from llava.mm_utils import get_model_name_from_path
+            from llava.mm_utils import process_images, tokenizer_image_token, KeywordsStoppingCriteria
+            from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN # noqa E501
+            from llava.conversation import conv_templates, SeparatorStyle
+        except:
+            warnings.warn('Please install VILA before using VILA')
+            warnings.warn('Please install VILA from https://github.com/NVlabs/VILA')
+            warnings.warn('Please install VLMEvalKit after installing VILA')
+            warnings.warn('VILA is supported only with transformers==4.36.2')
+            sys.exit(-1)
+
+        warnings.warn('Please install the latest version of VILA from GitHub before you evaluate the VILA model.')
+        assert osp.exists(model_path) or len(model_path.split('/')) == 2
+
+        model_name = get_model_name_from_path(model_path)
+
+        try:
+            self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
+                model_path=model_path,
+                model_base=None,
+                model_name=model_name,
+                device='cpu',
+                device_map='cpu'
+            )
+        except Exception as e:
+            warnings.warn(f'Error loading VILA model: {e}')
+            exit(-1)
+
+        self.model = self.model.cuda()
+        if '3b' in model_path:
+            self.conv_mode = 'vicuna_v1'
+        if '8b' in model_path:
+            self.conv_mode = 'llama_3'
+        elif '13b' in model_path:
+            self.conv_mode = 'vicuna_v1'
+        elif '40b' in model_path:
+            self.conv_mode = 'hermes-2'
+
+        kwargs_default = dict(do_sample=False, temperature=0, max_new_tokens=512, top_p=None, num_beams=1, use_cache=True) # noqa E501
+
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+        warnings.warn(f'Using the following kwargs for generation config: {self.kwargs}')
+
+        self.conv_templates = conv_templates
+        self.process_images = process_images
+        self.tokenizer_image_token = tokenizer_image_token
+        self. DEFAULT_IMAGE_TOKEN = DEFAULT_IMAGE_TOKEN
+        self.SeparatorStyle = SeparatorStyle
+        self.IMAGE_TOKEN_INDEX = IMAGE_TOKEN_INDEX
+        self.KeywordsStoppingCriteria = KeywordsStoppingCriteria
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        # TODO see if custom prompt needed
+        return False
+
+    def generate_inner(self, message, dataset=None):
+
+        content, images = '', []
+
+        for msg in message:
+            if msg['type'] == 'text':
+                content += msg['value']
+            elif msg['type'] == 'image':
+                image = Image.open(msg['value']).convert('RGB')
+                images.append(image)
+                content += (self.DEFAULT_IMAGE_TOKEN + '\n')
+
+        image_tensor = self.process_images(
+            images, self.image_processor,
+            self.model.config).to(self.model.device, dtype=torch.float16)
+
+        # Support interleave text and image
+        conv = self.conv_templates[self.conv_mode].copy()
+        conv.append_message(conv.roles[0], content)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        input_ids = self.tokenizer_image_token(prompt, self.tokenizer, self.IMAGE_TOKEN_INDEX,
+                                               return_tensors='pt').unsqueeze(0).cuda()
+
+        stop_str = conv.sep if conv.sep_style != self.SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = self.KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids)
+
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                input_ids, images=image_tensor, stopping_criteria=[stopping_criteria], **self.kwargs)
+
+            output = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+        return output
--- a/VLMEvalKit/vlmeval/vlm/visualglm.py
+++ b/VLMEvalKit/vlmeval/vlm/visualglm.py
+import warnings
+from .base import BaseModel
+from ..smp import *
+
+
+class VisualGLM(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='THUDM/visualglm-6b', **kwargs):
+        try:
+            import sat
+        except:
+            warnings.warn('Please install SwissArmyTransformer to use VisualGLM')
+        assert model_path is not None
+        self.model_path = model_path
+
+        from transformers import AutoModel
+        from transformers import AutoTokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()
+        self.model = model
+        self.kwargs = kwargs
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        output, _ = self.model.chat(
+            image_path=image_path,
+            tokenizer=self.tokenizer,
+            query=prompt,
+            history=[],
+            **self.kwargs
+        )
+        return output
--- a/VLMEvalKit/vlmeval/vlm/vxverse.py
+++ b/VLMEvalKit/vlmeval/vlm/vxverse.py
+import torch
+import sys
+import os.path as osp
+import warnings
+from .base import BaseModel
+from transformers import StoppingCriteriaList
+from omegaconf import OmegaConf
+from PIL import Image
+from huggingface_hub import snapshot_download
+from vlmeval.smp import *
+
+model_cfgs = {
+    'XVERSE-V-13B': {
+        'arch': 'vxverse',
+        'model_type': 'pretrain_xverse13b-chat',
+        'max_txt_len': 512,
+        'end_sym': '<|endoftext|>',
+        'low_resource': False,
+        'prompt_template': 'Human: {}\nAssistant: ',
+        'ckpt': 'xverse/XVERSE-V-13B',
+        'lora_r': 128,
+        'lora_alpha': 256,
+        'lora_dropout': 0.05,
+        'lora_target_modules': 'all_linear',
+        'has_qformer': False,
+        'n_proj_layers': 2,
+        'vit_model': 'openai/clip-vit-large-patch14',
+        'vit_path': 'openai/clip-vit-large-patch14',
+        'image_size': 224,
+        'drop_path_rate': 0,
+        'vit_precision': 'fp16',
+        'llama_model': 'xverse/XVERSE-13B-Chat',
+    }
+}
+
+
+class VXVERSE(BaseModel):
+
+    INSTALL_REQ = True
+    INTERLEAVE = False
+
+    def __init__(self, model_name='XVERSE-V-13B', root=None, **kwargs):
+
+        if root is None:
+            warnings.warn('Please set root to the directory of vxverse.')
+
+        if model_name == 'XVERSE-V-13B':
+            cfg = model_cfgs['XVERSE-V-13B']
+        else:
+            raise NotImplementedError
+
+        ckpt_dir = cfg['ckpt']
+        if not osp.isdir(ckpt_dir):
+            cache_path = get_cache_path(ckpt_dir)
+            if cache_path is not None:
+                ckpt_dir = cache_path
+            else:
+                ckpt_dir = snapshot_download(repo_id=ckpt_dir)
+        assert osp.exists(ckpt_dir) and osp.isdir(ckpt_dir)
+        ckpt = osp.join(ckpt_dir, 'adapter_and_lora.bin')
+        cfg['ckpt'] = ckpt
+        model_cfg = OmegaConf.create(cfg)
+
+        self.model_name = model_name
+
+        self.root = root
+        sys.path.append(self.root)
+
+        from vxverse.common.registry import registry
+        from vxverse.conversation.conversation import CONV_VISION_XVERSE
+
+        device = torch.cuda.current_device()
+        self.device = device
+
+        model_cls = registry.get_model_class(model_cfg.arch)
+        model = model_cls.from_config(model_cfg)
+        model = model.to(device)
+        model.eval()
+        vis_processor_cfg = OmegaConf.create(dict(name='hd_image_train', image_size=224))
+        vis_processor = registry.get_processor_class(
+            vis_processor_cfg.name
+        ).from_config(vis_processor_cfg)
+
+        self.model = model
+        self.vis_processor = vis_processor
+        self.vis_processor_cfg = vis_processor_cfg
+
+        self.CONV_VISION = CONV_VISION_XVERSE
+        self.CONV_VISION.system = ''
+        stop_words_ids = [[835], [2277, 29937]]
+        self.stop_words_ids = stop_words_ids
+        default_kwargs = dict(max_new_tokens=512)
+        default_kwargs.update(kwargs)
+        self.kwargs = default_kwargs
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+
+        image = Image.open(image_path).convert('RGB')
+        image = self.vis_processor(image)
+
+        if self.vis_processor_cfg.name == 'hd_image_train':
+            patches_per_image = [[image.shape[0]]]
+            image = [image]
+        else:
+            patches_per_image = None
+            image = image.unsqueeze(0)
+
+        chat_state = self.CONV_VISION.copy()
+        texts = self.prepare_texts([prompt], chat_state)
+        texts = [text.lstrip() for text in texts]
+        answers = self.model.generate(
+            image,
+            texts,
+            patches_per_images=patches_per_image,
+            do_sample=False,
+            stop_words_ids=self.stop_words_ids,
+            **self.kwargs
+        )
+        return answers[0]
+
+    def prepare_texts(self, texts, conv_temp):
+        convs = [conv_temp.copy() for _ in range(len(texts))]
+        [
+            conv.append_message(conv.roles[0], '<ImageHere>\n{}'.format(text))
+            for conv, text in zip(convs, texts)
+        ]
+        [conv.append_message(conv.roles[1], None) for conv in convs]
+        texts = [conv.get_prompt() for conv in convs]
+        return texts
--- a/VLMEvalKit/vlmeval/vlm/wemm.py
+++ b/VLMEvalKit/vlmeval/vlm/wemm.py
+import torch
+from PIL import Image
+import sys
+from ..smp import *
+from .base import BaseModel
+from ..dataset import DATASET_TYPE
+from transformers import AutoModel, GenerationConfig
+
+
+class WeMM(BaseModel):
+    def __init__(self, model_path='feipengma/WeMM', **kwargs):
+        self.wemm = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True)
+        self.wemm.cuda()
+        self.wemm.eval()
+        torch.cuda.empty_cache()
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'MCQ':
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert dataset is None or isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+        prompt = question
+
+        if len(options):
+            prompt += (
+                '\n请直接回答选项字母。' if cn_string(prompt) else
+                "\nAnswer with the option's letter from the given choices directly."
+            )
+        else:
+            prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
+
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=p) for p in tgt_path])
+        return message
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+
+        if dataset == 'HallusionBench':
+            prompt = prompt + ' Please answer yes or no. Answer the question using a single word or phrase.'
+
+        gen_config = None
+        if dataset == 'MMVet':
+            gen_config = GenerationConfig(
+                max_new_tokens=512,
+                do_sample=True,
+                temperatures=0.7,
+                num_beams=3,
+                eos_token_id=self.wemm.tokenizer.eos_token_id,
+                pad_token_id=self.wemm.tokenizer.pad_token_id
+                if self.wemm.tokenizer.pad_token_id is not None else self.wemm.tokenizer.eos_token_id,
+            )
+        pred = self.wemm.mm_generate(image_path, prompt, gen_config)
+
+        return pred
--- a/VLMEvalKit/vlmeval/vlm/xcomposer/__init__.py
+++ b/VLMEvalKit/vlmeval/vlm/xcomposer/__init__.py
+from .sharecaptioner import ShareCaptioner
+from .xcomposer import XComposer
+from .xcomposer2 import XComposer2
+from .xcomposer2_4KHD import XComposer2_4KHD
+from .xcomposer2d5 import XComposer2d5
+
+__all__ = ['ShareCaptioner', 'XComposer', 'XComposer2', 'XComposer2_4KHD', 'XComposer2d5']
--- a/VLMEvalKit/vlmeval/vlm/xcomposer/sharecaptioner.py
+++ b/VLMEvalKit/vlmeval/vlm/xcomposer/sharecaptioner.py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from ..base import BaseModel
+from ...smp import *
+from ...dataset import DATASET_TYPE
+
+
+class ShareCaptioner(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='Lin-Chen/ShareCaptioner', **kwargs):
+        assert model_path is not None
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path, device_map='cuda', trust_remote_code=True).eval()
+        self.model.tokenizer = tokenizer
+        self.model.cuda()
+        self.model.half()
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'MCQ':
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+
+        if dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
+            question = line['question']
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+            if hint is not None:
+                question = hint + '\n' + question
+
+            option_candidate = string.ascii_uppercase
+            options = {
+                cand: line[cand]
+                for cand in option_candidate
+                if cand in line and not pd.isna(line[cand])
+            }
+            for key, item in options.items():
+                question += f'\n{key}. {item}'
+            prompt = question
+
+            if not cn_string(prompt):
+                prompt = prompt + '\n' + "Answer with the option's letter from the given choices directly."
+            else:
+                prompt = prompt + '\n' + '请直接回答选项字母。'
+        else:
+            prompt = line['question']
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        seg1 = '<|User|>:'
+        seg2 = f'{prompt}{self.model.eoh}\n<|Bot|>:'
+        self.seg_emb1 = self.model.encode_text(seg1, add_special_tokens=True)
+        self.seg_emb2 = self.model.encode_text(seg2, add_special_tokens=False)
+
+        image = Image.open(image_path).convert('RGB')
+        image = self.model.vis_processor(image).unsqueeze(0)
+        image = image.to(self.model.device)
+        tmp_bs = image.shape[0]
+        tmp_seg_emb1 = self.seg_emb1.repeat(tmp_bs, 1, 1)
+        tmp_seg_emb2 = self.seg_emb2.repeat(tmp_bs, 1, 1)
+        with torch.cuda.amp.autocast():
+            with torch.no_grad():
+                image = self.model.encode_img(image)
+                input_emb = torch.cat(
+                    [tmp_seg_emb1, image, tmp_seg_emb2], dim=1)
+                out_embeds = self.model.internlm_model.generate(
+                    inputs_embeds=input_emb,
+                    max_length=500,
+                    num_beams=3,
+                    min_length=1,
+                    do_sample=True,
+                    repetition_penalty=1.5,
+                    length_penalty=1.0,
+                    temperature=1.,
+                    eos_token_id=self.model.tokenizer.eos_token_id,
+                    num_return_sequences=1)
+
+        for j, out in enumerate(out_embeds):
+            out[out == -1] = 2
+            response = self.model.decode_text([out])
+        return response
--- a/VLMEvalKit/vlmeval/vlm/xcomposer/xcomposer.py
+++ b/VLMEvalKit/vlmeval/vlm/xcomposer/xcomposer.py
+import torch
+from transformers import AutoModel, AutoTokenizer
+from transformers import StoppingCriteria, StoppingCriteriaList
+from PIL import Image
+from ..base import BaseModel
+from ...smp import *
+from ...dataset import DATASET_TYPE
+
+
+class StoppingCriteriaSub(StoppingCriteria):
+    def __init__(self, stops=[], encounters=1):
+        super().__init__()
+        self.stops = stops
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
+        for stop in self.stops:
+            if torch.all((stop == input_ids[0][-len(stop):])).item():
+                return True
+
+        return False
+
+
+class XComposer(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='internlm/internlm-xcomposer-vl-7b', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+
+        model = AutoModel.from_pretrained(self.model_path, device_map='cpu', trust_remote_code=True).cuda().eval()
+        tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
+        model.tokenizer = tokenizer
+        self.model = model
+        self.device = self.model.internlm_model.model.embed_tokens.weight.device
+        self.eoh = '<TOKENS_UNUSED_0>'
+        self.eoa = '<TOKENS_UNUSED_1>'
+        stop_words_ids = [
+            torch.tensor([103027]).to(self.device),  # end of human
+            torch.tensor([103028]).to(self.device),  # end of bot
+        ]
+        default_kwargs = {
+            'max_new_tokens': 512, 'num_beams': 5, 'do_sample': False,
+            'min_length': 1, 'repetition_penalty': 1.5, 'length_penalty': 1.0
+        }
+        default_kwargs.update(kwargs)
+        self.kwargs = default_kwargs
+        self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+
+    def generate_inner(self, message, dataset=None):
+        if len(message) == 2:
+            if message[0]['type'] == 'text' and message[1]['type'] == 'image':
+                message = [message[1], message[0]]
+        kwargs = cp.deepcopy(self.kwargs)
+        if dataset is not None:
+            if DATASET_TYPE(dataset) == 'MCQ':
+                kwargs['max_new_tokens'] = 5
+                kwargs['num_beams'] = 5
+
+        with torch.cuda.amp.autocast():
+            with torch.no_grad():
+                prompt_embs = self.message_to_prompt_embs(message, dataset)
+                outputs = self.model.internlm_model.generate(
+                    inputs_embeds=prompt_embs,
+                    stopping_criteria=self.stopping_criteria,
+                    **kwargs
+                )
+
+        output_token = outputs[0]
+        if output_token[0] == 0:
+            output_token = output_token[1:]
+        if output_token[0] == 1:
+            output_token = output_token[1:]
+        output_text = self.model.tokenizer.decode(output_token, add_special_tokens=False)
+
+        output_text = output_text.split(self.model.eoa)[0]
+        output_text = output_text.split('<|Bot|>')[-1].strip()
+        return output_text
+
+    def message_to_prompt_embs(self, message, dataset=None):
+        assert isinstance(message, list)
+        img_embeds = []
+        prompt_full = '<|User|>: '
+        for msg in message:
+            if msg['type'] == 'text':
+                prompt_full += msg['value']
+            elif msg['type'] == 'image':
+                image = Image.open(msg['value']).convert('RGB')
+                image = self.model.vis_processor(image).unsqueeze(0).to(self.device)
+                img_embeds.append(self.model.encode_img(image))
+                prompt_full += '<ImageHere>'
+
+        prompt_full += self.model.eoh + ' <|Bot|>: '
+        if dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
+            prompt_full += 'Answer: The answer is '
+        elif dataset is not None and DATASET_TYPE(dataset) in ['VQA', 'QA', 'Y/N']:
+            prompt_full += 'Answer: '
+
+        prompt_segs = prompt_full.split('<ImageHere>')
+        assert len(prompt_segs) == len(img_embeds) + 1
+
+        prompt_seg_tokens = [
+            self.model.tokenizer(seg, return_tensors='pt', add_special_tokens=(i == 0)).to(self.device).input_ids.long()
+            for i, seg in enumerate(prompt_segs)
+        ]
+        prompt_seg_embs = [self.model.internlm_model.model.embed_tokens(seg) for seg in prompt_seg_tokens]
+        all_embeddings = []
+        for i in range(len(img_embeds)):
+            all_embeddings.extend([prompt_seg_embs[i], img_embeds[i]])
+        all_embeddings.append(prompt_seg_embs[-1])
+        prompt_embs = torch.cat(all_embeddings, dim=1)
+        return prompt_embs
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'MCQ':
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        options_prompt = ''
+        for key, item in options.items():
+            options_prompt += f'{key}. {item}\n'
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        context = 'N/A' if hint is None else hint
+        mid_prompt = 'Context: ' + context + '\nQuestion: ' + question
+        if len(options_prompt):
+            mid_prompt += '\nOptions: ' + options_prompt
+
+        if len(options):
+            txt_prompt = 'Please answer this question by choosing the correct choice.'
+        else:
+            txt_prompt = 'Please answer this question directly. '
+        prompt = txt_prompt + mid_prompt
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
--- a/VLMEvalKit/vlmeval/vlm/xcomposer/xcomposer2.py
+++ b/VLMEvalKit/vlmeval/vlm/xcomposer/xcomposer2.py
+import torch
+import torchvision
+from transformers import AutoModel, AutoTokenizer
+from PIL import Image
+from ..base import BaseModel
+from ...smp import *
+from ...dataset import DATASET_TYPE
+import re
+pattern = re.compile(r'[A-Z]')
+
+
+def __padding__(image):
+    width, height = image.size
+    tar = max(width, height)
+    top_padding = int((tar - height) / 2)
+    bottom_padding = tar - height - top_padding
+    left_padding = int((tar - width) / 2)
+    right_padding = tar - width - left_padding
+    image = torchvision.transforms.functional.pad(image, [left_padding, top_padding, right_padding, bottom_padding])
+    return image
+
+
+meta_instruction = """
+You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).
+- InternLM-XComposer (浦语·灵笔) is a multi-modality conversational language model that is developed by
+Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
+- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language
+chosen by the user such as English and 中文.
+- InternLM-XComposer (浦语·灵笔) is capable of comprehending and articulating responses effectively
+based on the provided image.
+"""
+
+
+def model_gen(model, text, images, need_bos=True, padding=False, beams=3, max_token=500):
+    pt1 = 0
+    embeds = []
+    im_mask = []
+    images = [images]
+    images_loc = [0]
+    for i, pts in enumerate(images_loc + [len(text)]):
+        subtext = text[pt1:pts]
+        if need_bos or len(subtext) > 0:
+            text_embeds = model.encode_text(subtext, add_special_tokens=need_bos)
+            embeds.append(text_embeds)
+            im_mask.append(torch.zeros(text_embeds.shape[:2]).cuda())
+            need_bos = False
+        if i < len(images):
+            try:
+                image = Image.open(images[i]).convert('RGB')
+            except:
+                image = images[i].convert('RGB')
+            if padding:
+                image = __padding__(image)
+            image = model.vis_processor(image).unsqueeze(0).cuda()
+            image_embeds = model.encode_img(image)
+            embeds.append(image_embeds)
+            im_mask.append(torch.ones(image_embeds.shape[:2]).cuda())
+        pt1 = pts
+    embeds = torch.cat(embeds, dim=1)
+    im_mask = torch.cat(im_mask, dim=1)
+    im_mask = im_mask.bool()
+
+    outputs = model.generate(
+        inputs_embeds=embeds,
+        im_mask=im_mask,
+        temperature=1.0,
+        max_new_tokens=max_token,
+        num_beams=beams,
+        do_sample=False,
+        repetition_penalty=1.0)
+
+    output_token = outputs[0]
+    if output_token[0] == 0 or output_token[0] == 1:
+        output_token = output_token[1:]
+    output_text = model.tokenizer.decode(output_token, add_special_tokens=False)
+    output_text = output_text.split('[UNUSED_TOKEN_145]')[0].strip()
+    return output_text
+
+
+class XComposer2(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='internlm/internlm-xcomposer2-vl-7b', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+
+        model = AutoModel.from_pretrained(self.model_path, device_map='cpu', trust_remote_code=True).cuda().eval()
+        model.half()
+        tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
+        model.tokenizer = tokenizer
+        self.model = model
+        self.device = self.model.model.tok_embeddings.weight.device
+
+    def generate_mme(self, image_path, text):
+        text = text.split('Please answer')[0].strip()
+        text = f'{text} Answer this question briefly'
+        text = f'[UNUSED_TOKEN_146]user\n{text}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+
+        return model_gen(self.model, text, image_path, need_bos=True, padding=True, beams=5)
+
+    def generate_multichoice(self, image_path, text, dataset):
+        out = model_gen(self.model, text, image_path, need_bos=True, padding=False, beams=5, max_token=5)
+        if 'mmmu' in dataset.lower():
+            return out
+        res = pattern.findall(out)
+        if len(res) == 0:
+            print('Error:', out)
+            res = 'Z'
+        return res[0]
+
+    def generate_vqa(self, image_path, text):
+        out = model_gen(self.model, text, image_path, need_bos=True)
+        return out
+
+    def generate_vanilla(self, image_path, text):
+        text = (
+            '[UNUSED_TOKEN_146]system\n{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]user\n{}'
+            'Answer this question in detail.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+        ).format(meta_instruction, text)
+        out = model_gen(self.model, text, image_path, need_bos=True, max_token=500)
+        return out
+
+    def generate_brief(self, image_path, text):
+        text = (
+            '[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.{}'
+            '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+        ).format(text)
+        out = model_gen(self.model, text, image_path, need_bos=True, max_token=10)
+        return out
+
+    def generate_driectly(self, image_path, text):
+        text = '[UNUSED_TOKEN_146]user\n{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'.format(text)
+        out = model_gen(self.model, text, image_path, need_bos=True, max_token=500)
+        return out
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        with torch.cuda.amp.autocast():
+            if dataset is None:
+                return self.generate_vanilla(image_path, prompt)
+            assert isinstance(dataset, str)
+            if dataset == 'MME':
+                return self.generate_mme(image_path, prompt)
+
+            elif listinstr(['hallu'], dataset.lower()):
+                return self.generate_brief(image_path, prompt)
+
+            elif listinstr(['llava'], dataset.lower()):
+                return self.generate_vanilla(image_path, prompt)
+
+            elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
+                return self.generate_multichoice(image_path, prompt, dataset)
+
+            elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
+                return self.generate_vqa(image_path, prompt)
+
+            else:
+                return self.generate_vanilla(image_path, prompt)
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'VQA':
+            return True
+        return False
+
+    def build_mcqa(self, line):
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        img_prompt = '[UNUSED_TOKEN_146]user\n'
+        if len(options):
+            options_prompt = ''
+            for key, item in options.items():
+                options_prompt += f'{key}. {item} '
+            options_prompt = options_prompt.strip()
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+
+            context = 'N/A' if hint is None else hint
+            mid_prompt = 'Question: ' + question + '\nContext: ' + context + '\nOptions: ' + options_prompt
+            ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\nThe answer is'
+            prompt = img_prompt + mid_prompt + ans_prompt
+        else:
+            mid_prompt = f'Answer the question using a single word or phrase.{question}'
+            ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+            prompt = img_prompt + mid_prompt + ans_prompt
+
+        return prompt
+
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+
+        if DATASET_TYPE(dataset) == 'MCQ':
+            prompt = self.build_mcqa(line)
+        elif DATASET_TYPE(dataset) == 'VQA':
+            if 'mathvista' in dataset.lower():
+                q = line['question']
+                prompt = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+            else:
+                q = line['question']
+                prompt = (
+                    f'[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.{q}'
+                    '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+                )
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
--- a/VLMEvalKit/vlmeval/vlm/xcomposer/xcomposer2_4KHD.py
+++ b/VLMEvalKit/vlmeval/vlm/xcomposer/xcomposer2_4KHD.py
+import torch
+from transformers import AutoModel, AutoTokenizer
+from PIL import Image
+from ..base import BaseModel
+from ...smp import *
+from ...dataset import DATASET_TYPE
+import numpy as np
+import torchvision.transforms as transforms
+
+import re
+pattern = re.compile(r'[A-Z]')
+
+
+def padding_336(b):
+    width, height = b.size
+    tar = int(np.ceil(height / 336) * 336)
+    top_padding = int((tar - height) / 2)
+    bottom_padding = tar - height - top_padding
+    left_padding = 0
+    right_padding = 0
+    b = transforms.functional.pad(b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255, 255, 255])
+
+    return b
+
+
+def HD_transform(img, im_num=16):
+    width, height = img.size
+    trans = False
+    if width < height:
+        img = img.transpose(Image.TRANSPOSE)
+        trans = True
+        width, height = img.size
+    ratio = (width / height)
+    scale = 1
+    while scale * np.ceil(scale / ratio) <= im_num:
+        scale += 1
+    scale -= 1
+    new_w = int(scale * 336)
+    new_h = int(new_w / ratio)
+
+    img = transforms.functional.resize(img, [new_h, new_w],)
+    img = padding_336(img)
+    width, height = img.size
+    assert width * height <= im_num * 336 * 336
+    if trans:
+        img = img.transpose(Image.TRANSPOSE)
+
+    return img
+
+
+meta_instruction = """You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).
+- InternLM-XComposer (浦语·灵笔) is a multi-modality conversational language model that is developed\
+ by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
+- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language chosen by\
+ the user such as English and 中文.
+- InternLM-XComposer (浦语·灵笔) is capable of comprehending and articulating responses\
+ effectively based on the provided image."""
+
+
+def model_gen(model, text, images, need_bos=True, padding=False, beams=3, max_token=500):
+    pt1 = 0
+    embeds = []
+    im_mask = []
+    images = [images]
+    images_loc = [0]
+    for i, pts in enumerate(images_loc + [len(text)]):
+        subtext = text[pt1:pts]
+        if need_bos or len(subtext) > 0:
+            text_embeds = model.encode_text(subtext, add_special_tokens=need_bos)
+            embeds.append(text_embeds)
+            im_mask.append(torch.zeros(text_embeds.shape[:2]).cuda())
+            need_bos = False
+        if i < len(images):
+            try:
+                image = Image.open(images[i]).convert('RGB')
+            except:
+                image = images[i].convert('RGB')
+
+            image = HD_transform(image, im_num=model.hd_num)
+            image = model.vis_processor(image).unsqueeze(0).cuda()
+            image_embeds = model.encode_img(image)
+            embeds.append(image_embeds)
+            im_mask.append(torch.ones(image_embeds.shape[:2]).cuda())
+        pt1 = pts
+    embeds = torch.cat(embeds, dim=1)
+    im_mask = torch.cat(im_mask, dim=1)
+    im_mask = im_mask.bool()
+
+    outputs = model.generate(inputs_embeds=embeds, im_mask=im_mask,
+                             temperature=1.0, max_new_tokens=max_token, num_beams=beams,
+                             do_sample=False, repetition_penalty=1.0)
+    output_token = outputs[0]
+    if output_token[0] == 0 or output_token[0] == 1:
+        output_token = output_token[1:]
+    output_text = model.tokenizer.decode(output_token, add_special_tokens=False)
+    output_text = output_text.split('[UNUSED_TOKEN_145]')[0].strip()
+    return output_text
+
+
+class XComposer2_4KHD(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='internlm/internlm-xcomposer2-4khd-7b', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+
+        model = AutoModel.from_pretrained(self.model_path, device_map='cpu', trust_remote_code=True).cuda().eval()
+        model.half()
+        tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
+        model.tokenizer = tokenizer
+        self.model = model
+        self.device = self.model.model.tok_embeddings.weight.device
+        self.model.hd_num = 25
+
+    def generate_mme(self, image_path, text):
+        text = text.split('Please answer')[0].strip()
+        text = f'{text} Answer this question briefly'
+        text = f'[UNUSED_TOKEN_146]user\n{text}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+
+        return model_gen(self.model, text, image_path, need_bos=True, padding=True, beams=5)
+
+    def generate_multichoice(self, image_path, text, dataset):
+        out = model_gen(self.model, text, image_path, need_bos=True, padding=False, beams=5, max_token=5)
+        if 'mmmu' in dataset.lower():
+            return out
+        res = pattern.findall(out)
+        if len(res) == 0:
+            print('Error:', out)
+            res = 'Z'
+        return res[0]
+
+    def generate_vqa(self, image_path, text):
+        out = model_gen(self.model, text, image_path, need_bos=True, max_token=100)
+        return out
+
+    def generate_vanilla(self, image_path, text):
+        out = model_gen(self.model, text, image_path, need_bos=True, max_token=500)
+        return out
+
+    def generate_brief(self, image_path, text):
+        text = '[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.{}\
+               [UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'.format(text)
+        out = model_gen(self.model, text, image_path, need_bos=True, max_token=10)
+        return out
+
+    def generate(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        if listinstr(['docvqa_test', 'infovqa_test'], dataset.lower()):
+            self.model.hd_num = 65
+        elif listinstr(['docvqa_val', 'infovqa_val', 'OCRBench'], dataset.lower()):
+            self.model.hd_num = 55
+        elif listinstr(['mmlongbench_doc'], dataset.lower()):
+            self.model.hd_num = 45
+        elif listinstr(['mmmu', 'mmbench', 'mmvet'], dataset.lower()):
+            self.model.hd_num = 16
+        else:
+            self.model.hd_num = 25
+
+        with torch.cuda.amp.autocast():
+            if dataset is None:
+                return self.generate_vanilla(image_path, prompt)
+            assert isinstance(dataset, str)
+            if dataset == 'MME':
+                return self.generate_mme(image_path, prompt)
+
+            elif listinstr(['hallu'], dataset.lower()):
+                return self.generate_brief(image_path, prompt)
+
+            elif listinstr(['llava', 'mmvet'], dataset.lower()):
+                return self.generate_vanilla(image_path, prompt)
+
+            elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
+                return self.generate_multichoice(image_path, prompt, dataset)
+
+            elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
+                return self.generate_vqa(image_path, prompt)
+
+            else:
+                return self.generate_vanilla(image_path, prompt)
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'VQA':
+            return True
+        return False
+
+    def build_mcqa(self, line):
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        img_prompt = '[UNUSED_TOKEN_146]user\n'
+        if len(options):
+            options_prompt = ''
+            for key, item in options.items():
+                options_prompt += f'{key}. {item} '
+            options_prompt = options_prompt.strip()
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+
+            context = 'N/A' if hint is None else hint
+            mid_prompt = 'Question: ' + question + '\nContext: ' + context + '\nOptions: ' + options_prompt
+            ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\nThe answer is'
+            prompt = img_prompt + mid_prompt + ans_prompt
+        else:
+            mid_prompt = f'Answer the question using a single word or phrase.{question}'
+            ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+            prompt = img_prompt + mid_prompt + ans_prompt
+
+        return prompt
+
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+
+        if DATASET_TYPE(dataset) == 'MCQ':
+            prompt = self.build_mcqa(line)
+        elif DATASET_TYPE(dataset) == 'VQA':
+            if 'mathvista' in dataset.lower():
+                q = line['question']
+                prompt = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+            elif listinstr(['llava', 'mmvet'], dataset.lower()):
+                q = line['question']
+                prompt = '[UNUSED_TOKEN_146]system\n{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]user\n{}\
+                         Answer this question in detail.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]\
+                         assistant\n'.format(meta_instruction, q)
+            elif listinstr(['mmlongbench_doc'], dataset.lower()):
+                q = line['question']
+                prompt = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+            else:
+                q = line['question']
+                prompt = f'[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.\
+                          {q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+        ret = [dict(type='text', value=prompt)]
+        ret.extend([dict(type='image', value=s) for s in tgt_path])
+        return ret
--- a/VLMEvalKit/vlmeval/vlm/xcomposer/xcomposer2d5.py
+++ b/VLMEvalKit/vlmeval/vlm/xcomposer/xcomposer2d5.py
+import re
+
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+
+from ...dataset import DATASET_TYPE
+from ...smp import *
+from ..base import BaseModel
+
+pattern = re.compile(r'[A-Z]')
+
+
+def padding_560(b):
+    width, height = b.size
+    tar = int(np.ceil(height / 560) * 560)
+    top_padding = int((tar - height) / 2)
+    bottom_padding = tar - height - top_padding
+    left_padding = 0
+    right_padding = 0
+    b = transforms.functional.pad(
+        b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255, 255, 255])
+
+    return b
+
+
+def HD_transform(img, im_num=36, id_scale=1.5):
+    width, height = img.size
+    trans = False
+    if width < height:
+        img = img.transpose(Image.TRANSPOSE)
+        trans = True
+        width, height = img.size
+    ratio = (width / height)
+    scale = 1
+    while scale * np.ceil(scale / ratio) <= im_num:
+        scale += 1
+    scale -= 1
+
+    scale = min(np.ceil(width * id_scale / 560), scale)
+    new_w = int(scale * 560)
+    new_h = int(new_w / ratio)
+
+    img = transforms.functional.resize(img, [new_h, new_w],)
+    img = padding_560(img)
+    width, height = img.size
+    assert width * height <= im_num * 560 * 560
+    if trans:
+        img = img.transpose(Image.TRANSPOSE)
+
+    return img
+
+
+meta_instruction = """You are an AI assistant whose name is InternLM (书生·浦语).\n" + "- InternLM (书生·浦语) \
+is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室).
+It is designed to be helpful, honest, and harmless.\n"+"- InternLM (书生·浦语) \
+can understand and communicate fluently in the language chosen by the user such as English and 中文."""
+
+
+def model_gen(model, text, images, need_bos=True, padding=False, beams=3, max_token=500):
+    embeds = []
+    im_mask = []
+
+    im_idx = 0
+    sub_q = text.split('<IM_POS>')
+    add_im = len(sub_q) - 1
+    for subtext in sub_q:
+        if need_bos or len(subtext) > 0:
+            text_embeds = model.encode_text(
+                subtext, add_special_tokens=need_bos)
+            embeds.append(text_embeds)
+            im_mask.append(torch.zeros(text_embeds.shape[:2]).to(model.device))
+            need_bos = False
+
+        if im_idx < len(images) and add_im:
+            try:
+                image = Image.open(images[im_idx]).convert('RGB')
+            except:
+                image = images[im_idx].convert('RGB')
+            if len(images) > 1:
+                image = HD_transform(image, im_num=model.hd_num // len(images), id_scale=model.id_scale)
+            else:
+                image = HD_transform(
+                    image, im_num=model.hd_num, id_scale=model.id_scale)
+            image = model.vis_processor(image).unsqueeze(0).to(model.device)
+            image_embeds = model.encode_img(image)
+            im_idx += 1
+            add_im -= 1
+            embeds.append(image_embeds)
+            im_mask.append(torch.ones(
+                image_embeds.shape[:2], dtype=torch.long).to(model.device))
+
+    embeds = torch.cat(embeds, dim=1)
+    im_mask = torch.cat(im_mask, dim=1)
+    im_mask = im_mask.bool()
+
+    outputs = model.generate(inputs_embeds=embeds, im_mask=im_mask,
+                             temperature=1.0, max_new_tokens=max_token, num_beams=beams,
+                             do_sample=False, repetition_penalty=1.0)
+
+    output_token = outputs[0]
+    if output_token[0] == 0 or output_token[0] == 1:
+        output_token = output_token[1:]
+    output_text = model.tokenizer.decode(
+        output_token, add_special_tokens=False)
+    output_text = output_text.split('[UNUSED_TOKEN_145]')[0].strip()
+    return output_text
+
+
+class XComposer2d5(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = True
+
+    def __init__(self, model_path='internlm/internlm-xcomposer2d5-7b', id_scale=1.5, beam=3, **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+        self.id_scale = id_scale
+        self.beam = beam
+
+        model = AutoModel.from_pretrained(
+            self.model_path, device_map='cpu', trust_remote_code=True, local_files_only=True).cuda().eval()
+        model.half()
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.model_path, trust_remote_code=True)
+        model.tokenizer = tokenizer
+        self.model = model
+        self.device = self.model.model.tok_embeddings.weight.device
+        self.model.hd_num = 36
+        self.model.id_scale = self.id_scale
+
+    def message_to_promptimg(self, message, dataset=None):
+        num_images = len([x for x in message if x['type'] == 'image'])
+        if num_images == 0:
+            prompt = '\n'.join([x['value']
+                               for x in message if x['type'] == 'text'])
+            image = None
+        else:
+            image = [x['value'] for x in message if x['type'] == 'image']
+            if len(image) == 1:
+                prompt = ''.join([x['value']
+                                 for x in message if x['type'] == 'text'])
+                im_prompt = '<IM_POS>'
+                prompt = prompt.replace('<image 1>', '')
+                prompt = im_prompt + prompt
+            else:
+                prompt = ''
+                im_prompt = [
+                    f'Image{im_idx+1}: <IM_POS>;' for im_idx in range(len(image))]
+                add_im = len(im_prompt)
+                im_idx = 0
+                for x in message:
+                    if x['type'] == 'text':
+                        prompt += x['value']
+                        if add_im > im_idx:
+                            prompt += f'Image{im_idx + 1}'
+                            im_idx += 1
+                im_prompt = ' '.join(im_prompt)
+                for i in range(len(image)):
+                    prompt = prompt.replace(f'<image {i+1}>', f'Image{i+1} ')
+                if listinstr(['mmlongbench', 'dude', 'slidevqa'], dataset.lower()):     # fix bug for multi-image prompt
+                    prompt = '[UNUSED_TOKEN_146]user\n' + im_prompt + re.sub(
+                        re.escape('[UNUSED_TOKEN_146]user\n'), '', prompt
+                    )
+                    prompt = re.sub('Image1$', '', prompt)
+        return prompt, image
+
+    def generate_mme(self, image_path, text):
+        text = text.split('Please answer')[0].strip()
+        text = f'{text} Answer this question briefly'
+        text = f'[UNUSED_TOKEN_146]user\n{text}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+
+        return model_gen(self.model, text, image_path, need_bos=True, padding=True, beams=self.beam)
+
+    def generate_multichoice(self, image_path, text, dataset):
+        out = model_gen(self.model, text, image_path,
+                        need_bos=True, padding=False, beams=self.beam, max_token=5)
+        if 'mmmu' in dataset.lower():
+            return out
+        res = pattern.findall(out)
+        if len(res) == 0:
+            print('Error:', out)
+            res = 'Z'
+        return res[0]
+
+    def generate_vqa(self, image_path, text):
+        out = model_gen(self.model, text, image_path, beams=self.beam,
+                        need_bos=True, max_token=100)
+        return out
+
+    def generate_vanilla(self, image_path, text):
+        out = model_gen(self.model, text, image_path, beams=self.beam,
+                        need_bos=True, max_token=500)
+        return out
+
+    def generate_brief(self, image_path, text):
+        text = '[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.{}\
+               [UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'.format(text)
+        out = model_gen(self.model, text, image_path, beams=self.beam,
+                        need_bos=True, max_token=10)
+        return out
+
+    def set_max_num(self, dataset):
+        if listinstr(['MME-RealWorld', 'MME-RealWorld-CN'], dataset):
+            self.model.hd_num = 25
+
+    def generate_inner(self, message, dataset=None):
+        self.set_max_num(dataset)
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+
+        with torch.cuda.amp.autocast():
+            if dataset is None:
+                return self.generate_vanilla(image_path, prompt)
+            assert isinstance(dataset, str)
+            if dataset == 'MME':
+                return self.generate_mme(image_path, prompt)
+            elif listinstr(['hallu', 'pope'], dataset.lower()):
+                return self.generate_brief(image_path, prompt)
+            elif listinstr(['llava', 'mmvet'], dataset.lower()):
+                return self.generate_vanilla(image_path, prompt)
+            elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
+                return self.generate_multichoice(image_path, prompt, dataset)
+            elif listinstr(['MME-RealWorld', 'MME-RealWorld-CN'], dataset):
+                return self.generate_multichoice(image_path, prompt, dataset)
+            elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
+                return self.generate_vqa(image_path, prompt)
+            else:
+                return self.generate_vanilla(image_path, prompt)
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'VQA':
+            return True
+        return False
+
+    def build_mcqa(self, line):
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        img_prompt = '[UNUSED_TOKEN_146]user\n'
+        if len(options):
+            options_prompt = ''
+            for key, item in options.items():
+                options_prompt += f'{key}. {item} '
+            options_prompt = options_prompt.strip()
+            hint = line['hint'] if (
+                'hint' in line and not pd.isna(line['hint'])) else None
+
+            context = 'N/A' if hint is None else hint
+            mid_prompt = 'Question: ' + question + '\nContext: ' + \
+                context + '\nOptions: ' + options_prompt
+            ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\nThe answer is'
+            prompt = img_prompt + mid_prompt + ans_prompt
+        else:
+            mid_prompt = f'Answer the question using a single word or phrase.{question}'
+            ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+            prompt = img_prompt + mid_prompt + ans_prompt
+
+        return prompt
+
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+
+        if DATASET_TYPE(dataset) == 'MCQ':
+            prompt = self.build_mcqa(line)
+        elif DATASET_TYPE(dataset) == 'VQA':
+            if 'mathvista' in dataset.lower():
+                q = line['question']
+                prompt = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+            elif listinstr(['llava', 'mmvet'], dataset.lower()):
+                q = line['question']
+                prompt = '[UNUSED_TOKEN_146]system\n{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]user\n{}\
+                         Answer this question in detail.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]\
+                         assistant\n'.format(meta_instruction, q)
+            elif listinstr(['mmlongbench_doc', 'dude', 'slidevqa'], dataset.lower()):
+                q = line['question']
+                prompt = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+            else:
+                q = line['question']
+                prompt = f'[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.\
+                          {q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
+        ret = [dict(type='text', value=prompt)]
+        ret.extend([dict(type='image', value=s) for s in tgt_path])
+        return ret
--- a/VLMEvalKit/vlmeval/vlm/xgen_mm.py
+++ b/VLMEvalKit/vlmeval/vlm/xgen_mm.py
+from PIL import Image
+import torch
+
+from .base import BaseModel
+from ..smp import *
+
+
+class XGenMM(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = True
+
+    def __init__(self, model_path='Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5', **kwargs):
+        try:
+            from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor
+        except:
+            warnings.warn('Please install the latest version transformers.')
+            sys.exit(-1)
+
+        model = AutoModelForVision2Seq.from_pretrained(
+            model_path, device_map='cuda', trust_remote_code=True, torch_dtype='auto'
+        ).eval()
+
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, use_fast=False, legacy=False
+        )
+        tokenizer = model.update_special_tokens(tokenizer)
+        tokenizer.eos_token = '<|end|>'
+        tokenizer.padding_side = 'left'
+        image_processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code=True)
+        self.model = model
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+        self.kwargs = kwargs
+
+    def apply_prompt_template(self, query):
+        s = (
+            '<|system|>\nA chat between a curious user and an artificial intelligence assistant. '
+            "The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n"
+            f'<|user|>\n{query}<|end|>\n<|assistant|>\n'
+        )
+        return s
+
+    def generate_inner(self, message, dataset=None):
+
+        content, images, image_sizes = '', [], []
+
+        for msg in message:
+            if msg['type'] == 'text':
+                content += msg['value']
+            elif msg['type'] == 'image':
+                image = Image.open(msg['value']).convert('RGB')
+                images.append(self.image_processor([image], image_aspect_ratio='anyres')['pixel_values'].to('cuda'))
+                image_sizes.append(image.size)
+                content += '<image> '
+
+        inputs = {'pixel_values': [images]}
+        prompt = self.apply_prompt_template(content)
+        language_inputs = self.tokenizer([prompt], return_tensors='pt').to('cuda')
+        inputs.update(language_inputs)
+
+        generation_args = {
+            'max_new_tokens': 1024,
+            'temperature': 0.0,
+            'do_sample': False,
+            'top_p': None,
+            'num_beams': 1
+        }
+        generation_args.update(self.kwargs)
+
+        generate_ids = self.model.generate(
+            **inputs, image_size=[image_sizes],
+            pad_token_id=self.tokenizer.pad_token_id,
+            eos_token_id=self.tokenizer.eos_token_id,
+            **generation_args
+        )
+
+        # remove input tokens
+        response = self.tokenizer.decode(generate_ids[0], skip_special_tokens=True).split('<|end|>')[0]
+
+        return response
--- a/VLMEvalKit/vlmeval/vlm/yi_vl.py
+++ b/VLMEvalKit/vlmeval/vlm/yi_vl.py
+import torch
+import sys
+import os.path as osp
+import warnings
+from PIL import Image
+from vlmeval.smp import get_cache_path, load, dump, splitlen
+from huggingface_hub import snapshot_download
+from .base import BaseModel
+
+
+"""
+You can perform inference of Yi-VL through the following steps:
+1. clone the repo https://github.com/01-ai/Yi to path-to-Yi
+2. set up the environment and install the required packages in path-to-Yi/VL/requirements.txt
+3. set Yi_ROOT in vlmeval/config.py
+    Yi_ROOT = path-to-Yi
+
+You are all set now! To run a demo for Yi-VL:
+```python
+from vlmeval import *
+model = supported_VLM['Yi_VL_6B']()
+model.generate('apple.jpg', 'What is in this image?')
+```
+To run evaluation for Yi-VL, use `python run.py --model Yi_VL_6B --data {dataset_list}`
+"""
+
+
+def edit_config(repo_id):
+    if not osp.exists(repo_id):
+        root = get_cache_path(repo_id)
+    else:
+        root = repo_id
+    assert root is not None and osp.exists(root)
+    cfg = osp.join(root, 'config.json')
+    data = load(cfg)
+    mm_vision_tower = data['mm_vision_tower']
+    if mm_vision_tower.startswith('./vit/'):
+        data['mm_vision_tower'] = osp.join(root, mm_vision_tower)
+        assert osp.exists(data['mm_vision_tower'])
+        dump(data, cfg)
+
+
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    import torch
+
+    setattr(torch.nn.Linear, 'reset_parameters', lambda self: None)
+    setattr(torch.nn.LayerNorm, 'reset_parameters', lambda self: None)
+
+
+class Yi_VL(BaseModel):
+
+    INSTALL_REQ = True
+    INTERLEAVE = False
+
+    def __init__(self,
+                 model_path='01-ai/Yi-VL-6B',
+                 root=None,
+                 **kwargs):
+
+        if root is None:
+            warnings.warn(
+                'Please set root to the directory of Yi, '
+                'which is cloned from here: https://github.com/01-ai/Yi.'
+            )
+
+        self.root = osp.join(root, 'VL')
+        sys.path.append(self.root)
+
+        if splitlen(model_path, '/') == 2 and not osp.exists(model_path):
+            if get_cache_path(model_path) is None:
+                snapshot_download(repo_id=model_path)
+            edit_config(model_path)
+        elif osp.exists(model_path):
+            edit_config(model_path)
+
+        from llava.mm_utils import get_model_name_from_path, load_pretrained_model
+        from llava.model.constants import key_info
+
+        disable_torch_init()
+        key_info['model_path'] = model_path
+        get_model_name_from_path(model_path)
+        self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
+            model_path,
+            device_map='cpu')
+        self.model = self.model.cuda()
+        self.conv_mode = 'mm_default'
+
+        kwargs_default = dict(temperature=0.2,
+                              num_beams=1,
+                              do_sample=False,
+                              max_new_tokens=1024,
+                              top_p=None)
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+
+        from llava.conversation import conv_templates
+        from llava.model.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
+        from llava.mm_utils import KeywordsStoppingCriteria, expand2square, tokenizer_image_token
+
+        qs = DEFAULT_IMAGE_TOKEN + '\n' + prompt
+        conv = conv_templates[self.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        input_ids = (
+            tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
+            .unsqueeze(0)
+            .cuda()
+        )
+
+        image = Image.open(image_path)
+        if getattr(self.model.config, 'image_aspect_ratio', None) == 'pad':
+            if image.mode == 'L':
+                background_color = int(sum([int(x * 255) for x in self.image_processor.image_mean]) / 3)
+            else:
+                background_color = tuple(int(x * 255) for x in self.image_processor.image_mean)
+            image = expand2square(image, background_color)
+        image_tensor = self.image_processor.preprocess(image, return_tensors='pt')[
+            'pixel_values'
+        ][0]
+
+        stop_str = conv.sep
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids)
+        self.model = self.model.to(dtype=torch.bfloat16)
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                input_ids,
+                images=image_tensor.unsqueeze(0).to(dtype=torch.bfloat16).cuda(),
+                stopping_criteria=[stopping_criteria],
+                use_cache=True,
+                **self.kwargs)
+
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(
+                f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids'
+            )
+        outputs = self.tokenizer.batch_decode(
+            output_ids[:, input_token_len:], skip_special_tokens=True
+        )[0]
+        outputs = outputs.strip()
+
+        if outputs.endswith(stop_str):
+            outputs = outputs[: -len(stop_str)]
+        outputs = outputs.strip()
+        return outputs
--- a/assets/license/GEMMA_LICENSE.txt
+++ b/assets/license/GEMMA_LICENSE.txt
+Gemma Terms of Use
+
+Last modified: April 1, 2024
+
+By using, reproducing, modifying, distributing, performing or displaying any portion or element of Gemma, Model Derivatives including via any Hosted Service, (each as defined below) (collectively, the "Gemma Services") or otherwise accepting the terms of this Agreement, you agree to be bound by this Agreement.
+
+Section 1: DEFINITIONS
+1.1 Definitions
+(a) "Agreement" or "Gemma Terms of Use" means these terms and conditions that govern the use, reproduction, Distribution or modification of the Gemma Services and any terms and conditions incorporated by reference.
+
+(b) "Distribution" or "Distribute" means any transmission, publication, or other sharing of Gemma or Model Derivatives to a third party, including by providing or making Gemma or its functionality available as a hosted service via API, web access, or any other electronic or remote means ("Hosted Service").
+
+(c) "Gemma" means the set of machine learning language models, trained model weights and parameters identified at ai.google.dev/gemma, regardless of the source that you obtained it from.
+
+(d) "Google" means Google LLC.
+
+(e) "Model Derivatives" means all (i) modifications to Gemma, (ii) works based on Gemma, or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of Gemma, to that model in order to cause that model to perform similarly to Gemma, including distillation methods that use intermediate data representations or methods based on the generation of synthetic data Outputs by Gemma for training that model. For clarity, Outputs are not deemed Model Derivatives.
+
+(f) "Output" means the information content output of Gemma or a Model Derivative that results from operating or otherwise using Gemma or the Model Derivative, including via a Hosted Service.
+
+1.2
+As used in this Agreement, "including" means "including without limitation".
+
+Section 2: ELIGIBILITY AND USAGE
+2.1 Eligibility
+You represent and warrant that you have the legal capacity to enter into this Agreement (including being of sufficient age of consent). If you are accessing or using any of the Gemma Services for or on behalf of a legal entity, (a) you are entering into this Agreement on behalf of yourself and that legal entity, (b) you represent and warrant that you have the authority to act on behalf of and bind that entity to this Agreement and (c) references to "you" or "your" in the remainder of this Agreement refers to both you (as an individual) and that entity.
+
+2.2 Use
+You may use, reproduce, modify, Distribute, perform or display any of the Gemma Services only in accordance with the terms of this Agreement, and must not violate (or encourage or permit anyone else to violate) any term of this Agreement.
+
+Section 3: DISTRIBUTION AND RESTRICTIONS
+3.1 Distribution and Redistribution
+You may reproduce or Distribute copies of Gemma or Model Derivatives if you meet all of the following conditions:
+
+You must include the use restrictions referenced in Section 3.2 as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of Gemma or Model Derivatives and you must provide notice to subsequent users you Distribute to that Gemma or Model Derivatives are subject to the use restrictions in Section 3.2.
+You must provide all third party recipients of Gemma or Model Derivatives a copy of this Agreement.
+You must cause any modified files to carry prominent notices stating that you modified the files.
+All Distributions (other than through a Hosted Service) must be accompanied by a "Notice" text file that contains the following notice: "Gemma is provided under and subject to the Gemma Terms of Use found at ai.google.dev/gemma/terms".
+You may add your own intellectual property statement to your modifications and, except as set forth in this Section, may provide additional or different terms and conditions for use, reproduction, or Distribution of your modifications, or for any such Model Derivatives as a whole, provided your use, reproduction, modification, Distribution, performance, and display of Gemma otherwise complies with the terms and conditions of this Agreement. Any additional or different terms and conditions you impose must not conflict with the terms of this Agreement.
+
+3.2 Use Restrictions
+You must not use any of the Gemma Services:
+
+for the restricted uses set forth in the Gemma Prohibited Use Policy at ai.google.dev/gemma/prohibited_use_policy ("Prohibited Use Policy"), which is hereby incorporated by reference into this Agreement; or
+in violation of applicable laws and regulations.
+To the maximum extent permitted by law, Google reserves the right to restrict (remotely or otherwise) usage of any of the Gemma Services that Google reasonably believes are in violation of this Agreement.
+
+3.3 Generated Output
+Google claims no rights in Outputs you generate using Gemma. You and your users are solely responsible for Outputs and their subsequent uses.
+
+Section 4: ADDITIONAL PROVISIONS
+4.1 Updates
+Google may update Gemma from time to time.
+
+4.2 Trademarks
+Nothing in this Agreement grants you any rights to use Google's trademarks, trade names, logos or to otherwise suggest endorsement or misrepresent the relationship between you and Google. Google reserves any rights not expressly granted herein.
+
+4.3 DISCLAIMER OF WARRANTY
+UNLESS REQUIRED BY APPLICABLE LAW, THE GEMMA SERVICES, AND OUTPUTS, ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING, REPRODUCING, MODIFYING, PERFORMING, DISPLAYING OR DISTRIBUTING ANY OF THE GEMMA SERVICES OR OUTPUTS AND ASSUME ANY AND ALL RISKS ASSOCIATED WITH YOUR USE OR DISTRIBUTION OF ANY OF THE GEMMA SERVICES OR OUTPUTS AND YOUR EXERCISE OF RIGHTS AND PERMISSIONS UNDER THIS AGREEMENT.
+
+4.4 LIMITATION OF LIABILITY
+TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY, CONTRACT, OR OTHERWISE, UNLESS REQUIRED BY APPLICABLE LAW, SHALL GOOGLE OR ITS AFFILIATES BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, CONSEQUENTIAL, OR PUNITIVE DAMAGES, OR LOST PROFITS OF ANY KIND ARISING FROM THIS AGREEMENT OR RELATED TO, ANY OF THE GEMMA SERVICES OR OUTPUTS EVEN IF GOOGLE OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+4.5 Term, Termination, and Survival
+The term of this Agreement will commence upon your acceptance of this Agreement (including acceptance by your use, modification, or Distribution, reproduction, performance or display of any portion or element of the Gemma Services) and will continue in full force and effect until terminated in accordance with the terms of this Agreement. Google may terminate this Agreement if you are in breach of any term of this Agreement. Upon termination of this Agreement, you must delete and cease use and Distribution of all copies of Gemma and Model Derivatives in your possession or control. Sections 1, 2.1, 3.3, 4.2 to 4.9 shall survive the termination of this Agreement.
+
+4.6 Governing Law and Jurisdiction
+This Agreement will be governed by the laws of the State of California without regard to choice of law principles. The UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The state and federal courts of Santa Clara County, California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
+
+4.7 Severability
+If any provision of this Agreement is held to be invalid, illegal or unenforceable, the remaining provisions shall be unaffected thereby and remain valid as if such provision had not been set forth herein.
+
+4.8 Entire Agreement
+This Agreement states all the terms agreed between the parties and supersedes all other agreements between the parties as of the date of acceptance relating to its subject matter.
+
+4.9 No Waiver
+Google will not be treated as having waived any rights by not exercising (or delaying the exercise of) any rights under this Agreement.
\ No newline at end of file
--- a/assets/license/LLAMA3_LICENSE
+++ b/assets/license/LLAMA3_LICENSE
+META LLAMA 3 COMMUNITY LICENSE AGREEMENT
+
+Meta Llama 3 Version Release Date: April 18, 2024
+“Agreement” means the terms and conditions for use, reproduction, distribution and modification of the Llama Materials set forth herein.
+
+“Documentation” means the specifications, manuals and documentation accompanying Meta Llama 3 distributed by Meta at https://llama.meta.com/get-started/.
+
+“Licensee” or “you” means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
+
+“Meta Llama 3” means the foundational large language models and software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by Meta at https://llama.meta.com/llama-downloads.
+
+“Llama Materials” means, collectively, Meta’s proprietary Meta Llama 3 and Documentation (and any portion thereof) made available under this Agreement.
+
+“Meta” or “we” means Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) and Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland).
+
+By clicking “I Accept” below or by using or distributing any portion or element of the Llama Materials, you agree to be bound by this Agreement.
+
+1. License Rights and Redistribution.
+
+	a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta’s intellectual property or other rights owned by Meta embodied in the Llama Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Llama Materials.
+	b. Redistribution and Use.
+		i. If you distribute or make available the Llama Materials (or any derivative works thereof), or a product or service that uses any of them, including another AI model, you shall (A) provide a copy of this Agreement with any such Llama Materials; and (B) prominently display “Built with Meta Llama 3” on a related website, user interface, blogpost, about page, or product documentation. If you use the Llama Materials to create, train, fine tune, or otherwise improve an AI model, which is distributed or made available, you shall also include “Llama 3” at the beginning of any such AI model name.
+		ii. If you receive Llama Materials, or any derivative works thereof, from a Licensee as part of an integrated end user product, then Section 2 of this Agreement will not apply to you.
+		iii. You must retain in all copies of the Llama Materials that you distribute the following attribution notice within a “Notice” text file distributed as a part of such copies: “Meta Llama 3 is licensed under the Meta Llama 3 Community License, Copyright © Meta Platforms, Inc. All Rights Reserved.”
+		iv. Your use of the Llama Materials must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Llama Materials (available at https://llama.meta.com/llama3/use-policy), which is hereby incorporated by reference into this Agreement.
+		v. You will not use the Llama Materials or any output or results of the Llama Materials to improve any other large language model (excluding Meta Llama 3 or derivative works thereof).
+
+2. Additional Commercial Terms. If, on the Meta Llama 3 version release date, the monthly active users of the products or services made available by or for Licensee, or Licensee’s affiliates, is greater than 700 million monthly active users in the preceding calendar month, you must request a license from Meta, which Meta may grant to you in its sole discretion, and you are not authorized to exercise any of the rights under this Agreement unless or until Meta otherwise expressly grants you such rights.
+
+3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, AND META DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE LLAMA MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS.
+
+4. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+
+5. Intellectual Property.
+	a. No trademark licenses are granted under this Agreement, and in connection with the Llama Materials, neither Meta nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Llama Materials or as set forth in this Section 5(a). Meta hereby grants you a license to use “Llama 3” (the “Mark”) solely as required to comply with the last sentence of Section 1.b.i. You will comply with Meta’s brand guidelines (currently accessible at https://about.meta.com/brand/resources/meta/company-brand/  ). All goodwill arising out of your use of the Mark will inure to the benefit of Meta.
+	b. Subject to Meta’s ownership of Llama Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the Llama Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications.
+	c. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Llama Materials or Meta Llama 3 outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the Llama Materials.
+
+6. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Llama Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Llama Materials. Sections 3, 4 and 7 shall survive the termination of this Agreement.
+
+7. Governing Law and Jurisdiction. This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
+
+
+Meta Llama 3 Acceptable Use Policy
+Meta is committed to promoting safe and fair use of its tools and features, including Meta Llama 3. If you access or use Meta Llama 3, you agree to this Acceptable Use Policy (“Policy”). The most recent copy of this policy can be found at https://llama.meta.com/llama3/use-policy
+Prohibited Uses
+We want everyone to use Meta Llama 3 safely and responsibly. You agree you will not use, or allow others to use, Meta Llama 3 to:
+1. Violate the law or others’ rights, including to:
+	a. Engage in, promote, generate, contribute to, encourage, plan, incite, or further illegal or unlawful activity or content, such as:
+      		i. Violence or terrorism
+      		ii. Exploitation or harm to children, including the solicitation, creation, acquisition, or dissemination of child exploitative content or failure to report Child Sexual Abuse Material
+      		iii. Human trafficking, exploitation, and sexual violence
+      		iv. The illegal distribution of information or materials to minors, including obscene materials, or failure to employ legally required age-gating in connection with such information or materials.
+      		v. Sexual solicitation
+      		vi. Any other criminal activity
+   	b. Engage in, promote, incite, or facilitate the harassment, abuse, threatening, or bullying of individuals or groups of individuals
+   	c. Engage in, promote, incite, or facilitate discrimination or other unlawful or harmful conduct in the provision of employment, employment benefits, credit, housing, other economic benefits, or other essential goods and services
+   	d. Engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or related professional practices
+   	e. Collect, process, disclose, generate, or infer health, demographic, or other sensitive personal or private information about individuals without rights and consents required by applicable laws
+   	f. Engage in or facilitate any action or generate any content that infringes, misappropriates, or otherwise violates any third-party rights, including the outputs or results of any products or services using the Llama Materials
+   	g. Create, generate, or facilitate the creation of malicious code, malware, computer viruses or do anything else that could disable, overburden, interfere with or impair the proper working, integrity, operation or appearance of a website or computer system
+
+2. Engage in, promote, incite, facilitate, or assist in the planning or development of activities that present a risk of death or bodily harm to individuals, including use of Meta Llama 3 related to the following:
+   	a. Military, warfare, nuclear industries or applications, espionage, use for materials or activities that are subject to the International Traffic Arms Regulations (ITAR) maintained by the United States Department of State
+   	b. Guns and illegal weapons (including weapon development)
+   	c. Illegal drugs and regulated/controlled substances
+   	d. Operation of critical infrastructure, transportation technologies, or heavy machinery
+   	e. Self-harm or harm to others, including suicide, cutting, and eating disorders
+   	f. Any content intended to incite or promote violence, abuse, or any infliction of bodily harm to an individual
+
+3. Intentionally deceive or mislead others, including use of Meta Llama 3 related to the following:
+   	a. Generating, promoting, or furthering fraud or the creation or promotion of disinformation
+   	b. Generating, promoting, or furthering defamatory content, including the creation of defamatory statements, images, or other content
+   	c. Generating, promoting, or further distributing spam
+   	d. Impersonating another individual without consent, authorization, or legal right
+   	e. Representing that the use of Meta Llama 3 or outputs are human-generated
+   	f. Generating or facilitating false online engagement, including fake reviews and other means of fake online engagement
+   	g. Fail to appropriately disclose to end users any known dangers of your AI system
+
+Please report any violation of this Policy, software “bug,” or other problems that could lead to a violation of this Policy through one of the following means:
+   	* Reporting issues with the model: https://github.com/meta-llama/llama3
+   	* Reporting risky content generated by the model: developers.facebook.com/llama_output_feedback
+   	* Reporting bugs and security concerns: facebook.com/whitehat/info
+   	* Reporting violations of the Acceptable Use Policy or unlicensed uses of Meta Llama 3: LlamaUseReport@meta.com
\ No newline at end of file
--- a/assets/license/QWEN_LICENSE
+++ b/assets/license/QWEN_LICENSE
+Tongyi Qianwen LICENSE AGREEMENT
+
+Tongyi Qianwen Release Date: August 3, 2023
+
+By clicking to agree or by using or distributing any portion or element of the Tongyi Qianwen Materials, you will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
+
+1. Definitions
+    a. This Tongyi Qianwen LICENSE AGREEMENT (this "Agreement") shall mean the terms and conditions for use, reproduction, distribution and modification of the Materials as defined by this Agreement.
+    b. "We"(or "Us") shall mean Alibaba Cloud.
+    c. "You" (or "Your") shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Materials for any purpose and in any field of use.
+    d. "Third Parties" shall mean individuals or legal entities that are not under common control with Us or You.
+    e. "Tongyi Qianwen" shall mean the large language models (including Qwen model and Qwen-Chat model), and software and algorithms, consisting of trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by Us.
+    f. "Materials" shall mean, collectively, Alibaba Cloud's proprietary Tongyi Qianwen and Documentation (and any portion thereof) made available under this Agreement.
+    g. "Source" form shall mean the preferred form for making modifications, including but not limited to model source code, documentation source, and configuration files.
+    h. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+2. Grant of Rights
+You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Alibaba Cloud's intellectual property or other rights owned by Us embodied in the Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Materials.
+
+3. Redistribution
+You may reproduce and distribute copies of the Materials or derivative works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
+    a. You shall give any other recipients of the Materials or derivative works a copy of this Agreement;
+    b. You shall cause any modified files to carry prominent notices stating that You changed the files;
+    c. You shall retain in all copies of the Materials that You distribute the following attribution notices within a "Notice" text file distributed as a part of such copies: "Tongyi Qianwen is licensed under the Tongyi Qianwen LICENSE AGREEMENT, Copyright (c) Alibaba Cloud. All Rights Reserved."; and
+    d. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such derivative works as a whole, provided Your use, reproduction, and distribution of the work otherwise complies with the terms and conditions of this Agreement.
+
+4. Restrictions
+If you are commercially using the Materials, and your product or service has more than 100 million monthly active users, You shall request a license from Us. You cannot exercise your rights under this Agreement without our express authorization.
+
+5. Rules of use
+    a. The Materials may be subject to export controls or restrictions in China, the United States or other countries or regions. You shall comply with applicable laws and regulations in your use of the Materials.
+    b. You can not use the Materials or any output therefrom to improve any other large language model (excluding Tongyi Qianwen or derivative works thereof).
+
+6. Intellectual Property
+    a. We retain ownership of all intellectual property rights in and to the Materials and derivatives made by or for Us. Conditioned upon compliance with the terms and conditions of this Agreement, with respect to any derivative works and modifications of the Materials that are made by you, you are and will be the owner of such derivative works and modifications.
+    b. No trademark license is granted to use the trade names, trademarks, service marks, or product names of Us, except as required to fulfill notice requirements under this Agreement or as required for reasonable and customary use in describing and redistributing the Materials.
+    c. If you commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any entity alleging that the Materials or any output therefrom, or any part of the foregoing, infringe any intellectual property or other right owned or licensable by you, then all licences granted to you under this Agreement shall terminate as of the date such lawsuit or other proceeding is commenced or brought.
+
+7. Disclaimer of Warranty and Limitation of Liability
+
+    a. We are not obligated to support, update, provide training for, or develop any further version of the Tongyi Qianwen Materials or to grant any license thereto.
+    b. THE MATERIALS ARE PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. WE MAKE NO WARRANTY AND ASSUME NO RESPONSIBILITY FOR THE SAFETY OR STABILITY OF THE MATERIALS AND ANY OUTPUT THEREFROM.
+    c. IN NO EVENT SHALL WE BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE MATERIALS OR ANY OUTPUT OF IT, NO MATTER HOW IT’S CAUSED.
+    d. You will defend, indemnify and hold harmless Us from and against any claim by any third party arising out of or related to your use or distribution of the Materials.
+
+8. Survival and Termination.
+    a. The term of this Agreement shall commence upon your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
+    b. We may terminate this Agreement if you breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, you must delete and cease use of the Materials. Sections 7 and 9 shall survive the termination of this Agreement.
+
+9. Governing Law and Jurisdiction.
+    a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
+    b. The People's Courts in Hangzhou City shall have exclusive jurisdiction over any dispute arising out of this Agreement.
\ No newline at end of file
--- a/assets/ovis-illustration.png
+++ b/assets/ovis-illustration.png
--- a/assets/performance/Ovis1_6-Gemma2-9B.png
+++ b/assets/performance/Ovis1_6-Gemma2-9B.png
--- a/assets/result.png
+++ b/assets/result.png
--- a/icon.png
+++ b/icon.png
--- a/ovis/__init__.py
+++ b/ovis/__init__.py
+import os
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"