init

81028572 · luopl · 81028572 · 81028572 · 81028572 · 81028572
Commit 81028572 authored Sep 28, 2024 by luopl
20 changed files
--- a/VLMEvalKit/vlmeval/vlm/phi3_vision.py
+++ b/VLMEvalKit/vlmeval/vlm/phi3_vision.py
+from PIL import Image
+import torch
+from .base import BaseModel
+from ..smp import *
+class Phi3Vision(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = False
+    def __init__(self, model_path='microsoft/Phi-3-vision-128k-instruct', **kwargs):
+        try:
+            from transformers import AutoProcessor, AutoModelForCausalLM
+        except:
+            warnings.warn('Please install the latest version transformers.')
+            sys.exit(-1)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path, device_map='cuda', trust_remote_code=True, torch_dtype='auto').eval()
+        processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+        self.model = model
+        self.processor = processor
+        self.kwargs = kwargs
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        image = Image.open(image_path).convert('RGB')
+        messages = [
+            {'role': 'user', 'content': f'<|image_1|>\n{prompt}'}
+        ]
+        prompt = self.processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = self.processor(prompt, [image], return_tensors='pt').to('cuda')
+        generation_args = {
+            'max_new_tokens': 500,
+            'temperature': 0.0,
+            'do_sample': False,
+        }
+        generation_args.update(self.kwargs)
+        generate_ids = self.model.generate(
+            **inputs,
+            eos_token_id=self.processor.tokenizer.eos_token_id,
+            **generation_args
+        )
+        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+        response = self.processor.batch_decode(
+            generate_ids,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
+        return response
+    def chat_inner(self, message, dataset=None):
+        messages = []
+        image_cnt = 1
+        image_list = []
+        for msg in message:
+            content = ''
+            # If message is just text in the conversation
+            if len(msg['content']) == 1 and msg['content'][0]['type'] == 'text':
+                msg_new = {'role': msg['role'], 'content': msg['content'][0]['value']}
+                messages.append(msg_new)
+                continue
+            # If both image & text is present
+            for x in msg['content']:
+                if x['type'] == 'text':
+                    content += x['value']
+                elif x['type'] == 'image':
+                    image = Image.open(x['value']).convert('RGB')
+                    content += f'<|image_{image_cnt}|>\n'
+                    image_list.append(image)
+                    image_cnt += 1
+            msg_new = {'role': msg['role'], 'content': content}
+            messages.append(msg_new)
+        prompt = self.processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = self.processor(prompt, image_list, return_tensors='pt').to('cuda')
+        generation_args = {
+            'max_new_tokens': 500,
+            'temperature': 0.0,
+            'do_sample': False,
+        }
+        generation_args.update(self.kwargs)
+        generate_ids = self.model.generate(
+            **inputs,
+            eos_token_id=self.processor.tokenizer.eos_token_id,
+            **generation_args
+        )
+        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+        response = self.processor.batch_decode(
+            generate_ids,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
+        return response
+class Phi3_5Vision(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = False
+    def __init__(self, model_path='microsoft/Phi-3.5-vision-instruct', **kwargs):
+        try:
+            from transformers import AutoProcessor, AutoModelForCausalLM
+        except:
+            warnings.warn('Please install the latest version transformers.')
+            sys.exit(-1)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path, device_map='cuda', trust_remote_code=True, torch_dtype='auto',
+            _attn_implementation='flash_attention_2').eval()
+        # for best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame.
+        processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True, num_crops=4)
+        self.model = model
+        self.processor = processor
+        self.kwargs = kwargs
+    def generate_inner(self, message, dataset=None):
+        prompt = '\n'.join([msg['value'] for msg in message if msg['type'] == 'text'])
+        images = [Image.open(msg['value']).convert('RGB') for msg in message if msg['type'] == 'image']
+        num_images = len(images)
+        placeholder = ''
+        for i in range(1, num_images + 1):
+            placeholder += f'<|image_{i}|>\n'
+        messages = [
+            {'role': 'user', 'content': placeholder + prompt}
+        ]
+        prompt = self.processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = self.processor(prompt, images, return_tensors='pt').to('cuda')
+        generation_args = {
+            'max_new_tokens': 1000,
+            'temperature': 0.0,
+            'do_sample': False,
+        }
+        generation_args.update(self.kwargs)
+        generate_ids = self.model.generate(
+            **inputs,
+            eos_token_id=self.processor.tokenizer.eos_token_id,
+            **generation_args
+        )
+        # remove input tokens
+        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+        response = self.processor.batch_decode(
+            generate_ids,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
+        return response
--- a/VLMEvalKit/vlmeval/vlm/pixtral.py
+++ b/VLMEvalKit/vlmeval/vlm/pixtral.py
+import torch
+from PIL import Image
+from .base import BaseModel
+from ..smp import *
+import warnings
+from huggingface_hub import snapshot_download
+class Pixtral(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = True
+    def __init__(self, model_path='mistralai/Pixtral-12B-2409', **kwargs):
+        self.model_path = model_path
+        try:
+            from mistral_inference.transformer import Transformer
+            from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+        except ImportError as err:
+            warnings.warn('Please install `mistral-inference` and `mistral_common`')
+            raise err
+        if get_cache_path(model_path) is None:
+            snapshot_download(repo_id=model_path)
+        cache_path = get_cache_path(self.model_path)
+        self.tokenizer = MistralTokenizer.from_file(f'{cache_path}/tekken.json')
+        model = Transformer.from_folder(cache_path, device='cpu')
+        model.cuda()
+        self.model = model
+        self.max_tokens = 512
+    def generate_inner(self, message, dataset=None):
+        try:
+            from mistral_inference.generate import generate
+            from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageURLChunk
+            from mistral_common.protocol.instruct.request import ChatCompletionRequest
+        except ImportError as err:
+            warnings.warn('Please install `mistral-inference` and `mistral_common`')
+            raise err
+        msg_new = []
+        for msg in message:
+            tp, val = msg['type'], msg['value']
+            if tp == 'text':
+                msg_new.append(TextChunk(text=val))
+            elif tp == 'image':
+                b64 = encode_image_file_to_base64(val)
+                image_url = f'data:image/jpeg;base64,{b64}'
+                msg_new.append(ImageURLChunk(image_url=image_url))
+        completion_request = ChatCompletionRequest(messages=[UserMessage(content=msg_new)])
+        encoded = self.tokenizer.encode_chat_completion(completion_request)
+        images = encoded.images
+        tokens = encoded.tokens
+        out_tokens, _ = generate(
+            [tokens],
+            self.model,
+            images=[images],
+            max_tokens=self.max_tokens,
+            temperature=0,
+            eos_id=self.tokenizer.instruct_tokenizer.tokenizer.eos_id)
+        result = self.tokenizer.decode(out_tokens[0])
+        return result
--- a/VLMEvalKit/vlmeval/vlm/qh_360vl.py
+++ b/VLMEvalKit/vlmeval/vlm/qh_360vl.py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import warnings
+import os.path as osp
+from PIL import Image
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+class QH_360VL(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = False
+    def __init__(self, model_path='qihoo360/360VL-70B', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(model_path,
+                                                          torch_dtype=torch.float16,
+                                                          low_cpu_mem_usage=True,
+                                                          device_map='auto',
+                                                          trust_remote_code=True).eval()
+        vision_tower = self.model.get_vision_tower()
+        vision_tower.load_model()
+        vision_tower.to(device='cuda', dtype=torch.float16)
+        self.image_processor = vision_tower.image_processor
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.kwargs = kwargs
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+        torch.cuda.empty_cache()
+    def generate(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        print(prompt)
+        image = Image.open(image_path).convert('RGB')
+        terminators = [
+            self.tokenizer.convert_tokens_to_ids('<|eot_id|>',)
+        ]
+        inputs = self.model.build_conversation_input_ids(self.tokenizer,
+                                                         query=prompt,
+                                                         image=image,
+                                                         image_processor=self.image_processor)
+        input_ids = inputs['input_ids'].to(device='cuda', non_blocking=True)
+        images = inputs['image'].to(dtype=torch.float16, device='cuda', non_blocking=True)
+        output_ids = self.model.generate(input_ids=input_ids,
+                                         images=images,
+                                         do_sample=False,
+                                         num_beams=1,
+                                         max_new_tokens=512,
+                                         eos_token_id=terminators,
+                                         use_cache=True)
+        input_token_len = input_ids.shape[1]
+        outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+        response = outputs.strip()
+        return response
--- a/VLMEvalKit/vlmeval/vlm/qwen2_vl/__init__.py
+++ b/VLMEvalKit/vlmeval/vlm/qwen2_vl/__init__.py
+from .model import Qwen2VLChat
+from .prompt import Qwen2VLPromptMixin
--- a/VLMEvalKit/vlmeval/vlm/qwen2_vl/model.py
+++ b/VLMEvalKit/vlmeval/vlm/qwen2_vl/model.py
+from __future__ import annotations
+import os
+import warnings
+import torch
+from ..base import BaseModel
+from .prompt import Qwen2VLPromptMixin
+def ensure_image_url(image: str) -> str:
+    prefixes = ['http://', 'https://', 'file://', 'data:image;']
+    if any(image.startswith(prefix) for prefix in prefixes):
+        return image
+    if os.path.exists(image):
+        return 'file://' + image
+    raise ValueError(f'Invalid image: {image}')
+def ensure_video_url(video: str) -> str:
+    prefixes = ['http://', 'https://', 'file://', 'data:video;']
+    if any(video.startswith(prefix) for prefix in prefixes):
+        return video
+    if os.path.exists(video):
+        return 'file://' + video
+    raise ValueError(f'Invalid video: {video}')
+class Qwen2VLChat(Qwen2VLPromptMixin, BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = True
+    VIDEO_LLM = True
+    def __init__(
+        self,
+        model_path: str,
+        min_pixels: int | None = None,
+        max_pixels: int | None = None,
+        max_new_tokens=2048,
+        top_p=0.001,
+        top_k=1,
+        temperature=0.01,
+        repetition_penalty=1.0,
+        use_custom_prompt: bool = True,
+        system_prompt: str | None = None,
+        verbose: bool = True,
+    ):
+        super().__init__(use_custom_prompt=use_custom_prompt)
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.generate_kwargs = dict(
+            max_new_tokens=max_new_tokens,
+            top_p=top_p,
+            top_k=top_k,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+        )
+        self.system_prompt = system_prompt
+        self.verbose = verbose
+        self.fps = 2.0
+        from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor
+        assert model_path is not None
+        self.model_path = model_path
+        self.processor = Qwen2VLProcessor.from_pretrained(model_path)
+        if '72b' not in self.model_path.lower():
+            self.model = Qwen2VLForConditionalGeneration.from_pretrained(
+                model_path, torch_dtype='auto', device_map='cpu', attn_implementation='flash_attention_2'
+            )
+            self.model.cuda().eval()
+        else:
+            self.model = Qwen2VLForConditionalGeneration.from_pretrained(
+                model_path, torch_dtype='auto', device_map='auto', attn_implementation='flash_attention_2'
+            )
+            self.model.cuda().eval()
+        torch.cuda.empty_cache()
+    def _prepare_content(self, inputs: list[dict[str, str]], dataset: str | None = None) -> list[dict[str, str]]:
+        """
+        inputs list[dict[str, str]], each dict has keys: ['type', 'value']
+        """
+        content = []
+        for s in inputs:
+            if s['type'] == 'image':
+                item = {'type': 'image', 'image': ensure_image_url(s['value'])}
+                if dataset == 'OCRBench':
+                    item['min_pixels'] = 10 * 10 * 28 * 28
+                    warnings.warn(f"OCRBench dataset uses custom min_pixels={item['min_pixels']}")
+                    if self.max_pixels is not None:
+                        item['max_pixels'] = self.max_pixels
+                else:
+                    if self.min_pixels is not None:
+                        item['min_pixels'] = self.min_pixels
+                    if self.max_pixels is not None:
+                        item['max_pixels'] = self.max_pixels
+            elif s['type'] == 'video':
+                item = {'type': 'video', 'video': ensure_video_url(s['value'])}
+                if self.fps is not None:
+                    item['fps'] = self.fps
+            elif s['type'] == 'text':
+                item = {'type': 'text', 'text': s['value']}
+            else:
+                raise ValueError(f"Invalid message type: {s['type']}, {s}")
+            content.append(item)
+        return content
+    def generate_inner(self, message, dataset=None):
+        try:
+            from qwen_vl_utils import process_vision_info
+        except ImportError:
+            warnings.warn("qwen_vl_utils not found, please install it via 'pip install qwen-vl-utils'")
+            raise
+        messages = []
+        if self.system_prompt is not None:
+            messages.append({'role': 'system', 'content': self.system_prompt})
+        messages.append({'role': 'user', 'content': self._prepare_content(message, dataset=dataset)})
+        if self.verbose:
+            print(f'\033[31m{messages}\033[0m')
+        text = self.processor.apply_chat_template([messages], tokenize=False, add_generation_prompt=True)
+        images, videos = process_vision_info([messages])
+        inputs = self.processor(text=text, images=images, videos=videos, padding=True, return_tensors='pt')
+        inputs = inputs.to('cuda')
+        generated_ids = self.model.generate(
+            **inputs,
+            **self.generate_kwargs,
+        )
+        generated_ids = [
+            output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        out = self.processor.tokenizer.batch_decode(
+            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        response = out[0]
+        if self.verbose:
+            print(f'\033[32m{response}\033[0m')
+        return response
--- a/VLMEvalKit/vlmeval/vlm/qwen2_vl/prompt.py
+++ b/VLMEvalKit/vlmeval/vlm/qwen2_vl/prompt.py
+from __future__ import annotations
+class Qwen2VLPromptMixin:
+    """
+    Mixin class for Qwen2VLChat to build custom prompt for different datasets.
+    Requires the following methods to be implemented in the subclass:
+        - dump_image(line, dataset: str) -> str | list[str]
+    Implements the following methods:
+        - use_custom_prompt(dataset: str) -> bool
+        - build_prompt(line, dataset: str) -> list[dict[str, str]]
+    """
+    def __init__(self, *args, use_custom_prompt: bool = True, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self._use_custom_prompt = use_custom_prompt
+    def set_dump_image(self, dump_image_func):
+        self.dump_image_func = dump_image_func
+    def dump_image(self, line, dataset):
+        return self.dump_image_func(line)
+    def use_custom_prompt(self, dataset: str) -> bool:
+        from vlmeval.dataset import DATASET_TYPE
+        dataset_type = DATASET_TYPE(dataset, default=None)
+        if not self._use_custom_prompt:
+            return False
+        if dataset in {'MMMU_DEV_VAL', 'MMMU_TEST'}:
+            return True
+        if dataset_type == 'MCQ':
+            return True
+        if dataset_type == 'Y/N' and dataset in {'HallusionBench', 'POPE'}:  # MME has it's own prompt
+            return True
+        if dataset_type == 'VQA' and dataset not in {'MMVet'}:  # MMVet VQA has it's own prompt
+            return True
+        return False
+    def build_prompt(self, line, dataset: str) -> list[dict[str, str]]:
+        from vlmeval.dataset import DATASET_TYPE
+        if dataset in {'MMMU_DEV_VAL', 'MMMU_TEST'}:
+            return self._build_mmmu_prompt(line, dataset)
+        dataset_type = DATASET_TYPE(dataset, default=None)
+        if dataset_type == 'MCQ':
+            return self._build_mcq_prompt(line, dataset)
+        if dataset_type == 'Y/N':
+            return self._build_yorn_prompt(line, dataset)
+        if dataset_type == 'VQA':
+            return self._build_vqa_prompt(line, dataset)
+        raise ValueError(f'Unsupported dataset: {dataset}')
+    def _build_mmmu_prompt(self, line, dataset: str) -> list[dict[str, str]]:
+        """change the prompt for MMMU dataset: keep all images at beginning."""
+        import string
+        import pandas as pd
+        tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        options = {cand: line[cand] for cand in string.ascii_uppercase if cand in line and not pd.isna(line[cand])}
+        options_prompt = 'Options:\n'
+        for key, item in options.items():
+            options_prompt += f'{key}. {item}\n'
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        prompt = ''
+        if hint is not None:
+            prompt += f'Hint: {hint}\n'
+        prompt += f'Question: {question}\n'
+        if len(options):
+            prompt += options_prompt
+            prompt += 'Please select the correct answer from the options above. \n'
+        prompt = prompt.rstrip()
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=prompt))
+        return msgs
+    def _build_mcq_prompt(self, line, dataset: str) -> list[dict[str, str]]:
+        """change the prompt for MCQ dataset: use chinese prompt if the question contains chinese characters."""
+        MCQ_CN_PROMPT = '请直接回答选项字母。'
+        MCQ_EN_PROMPT = 'Please select the correct answer from the options above.'
+        import string
+        import pandas as pd
+        def cn_string(s):
+            import re
+            if re.search('[\u4e00-\u9fff]', s):
+                return True
+            return False
+        tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        options = {cand: line[cand] for cand in string.ascii_uppercase if cand in line and not pd.isna(line[cand])}
+        options_prompt = 'Options:\n'
+        for key, item in options.items():
+            options_prompt += f'{key}. {item}\n'
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        prompt = ''
+        if hint is not None:
+            prompt += f'Hint: {hint}\n'
+        prompt += f'Question: {question}\n'
+        if len(options):
+            prompt += options_prompt
+            prompt += MCQ_CN_PROMPT if cn_string(prompt) else MCQ_EN_PROMPT
+        prompt = prompt.rstrip()
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=prompt))
+        return msgs
+    def _build_yorn_prompt(self, line, dataset: str) -> list[dict[str, str]]:
+        """change the prompt for YORN dataset:"""
+        YORN_PROMPT = ' Please answer yes or no.'
+        tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=question))
+        assert msgs[-1]['type'] == 'text'
+        msgs[-1]['value'] += YORN_PROMPT
+        return msgs
+    def _build_vqa_prompt(self, line, dataset: str) -> list[dict[str, str]]:
+        """change the prompt for VQA dataset:"""
+        VQA_PROMPT = '\nPlease try to answer the question with short words or phrases if possible.'
+        tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=question))
+        assert msgs[-1]['type'] == 'text'
+        msgs[-1]['value'] += VQA_PROMPT
+        return msgs
--- a/VLMEvalKit/vlmeval/vlm/qwen_vl.py
+++ b/VLMEvalKit/vlmeval/vlm/qwen_vl.py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import warnings
+import copy as cp
+from .base import BaseModel
+from ..smp import isimg, listinstr
+from ..dataset import DATASET_TYPE
+class QwenVL(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = True
+    def __init__(self, model_path='Qwen/Qwen-VL', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        tokenizer.padding_side = 'left'
+        tokenizer.pad_token_id = tokenizer.eod_id
+        self.tokenizer = tokenizer
+        self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cuda', trust_remote_code=True).eval()
+        default_kwargs = dict(
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=512,
+            min_new_tokens=1,
+            num_return_sequences=1,
+            use_cache=True,
+            output_hidden_states=True,
+            pad_token_id=tokenizer.eod_id,
+            eos_token_id=tokenizer.eod_id)
+        default_kwargs.update(kwargs)
+        self.kwargs = default_kwargs
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+        torch.cuda.empty_cache()
+    def adjust_kwargs(self, dataset):
+        kwargs = cp.deepcopy(self.kwargs)
+        if DATASET_TYPE(dataset) in ['MCQ', 'Y/N']:
+            kwargs['max_new_tokens'] = 32
+        elif DATASET_TYPE(dataset) == 'Caption' and 'COCO' in dataset:
+            kwargs['max_new_tokens'] = 32
+        elif DATASET_TYPE(dataset) == 'VQA':
+            if listinstr(['OCRVQA', 'ChartQA', 'DocVQA'], dataset):
+                kwargs['max_new_tokens'] = 100
+            elif listinstr(['TextVQA'], dataset):
+                kwargs['max_new_tokens'] = 10
+        return kwargs
+    def generate_inner(self, message, dataset=None):
+        if dataset is not None:
+            kwargs = self.adjust_kwargs(dataset)
+        else:
+            kwargs = self.kwargs
+        prompt = ''
+        for s in message:
+            if s['type'] == 'image':
+                prompt += f'<img>{s["value"]}</img>'
+            elif s['type'] == 'text':
+                prompt += s['value']
+        if dataset is not None and DATASET_TYPE(dataset) == 'VQA':
+            prompt += ' Answer:'
+        encoded = self.tokenizer([prompt], return_tensors='pt', padding='longest')
+        input_ids = encoded.input_ids.to('cuda')
+        attention_mask = encoded.attention_mask.to('cuda')
+        pred = self.model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            **kwargs)
+        answer = self.tokenizer.decode(pred[0][input_ids.size(1):].cpu(), skip_special_tokens=True).strip()
+        return answer
+class QwenVLChat(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = True
+    def __init__(self, model_path='Qwen/Qwen-VL-Chat', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cuda', trust_remote_code=True).eval()
+        torch.cuda.empty_cache()
+        self.kwargs = kwargs
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+    def build_history(self, message):
+        def concat_tilist(tilist):
+            image_cnt = 1
+            prompt = ''
+            for item in tilist:
+                if item['type'] == 'text':
+                    prompt += item['value']
+                elif item['type'] == 'image':
+                    prompt += f"Picture {image_cnt}: <img>{item['value']}</img>\n"
+                    image_cnt += 1
+            return prompt
+        assert len(message) % 2 == 0
+        hist = []
+        for i in range(len(message) // 2):
+            m1, m2 = message[2 * i], message[2 * i + 1]
+            assert m1['role'] == 'user' and m2['role'] == 'assistant'
+            hist.append((concat_tilist(m1['content']), concat_tilist(m2['content'])))
+        return hist
+    def generate_inner(self, message, dataset=None):
+        vl_list = [{'image': s['value']} if s['type'] == 'image' else {'text': s['value']} for s in message]
+        query = self.tokenizer.from_list_format(vl_list)
+        response, _ = self.model.chat(self.tokenizer, query=query, history=None, **self.kwargs)
+        return response
+    def chat_inner(self, message, dataset=None):
+        assert len(message) % 2 == 1 and message[-1]['role'] == 'user'
+        history = self.build_history(message[:-1])
+        vl_list = [
+            {'image': s['value']} if s['type'] == 'image' else {'text': s['value']}
+            for s in message[-1]['content']
+        ]
+        query = self.tokenizer.from_list_format(vl_list)
+        response, _ = self.model.chat(self.tokenizer, query=query, history=history, **self.kwargs)
+        return response
--- a/VLMEvalKit/vlmeval/vlm/rbdash.py
+++ b/VLMEvalKit/vlmeval/vlm/rbdash.py
+import sys
+import torch
+import os.path as osp
+import os
+import warnings
+from .base import BaseModel
+from ..dataset import DATASET_TYPE
+from ..smp import *
+from PIL import Image
+'''
+    Please follow the instructions to download ckpt.
+    https://github.com/RBDash-Team/RBDash?tab=readme-ov-file#pretrained-weights
+'''
+class RBDash(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = False
+    def __init__(self, model_path, root=None, conv_mode='qwen', **kwargs):
+        from huggingface_hub import snapshot_download
+        if root is None:
+            warnings.warn('Please set `root` to RBDash code directory, \
+                which is cloned from here: "https://github.com/RBDash-Team/RBDash?tab=readme-ov-file" ')
+            sys.exit(-1)
+        warnings.warn('Please follow the instructions of RBDash to put the ckpt file in the right place, \
+            which can be found at https://github.com/RBDash-Team/RBDash?tab=readme-ov-file#structure')
+        assert model_path == 'RBDash-Team/RBDash-v1.2-72b', 'We only support RBDash-v1.2-72b for now'
+        sys.path.append(root)
+        try:
+            from rbdash.model.builder import load_pretrained_model
+            from rbdash.mm_utils import get_model_name_from_path
+        except:
+            raise ImportError(
+                'Please first install RBdash and set the root path to use RBdash, '
+                'which is cloned from here: "https://github.com/RBDash-Team/RBDash?tab=readme-ov-file" '
+            )
+        VLMEvalKit_path = os.getcwd()
+        os.chdir(root)
+        warnings.warn('Please set `root` to RBdash code directory, \
+            which is cloned from here: "https://github.com/RBDash-Team/RBDash?tab=readme-ov-file" ')
+        try:
+            model_name = get_model_name_from_path(model_path)
+        except:
+            raise ImportError(
+                'Please follow the instructions of RBdash to put the ckpt file in the right place, '
+                'which can be found at https://github.com/RBDash-Team/RBDash?tab=readme-ov-file#structure'
+            )
+        download_model_path = snapshot_download(model_path)
+        internvit_local_dir = './model_zoo/OpenGVLab/InternViT-6B-448px-V1-5'
+        os.makedirs(internvit_local_dir, exist_ok=True)
+        snapshot_download('OpenGVLab/InternViT-6B-448px-V1-5', local_dir=internvit_local_dir)
+        convnext_local_dir = './model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup'
+        os.makedirs(convnext_local_dir, exist_ok=True)
+        snapshot_download('laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup', local_dir=convnext_local_dir)
+        preprocessor_url = 'https://huggingface.co/openai/clip-vit-large-patch14-336/blob/main/preprocessor_config.json'
+        download_file_path = osp.join(convnext_local_dir, 'preprocessor_config.json')
+        if not osp.exists(download_file_path):
+            print(f'download preprocessor to {download_file_path}')
+            download_file(preprocessor_url, download_file_path)
+        tokenizer, model, image_processor, image_processor_aux, context_len = load_pretrained_model(
+            download_model_path, None, model_name, device_map='auto'
+        )
+        os.chdir(VLMEvalKit_path)
+        self.model = model
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        self.image_processor_aux = image_processor_aux
+        self.conv_mode = conv_mode
+        if tokenizer.unk_token is None:
+            tokenizer.unk_token = '<|endoftext|>'
+        tokenizer.pad_token = tokenizer.unk_token
+        kwargs_default = dict(temperature=float(0.2), num_beams=1, top_p=None, max_new_tokens=128, use_cache=True)
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+    def generate_inner(self, message, dataset=None):
+        try:
+            from rbdash.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, \
+                DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+            from rbdash.conversation import conv_templates
+            from rbdash.mm_utils import tokenizer_image_token, process_images
+        except:
+            raise ImportError(
+                'Please first install RBdash and set the root path to use RBdash, '
+                'which is cloned from here: "https://github.com/RBDash-Team/RBDash?tab=readme-ov-file" '
+            )
+        prompt, image = self.message_to_promptimg(message, dataset=dataset)
+        image = Image.open(image).convert('RGB')
+        if self.model.config.mm_use_im_start_end:
+            prompt = (
+                DEFAULT_IM_START_TOKEN
+                + DEFAULT_IMAGE_TOKEN
+                + DEFAULT_IM_END_TOKEN
+                + '\n'
+                + prompt
+            )
+        else:
+            prompt = DEFAULT_IMAGE_TOKEN + '\n' + prompt
+        conv = conv_templates[self.conv_mode].copy()
+        conv.append_message(conv.roles[0], prompt)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
+        input_ids = input_ids.unsqueeze(0).cuda()
+        if hasattr(self.model.config, 'image_size_aux'):
+            if not hasattr(self.image_processor, 'image_size_raw'):
+                self.image_processor.image_size_raw = self.image_processor.crop_size.copy()
+            self.image_processor.crop_size['height'] = self.model.config.image_size_aux
+            self.image_processor.crop_size['width'] = self.model.config.image_size_aux
+            self.image_processor.size['shortest_edge'] = self.model.config.image_size_aux
+            self.image_processor_aux.crop_size['height'] = self.model.config.image_size_aux
+            self.image_processor_aux.crop_size['width'] = self.model.config.image_size_aux
+            self.image_processor_aux.size[
+                'shortest_edge'
+            ] = self.model.config.image_size_aux
+        image_tensor = process_images([image], self.image_processor, self.model.config)[0]
+        image_grid = getattr(self.model.config, 'image_grid', 1)
+        if hasattr(self.model.config, 'image_size_aux'):
+            raw_shape = [
+                self.image_processor.image_size_raw['height'] * image_grid,
+                self.image_processor.image_size_raw['width'] * image_grid
+            ]
+            if self.image_processor is not self.image_processor_aux:
+                image_tensor_aux = process_images([image], self.image_processor_aux, self.model.config)[
+                    0
+                ]
+            else:
+                image_tensor_aux = image_tensor
+            image_tensor = torch.nn.functional.interpolate(
+                image_tensor[None],
+                size=raw_shape,
+                mode='bilinear',
+                align_corners=False
+            )[0]
+        else:
+            image_tensor_aux = []
+        if image_grid >= 2:
+            raw_image = image_tensor.reshape(
+                3, image_grid, self.image_processor.image_size_raw['height'],
+                image_grid, self.image_processor.image_size_raw['width']
+            )
+            raw_image = raw_image.permute(1, 3, 0, 2, 4)
+            raw_image = raw_image.reshape(
+                -1, 3, self.image_processor.image_size_raw['height'], self.image_processor.image_size_raw['width']
+            )
+            if getattr(self.model.config, 'image_global', False):
+                global_image = image_tensor
+                if len(global_image.shape) == 3:
+                    global_image = global_image[None]
+                global_image = torch.nn.functional.interpolate(
+                    global_image,
+                    size=[
+                        self.image_processor.image_size_raw['height'],
+                        self.image_processor.image_size_raw['width']
+                    ],
+                    mode='bilinear',
+                    align_corners=False
+                )
+                raw_image = torch.cat([raw_image, global_image], dim=0)
+            image_tensor = raw_image.contiguous()
+        images = image_tensor[None].to(dtype=self.model.dtype, device='cuda', non_blocking=True)
+        if len(image_tensor_aux) > 0:
+            images_aux = image_tensor_aux[None].to(dtype=self.model.dtype, device='cuda', non_blocking=True)
+        else:
+            images_aux = None
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                input_ids,
+                max_new_tokens=512,
+                images=images,
+                images_aux=images_aux,
+                do_sample=True if self.kwargs['temperature'] > 0 else False,
+                temperature=self.kwargs['temperature'],
+                top_p=self.kwargs['top_p'],
+                num_beams=self.kwargs['num_beams']
+            )
+        outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+        return outputs
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN'], dataset):
+            # For Multi-Turn we don't have custom prompt
+            return False
+        if 'mme' in dataset.lower():
+            return True
+        elif 'hallusionbench' in dataset.lower():
+            return True
+        elif 'mmmu' in dataset.lower():
+            return True
+        elif 'mmbench' in dataset.lower():
+            return True
+        return False
+    def build_mme(self, line):
+        question = line['question']
+        prompt = question + 'Answer the question using a single word or phrase.'
+        return prompt
+    def build_hallusionbench(self, line):
+        question = line['question']
+        prompt = question + '\nAnswer the question using a single word or phrase.'
+        return prompt
+    def build_mmbench(self, line):
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        options_prompt = ''
+        for key, item in options.items():
+            options_prompt += f'{key}. {item}\n'
+        prompt = f'{question}\n'
+        if len(options):
+            prompt += options_prompt
+            prompt += "Answer with the option's letter from the given choices directly."
+        else:
+            prompt += 'Answer the question using a single word or phrase.'
+        return prompt
+    def build_mmmu(self, line):
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        options_prompt = ''
+        for key, item in options.items():
+            options_prompt += f'({key}) {item}\n'
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        prompt = ''
+        if hint is not None:
+            prompt += f'Hint: {hint}\n'
+        prompt += f'{question}\n'
+        if len(options):
+            prompt += options_prompt
+            prompt += "\nAnswer with the option's letter from the given choices directly."
+        else:
+            prompt += 'Answer the question using a single word or phrase.'
+        return prompt
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+        if 'mme' in dataset.lower():
+            prompt = self.build_mme(line)
+        elif 'hallusionbench' in dataset.lower():
+            prompt = self.build_hallusionbench(line)
+        elif 'mmmu' in dataset.lower():
+            prompt = self.build_mmmu(line)
+        elif 'mmbench' in dataset.lower():
+            prompt = self.build_mmbench(line)
+        ret = [dict(type='text', value=prompt)]
+        ret.extend([dict(type='image', value=s) for s in tgt_path])
+        return ret
--- a/VLMEvalKit/vlmeval/vlm/slime.py
+++ b/VLMEvalKit/vlmeval/vlm/slime.py
+import torch
+from PIL import Image
+from abc import abstractproperty
+import sys
+import os.path as osp
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+import copy
+class SliME(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = True
+    DEFAULT_IMAGE_TOKEN = '<image>'
+    IMAGE_TOKEN_INDEX = -200
+    def __init__(self, model_path='yifanzhang114/SliME-Llama3-8B', **kwargs):
+        assert model_path is not None
+        try:
+            from llava.model.builder import load_pretrained_model
+            from llava.conversation import conv_templates
+            from llava.mm_utils import get_model_name_from_path, tokenizer_image_token
+        except:
+            warnings.warn('Please install requirements on https://github.com/yfzhang114/SliME before using SliME')
+        model_name = get_model_name_from_path(model_path)
+        tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None, model_name, device_map=None)
+        model.cuda().eval()
+        model.tie_weights()
+        if 'llama3' in model_path.lower():
+            conv_mode = 'llama3'
+        elif 'vicuna' in model_path.lower():
+            conv_mode = 'v1'
+        self.conv_template = conv_mode
+        self.conv_templates = conv_templates
+        self.tokenizer = tokenizer
+        self.model = model
+        self.image_processor = image_processor
+        self.tokenizer_image_token = tokenizer_image_token
+    def generate_inner(self, message, dataset=None):
+        content, images = '', []
+        for msg in message:
+            if msg['type'] == 'text':
+                content += msg['value']
+            else:
+                images.append(Image.open(msg['value']).convert('RGB'))
+                content += (self.DEFAULT_IMAGE_TOKEN + '\n')
+        preprocess = self.image_processor.preprocess
+        image_tokenizer = self.tokenizer_image_token
+        image_tensor = [
+            preprocess(f, return_tensors='pt')['pixel_values'][0].half().cuda() for f in images
+        ]
+        image_tensor = torch.stack(image_tensor)
+        conv = copy.deepcopy(self.conv_templates[self.conv_template])
+        conv.messages = list(conv.messages)
+        conv.append_message(conv.roles[0], content)
+        conv.append_message(conv.roles[1], None)
+        prompt_question = conv.get_prompt()
+        input_ids = image_tokenizer(prompt_question, self.tokenizer, self.IMAGE_TOKEN_INDEX, return_tensors='pt')
+        input_ids = input_ids.unsqueeze(0).cuda()
+        cont = self.model.generate(
+            input_ids,
+            images=image_tensor,
+            do_sample=False,
+            temperature=0,
+            max_new_tokens=512,
+        )
+        text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0]
+        return text_outputs
--- a/VLMEvalKit/vlmeval/vlm/transcore_m.py
+++ b/VLMEvalKit/vlmeval/vlm/transcore_m.py
+import sys
+import torch
+from abc import abstractproperty
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+from transformers import AutoTokenizer, BitsAndBytesConfig
+class TransCoreM(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = False
+    def load_pretrained_model(self, model_path, load_8bit=False, load_4bit=False, revision='main'):
+        from transcorem.model import TransCoreMQWenForCausalLM
+        from transcorem.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+        import transcorem.config_param as config_param
+        kwargs = {'revision': revision}
+        if load_8bit:
+            kwargs['load_in_8bit'] = True
+        elif load_4bit:
+            kwargs['load_in_4bit'] = True
+            kwargs['quantization_config'] = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type='nf4'
+            )
+        else:
+            kwargs['torch_dtype'] = torch.float16
+        config_param.model_path = model_path
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, use_fast=False, revision=revision, trust_remote_code=True)
+        model = TransCoreMQWenForCausalLM.from_pretrained(
+            model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs)
+        image_processor = None
+        mm_use_im_start_end = getattr(model.config, 'mm_use_im_start_end', False)
+        mm_use_im_patch_token = getattr(model.config, 'mm_use_im_patch_token', True)
+        if mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        if mm_use_im_start_end:
+            tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+        model.resize_token_embeddings(len(tokenizer))
+        vision_tower = model.get_vision_tower()
+        if not vision_tower.is_loaded:
+            vision_tower.load_model()
+        vision_tower.to(device='cpu', dtype=torch.float16)
+        image_processor = vision_tower.image_processor
+        if hasattr(model.config, 'max_sequence_length'):
+            context_len = model.config.max_sequence_length
+        else:
+            context_len = 2048
+        return tokenizer, model, image_processor, context_len
+    def __init__(self,
+                 root=None,
+                 revision='main',
+                 **kwargs):
+        self.root = root
+        self.revision = revision
+        sys.path.append(root)
+        model_path = 'PCIResearch/TransCore-M'
+        assert osp.exists(model_path) or splitlen(model_path) == 2
+        self.tokenizer, self.model, self.image_processor, self.context_len = self.load_pretrained_model(
+            model_path=model_path, revision=revision)
+        self.model = self.model.cuda()
+        print('==============conv_mode: transcorem_v1')
+        self.conv_mode = 'transcorem_v1'
+        kwargs_default = dict(do_sample=False, temperature=0.0, max_new_tokens=512, top_p=None, num_beams=1)
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'MCQ':
+            return True
+        return False
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+        prompt = question
+        if len(options):
+            prompt += (
+                '\n请直接回答选项字母。' if cn_string(prompt) else
+                "\nAnswer with the option's letter from the given choices directly."
+            )
+        else:
+            prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=f) for f in tgt_path])
+        return message
+    def generate_inner(self, message, dataset=None):
+        from transcorem.mm_utils import highres_process_images, tokenizer_image_token, KeywordsStoppingCriteria
+        from transcorem.constants import (
+            IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN)
+        from transcorem.conversation import conv_templates, SeparatorStyle
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        image = Image.open(image_path).convert('RGB')
+        args = abstractproperty()
+        args.image_aspect_ratio = 'pad'
+        image_patches = highres_process_images(image, self.image_processor, args, base_reso=336)
+        image_patches = [patch.unsqueeze(0).to('cuda', dtype=torch.float16) for patch in image_patches]
+        if self.model.config.mm_use_im_start_end:
+            inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + prompt
+        else:
+            inp = DEFAULT_IMAGE_TOKEN + '\n' + prompt
+        conv = conv_templates[self.conv_mode].copy()
+        conv.append_message(conv.roles[0], inp)
+        conv.append_message(conv.roles[1], None)
+        prompt_conv = conv.get_prompt()
+        input_ids = tokenizer_image_token(prompt_conv, self.tokenizer, IMAGE_TOKEN_INDEX,
+                                          return_tensors='pt').unsqueeze(0).cuda()
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids)
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                input_ids,
+                images=image_patches,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria],
+                **self.kwargs)
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+        outputs = outputs.strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[:-len(stop_str)]
+        outputs = outputs.strip()
+        return outputs
--- a/VLMEvalKit/vlmeval/vlm/video_llm/__init__.py
+++ b/VLMEvalKit/vlmeval/vlm/video_llm/__init__.py
+from .video_llava import VideoLLaVA, VideoLLaVA_HF
+from .videochat2 import VideoChat2_HD
+from .chat_uni_vi import Chatunivi
+from .video_chatgpt import VideoChatGPT
+from .llama_vid import LLaMAVID
+from .pllava import PLLaVA
+__all__ = ['VideoLLaVA', 'VideoLLaVA_HF', 'Chatunivi', 'VideoChatGPT', 'LLaMAVID', 'VideoChat2_HD', 'PLLaVA']
--- a/VLMEvalKit/vlmeval/vlm/video_llm/chat_uni_vi.py
+++ b/VLMEvalKit/vlmeval/vlm/video_llm/chat_uni_vi.py
+import torch
+import warnings
+import copy as cp
+import numpy as np
+import sys
+import os
+from ..base import BaseModel
+from ...smp import isimg, listinstr
+from ...dataset import DATASET_TYPE
+from decord import VideoReader, cpu
+from PIL import Image
+def _get_rawvideo_dec(
+    video_path,
+    image_processor,
+    max_frames=64,
+    image_resolution=224,
+    video_framerate=1,
+    s=None,
+    e=None,
+):
+    # speed up video decode via decord.
+    video_mask = np.zeros(max_frames, dtype=np.int64)
+    max_video_length = 0
+    # T x 3 x H x W
+    video = np.zeros((max_frames, 3, image_resolution, image_resolution), dtype=np.float64)
+    if s is None:
+        start_time, end_time = None, None
+    else:
+        start_time = int(s)
+        end_time = int(e)
+        start_time = start_time if start_time >= 0.0 else 0.0
+        end_time = end_time if end_time >= 0.0 else 0.0
+        if start_time > end_time:
+            start_time, end_time = end_time, start_time
+        elif start_time == end_time:
+            end_time = start_time + 1
+    if os.path.exists(video_path):
+        vreader = VideoReader(video_path, ctx=cpu(0))
+    else:
+        print(video_path)
+        raise FileNotFoundError
+    fps = vreader.get_avg_fps()
+    f_start = 0 if start_time is None else int(start_time * fps)
+    f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
+    num_frames = f_end - f_start + 1
+    if num_frames > 0:
+        # T x 3 x H x W
+        sample_fps = int(video_framerate)
+        t_stride = int(round(float(fps) / sample_fps))
+        all_pos = list(range(f_start, f_end + 1, t_stride))
+        if len(all_pos) > max_frames:
+            sample_pos = [all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)]
+        else:
+            sample_pos = all_pos
+        patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
+        patch_images = torch.stack(
+            [image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0] for img in patch_images]
+        )
+        slice_len = patch_images.shape[0]
+        max_video_length = max_video_length if max_video_length > slice_len else slice_len
+        if slice_len < 1:
+            pass
+        else:
+            video[:slice_len, ...] = patch_images
+        return patch_images, slice_len
+    else:
+        print('video path: {} error.'.format(video_path))
+    video_mask[:max_video_length] = [1] * max_video_length
+    return torch.from_numpy(video), video_mask
+class Chatunivi(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = False
+    VIDEO_LLM = True
+    def __init__(self, model_path='Chat-UniVi/Chat-UniVi', **kwargs):
+        assert model_path is not None
+        try:
+            from ChatUniVi.model.builder import load_pretrained_model
+        except:
+            warnings.warn('Please install Chat-UniVi from https://github.com/PKU-YuanGroup/Chat-UniVi.git.')
+            sys.exit(-1)
+        model_name = 'ChatUniVi'
+        tokenizer, model, processor, context_len = load_pretrained_model(model_path, None, model_name)
+        self.tokenizer = tokenizer
+        self.model = model
+        vision_tower = model.get_vision_tower()
+        if not vision_tower.is_loaded:
+            vision_tower.load_model()
+        image_processor = vision_tower.image_processor
+        self.processor = image_processor
+        self.context_len = context_len
+        self.kwargs = kwargs
+        self.nframe = 64
+        self.resolution = 224
+        if 'v1.5' in model_path:
+            self.resolution = 336
+    def get_model_output(self, model, video_processor, tokenizer, video, qs):
+        from ChatUniVi.conversation import conv_templates, SeparatorStyle
+        from ChatUniVi.constants import (
+            DEFAULT_IMAGE_PATCH_TOKEN,
+            DEFAULT_IMAGE_TOKEN,
+            IMAGE_TOKEN_INDEX,
+            DEFAULT_IM_START_TOKEN,
+            DEFAULT_IM_END_TOKEN,
+            MAX_IMAGE_LENGTH,
+        )
+        from ChatUniVi.mm_utils import (
+            tokenizer_image_token,
+            KeywordsStoppingCriteria,
+        )
+        mm_use_im_start_end = getattr(model.config, 'mm_use_im_start_end', False)
+        mm_use_im_patch_token = getattr(model.config, 'mm_use_im_patch_token', True)
+        if mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        if mm_use_im_start_end:
+            tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+        model.resize_token_embeddings(len(tokenizer))
+        if model.config.config['use_cluster']:
+            for n, m in model.named_modules():
+                m = m.to(dtype=torch.bfloat16)
+        video_frames, slice_len = _get_rawvideo_dec(
+            video, video_processor, max_frames=MAX_IMAGE_LENGTH, image_resolution=self.resolution
+        )
+        if model.config.mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN * slice_len + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN * slice_len + '\n' + qs
+        conv = conv_templates['v1'].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(
+            0).cuda()
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=video_frames.half().cuda(),
+                do_sample=True,
+                temperature=0.2,
+                top_p=None,
+                num_beams=1,
+                output_scores=True,
+                return_dict_in_generate=True,
+                max_new_tokens=1024,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria])
+        output_ids = output_ids.sequences
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+        outputs = outputs.strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[:-len(stop_str)]
+        outputs = outputs.strip()
+        return outputs
+    def generate_inner(self, message, dataset=None):
+        question, video = self.message_to_promptvideo(message)
+        response = self.get_model_output(self.model, self.processor, self.tokenizer, video, question)
+        return response
--- a/VLMEvalKit/vlmeval/vlm/video_llm/configs/llama_vid/processor/clip-patch14-224/config.json
+++ b/VLMEvalKit/vlmeval/vlm/video_llm/configs/llama_vid/processor/clip-patch14-224/config.json
+{
+  "_name_or_path": "clip-vit-large-patch14/",
+  "architectures": [
+    "CLIPModel"
+  ],
+  "initializer_factor": 1.0,
+  "logit_scale_init_value": 2.6592,
+  "model_type": "clip",
+  "projection_dim": 768,
+  "text_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 77,
+    "min_length": 0,
+    "model_type": "clip_text_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim" : 768,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.16.0.dev0",
+    "use_bfloat16": false,
+    "vocab_size": 49408
+  },
+  "text_config_dict": {
+    "hidden_size": 768,
+    "intermediate_size": 3072,
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+    "projection_dim": 768
+  },
+  "torch_dtype": "float32",
+  "transformers_version": null,
+  "vision_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 224,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "clip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 24,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim" : 768,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.16.0.dev0",
+    "use_bfloat16": false
+  },
+  "vision_config_dict": {
+    "hidden_size": 1024,
+    "intermediate_size": 4096,
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "patch_size": 14,
+    "projection_dim": 768
+  }
+}
--- a/VLMEvalKit/vlmeval/vlm/video_llm/configs/llama_vid/processor/clip-patch14-224/preprocessor_config.json
+++ b/VLMEvalKit/vlmeval/vlm/video_llm/configs/llama_vid/processor/clip-patch14-224/preprocessor_config.json
+{
+  "crop_size": 224,
+  "do_center_crop": true,
+  "do_normalize": true,
+  "do_resize": true,
+  "feature_extractor_type": "CLIPFeatureExtractor",
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "size": 224
+}
--- a/VLMEvalKit/vlmeval/vlm/video_llm/configs/videochat2_hd.json
+++ b/VLMEvalKit/vlmeval/vlm/video_llm/configs/videochat2_hd.json
+{
+    "model": {
+      "model_cls": "VideoChat2_it_hd_mistral",
+      "vit_blip_model_path": "OpenGVLab/videochat2",
+      "mistral_model_path": "mistralai/Mistral-7B-Instruct-v0.2",
+      "videochat2_model_path": "OpenGVLab/VideoChat2_stage2_Mistral_7B",
+      "freeze_vit": false,
+      "freeze_qformer": false,
+      "max_txt_len": 512,
+      "low_resource": false,
+      "vision_encoder": {
+        "name": "vit_l14",
+        "img_size": 224,
+        "patch_size": 16,
+        "d_model": 1024,
+        "encoder_embed_dim": 1024,
+        "encoder_depth": 24,
+        "encoder_num_heads": 16,
+        "drop_path_rate": 0.0,
+        "num_frames": 8,
+        "tubelet_size": 1,
+        "use_checkpoint": true,
+        "checkpoint_num": 18,
+        "pretrained": "",
+        "return_index": -2,
+        "vit_add_ln": true,
+        "ckpt_num_frame": 4
+      },
+      "num_query_token": 32,
+      "qformer_hidden_dropout_prob": 0.1,
+      "qformer_attention_probs_dropout_prob": 0.1,
+      "qformer_drop_path_rate": 0.2,
+      "extra_num_query_token": 64,
+      "qformer_text_input": true,
+      "system": "",
+      "start_token": "<Video>",
+      "end_token": "</Video>",
+      "add_second_msg": true,
+      "img_start_token": "<Image>",
+      "img_end_token": "</Image>",
+      "random_shuffle": true,
+      "return_question_instruction": false,
+      "use_flash_attention": true,
+      "use_lora": false,
+      "lora_r": 16,
+      "lora_alpha": 32,
+      "lora_dropout": 0.1,
+      "dynamic_config": {
+        "local_size": 224,
+        "hd_num": 6,
+        "padding": false,
+        "add_global": true
+      }
+    },
+    "device": "cuda"
+}
--- a/VLMEvalKit/vlmeval/vlm/video_llm/llama_vid.py
+++ b/VLMEvalKit/vlmeval/vlm/video_llm/llama_vid.py
+import torch
+import warnings
+import copy as cp
+import numpy as np
+import sys
+import os
+from ..base import BaseModel
+from ...smp import isimg, listinstr, load, dump, download_file
+from ...dataset import DATASET_TYPE
+from decord import VideoReader, cpu
+from huggingface_hub import snapshot_download
+def load_video(video_path):
+    vr = VideoReader(video_path, ctx=cpu(0))
+    total_frame_num = len(vr)
+    fps = round(vr.get_avg_fps())
+    frame_idx = [i for i in range(0, total_frame_num, fps)]
+    spare_frames = vr.get_batch(frame_idx).asnumpy()
+    return spare_frames
+def change_file(file_path, mm_vision_tower):
+    org_data = load(file_path)
+    org_data['image_processor'] = './vlmeval/vlm/video_llm/configs/llama_vid/processor/clip-patch14-224'
+    org_data['mm_vision_tower'] = mm_vision_tower
+    dump(org_data, file_path)
+class LLaMAVID(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = False
+    VIDEO_LLM = True
+    def __init__(self, model_path='YanweiLi/llama-vid-7b-full-224-video-fps-1', **kwargs):
+        assert model_path is not None
+        try:
+            from llamavid.model.builder import load_pretrained_model
+            from llava.mm_utils import get_model_name_from_path
+        except:
+            warnings.warn('Please install LLaMA-VID from https://github.com/dvlab-research/LLaMA-VID.')
+            sys.exit(-1)
+        model_base = None
+        model_name = get_model_name_from_path(model_path)
+        eva_vit_g_url = 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/eva_vit_g.pth'
+        true_model_path = snapshot_download(model_path)
+        eva_vit_path = os.path.join(true_model_path, 'eva_vit_g.pth')
+        if not os.path.exists(eva_vit_path):
+            download_file(eva_vit_g_url, eva_vit_path)
+        config_path = os.path.join(true_model_path, 'config.json')
+        change_file(config_path, eva_vit_path)
+        tokenizer, model, image_processor, context_len = load_pretrained_model(
+            true_model_path, model_base, model_name, None, device_map='cpu', device='cpu'
+        )
+        model.cuda()
+        self.tokenizer = tokenizer
+        self.model = model
+        self.processor = image_processor
+        self.context_len = context_len
+        self.kwargs = kwargs
+        self.nframe = 8
+    def get_model_output(self, model, video_processor, tokenizer, video, qs):
+        from llamavid.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
+        from llamavid.constants import DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+        from llamavid.conversation import conv_templates, SeparatorStyle
+        from llava.mm_utils import tokenizer_image_token, KeywordsStoppingCriteria
+        original_qs = cp.deepcopy(qs)
+        if model.config.mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+        conv_mode = 'vicuna_v1'
+        conv = conv_templates[conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        # Check if the video exists
+        if os.path.exists(video):
+            video = load_video(video)
+            video = video_processor.preprocess(video, return_tensors='pt')['pixel_values'].half().cuda()
+            video = [video]
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        cur_prompt = original_qs
+        with torch.inference_mode():
+            model.update_prompt([[cur_prompt]])
+            output_ids = model.generate(
+                input_ids,
+                images=video,
+                do_sample=True,
+                temperature=0.2,
+                max_new_tokens=1024,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria],
+            )
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+        outputs = outputs.strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[: -len(stop_str)]
+        outputs = outputs.strip()
+        return outputs
+    def generate_inner(self, message, dataset=None):
+        question, video = self.message_to_promptvideo(message)
+        response = self.get_model_output(self.model, self.processor, self.tokenizer, video, question)
+        return response
--- a/VLMEvalKit/vlmeval/vlm/video_llm/pllava.py
+++ b/VLMEvalKit/vlmeval/vlm/video_llm/pllava.py
+import torch
+import warnings
+import copy as cp
+import numpy as np
+import sys
+from PIL import Image
+import torchvision
+from ..base import BaseModel
+from ...smp import isimg, listinstr, get_rank_and_world_size
+from ...dataset import DATASET_TYPE
+from huggingface_hub import snapshot_download
+class PLLaVA(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = False
+    VIDEO_LLM = True
+    def __init__(self, model_path='ermu2001/pllava-13b', dir_root=None, **kwargs):
+        sys.path.append(dir_root)
+        try:
+            from tasks.eval.model_utils import load_pllava
+        except:
+            warnings.warn(
+                'Please first install requirements and set the root path to use PLLaVA. \
+                Follow the instructions at https://github.com/magic-research/PLLaVA.'
+            )
+            sys.exit(-1)
+        rank, world_size = get_rank_and_world_size()
+        self.nframe = 16
+        self.use_lora = True
+        self.lora_alpha = 4
+        self.pooling_shape = (16, 12, 12)
+        self.RESOLUTION = 672
+        self.model_path = model_path
+        # remind that, once the model goes larger (30B+) may cause the memory to be heavily used up. Even Tearing Nodes.
+        weight_dir = snapshot_download(model_path)
+        self.model, self.processor = load_pllava(
+            model_path, num_frames=self.nframe, use_lora=self.use_lora,
+            weight_dir=weight_dir, lora_alpha=self.lora_alpha, pooling_shape=self.pooling_shape
+        )
+        #  position embedding
+        self.model = self.model.to(torch.device(rank))
+        self.model = self.model.eval()
+    def load_video(self, video_path, num_segments=8, resolution=336):
+        from decord import VideoReader, cpu
+        transforms = torchvision.transforms.Resize(size=resolution)
+        vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+        num_frames = len(vr)
+        frame_indices = self.get_index(num_frames, num_segments)
+        images_group = list()
+        for frame_index in frame_indices:
+            img = Image.fromarray(vr[frame_index].asnumpy())
+            images_group.append(transforms(img))
+        return images_group
+    def get_index(self, num_frames, num_segments):
+        seg_size = float(num_frames - 1) / num_segments
+        start = int(seg_size / 2)
+        offsets = np.array([
+            start + int(np.round(seg_size * idx)) for idx in range(num_segments)
+        ])
+        return offsets
+    def generate_inner(self, message, dataset=None):
+        from tasks.eval.model_utils import pllava_answer
+        from tasks.eval.eval_utils import conv_templates
+        question, video = self.message_to_promptvideo(message)
+        img_list = self.load_video(video, num_segments=self.nframe, resolution=self.RESOLUTION)
+        if self.model_path == 'ermu2001/pllava-34b':  # using slightly different conversation mode for 34b model
+            if dataset in ['Video-MME', 'MVBench', 'MVBench_MP4']:  # MCQ dataset
+                conv_mode = 'eval_mvbench_llavanext'
+            else:  # VQA dataset
+                conv_mode = 'eval_videoqa_llavanext'
+        else:
+            if dataset in ['Video-MME', 'MVBench', 'MVBench_MP4']:  # MCQ dataset
+                conv_mode = 'eval_mvbench'
+            else:  # VQA dataset
+                conv_mode = 'eval_videoqabench'
+        conv = conv_templates[conv_mode].copy()
+        if dataset in ['MVBench', 'MVBench_MP4']:
+            conv.user_query(message[1]['value'], message[0]['value'], message[-2]['value'], is_mm=True)
+            conv.assistant_response(message[-1]['value'])
+        else:
+            conv.user_query(question, is_mm=True)
+        llm_response, conv = pllava_answer(
+            conv=conv, model=self.model, processor=self.processor,
+            do_sample=False, img_list=img_list, max_new_tokens=512, print_res=False
+        )
+        if dataset in ['MVBench', 'MVBench_MP4']:
+            llm_response = '(' + ''.join(llm_response.split(message[-1]['value'])[1:])
+        return llm_response
--- a/VLMEvalKit/vlmeval/vlm/video_llm/video_chatgpt.py
+++ b/VLMEvalKit/vlmeval/vlm/video_llm/video_chatgpt.py
+import torch
+import os
+import warnings
+import copy as cp
+import numpy as np
+import sys
+from ..base import BaseModel
+from ...smp import isimg, listinstr
+from ...dataset import DATASET_TYPE
+from huggingface_hub import snapshot_download
+class VideoChatGPT(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = False
+    VIDEO_LLM = True
+    def __init__(self, model_path='MBZUAI/Video-ChatGPT-7B', dir_root=None, **kwargs):
+        assert model_path is not None
+        sys.path.append(dir_root)
+        try:
+            from video_chatgpt.eval.model_utils import initialize_model
+        except:
+            warnings.warn(
+                'Please first install requirements and set the root path to use Video-ChatGPT. \
+                Follow the instructions at https://github.com/mbzuai-oryx/Video-ChatGPT.'
+            )
+            sys.exit(-1)
+        base_model_path = snapshot_download('mmaaz60/LLaVA-7B-Lightening-v1-1')
+        projection_path = snapshot_download(model_path)
+        projection_name = 'video_chatgpt-7B.bin'
+        projection_path = os.path.join(projection_path, projection_name)
+        model, vision_tower, tokenizer, image_processor, video_token_len = initialize_model(
+            base_model_path, projection_path
+        )
+        self.tokenizer = tokenizer
+        self.model = model
+        self.processor = image_processor
+        self.context_len = video_token_len
+        self.kwargs = kwargs
+        self.vision_tower = vision_tower
+        self.nframe = 8
+    def get_model_output(self, model, video_processor, tokenizer, video, qs):
+        from video_chatgpt.eval.model_utils import load_video
+        from video_chatgpt.inference import video_chatgpt_infer
+        conv_mode = 'video-chatgpt_v1'
+        video_frames = load_video(video)
+        # Run inference on the video and questions
+        output = video_chatgpt_infer(
+            video_frames,
+            qs,
+            conv_mode,
+            model,
+            self.vision_tower,
+            tokenizer,
+            video_processor,
+            self.context_len,
+        )
+        return output
+    def generate_inner(self, message, dataset=None):
+        question, video = self.message_to_promptvideo(message)
+        response = self.get_model_output(self.model, self.processor, self.tokenizer, video, question)
+        return response
--- a/VLMEvalKit/vlmeval/vlm/video_llm/video_llava.py
+++ b/VLMEvalKit/vlmeval/vlm/video_llm/video_llava.py
+import torch
+import warnings
+import copy as cp
+import numpy as np
+import sys
+from ..base import BaseModel
+from ...smp import isimg, listinstr
+from ...dataset import DATASET_TYPE
+def read_video_pyav(container, indices):
+    frames = []
+    container.seek(0)
+    start_index = indices[0]
+    end_index = indices[-1]
+    for i, frame in enumerate(container.decode(video=0)):
+        if i > end_index:
+            break
+        if i >= start_index and i in indices:
+            frames.append(frame)
+    return np.stack([x.to_ndarray(format='rgb24') for x in frames])
+class VideoLLaVA_HF(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = False
+    VIDEO_LLM = True
+    def __init__(self, model_path='LanguageBind/Video-LLaVA-7B-hf', **kwargs):
+        try:
+            from transformers import VideoLlavaProcessor, VideoLlavaForConditionalGeneration
+        except:
+            warnings.warn('Please install the latest version transformers. \
+                          You can install by `pip install transformers==4.42.0` \
+                          or `pip install --upgrade git+https://github.com/huggingface/transformers.git`.')
+            sys.exit(-1)
+        assert model_path is not None
+        self.model_path = model_path
+        self.model = VideoLlavaForConditionalGeneration.from_pretrained(model_path)
+        self.model.eval().cuda()
+        self.processor = VideoLlavaProcessor.from_pretrained(model_path)
+        self.kwargs = kwargs
+        self.nframe = 8
+        torch.cuda.empty_cache()
+    def generate_inner(self, message, dataset=None):
+        import av
+        question, video = self.message_to_promptvideo(message)
+        container = av.open(video)
+        # sample uniformly 8 frames from the video
+        total_frames = container.streams.video[0].frames
+        indices = np.arange(0, total_frames, total_frames / self.nframe).astype(int)
+        clip = read_video_pyav(container, indices)
+        prompt = f'USER: <video>\n{question} ASSISTANT:'
+        inputs = self.processor(text=prompt, videos=clip, return_tensors='pt').to(self.model.device)
+        # Generate args -- deperecated
+        generation_args = {
+            'max_new_tokens': 1024,
+            'temperature': 0.2,
+            'do_sample': True,
+        }
+        generation_args.update(self.kwargs)
+        generate_ids = self.model.generate(**inputs, **generation_args)
+        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+        response = self.processor.batch_decode(
+            generate_ids,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
+        return response
+class VideoLLaVA(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = False
+    VIDEO_LLM = True
+    def __init__(self, model_path='LanguageBind/Video-LLaVA-7B', **kwargs):
+        assert model_path is not None
+        try:
+            from videollava.conversation import conv_templates, SeparatorStyle
+            from videollava.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
+            from videollava.constants import DEFAULT_VID_START_TOKEN, DEFAULT_VID_END_TOKEN
+            from videollava.mm_utils import get_model_name_from_path, tokenizer_image_token, KeywordsStoppingCriteria
+            from videollava.model.builder import load_pretrained_model
+            from videollava.model.language_model.llava_llama import LlavaLlamaForCausalLM
+            from videollava.train.train import smart_tokenizer_and_embedding_resize
+        except:
+            warnings.warn('Please install Video-LLaVA from https://github.com/FangXinyu-0913/Video-LLaVA.')
+            sys.exit(-1)
+        model_base = None
+        model_name = model_path.split('/')[-1]
+        tokenizer, model, processor, context_len = load_pretrained_model(model_path, model_base, model_name)
+        self.tokenizer = tokenizer
+        self.model = model
+        self.processor = processor
+        self.context_len = context_len
+        self.kwargs = kwargs
+        self.nframe = 8
+    def get_model_output(self, model, video_processor, tokenizer, video, qs):
+        from videollava.conversation import conv_templates, SeparatorStyle
+        from videollava.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
+        from videollava.constants import DEFAULT_VID_START_TOKEN, DEFAULT_VID_END_TOKEN
+        from videollava.mm_utils import tokenizer_image_token, KeywordsStoppingCriteria
+        if model.config.mm_use_im_start_end:
+            qs = DEFAULT_VID_START_TOKEN + ''.join([DEFAULT_IMAGE_TOKEN] * 8) + DEFAULT_VID_END_TOKEN + '\n' + qs
+        else:
+            qs = ''.join([DEFAULT_IMAGE_TOKEN] * 8) + '\n' + qs
+        conv_mode = 'llava_v1'
+        device = torch.device('cuda')
+        conv = conv_templates[conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        video_tensor = video_processor.preprocess(video, return_tensors='pt')['pixel_values'][0].half().to(device)
+        input_ids = tokenizer_image_token(prompt, tokenizer,
+                                          IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(device)
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=[video_tensor],
+                do_sample=True,
+                temperature=0.2,
+                max_new_tokens=1024,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria])
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+        outputs = outputs.strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[:-len(stop_str)]
+        outputs = outputs.strip()
+        return outputs
+    def generate_inner(self, message, dataset=None):
+        question, video = self.message_to_promptvideo(message)
+        response = self.get_model_output(self.model, self.processor['video'], self.tokenizer, video, question)
+        return response
--- a/VLMEvalKit/vlmeval/vlm/video_llm/videochat2.py
+++ b/VLMEvalKit/vlmeval/vlm/video_llm/videochat2.py
+import torch
+import warnings
+import copy as cp
+import numpy as np
+import sys
+import os.path as osp
+import os
+import requests
+import shutil
+import huggingface_hub
+from transformers import StoppingCriteria, StoppingCriteriaList
+from huggingface_hub import snapshot_download
+from PIL import Image
+from torchvision.transforms import PILToTensor
+from torchvision import transforms
+from peft import get_peft_model, LoraConfig, TaskType
+from ..base import BaseModel
+from ...smp import *
+from ...dataset import DATASET_TYPE
+def get_prompt(conv):
+    ret = conv.system + conv.sep
+    for role, message in conv.messages:
+        if message:
+            ret += role + ' ' + message + ' ' + conv.sep
+        else:
+            ret += role
+    return ret
+def get_prompt2(conv):
+    ret = conv.system + conv.sep
+    count = 0
+    for role, message in conv.messages:
+        count += 1
+        if count == len(conv.messages):
+            ret += role + ' ' + message
+        else:
+            if message:
+                ret += role + ' ' + message + ' ' + conv.sep
+            else:
+                ret += role
+    return ret
+class StoppingCriteriaSub(StoppingCriteria):
+    def __init__(self, stops=[], encounters=1):
+        super().__init__()
+        self.stops = stops
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
+        for stop in self.stops:
+            if torch.all((stop == input_ids[0][-len(stop):])).item():
+                return True
+        return False
+class VideoChat2_HD(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = False
+    VIDEO_LLM = True
+    def __init__(self, model_path='OpenGVLab/VideoChat2_HD_stage4_Mistral_7B',
+                 root='./Ask-Anything', config_file='./configs/videochat2_hd.json',
+                 **kwargs):
+        self.config_file = config_file
+        self.root = root
+        self.model_path = model_path
+        if root is None:
+            warnings.warn('Please set `root` to Ask-Anything directory, \
+                          which is cloned from here: https://github.com/OpenGVLab/Ask-Anything')
+            sys.exit(-1)
+        sys.path.append(osp.join(root, 'video_chat2'))
+        try:
+            from utils.config import Config
+            from utils.easydict import EasyDict
+            from models import VideoChat2_it_hd_mistral
+            from dataset.hd_utils import HD_transform_padding, HD_transform_no_padding
+        except:
+            raise ImportError(
+                'Please first install VideoChat2 and set the root path to use VideoChat2, '
+                'which is cloned from here: https://github.com/OpenGVLab/Ask-Anything '
+            )
+        cfg = Config.from_file(self.config_file)
+        def download_file(url, pth):
+            destination_folder = pth
+            # 确保目标文件夹存在
+            if not os.path.exists(destination_folder):
+                os.makedirs(destination_folder)
+            # 获取文件名
+            filename = os.path.basename(url)
+            destination_path = os.path.join(destination_folder, filename)
+            if os.path.exists(destination_path):
+                print(f'File downloaded! No repeat download needed. Saved in {destination_path}')
+                return
+            # 下载文件
+            response = requests.get(url, stream=True)
+            if response.status_code == 200:
+                with open(destination_path, 'wb') as file:
+                    response.raw.decode_content = True
+                    shutil.copyfileobj(response.raw, file)
+                print(f'File downloaded and saved to {destination_path}')
+            else:
+                print(f'Download failed, status code: {response.status_code}')
+        hf_token = os.environ.get('HUGGINGFACE_TOKEN')
+        huggingface_hub.login(hf_token)
+        videochat2_model_path = snapshot_download(repo_id=cfg.model.videochat2_model_path, repo_type='model')
+        cfg.model.videochat2_model_path = osp.join(videochat2_model_path, 'videochat2_mistral_7b_stage2.pth')
+        mistral_model_path = snapshot_download(repo_id=cfg.model.mistral_model_path, repo_type='model')
+        cfg.model.mistral_model_path = mistral_model_path
+        vit_blip_model_path = snapshot_download(repo_id=cfg.model.vit_blip_model_path, repo_type='model')
+        cfg.model.vit_blip_model_path = osp.join(vit_blip_model_path, 'umt_l16_qformer.pth')
+        model = VideoChat2_it_hd_mistral(config=cfg.model)
+        peft_config = LoraConfig(
+            task_type=TaskType.CAUSAL_LM, inference_mode=False,
+            r=16, lora_alpha=32, lora_dropout=0.,
+            target_modules=[
+                'q_proj', 'k_proj', 'v_proj', 'o_proj',
+                'gate_proj', 'up_proj', 'down_proj', 'lm_head'
+            ]
+        )
+        model.mistral_model = get_peft_model(model.mistral_model, peft_config)
+        stage4_model_path = snapshot_download(repo_id=model_path, repo_type='model')
+        state_dict = torch.load(osp.join(stage4_model_path, 'videochat2_hd_mistral_7b_stage4.pth'), 'cuda')
+        if 'model' in state_dict.keys():
+            model.load_state_dict(state_dict['model'], strict=False)
+        else:
+            model.load_state_dict(state_dict, strict=False)
+        model = model.to(torch.device('cuda'))
+        model = model.eval()
+        self.model = model
+        #  position embedding
+        self.nframe = 16
+        self.resolution = 224
+        self.hd_num = 6
+        new_pos_emb = self.get_sinusoid_encoding_table(
+            n_position=(self.resolution // 16) ** 2 * self.nframe,
+            cur_frame=self.nframe
+        )
+        self.model.vision_encoder.encoder.pos_embed = new_pos_emb
+        self.hd_transform = HD_transform_no_padding
+        mean = (0.485, 0.456, 0.406)
+        std = (0.229, 0.224, 0.225)
+        self.transform = transforms.Compose([
+            transforms.Lambda(lambda x: x.float().div(255.0)),
+            transforms.Normalize(mean, std)
+        ])
+    def get_sinusoid_encoding_table(self, n_position=784, d_hid=1024,
+                                    cur_frame=8, ckpt_num_frame=4,
+                                    pre_n_position=784):
+        ''' Sinusoid position encoding table '''
+        # TODO: make it with torch instead of numpy
+        def get_position_angle_vec(position):
+            return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+        # generate checkpoint position embedding
+        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(pre_n_position)])
+        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+        sinusoid_table = torch.tensor(sinusoid_table, dtype=torch.float, requires_grad=False).unsqueeze(0)
+        print(f'n_position: {n_position}')
+        print(f'pre_n_position: {pre_n_position}')
+        if n_position != pre_n_position:
+            T = ckpt_num_frame  # checkpoint frame
+            P = 14  # checkpoint size
+            C = d_hid
+            new_P = int((n_position // cur_frame) ** 0.5)  # testing size
+            if new_P != 14:
+                print(f'Pretraining uses 14x14, but current version is {new_P}x{new_P}')
+                print('Interpolate the position embedding')
+                sinusoid_table = sinusoid_table.reshape(-1, T, P, P, C)
+                sinusoid_table = sinusoid_table.reshape(-1, P, P, C).permute(0, 3, 1, 2)
+                sinusoid_table = torch.nn.functional.interpolate(
+                    sinusoid_table, size=(new_P, new_P), mode='bicubic', align_corners=False)
+                # BT, C, H, W -> BT, H, W, C ->  B, T, H, W, C
+                sinusoid_table = sinusoid_table.permute(0, 2, 3, 1).reshape(-1, T, new_P, new_P, C)
+                sinusoid_table = sinusoid_table.flatten(1, 3)  # B, THW, C
+        if cur_frame != ckpt_num_frame:
+            print(f'Pretraining uses 4 frames, but current frame is {cur_frame}')
+            print('Interpolate the position embedding')
+            T = ckpt_num_frame  # checkpoint frame
+            new_T = cur_frame  # testing frame
+            # interpolate
+            P = int((n_position // cur_frame) ** 0.5)  # testing size
+            C = d_hid
+            sinusoid_table = sinusoid_table.reshape(-1, T, P, P, C)
+            sinusoid_table = sinusoid_table.permute(0, 2, 3, 4, 1).reshape(-1, C, T)  # BHW, C, T
+            sinusoid_table = torch.nn.functional.interpolate(sinusoid_table, size=new_T, mode='linear')
+            sinusoid_table = sinusoid_table.reshape(1, P, P, C, new_T).permute(0, 4, 1, 2, 3)  # B, T, H, W, C
+            sinusoid_table = sinusoid_table.flatten(1, 3)  # B, THW, C
+        return sinusoid_table
+    def get_index(self, bound, fps, max_frame, first_idx=0):
+        if bound:
+            start, end = bound[0], bound[1]
+        else:
+            start, end = -100000, 100000
+        start_idx = max(first_idx, round(start * fps))
+        end_idx = min(round(end * fps), max_frame)
+        seg_size = float(end_idx - start_idx) / self.nframe
+        frame_indices = np.array([
+            int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
+            for idx in range(self.nframe)
+        ])
+        return frame_indices
+    def read_video(self, video_path, bound=None):
+        from decord import VideoReader, cpu
+        vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+        max_frame = len(vr) - 1
+        fps = float(vr.get_avg_fps())
+        frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
+        frames = vr.get_batch(frame_indices)
+        frames = frames.permute(0, 3, 1, 2)
+        frames = self.hd_transform(frames.float(), image_size=self.resolution, hd_num=self.hd_num)
+        torch_imgs = self.transform(frames)
+        return torch_imgs
+    def ask(self, text, conv):
+        conv.messages.append([conv.roles[0], text])
+    def get_context_emb(self, conv, model, img_list, answer_prompt=None, print_res=False):
+        if answer_prompt:
+            prompt = get_prompt2(conv)
+        else:
+            prompt = get_prompt(conv)
+        if print_res:
+            print(prompt)
+        if '<VideoHere>' in prompt:
+            prompt_segs = prompt.split('<VideoHere>')
+        else:
+            prompt_segs = prompt.split('<ImageHere>')
+        assert len(prompt_segs) == len(img_list) + 1, 'Unmatched numbers of image placeholders and images.'
+        with torch.no_grad():
+            seg_tokens = [
+                model.mistral_tokenizer(
+                    seg, return_tensors='pt', add_special_tokens=i == 0).to('cuda').input_ids
+                # only add bos to the first seg
+                for i, seg in enumerate(prompt_segs)
+            ]
+            seg_embs = [model.mistral_model.base_model.model.model.embed_tokens(seg_t) for seg_t in seg_tokens]
+    #         seg_embs = [model.mistral_model.model.embed_tokens(seg_t) for seg_t in seg_tokens]
+        mixed_embs = [emb for pair in zip(seg_embs[:-1], img_list) for emb in pair] + [seg_embs[-1]]
+        mixed_embs = torch.cat(mixed_embs, dim=1)
+        return mixed_embs
+    def answer(self, conv, model, img_list, do_sample=True, max_new_tokens=500, num_beams=1, min_length=1, top_p=0.9,
+               repetition_penalty=1.0, length_penalty=1, temperature=1.0, answer_prompt=None, print_res=False):
+        stop_words_ids = [
+            torch.tensor([2]).to('cuda'),
+            torch.tensor([29871, 2]).to('cuda')]  # '</s>' can be encoded in two different ways.
+        stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+        conv.messages.append([conv.roles[1], answer_prompt])
+        embs = self.get_context_emb(conv, model, img_list, answer_prompt=answer_prompt, print_res=print_res)
+        with torch.no_grad():
+            outputs = model.mistral_model.generate(
+                inputs_embeds=embs,
+                max_new_tokens=max_new_tokens,
+                stopping_criteria=stopping_criteria,
+                num_beams=num_beams,
+                do_sample=do_sample,
+                min_length=min_length,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                length_penalty=length_penalty,
+                temperature=temperature,
+            )
+        output_token = outputs[0]
+        if output_token[0] == 0:  # the model might output a unknow token <unk> at the beginning. remove it
+            output_token = output_token[1:]
+        if output_token[0] == 1:  # some users find that there is a start token <s> at the beginning. remove it
+            output_token = output_token[1:]
+        output_text = model.mistral_tokenizer.decode(output_token, add_special_tokens=False)
+        output_text = output_text.split('</s>')[0]  # remove the stop sign </s>
+    #     output_text = output_text.split('[/INST]')[-1].strip()
+        conv.messages[-1][1] = output_text + '</s>'
+        return output_text, output_token.cpu().numpy()
+    def infer_data(
+        self, data_sample, system=' ',
+        question_prompt='',  # add in the end of question
+        answer_prompt=None,  # add in the begining of answer
+        system_q=False,  # whether add question in the system prompt for QFormer
+        print_res=True,
+        system_llm=False
+    ):
+        assert system_q is False, 'do not support system_q now'
+        video = data_sample['video']
+        T_, C, H, W = video.shape
+        video = video.reshape(1, T_, C, H, W).to('cuda')
+        video_list = []
+        with torch.no_grad():
+            if system_q:
+                raise NotImplementedError
+            else:
+                video_emb, _, _ = self.model.encode_img(video, system)
+        video_list.append(video_emb[0])
+        question = data_sample['question']
+        from utils.easydict import EasyDict
+        chat = EasyDict({
+            'system': system,
+            'roles': ('[INST]', '[/INST]'),
+            'messages': [],
+            'sep': ''
+        })
+        if data_sample['subtitle'] != '':
+            subtitle = f"This video's subtitles are listed below: {data_sample['subtitle']}"
+            chat.messages.append([chat.roles[0], f'{subtitle}\n<Video><VideoHere></Video> [/INST]'])
+        else:
+            chat.messages.append([chat.roles[0], '<Video><VideoHere></Video> [/INST]'])
+        if system_llm:
+            prompt = system + question + question_prompt
+        else:
+            prompt = question + question_prompt
+        self.ask(prompt, chat)
+        llm_message = self.answer(
+            conv=chat, model=self.model, do_sample=False,
+            img_list=video_list, max_new_tokens=100,
+            answer_prompt=answer_prompt, print_res=print_res
+        )[0]
+        return llm_message.strip()
+    def qa_template(self, data):
+        question = data.split('Answer:')[0].split('\n')[0] + '\n'
+        question += 'Options:\n'
+        choices = data.split('Answer:')[0].split('\n')[1:]
+        choices = [item for item in choices if item != '']  # remove blank space
+        for idx, c in enumerate(choices):
+            cur_choice, cur_text = c[0], c[3:]
+            question += f'({cur_choice}) {cur_text}\n'
+        question = question.rstrip()
+        return question
+    def split_subtitle(self, data):
+        if 'This video\'s subtitles are listed below' in data:
+            # 找到subtitle的起始和结束位置
+            start_marker = 'This video\'s subtitles are listed below:'
+            end_marker = 'Select the best answer to the following multiple-choice question based on the video.'
+            start_index = data.find(start_marker) + len(start_marker)
+            end_index = data.find(end_marker)
+            # 提取subtitle部分
+            subtitle = data[start_index:end_index].strip()
+            return subtitle
+        else:
+            return ''
+    def generate_inner(self, message, dataset=None):
+        if dataset == 'Video-MME':
+            _, video = self.message_to_promptvideo(message)
+            torch_imgs = self.read_video(video)
+            subtitle = self.split_subtitle(message[-2]['value'])
+            question = self.qa_template(message[-1]['value'])
+            example = {
+                'subtitle': subtitle,
+                'video': torch_imgs,
+                'question': question
+            }
+            pred_option = self.infer_data(
+                example,
+                ' ',
+                question_prompt='\nOnly give the best option.',
+                answer_prompt='Best option:(',
+                system_q=False,
+                print_res=False,
+                system_llm=True
+            )
+            return_message = '(' + pred_option.split('\n')[0]
+            return return_message
+        elif dataset == 'MVBench' or dataset == 'MVBench_MP4':
+            _, video = self.message_to_promptvideo(message)
+            torch_imgs = self.read_video(video)
+            example = {
+                'subtitle': '',
+                'video': torch_imgs,
+                'question': message[1]['value']
+            }
+            pred_option = self.infer_data(
+                example,
+                message[0]['value'],
+                question_prompt='\nOnly give the best option.',
+                answer_prompt='Best option:(',
+                system_q=False,
+                print_res=False,
+                system_llm=True
+            )
+            return_message = '(' + pred_option.split('\n')[0]
+            return return_message
+        else:
+            question, video = self.message_to_promptvideo(message)
+            torch_imgs = self.read_video(video)
+            example = {
+                'subtitle': '',
+                'video': torch_imgs,
+                'question': f'Question:{question}\nAnswer:'
+            }
+            pred_result = self.infer_data(
+                example,
+                ' ',
+                system_q=False,
+                print_res=False,
+                system_llm=False
+            )
+            return pred_result