Initial commit

bc5ebf0f · luopl · bc5ebf0f · bc5ebf0f · bc5ebf0f · bc5ebf0f
Commit bc5ebf0f authored Dec 27, 2024 by luopl
20 changed files
--- a/VLMEvalKit/vlmeval/api/taichu.py
+++ b/VLMEvalKit/vlmeval/api/taichu.py
+from vlmeval.smp import *
+from vlmeval.api.base import BaseAPI
+import os
+import re
+import json
+
+from PIL import Image
+import base64
+from io import BytesIO
+
+
+class ChatResponse(dict):
+    def __getattr__(self, name):
+        value = self.get(name)
+        if isinstance(value, dict):
+            return ChatResponse(value)  # 如果值是字典，递归包装成 DotDict
+        elif isinstance(value, list):
+            return [ChatResponse(v) if isinstance(v, dict) else v for v in value]  # 如果值是列表，处理其中的字典
+        return value
+
+    def __setattr__(self, name, value):
+        self[name] = value
+
+    def __delattr__(self, name):
+        del self[name]
+
+
+from ..dataset import DATASET_TYPE
+
+
+class TaichuVLWrapper(BaseAPI):
+    is_api: bool = True
+
+    def __init__(self,
+                 model: str = 'Taichu-VL-2B',
+                 retry: int = 5,
+                 wait: int = 5,
+                 verbose: bool = True,
+                 temperature: float = 0.0,
+                 system_prompt: str = None,
+                 max_tokens: int = 4096,
+                 key: str = None,
+                 url: str = None,
+                 **kwargs):
+
+        self.model = model
+        self.kwargs = kwargs
+        self.max_tokens = max_tokens
+
+        self.system_prompt = '[sys]You are a helpful assistant.[/sys]'
+        self.hint_prompt = '|<Hint>|'
+        self.mcq_prompt = '|<MCQ>|'
+
+        self.datasets_use_system = ['MMVet']
+        self.datasets_use_multichoice = [
+            'MathVista', 'MathVision']
+
+        openai_key = os.environ.get('OPENAI_API_KEY', None)
+        use_openai = os.environ.get('USE_OPENAI_EVAL', True)
+        self.use_openai_evaluate = (isinstance(openai_key, str) and openai_key.startswith('sk-') and use_openai)
+
+        self.api_key = os.environ.get('TAICHU_API_KEY', key)
+        self.api_url = url
+
+        assert self.api_key is not None, 'Please set the API Key'
+
+        super().__init__(wait=wait, retry=retry, system_prompt=self.system_prompt, verbose=verbose, **kwargs)
+
+    def set_dump_image(self, dump_image_func):
+        self.dump_image_func = dump_image_func
+
+    def dump_image(self, line, dataset):
+        return self.dump_image_func(line)
+
+    def use_custom_prompt(self, dataset):
+        if listinstr(['MCQ', 'VQA'], DATASET_TYPE(dataset)):
+            return True
+        elif dataset is not None and listinstr(['HallusionBench'], dataset):
+            return True
+        return False
+
+    def clear_prompt(self, prompt):
+        prompt = re.sub(r"Hint:.*?Question:", "", prompt, flags=re.S).strip()
+        prompt = re.sub(r"\nChoices:\n.*", "", prompt, flags=re.S).strip()
+        return prompt
+
+    def encode_image(self, pil_image):
+        buffer = BytesIO()
+        pil_image.save(buffer, format='PNG')
+        base64_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
+        return base64_str
+
+    def build_prompt(self, line, dataset=None):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+
+        tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        hint = None
+        if listinstr(self.datasets_use_system, dataset):
+            system_prompt = self.system_prompt
+        else:
+            system_prompt = ''
+        mcq = False
+        if DATASET_TYPE(dataset) == 'MCQ' or listinstr(self.datasets_use_multichoice, dataset):
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            if listinstr(self.datasets_use_multichoice, dataset):
+                options = {}
+                if not pd.isna(line['choices']):
+                    for i, c in enumerate(eval(line['choices'])):
+                        options[string.ascii_uppercase[i]] = c
+                question = self.clear_prompt(question)
+
+            # support chinese
+            if listinstr(['_CN', '_cn'], dataset):
+                options_prompt = '\n选项：\n'
+            else:
+                options_prompt = '\nOPTIONS:\n'
+            options_prompt += '\n'.join(f"{key}:{value}" for key, value in options.items())
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+            mcq = True if len(options) else False
+            if len(options):
+                prompt = question + options_prompt
+            else:
+                prompt = question
+        else:
+            prompt = question
+
+        msgs = []
+        if system_prompt:
+            msgs.append(dict(type='text', value=system_prompt))
+
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs.append(dict(type='image', value=tgt_path))
+
+        if hint:
+            prompt = 'Hint: ' + hint + '\n' + prompt
+        msgs.append(dict(type='text', value=prompt))
+
+        if mcq:
+            msgs.append(dict(type='text', value=self.mcq_prompt))
+        return msgs
+
+    def prompt_to_request_messages(self, inputs):
+
+        messages = [
+            {'role': 'user', 'content': []}
+        ]
+        is_mcq = False
+        for x in inputs:
+            if x['type'] == 'text':
+                if x['value'] == self.system_prompt:
+                    messages = [{'role': 'system', 'content': [{"type": "text", "text": x['value']}]}] + messages
+                elif self.mcq_prompt == x['value']:
+                    is_mcq = True
+                else:
+                    messages[-1]['content'].append(
+                        {"type": "text", "text": x['value']},
+                    )
+            if x['type'] == 'image':
+                _url = self.encode_image(Image.open(x['value']))
+                messages[-1]['content'].append(
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{_url}"}},
+                )
+            else:
+                continue
+
+        return messages, is_mcq
+
+    def generate_inner(self, inputs, **kwargs) -> str:
+        messages, is_mcq = self.prompt_to_request_messages(inputs)
+
+        data = {
+            "model": self.model,
+            "messages": messages,
+            "max_tokens": self.max_tokens,
+            "temperature": 0,
+            "top_p": 0.8,
+            "stream": False,
+            "extra_body": {
+                "repetition_penalty": 1
+            }
+        }
+
+        headers = {
+            'Authorization': self.api_key,
+            'Content-Type': 'application/json'
+        }
+
+        try:
+            chat_response = requests.post(self.api_url, json=data, headers=headers)
+            response = ChatResponse(json.loads(chat_response.content))
+            result = response.choices[0].message.content
+            # Extract index to exact matching when ChatGPT is unavailable.
+            if self.use_openai_evaluate is False and is_mcq is True:
+                try:
+                    result = result[0]
+                except:
+                    result = 'A'
+            return 0, result, 'Succeeded! '
+        except Exception as err:
+            if self.verbose:
+                self.logger.error(f'{type(err)}: {err}')
+                self.logger.error(f'The input messages are {inputs}.')
+            return -1, '', ''
+
+
+class TaichuVLAPI(TaichuVLWrapper):
+
+    def generate(self, message, dataset=None):
+        return super(TaichuVLAPI, self).generate(message, dataset=dataset)
--- a/VLMEvalKit/vlmeval/api/taiyi.py
+++ b/VLMEvalKit/vlmeval/api/taiyi.py
+from vlmeval.smp import *
+from vlmeval.api.base import BaseAPI
+from vlmeval.dataset import DATASET_TYPE, img_root_map
+
+
+class TaiyiWrapper(BaseAPI):
+
+    is_api: bool = True
+
+    def __init__(self,
+                 model: str = 'taiyi',
+                 retry: int = 5,
+                 wait: int = 5,
+                 key: str = None,
+                 verbose: bool = False,
+                 system_prompt: str = None,
+                 temperature: float = 0,
+                 timeout: int = 60,
+                 url: str = "https://taiyi.megvii.com/v1/chat/completions",
+                 max_tokens: int = 1024,
+                 **kwargs):
+
+        self.model = model
+        self.fail_msg = 'Failed to obtain answer via API. '
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+
+        if key is None:
+            key = os.environ.get('TAIYI_API_KEY', None)
+        assert key is not None, ('Please set the API Key ')
+        self.key = key
+
+        self.timeout = timeout
+        super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
+        assert url is not None, ('Please set the url ')
+        self.url = url
+        self.logger.info(f'Using url: {self.url}; API Key: {self.key}')
+
+    def use_custom_prompt(self, dataset):
+        if DATASET_TYPE(dataset) == 'Y/N' or DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'VQA':
+            return True
+        return False
+
+    def prepare_inputs(self, inputs):
+        input_msgs = []
+        if self.system_prompt is not None:
+            input_msgs.append(dict(role='system', content=self.system_prompt))
+        has_images = np.sum([x['type'] == 'image' for x in inputs])
+        if has_images:
+            content_list = []
+            for msg in inputs:
+                if msg['type'] == 'text':
+                    content_list.append(dict(type='text', text=msg['value']))
+                elif msg['type'] == 'image':
+                    imgbytes = open(msg['value'],'rb').read()
+                    b64 = base64.b64encode(imgbytes).decode('ascii')
+                    img_struct = dict(url=f'data:image/jpeg;base64,{b64}')
+                    content_list.append(dict(type='image_url', image_url=img_struct))
+            input_msgs.append(dict(role='user', content=content_list))
+        else:
+            assert all([x['type'] == 'text' for x in inputs])
+            text = '\n'.join([x['value'] for x in inputs])
+            input_msgs.append(dict(role='user', content=text))
+        return input_msgs
+
+    def set_dump_image(self, dump_image_func):
+        self.dump_image_func = dump_image_func
+
+    def dump_image(self, line, dataset):
+        return self.dump_image_func(line)
+
+    def image_first(self, msgs):
+        nr_img = 0
+        for s in msgs:
+            if s['type'] == 'image':
+                nr_img += 1
+
+        if nr_img == 1:
+            new_msgs = []
+            img_msg = None
+            for s in msgs:
+                if s['type'] == 'text':
+                    new_msgs.append(s)
+                else:
+                    img_msg = s
+            new_msgs.insert(0, img_msg)
+        else:
+            new_msgs = msgs
+
+        return new_msgs
+
+    def build_multi_choice_prompt(self, line, dataset=None):
+        question = line['question']
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+        prompt = question
+
+        if len(options):
+            prompt += '\n请直接回答选项字母。' if cn_string(
+                prompt) else "\nAnswer with the option's letter from the given choices directly."
+        else:
+            prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
+
+        return prompt
+
+    def build_yorn_prompt(self, line, dataset=None):
+        if listinstr(['HallusionBench'], dataset):
+            pre_prompt = 'Read the following question carefully, think and solve it step by step.\n\n'
+        else:
+            pre_prompt = ''
+
+        prompt = pre_prompt + line['question'] + ' Please answer yes or no as the final answer.'
+
+        return prompt
+
+    def build_vqa_prompt(self, line, dataset=None):
+        if listinstr(['OCRBench'], dataset):
+            pre_prompt = 'Carefully identify the text in the image and answer the question.\n\n'
+        else:
+            pre_prompt = ''
+
+        if listinstr(['MMVet'], dataset):
+            post_prompt = '\nAnswer this question in detail.'
+        else:
+            post_prompt = ''
+
+        prompt = pre_prompt + line['question'] + post_prompt
+
+        return prompt
+
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert dataset is None or isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+
+        if DATASET_TYPE(dataset) == 'MCQ':
+            prompt = self.build_multi_choice_prompt(line, dataset)
+        elif DATASET_TYPE(dataset) == 'Y/N':
+            prompt = self.build_yorn_prompt(line, dataset)
+        elif DATASET_TYPE(dataset) == 'VQA':
+            prompt = self.build_vqa_prompt(line, dataset)
+        else:
+            raise RuntimeError(f'Invalid dataset type: {DATASET_TYPE(dataset)}')
+        message = []
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        message.extend([dict(type='text', value=prompt)])
+
+        # interleave dataset
+        if dataset.startswith('MMMU_'):
+            from .. import MMMUDataset
+            message = MMMUDataset.split_MMMU(message)
+            message = self.image_first(message)
+
+        return message
+
+    def generate_inner(self, inputs, **kwargs) -> str:
+
+        input_msgs = self.prepare_inputs(inputs)
+        temperature = kwargs.pop('temperature', self.temperature)
+
+        headers = {'Authorization': f'Bearer {self.key}'}
+        payload = dict(
+            model=self.model,
+            messages=input_msgs,
+            n=1,
+            temperature=temperature,
+            **kwargs)
+        response = requests.post(self.url, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
+        ret_code = response.status_code
+        ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
+        answer = self.fail_msg
+        try:
+            resp_struct = json.loads(response.text)
+            answer = resp_struct['choices'][0]['message']['content'].strip()
+        except:
+            pass
+        return ret_code, answer, response
+
+
+class TaiyiAPI(TaiyiWrapper):
+
+    def generate(self, message, dataset=None):
+        return super(TaiyiAPI, self).generate(message)
--- a/VLMEvalKit/vlmeval/config.py
+++ b/VLMEvalKit/vlmeval/config.py
+from vlmeval.vlm import *
+from vlmeval.api import *
+from functools import partial
+
+PandaGPT_ROOT = None
+MiniGPT4_ROOT = None
+TransCore_ROOT = None
+Yi_ROOT = None
+OmniLMM_ROOT = None
+Mini_Gemini_ROOT = None
+VXVERSE_ROOT = None
+VideoChat2_ROOT = None
+VideoChatGPT_ROOT = None
+PLLaVA_ROOT = None
+RBDash_ROOT = None
+LLAVA_V1_7B_MODEL_PTH = 'Please set your local path to LLaVA-7B-v1.1 here, the model weight is obtained by merging LLaVA delta weight based on vicuna-7b-v1.1 in https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md with vicuna-7b-v1.1. '
+
+video_models = {
+    'Video-LLaVA-7B':partial(VideoLLaVA, model_path='LanguageBind/Video-LLaVA-7B'),
+    'Video-LLaVA-7B-HF':partial(VideoLLaVA_HF, model_path='LanguageBind/Video-LLaVA-7B-hf'),
+    'VideoChat2-HD':partial(VideoChat2_HD, model_path='OpenGVLab/VideoChat2_HD_stage4_Mistral_7B', root=VideoChat2_ROOT, config_file='./vlmeval/vlm/video_llm/configs/videochat2_hd.json'),
+    'Chat-UniVi-7B': partial(Chatunivi, model_path="Chat-UniVi/Chat-UniVi"),
+    'Chat-UniVi-7B-v1.5': partial(Chatunivi, model_path="Chat-UniVi/Chat-UniVi-7B-v1.5"),
+    'LLaMA-VID-7B': partial(LLaMAVID, model_path='YanweiLi/llama-vid-7b-full-224-video-fps-1'),
+    'Video-ChatGPT': partial(VideoChatGPT, model_path='MBZUAI/Video-ChatGPT-7B', dir_root=VideoChatGPT_ROOT),
+    'PLLaVA-7B': partial(PLLaVA, model_path='ermu2001/pllava-7b', dir_root=PLLaVA_ROOT),
+    'PLLaVA-13B': partial(PLLaVA, model_path='ermu2001/pllava-13b', dir_root=PLLaVA_ROOT),
+    'PLLaVA-34B': partial(PLLaVA, model_path='ermu2001/pllava-34b', dir_root=PLLaVA_ROOT),
+}
+
+ungrouped = {
+    'TransCore_M': partial(TransCoreM, root=TransCore_ROOT),
+    'PandaGPT_13B': partial(PandaGPT, name='PandaGPT_13B', root=PandaGPT_ROOT),
+    'flamingov2': partial(OpenFlamingo, name='v2', mpt_pth='anas-awadalla/mpt-7b', ckpt_pth='openflamingo/OpenFlamingo-9B-vitl-mpt7b'),
+    'VisualGLM_6b': partial(VisualGLM, model_path='THUDM/visualglm-6b'),
+    'mPLUG-Owl2': partial(mPLUG_Owl2, model_path='MAGAer13/mplug-owl2-llama2-7b'),
+    'mPLUG-Owl3': partial(mPLUG_Owl3, model_path='mPLUG/mPLUG-Owl3-7B-240728'),
+    'emu2_chat': partial(Emu, model_path='BAAI/Emu2-Chat'),
+    'OmniLMM_12B': partial(OmniLMM12B, model_path='openbmb/OmniLMM-12B', root=OmniLMM_ROOT),
+    'MGM_7B': partial(Mini_Gemini, model_path='YanweiLi/MGM-7B-HD', root=Mini_Gemini_ROOT),
+    'Bunny-llama3-8B': partial(BunnyLLama3, model_path='BAAI/Bunny-v1_1-Llama-3-8B-V'),
+    'VXVERSE': partial(VXVERSE, model_name='XVERSE-V-13B', root=VXVERSE_ROOT),
+    'paligemma-3b-mix-448': partial(PaliGemma, model_path='google/paligemma-3b-mix-448'),
+    '360VL-70B': partial(QH_360VL, model_path='qihoo360/360VL-70B'),
+    'Llama-3-MixSenseV1_1': partial(LLama3Mixsense, model_path='Zero-Vision/Llama-3-MixSenseV1_1'),
+    'Parrot': partial(Parrot, model_path='AIDC-AI/Parrot-7B'),
+    'OmChat': partial(OmChat, model_path='omlab/omchat-v2.0-13B-single-beta_hf'),
+    'RBDash_72b': partial(RBDash, model_path='RBDash-Team/RBDash-v1.5', root=RBDash_ROOT),
+    'Pixtral-12B': partial(Pixtral, model_path='mistralai/Pixtral-12B-2409'),
+    'Falcon2-VLM-11B': partial(Falcon2VLM, model_path='tiiuae/falcon-11B-vlm')
+}
+
+api_models = {
+    # GPT
+    'GPT4V': partial(GPT4V, model='gpt-4-1106-vision-preview', temperature=0, img_size=512, img_detail='low', retry=10, verbose=False),
+    'GPT4V_HIGH': partial(GPT4V, model='gpt-4-1106-vision-preview', temperature=0, img_size=-1, img_detail='high', retry=10, verbose=False),
+    'GPT4V_20240409': partial(GPT4V, model='gpt-4-turbo-2024-04-09', temperature=0, img_size=512, img_detail='low', retry=10, verbose=False),
+    'GPT4V_20240409_HIGH': partial(GPT4V, model='gpt-4-turbo-2024-04-09', temperature=0, img_size=-1, img_detail='high', retry=10, verbose=False),
+    'GPT4o': partial(GPT4V, model='gpt-4o-2024-05-13', temperature=0, img_size=512, img_detail='low', retry=10, verbose=False),
+    'GPT4o_HIGH': partial(GPT4V, model='gpt-4o-2024-05-13', temperature=0, img_size=-1, img_detail='high', retry=10, verbose=False),
+    'GPT4o_20240806': partial(GPT4V, model='gpt-4o-2024-08-06', temperature=0, img_size=-1, img_detail='high', retry=10, verbose=False),
+    'GPT4o_20241120': partial(GPT4V, model='gpt-4o-2024-11-20', temperature=0, img_size=-1, img_detail='high', retry=10, verbose=False),
+    'GPT4o_MINI': partial(GPT4V, model='gpt-4o-mini-2024-07-18', temperature=0, img_size=-1, img_detail='high', retry=10, verbose=False),
+    # Gemini
+    'GeminiPro1-0': partial(GeminiProVision, model='gemini-1.0-pro', temperature=0, retry=10),  # now GeminiPro1-0 is only supported by vertex backend
+    'GeminiPro1-5': partial(GeminiProVision, model='gemini-1.5-pro', temperature=0, retry=10),
+    'GeminiFlash1-5': partial(GeminiProVision, model='gemini-1.5-flash', temperature=0, retry=10),
+    'GeminiFlash2-0': partial(GeminiProVision, model='gemini-2.0-flash-exp', temperature=0, retry=10),
+    'GeminiPro1-5-002': partial(GPT4V, model='gemini-1.5-pro-002', temperature=0, retry=10),  # Internal Use Only
+    'GeminiFlash1-5-002': partial(GPT4V, model='gemini-1.5-flash-002', temperature=0, retry=10),  # Internal Use Only
+    # Qwen-VL
+    'QwenVLPlus': partial(QwenVLAPI, model='qwen-vl-plus', temperature=0, retry=10),
+    'QwenVLMax': partial(QwenVLAPI, model='qwen-vl-max', temperature=0, retry=10),
+    # Reka
+    'RekaEdge': partial(Reka, model='reka-edge-20240208'),
+    'RekaFlash': partial(Reka, model='reka-flash-20240226'),
+    'RekaCore': partial(Reka, model='reka-core-20240415'),
+    # Step1V
+    'Step1V': partial(GPT4V, model='step-1v-32k', api_base="https://api.stepfun.com/v1/chat/completions", temperature=0, retry=10, img_size=-1, img_detail='high'),
+    'Step1.5V-mini': partial(GPT4V, model='step-1.5v-mini', api_base="https://api.stepfun.com/v1/chat/completions", temperature=0, retry=10, img_size=-1, img_detail='high'),
+    # Yi-Vision
+    'Yi-Vision': partial(GPT4V, model='yi-vision', api_base="https://api.lingyiwanwu.com/v1/chat/completions", temperature=0, retry=10),
+    # Claude
+    'Claude3V_Opus': partial(Claude3V, model='claude-3-opus-20240229', temperature=0, retry=10, verbose=False),
+    'Claude3V_Sonnet': partial(Claude3V, model='claude-3-sonnet-20240229', temperature=0, retry=10, verbose=False),
+    'Claude3V_Haiku': partial(Claude3V, model='claude-3-haiku-20240307', temperature=0, retry=10, verbose=False),
+    'Claude3-5V_Sonnet': partial(Claude3V, model='claude-3-5-sonnet-20240620', temperature=0, retry=10, verbose=False),
+    'Claude3-5V_Sonnet_20241022': partial(Claude3V, model='claude-3-5-sonnet-20241022', temperature=0, retry=10, verbose=False),
+    # GLM4V
+    'GLM4V': partial(GLMVisionAPI, model='glm4v-biz-eval', temperature=0, retry=10),
+    'GLM4V_PLUS': partial(GLMVisionAPI, model='cogvlm-evaluation-241203', temperature=0, retry=10),
+    # MiniMax abab
+    'abab6.5s': partial(GPT4V, model='abab6.5s-chat', api_base='https://api.minimax.chat/v1/chat/completions', temperature=0, retry=10),
+    'abab7-preview': partial(GPT4V, model='abab7-chat-preview', api_base='https://api.minimax.chat/v1/chat/completions', temperature=0, retry=10),
+    # CongRong
+    'CloudWalk': partial(CWWrapper, model='cw-congrong-v1.5', temperature=0, retry=10),
+    # SenseChat-V
+    'SenseChat-Vision': partial(SenseChatVisionAPI, model='SenseChat-Vision', temperature=0, retry=10),
+    'HunYuan-Vision': partial(HunyuanVision, model='hunyuan-vision', temperature=0, retry=10),
+    'bailingMM': partial(bailingMMAPI, model='bailingMM-mini', temperature=0, retry=10),
+    # BlueLM-V
+    "BlueLM_V": partial(BlueLM_V_API, model='BlueLM-VL-v3.0', temperature=0, retry=10),
+    # JiuTian-VL
+    "JTVL": partial(JTVLChatAPI, model='jt-vl-chat', temperature=0, retry=10),
+    "Taiyi": partial(TaiyiAPI, model='taiyi', temperature=0, retry=10),
+    # TeleMM
+    'TeleMM': partial(TeleMMAPI, model='TeleAI/TeleMM', temperature=0, retry=10),
+    # lmdeploy api
+    'lmdeploy': partial(LMDeployAPI, api_base='http://0.0.0.0:23333/v1/chat/completions', temperature=0, retry=10),
+    # Taichu-VL
+    'Taichu-VL-2B': partial(TaichuVLAPI, model='Taichu-VL-2B', url='https://platform.wair.ac.cn/api/v1/infer/10381/v1/chat/completions'),
+}
+
+mmalaya_series = {
+    'MMAlaya': partial(MMAlaya, model_path='DataCanvas/MMAlaya'),
+    'MMAlaya2': partial(MMAlaya2, model_path='DataCanvas/MMAlaya2'),
+}
+
+minicpm_series = {
+    'MiniCPM-V': partial(MiniCPM_V, model_path='openbmb/MiniCPM-V'),
+    'MiniCPM-V-2': partial(MiniCPM_V, model_path='openbmb/MiniCPM-V-2'),
+    'MiniCPM-Llama3-V-2_5': partial(MiniCPM_Llama3_V, model_path='openbmb/MiniCPM-Llama3-V-2_5'),
+    'MiniCPM-V-2_6': partial(MiniCPM_V_2_6, model_path='openbmb/MiniCPM-V-2_6'),
+}
+
+xtuner_series = {
+    'llava-internlm2-7b': partial(LLaVA_XTuner, llm_path='internlm/internlm2-chat-7b', llava_path='xtuner/llava-internlm2-7b', visual_select_layer=-2, prompt_template='internlm2_chat'),
+    'llava-internlm2-20b': partial(LLaVA_XTuner, llm_path='internlm/internlm2-chat-20b', llava_path='xtuner/llava-internlm2-20b', visual_select_layer=-2, prompt_template='internlm2_chat'),
+    'llava-internlm-7b': partial(LLaVA_XTuner, llm_path='internlm/internlm-chat-7b', llava_path='xtuner/llava-internlm-7b', visual_select_layer=-2, prompt_template='internlm_chat'),
+    'llava-v1.5-7b-xtuner': partial(LLaVA_XTuner, llm_path='lmsys/vicuna-7b-v1.5', llava_path='xtuner/llava-v1.5-7b-xtuner', visual_select_layer=-2, prompt_template='vicuna'),
+    'llava-v1.5-13b-xtuner': partial(LLaVA_XTuner, llm_path='lmsys/vicuna-13b-v1.5', llava_path='xtuner/llava-v1.5-13b-xtuner', visual_select_layer=-2, prompt_template='vicuna'),
+    'llava-llama-3-8b': partial(LLaVA_XTuner, llm_path='xtuner/llava-llama-3-8b-v1_1', llava_path='xtuner/llava-llama-3-8b-v1_1', visual_select_layer=-2, prompt_template='llama3_chat'),
+}
+
+qwen_series = {
+    'qwen_base': partial(QwenVL, model_path='Qwen/Qwen-VL'),
+    'qwen_chat': partial(QwenVLChat, model_path='Qwen/Qwen-VL-Chat'),
+    'monkey': partial(Monkey, model_path='echo840/Monkey'),
+    'monkey-chat': partial(MonkeyChat, model_path='echo840/Monkey-Chat'),
+    'minimonkey': partial(MiniMonkey, model_path='mx262/MiniMonkey')
+}
+
+llava_series = {
+    'llava_v1.5_7b': partial(LLaVA, model_path='liuhaotian/llava-v1.5-7b'),
+    'llava_v1.5_13b': partial(LLaVA, model_path='liuhaotian/llava-v1.5-13b'),
+    'llava_v1_7b': partial(LLaVA, model_path=LLAVA_V1_7B_MODEL_PTH),
+    'sharegpt4v_7b': partial(LLaVA, model_path='Lin-Chen/ShareGPT4V-7B'),
+    'sharegpt4v_13b': partial(LLaVA, model_path='Lin-Chen/ShareGPT4V-13B'),
+    'llava_next_vicuna_7b': partial(LLaVA_Next, model_path='llava-hf/llava-v1.6-vicuna-7b-hf'),
+    'llava_next_vicuna_13b': partial(LLaVA_Next, model_path='llava-hf/llava-v1.6-vicuna-13b-hf'),
+    'llava_next_mistral_7b': partial(LLaVA_Next, model_path='llava-hf/llava-v1.6-mistral-7b-hf'),
+    'llava_next_yi_34b': partial(LLaVA_Next, model_path='llava-hf/llava-v1.6-34b-hf'),
+    'llava_next_llama3': partial(LLaVA_Next, model_path='llava-hf/llama3-llava-next-8b-hf'),
+    'llava_next_72b': partial(LLaVA_Next, model_path='llava-hf/llava-next-72b-hf'),
+    'llava_next_110b': partial(LLaVA_Next, model_path='llava-hf/llava-next-110b-hf'),
+    'llava_next_qwen_32b': partial(LLaVA_Next2, model_path='lmms-lab/llava-next-qwen-32b'),
+    'llava_next_interleave_7b': partial(LLaVA_Next, model_path='llava-hf/llava-interleave-qwen-7b-hf'),
+    'llava_next_interleave_7b_dpo': partial(LLaVA_Next, model_path='llava-hf/llava-interleave-qwen-7b-dpo-hf'),
+    'llava-onevision-qwen2-0.5b-ov-hf': partial(LLaVA_OneVision_HF, model_path='llava-hf/llava-onevision-qwen2-0.5b-ov-hf'),
+    'llava-onevision-qwen2-0.5b-si-hf': partial(LLaVA_OneVision_HF, model_path='llava-hf/llava-onevision-qwen2-0.5b-si-hf'),
+    'llava-onevision-qwen2-7b-ov-hf': partial(LLaVA_OneVision_HF, model_path='llava-hf/llava-onevision-qwen2-7b-ov-hf'),
+    'llava-onevision-qwen2-7b-si-hf': partial(LLaVA_OneVision_HF, model_path='llava-hf/llava-onevision-qwen2-7b-si-hf'),
+    'llava_onevision_qwen2_0.5b_si': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-0.5b-si'),
+    'llava_onevision_qwen2_7b_si': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-7b-si'),
+    'llava_onevision_qwen2_72b_si': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-72b-si'),
+    'llava_onevision_qwen2_0.5b_ov': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-0.5b-ov'),
+    'llava_onevision_qwen2_7b_ov': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-7b-ov'),
+    'llava_onevision_qwen2_72b_ov': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-72b-ov-sft'),
+    'Aquila-VL-2B': partial(LLaVA_OneVision, model_path='BAAI/Aquila-VL-2B-llava-qwen'),
+    'llava_video_qwen2_7b':partial(LLaVA_OneVision, model_path='lmms-lab/LLaVA-Video-7B-Qwen2'),
+    'llava_video_qwen2_72b':partial(LLaVA_OneVision, model_path='lmms-lab/LLaVA-Video-72B-Qwen2'),
+    'varco-vision-hf':partial(LLaVA_OneVision_HF, model_path='NCSOFT/VARCO-VISION-14B-HF'),
+}
+
+internvl_series = {
+    'InternVL-Chat-V1-1': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-V1-1', version='V1.1'),
+    'InternVL-Chat-V1-2': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-V1-2', version='V1.2'),
+    'InternVL-Chat-V1-2-Plus': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-V1-2-Plus', version='V1.2'),
+    # InternVL1.5 series
+    'InternVL-Chat-V1-5': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-V1-5', version='V1.5'),
+    'Mini-InternVL-Chat-2B-V1-5': partial(InternVLChat, model_path='OpenGVLab/Mini-InternVL-Chat-2B-V1-5', version='V1.5'),
+    'Mini-InternVL-Chat-4B-V1-5': partial(InternVLChat, model_path='OpenGVLab/Mini-InternVL-Chat-4B-V1-5', version='V1.5'),
+    # InternVL2 series
+    'InternVL2-1B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-1B', version='V2.0'),
+    'InternVL2-2B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-2B', version='V2.0'),
+    'InternVL2-4B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-4B', version='V2.0'),
+    'InternVL2-8B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-8B', version='V2.0'),
+    'InternVL2-26B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-26B', version='V2.0'),
+    'InternVL2-40B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-40B', version='V2.0'),
+    'InternVL2-76B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-Llama3-76B', version='V2.0'),
+    # InternVL2 MPO series
+    'InternVL2-8B-MPO': partial(InternVLChat, model_path='OpenGVLab/InternVL2-8B-MPO', version='V2.0'),
+    'InternVL2-8B-MPO-CoT': partial(InternVLChat, model_path='OpenGVLab/InternVL2-8B-MPO', version='V2.0', use_mpo_prompt=True),
+    # InternVL2.5 series
+    'InternVL2_5-1B': partial(InternVLChat, model_path='OpenGVLab/InternVL2_5-1B', version='V2.0'),
+    'InternVL2_5-2B': partial(InternVLChat, model_path='OpenGVLab/InternVL2_5-2B', version='V2.0'),
+    'InternVL2_5-4B': partial(InternVLChat, model_path='OpenGVLab/InternVL2_5-4B', version='V2.0'),
+    'InternVL2_5-8B': partial(InternVLChat, model_path='OpenGVLab/InternVL2_5-8B', version='V2.0'),
+    'InternVL2_5-26B': partial(InternVLChat, model_path='OpenGVLab/InternVL2_5-26B', version='V2.0'),
+    'InternVL2_5-38B': partial(InternVLChat, model_path='OpenGVLab/InternVL2_5-38B', version='V2.0'),
+    'InternVL2_5-78B': partial(InternVLChat, model_path='OpenGVLab/InternVL2_5-78B', version='V2.0'),
+}
+
+sail_series = {
+    'SAIL-VL-2B': partial(SailVL, model_path='BytedanceDouyinContent/SAIL-VL-2B')
+}
+
+yivl_series = {
+    'Yi_VL_6B': partial(Yi_VL, model_path='01-ai/Yi-VL-6B', root=Yi_ROOT),
+    'Yi_VL_34B': partial(Yi_VL, model_path='01-ai/Yi-VL-34B', root=Yi_ROOT),
+}
+
+xcomposer_series = {
+    'XComposer': partial(XComposer, model_path='internlm/internlm-xcomposer-vl-7b'),
+    'sharecaptioner': partial(ShareCaptioner, model_path='Lin-Chen/ShareCaptioner'),
+    'XComposer2': partial(XComposer2, model_path='internlm/internlm-xcomposer2-vl-7b'),
+    'XComposer2_1.8b': partial(XComposer2, model_path='internlm/internlm-xcomposer2-vl-1_8b'),
+    'XComposer2_4KHD': partial(XComposer2_4KHD, model_path='internlm/internlm-xcomposer2-4khd-7b'),
+    'XComposer2d5': partial(XComposer2d5, model_path='internlm/internlm-xcomposer2d5-7b'),
+}
+
+minigpt4_series = {
+    'MiniGPT-4-v2': partial(MiniGPT4, mode='v2', root=MiniGPT4_ROOT),
+    'MiniGPT-4-v1-7B': partial(MiniGPT4, mode='v1_7b', root=MiniGPT4_ROOT),
+    'MiniGPT-4-v1-13B': partial(MiniGPT4, mode='v1_13b', root=MiniGPT4_ROOT),
+}
+
+idefics_series = {
+    'idefics_9b_instruct': partial(IDEFICS, model_path='HuggingFaceM4/idefics-9b-instruct'),
+    'idefics_80b_instruct': partial(IDEFICS, model_path='HuggingFaceM4/idefics-80b-instruct'),
+    'idefics2_8b': partial(IDEFICS2, model_path='HuggingFaceM4/idefics2-8b'),
+
+    # Idefics3 follows Idefics2 Pattern
+    'Idefics3-8B-Llama3': partial(IDEFICS2, model_path='HuggingFaceM4/Idefics3-8B-Llama3'),
+
+}
+
+smolvlm_series = {
+    'SmolVLM': partial(SmolVLM, model_path='HuggingFaceTB/SmolVLM-Instruct'),
+    'SmolVLM-DPO': partial(SmolVLM, model_path='HuggingFaceTB/SmolVLM-Instruct-DPO'),
+    'SmolVLM-Synthetic': partial(SmolVLM, model_path='HuggingFaceTB/SmolVLM-Instruct'),
+}
+
+instructblip_series = {
+    'instructblip_7b': partial(InstructBLIP, name='instructblip_7b'),
+    'instructblip_13b': partial(InstructBLIP, name='instructblip_13b'),
+}
+
+deepseekvl_series = {
+    'deepseek_vl_7b': partial(DeepSeekVL, model_path='deepseek-ai/deepseek-vl-7b-chat'),
+    'deepseek_vl_1.3b': partial(DeepSeekVL, model_path='deepseek-ai/deepseek-vl-1.3b-chat'),
+}
+
+
+janus_series = {
+    'Janus-1.3B': partial(Janus, model_path='deepseek-ai/Janus-1.3B')
+}
+
+cogvlm_series = {
+    'cogvlm-grounding-generalist': partial(CogVlm, model_path='THUDM/cogvlm-grounding-generalist-hf', tokenizer_name='lmsys/vicuna-7b-v1.5'),
+    'cogvlm-chat': partial(CogVlm, model_path='THUDM/cogvlm-chat-hf', tokenizer_name='lmsys/vicuna-7b-v1.5'),
+    'cogvlm2-llama3-chat-19B': partial(CogVlm, model_path='THUDM/cogvlm2-llama3-chat-19B'),
+    'glm-4v-9b': partial(GLM4v, model_path='THUDM/glm-4v-9b')
+}
+
+wemm_series = {
+    'WeMM': partial(WeMM, model_path='feipengma/WeMM'),
+}
+
+cambrian_series = {
+    'cambrian_8b': partial(Cambrian, model_path='nyu-visionx/cambrian-8b'),
+    'cambrian_13b': partial(Cambrian, model_path='nyu-visionx/cambrian-13b'),
+    'cambrian_34b': partial(Cambrian, model_path='nyu-visionx/cambrian-34b'),
+}
+
+chameleon_series = {
+    'chameleon_7b': partial(Chameleon, model_path='facebook/chameleon-7b'),
+    'chameleon_30b': partial(Chameleon, model_path='facebook/chameleon-30b'),
+}
+
+vila_series = {
+    'VILA1.5-3b': partial(VILA, model_path='Efficient-Large-Model/VILA1.5-3b'),
+    'Llama-3-VILA1.5-8b': partial(VILA, model_path='Efficient-Large-Model/Llama-3-VILA1.5-8b'),
+    'VILA1.5-13b': partial(VILA, model_path='Efficient-Large-Model/VILA1.5-13b'),
+    'VILA1.5-40b': partial(VILA, model_path='Efficient-Large-Model/VILA1.5-40b'),
+}
+
+ovis_series = {
+    'Ovis1.5-Llama3-8B': partial(Ovis, model_path='AIDC-AI/Ovis1.5-Llama3-8B'),
+    'Ovis1.5-Gemma2-9B': partial(Ovis, model_path='AIDC-AI/Ovis1.5-Gemma2-9B'),
+    'Ovis1.6-Gemma2-9B': partial(Ovis1_6, model_path='AIDC-AI/Ovis1.6-Gemma2-9B'),
+    'Ovis1.6-Llama3.2-3B': partial(Ovis1_6, model_path='AIDC-AI/Ovis1.6-Llama3.2-3B'),
+    'Ovis1.6-Gemma2-27B': partial(Ovis1_6_Plus, model_path='AIDC-AI/Ovis1.6-Gemma2-27B')
+}
+
+mantis_series = {
+    'Mantis-8B-siglip-llama3': partial(Mantis, model_path='TIGER-Lab/Mantis-8B-siglip-llama3'),
+    'Mantis-8B-clip-llama3': partial(Mantis, model_path='TIGER-Lab/Mantis-8B-clip-llama3'),
+    'Mantis-8B-Idefics2': partial(Mantis, model_path='TIGER-Lab/Mantis-8B-Idefics2'),
+    'Mantis-8B-Fuyu': partial(Mantis, model_path='TIGER-Lab/Mantis-8B-Fuyu')
+}
+
+phi3_series = {
+    'Phi-3-Vision': partial(Phi3Vision, model_path='microsoft/Phi-3-vision-128k-instruct'),
+    'Phi-3.5-Vision': partial(Phi3_5Vision, model_path='microsoft/Phi-3.5-vision-instruct')
+}
+
+xgen_mm_series = {
+    'xgen-mm-phi3-interleave-r-v1.5': partial(XGenMM, model_path='Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5'),
+    'xgen-mm-phi3-dpo-r-v1.5': partial(XGenMM, model_path='Salesforce/xgen-mm-phi3-mini-instruct-dpo-r-v1.5'),
+}
+
+qwen2vl_series = {
+    'Qwen-VL-Max-0809': partial(Qwen2VLAPI, model='qwen-vl-max-0809', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'Qwen-VL-Plus-0809': partial(Qwen2VLAPI, model='qwen-vl-plus-0809', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'Qwen2-VL-72B-Instruct': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-72B-Instruct', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'Qwen2-VL-7B-Instruct': partial(Qwen2VLChat, model_path='/home/luopl1/Qwen2-VL-7B-Instruct', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'Qwen2-VL-7B-Instruct-AWQ': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct-AWQ', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'Qwen2-VL-7B-Instruct-GPTQ-Int4': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'Qwen2-VL-7B-Instruct-GPTQ-Int8': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'Qwen2-VL-2B-Instruct': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-2B-Instruct', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'Qwen2-VL-2B-Instruct-AWQ': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-2B-Instruct-AWQ', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'Qwen2-VL-2B-Instruct-GPTQ-Int4': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'Qwen2-VL-2B-Instruct-GPTQ-Int8': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'XinYuan-VL-2B-Instruct': partial(Qwen2VLChat, model_path='Cylingo/Xinyuan-VL-2B', min_pixels=1280*28*28, max_pixels=16384*28*28),
+}
+
+slime_series = {
+    'Slime-7B': partial(SliME, model_path='yifanzhang114/SliME-vicuna-7B'),
+    'Slime-8B': partial(SliME, model_path='yifanzhang114/SliME-Llama3-8B'),
+    'Slime-13B': partial(SliME, model_path='yifanzhang114/SliME-vicuna-13B'),
+}
+
+eagle_series={
+    'Eagle-X4-8B-Plus': partial(Eagle, model_path='NVEagle/Eagle-X4-8B-Plus'),
+    'Eagle-X4-13B-Plus': partial(Eagle, model_path='NVEagle/Eagle-X4-13B-Plus'),
+    'Eagle-X5-7B': partial(Eagle, model_path='NVEagle/Eagle-X5-7B'),
+    'Eagle-X5-13B': partial(Eagle, model_path='NVEagle/Eagle-X5-13B'),
+    'Eagle-X5-13B-Chat': partial(Eagle, model_path='NVEagle/Eagle-X5-13B-Chat'),
+    'Eagle-X5-34B-Chat': partial(Eagle, model_path='NVEagle/Eagle-X5-34B-Chat'),
+    'Eagle-X5-34B-Plus': partial(Eagle, model_path='NVEagle/Eagle-X5-34B-Plus'),
+}
+
+moondream_series={
+    'Moondream1': partial(Moondream1, model_path='vikhyatk/moondream1'),
+    'Moondream2': partial(Moondream2, model_path='vikhyatk/moondream2'),
+}
+
+llama_series={
+    'Llama-3.2-11B-Vision-Instruct': partial(llama_vision, model_path='meta-llama/Llama-3.2-11B-Vision-Instruct'),
+    'LLaVA-CoT': partial(llama_vision, model_path='Xkev/Llama-3.2V-11B-cot'),
+    'Llama-3.2-90B-Vision-Instruct': partial(llama_vision, model_path='meta-llama/Llama-3.2-90B-Vision-Instruct'),
+}
+
+molmo_series={
+    'molmoE-1B-0924': partial(molmo, model_path='allenai/MolmoE-1B-0924'),
+    'molmo-7B-D-0924': partial(molmo, model_path='allenai/Molmo-7B-D-0924'),
+    'molmo-7B-O-0924': partial(molmo, model_path='allenai/Molmo-7B-O-0924'),
+    'molmo-72B-0924': partial(molmo, model_path='allenai/Molmo-72B-0924'),
+}
+
+kosmos_series={
+    'Kosmos2': partial(Kosmos2, model_path='microsoft/kosmos-2-patch14-224')
+}
+
+points_series = {
+    'POINTS-Yi-1.5-9B-Chat': partial(POINTS, model_path='WePOINTS/POINTS-Yi-1-5-9B-Chat'),
+    'POINTS-Qwen-2.5-7B-Chat': partial(POINTS, model_path='WePOINTS/POINTS-Qwen-2-5-7B-Chat'),
+    'POINTSV15-Qwen-2.5-7B-Chat': partial(POINTSV15, model_path='WePOINTS/POINTS-1-5-Qwen-2-5-7B-Chat'),
+
+}
+
+nvlm_series = {
+    'NVLM': partial(NVLM, model_path='nvidia/NVLM-D-72B'),
+}
+
+vintern_series = {
+    'Vintern-3B-beta': partial(VinternChat, model_path='5CD-AI/Vintern-3B-beta'),
+    'Vintern-1B-v2': partial(VinternChat, model_path='5CD-AI/Vintern-1B-v2'),
+}
+
+aria_series = {
+    "Aria": partial(Aria, model_path='rhymes-ai/Aria')
+}
+
+h2ovl_series = {
+    'h2ovl-mississippi-2b': partial(H2OVLChat, model_path='h2oai/h2ovl-mississippi-2b'),
+    'h2ovl-mississippi-1b': partial(H2OVLChat, model_path='h2oai/h2ovl-mississippi-800m'),
+}
+
+valley_series = {
+    'valley_eagle': partial(ValleyEagleChat, model_path='bytedance-research/Valley-Eagle-7B'),
+}
+
+supported_VLM = {}
+
+model_groups = [
+    ungrouped, api_models,
+    xtuner_series, qwen_series, llava_series, internvl_series, yivl_series,
+    xcomposer_series, minigpt4_series, idefics_series, instructblip_series,
+    deepseekvl_series, janus_series, minicpm_series, cogvlm_series, wemm_series,
+    cambrian_series, chameleon_series, video_models, ovis_series, vila_series,
+    mantis_series, mmalaya_series, phi3_series, xgen_mm_series, qwen2vl_series,
+    slime_series, eagle_series, moondream_series, llama_series, molmo_series,
+    kosmos_series, points_series, nvlm_series, vintern_series, h2ovl_series, aria_series,
+    smolvlm_series, sail_series, valley_series
+]
+
+for grp in model_groups:
+    supported_VLM.update(grp)
--- a/VLMEvalKit/vlmeval/dataset/__init__.py
+++ b/VLMEvalKit/vlmeval/dataset/__init__.py
+import warnings
+
+from .image_base import img_root_map, ImageBaseDataset
+from .image_caption import ImageCaptionDataset
+from .image_yorn import ImageYORNDataset
+from .image_mcq import (
+    ImageMCQDataset, MMMUDataset, CustomMCQDataset, MUIRDataset, GMAIMMBenchDataset, MMERealWorld, HRBenchDataset,
+    NaturalBenchDataset
+)
+from .image_mt import MMDUDataset
+from .image_vqa import (
+    ImageVQADataset, MathVision, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, TableVQABench,
+    CustomVQADataset, CRPE, MathVerse, OlympiadBench, QSpatial, VizWiz, MMNIAH
+)
+
+from .text_mcq import CustomTextMCQDataset, TextMCQDataset
+
+from .vcr import VCRDataset
+from .mmlongbench import MMLongBench
+from .dude import DUDE
+from .slidevqa import SlideVQA
+
+from .mmbench_video import MMBenchVideo
+from .videomme import VideoMME
+from .mvbench import MVBench, MVBench_MP4
+from .mlvu import MLVU, MLVU_MCQ, MLVU_OpenEnded
+from .tempcompass import TempCompass, TempCompass_Captioning, TempCompass_MCQ, TempCompass_YorN
+from .longvideobench import LongVideoBench
+from .video_concat_dataset import ConcatVideoDataset
+from .mmgenbench import MMGenBench
+
+from .miabench import MIABench
+from .cmmmu import CMMMU
+from .wildvision import WildVision
+from .mmmath import MMMath
+from .dynamath import Dynamath
+from .utils import *
+from .video_dataset_config import *
+from ..smp import *
+
+
+class ConcatDataset(ImageBaseDataset):
+    # This dataset takes multiple dataset names as input and aggregate them into a single dataset.
+    # Each single dataset should not have a field named `SUB_DATASET`
+
+    DATASET_SETS = {
+        'MMMB': ['MMMB_ar', 'MMMB_cn', 'MMMB_en', 'MMMB_pt', 'MMMB_ru', 'MMMB_tr'],
+        'MTL_MMBench_DEV': [
+            'MMBench_dev_ar', 'MMBench_dev_cn', 'MMBench_dev_en',
+            'MMBench_dev_pt', 'MMBench_dev_ru', 'MMBench_dev_tr'
+        ]
+    }
+
+    def __init__(self, dataset):
+        datasets = self.DATASET_SETS[dataset]
+        self.dataset_map = {}
+        # The name of the compliation
+        self.dataset_name = dataset
+        self.datasets = datasets
+        for dname in datasets:
+            dataset = build_dataset(dname)
+            assert dataset is not None, dataset
+            self.dataset_map[dname] = dataset
+        TYPES = [x.TYPE for x in self.dataset_map.values()]
+        MODALITIES = [x.MODALITY for x in self.dataset_map.values()]
+        assert np.all([x == TYPES[0] for x in TYPES]), (datasets, TYPES)
+        assert np.all([x == MODALITIES[0] for x in MODALITIES]), (datasets, MODALITIES)
+        self.TYPE = TYPES[0]
+        self.MODALITY = MODALITIES[0]
+        data_all = []
+        for dname in datasets:
+            data = self.dataset_map[dname].data
+            data['SUB_DATASET'] = [dname] * len(data)
+            data_new = localize_df(data, dname, nproc=16)
+            data_all.append(data_new)
+
+        data = pd.concat(data_all)
+        data['original_index'] = data.pop('index')
+        data['index'] = np.arange(len(data))
+        self.data = data
+
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        idx = line['original_index']
+        dname = line['SUB_DATASET']
+        org_data = self.dataset_map[dname].data
+        org_line = cp.deepcopy(org_data[org_data['index'] == idx]).iloc[0]
+        return self.dataset_map[dname].build_prompt(org_line)
+
+    def dump_image(self, line):
+        # Assert all images are pre-dumped
+        assert 'image' not in line
+        assert 'image_path' in line
+        tgt_path = toliststr(line['image_path'])
+        return tgt_path
+
+    @classmethod
+    def supported_datasets(cls):
+        return list(cls.DATASET_SETS)
+
+    def evaluate(self, eval_file, **judge_kwargs):
+        suffix = eval_file.split('.')[-1]
+        # First, split the eval_file by dataset
+        data_all = load(eval_file)
+        for dname in self.datasets:
+            tgt = eval_file.replace(self.dataset_name, dname)
+            data_sub = data_all[data_all['SUB_DATASET'] == dname]
+            data_sub.pop('index')
+            data_sub['index'] = data_sub.pop('original_index')
+            data_sub.pop('SUB_DATASET')
+            dump(data_sub, tgt)
+        # Then, evaluate each dataset separately
+        results_all = []
+        for dname in self.datasets:
+            tgt = eval_file.replace(self.dataset_name, dname)
+            res = self.dataset_map[dname].evaluate(tgt, **judge_kwargs)
+            assert isinstance(res, pd.DataFrame)
+            res['DATASET'] = [dname] * len(res)
+            results_all.append(res)
+        result = pd.concat(results_all)
+        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        dump(result, score_file)
+        return result
+
+
+# Add new supported dataset class here
+IMAGE_DATASET = [
+    ImageCaptionDataset, ImageYORNDataset, ImageMCQDataset, ImageVQADataset, MathVision,
+    MMMUDataset, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, TableVQABench,
+    MMLongBench, VCRDataset, MMDUDataset, DUDE, SlideVQA, MUIRDataset,
+    GMAIMMBenchDataset, MMERealWorld, HRBenchDataset, CRPE, MathVerse, NaturalBenchDataset,
+    MIABench, OlympiadBench, WildVision, MMMath, QSpatial, Dynamath, MMGenBench, VizWiz, MMNIAH,
+    CMMMU
+]
+
+VIDEO_DATASET = [
+    MMBenchVideo, VideoMME, MVBench, MVBench_MP4, LongVideoBench,
+    MLVU, MLVU_MCQ, MLVU_OpenEnded,
+    TempCompass, TempCompass_MCQ, TempCompass_Captioning, TempCompass_YorN
+]
+
+TEXT_DATASET = [
+    TextMCQDataset
+]
+
+CUSTOM_DATASET = [
+    CustomMCQDataset, CustomVQADataset, CustomTextMCQDataset
+]
+
+DATASET_COLLECTION = [ConcatDataset, ConcatVideoDataset]
+
+DATASET_CLASSES = IMAGE_DATASET + VIDEO_DATASET + TEXT_DATASET + CUSTOM_DATASET + DATASET_COLLECTION
+SUPPORTED_DATASETS = []
+for DATASET_CLS in DATASET_CLASSES:
+    SUPPORTED_DATASETS.extend(DATASET_CLS.supported_datasets())
+
+
+def DATASET_TYPE(dataset, *, default: str = 'MCQ') -> str:
+    for cls in DATASET_CLASSES:
+        if dataset in cls.supported_datasets():
+            if hasattr(cls, 'TYPE'):
+                return cls.TYPE
+    # Have to add specific routine to handle ConcatDataset
+    if dataset in ConcatDataset.DATASET_SETS:
+        dataset_list = ConcatDataset.DATASET_SETS[dataset]
+        TYPES = [DATASET_TYPE(dname) for dname in dataset_list]
+        assert np.all([x == TYPES[0] for x in TYPES]), (dataset_list, TYPES)
+        return TYPES[0]
+
+    if 'openended' in dataset.lower():
+        return 'VQA'
+    warnings.warn(f'Dataset {dataset} is a custom one and not annotated as `openended`, will treat as {default}. ')
+    return default
+
+
+def DATASET_MODALITY(dataset, *, default: str = 'IMAGE') -> str:
+    if dataset is None:
+        warnings.warn(f'Dataset is not specified, will treat modality as {default}. ')
+        return default
+    for cls in DATASET_CLASSES:
+        if dataset in cls.supported_datasets():
+            if hasattr(cls, 'MODALITY'):
+                return cls.MODALITY
+    # Have to add specific routine to handle ConcatDataset
+    if dataset in ConcatDataset.DATASET_SETS:
+        dataset_list = ConcatDataset.DATASET_SETS[dataset]
+        MODALITIES = [DATASET_MODALITY(dname) for dname in dataset_list]
+        assert np.all([x == MODALITIES[0] for x in MODALITIES]), (dataset_list, MODALITIES)
+        return MODALITIES[0]
+
+    if 'VIDEO' in dataset.lower():
+        return 'VIDEO'
+    elif 'IMAGE' in dataset.lower():
+        return 'IMAGE'
+    warnings.warn(f'Dataset {dataset} is a custom one, will treat modality as {default}. ')
+    return default
+
+
+def build_dataset(dataset_name, **kwargs):
+    for cls in DATASET_CLASSES:
+        if dataset_name in supported_video_datasets:
+            return supported_video_datasets[dataset_name](**kwargs)
+        elif dataset_name in cls.supported_datasets():
+            return cls(dataset=dataset_name, **kwargs)
+
+    warnings.warn(f'Dataset {dataset_name} is not officially supported. ')
+
+    data_file = osp.join(LMUDataRoot(), f'{dataset_name}.tsv')
+    if not osp.exists(data_file):
+        warnings.warn(f'Data file {data_file} does not exist. Dataset building failed. ')
+        return None
+
+    data = load(data_file)
+    if 'question' not in [x.lower() for x in data.columns]:
+        warnings.warn(f'Data file {data_file} does not have a `question` column. Dataset building failed. ')
+        return None
+
+    if 'A' in data and 'B' in data:
+        if 'image' in data or 'image_path' in data:
+            warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom MCQ dataset. ')
+            return CustomMCQDataset(dataset=dataset_name, **kwargs)
+        else:
+            warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom Text MCQ dataset. ')
+            return CustomTextMCQDataset(dataset=dataset_name, **kwargs)
+    else:
+        warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom VQA dataset. ')
+        return CustomVQADataset(dataset=dataset_name, **kwargs)
+
+
+__all__ = [
+    'build_dataset', 'img_root_map', 'build_judge', 'extract_answer_from_item', 'prefetch_answer', 'DEBUG_MESSAGE'
+] + [cls.__name__ for cls in DATASET_CLASSES]
--- a/VLMEvalKit/vlmeval/dataset/cmmmu.py
+++ b/VLMEvalKit/vlmeval/dataset/cmmmu.py
+from .image_base import ImageBaseDataset
+import random
+from collections import Counter
+import os
+import re
+import tempfile
+from ..smp import *
+
+
+def get_multi_choice_prediction(response, all_choices, index2ans):
+    for char in [',', '.', '!', '?', ';', ':', "'"]:
+        response = response.strip(char)
+    response = " " + response + " "  # add space to avoid partial match
+
+    candidates = []
+
+    for choice in all_choices:  # (A) (B) (C) (D)
+        # Add the choice to candidates each time it appears in the response
+        candidates.extend([choice for _ in range(response.count(f'({choice})'))])
+
+    if len(candidates) == 0:
+        for choice in all_choices:  # A B C D
+            # Similarly, add the choice for each occurrence
+            candidates.extend([choice for _ in range(response.count(f'{choice}'))])
+
+    if len(candidates) == 0 and len(response.split()) >= 1:
+        for index, ans in index2ans.items():
+            # Add index for each occurrence of ans in response
+            candidates.extend([index for _ in range(response.count(ans))])
+
+    # if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
+    if len(candidates) == 0 and len(response.split()) >= 1:
+        for index, ans in index2ans.items():
+            if ans in response:
+                candidates.append(index)
+                # index_ans = False  # it's content ans.
+
+    if len(candidates) == 0:  # still not get answer, randomly choose one.
+        return random.choice(all_choices)
+        # return ''
+    else:
+        # Count the occurrence of each candidate
+        candidate_counts = Counter(candidates)
+
+        # Select the most frequent candidates
+        max_count = max(candidate_counts.values())
+        most_frequent_candidates = [c for c in all_choices if candidate_counts.get(c, 0) == max_count]
+
+        # Combine the most frequent candidates in ABCD order
+        return ''.join(most_frequent_candidates)
+
+
+def extract_numbers(string):
+    # Pattern for numbers with Chinese commas
+    pattern_commas = r'-?\d{1,3}(?:，\d{3})+'
+    # Pattern for scientific notation
+    pattern_scientific = r'-?\d+(?:\.\d+)?[eE][+-]?\d+'
+    # Pattern for simple numbers without Chinese commas
+    pattern_simple = r'-?(?:\d+\.\d+|\.\d+|\d+)(?![eE][+-]?\d+)(?!，\d)'
+
+    # Extract numbers with Chinese commas
+    numbers_with_commas = re.findall(pattern_commas, string)
+    # Extract numbers in scientific notation
+    numbers_scientific = re.findall(pattern_scientific, string)
+    # Extract simple numbers without Chinese commas
+    numbers_simple = re.findall(pattern_simple, string)
+
+    # Combine all extracted numbers
+    all_numbers = numbers_with_commas + numbers_scientific + numbers_simple
+    return all_numbers
+
+
+def check_is_number(string):
+    try:
+        float(string.replace(',', ''))
+        return True
+    except ValueError:
+        # check if there's comma inside
+        return False
+
+
+def count_letters(string):
+    return sum(c.isalpha() and 'a' <= c <= 'z' or 'A' <= c <= 'Z' for c in string)
+
+
+def normalize_str(string, answer):
+    # check if characters in the string
+
+    # if number, numerize it.
+    if string is None:
+        return [string]
+    string = string.strip()
+
+    is_number = check_is_number(string)
+
+    if is_number:
+        string = string.replace(',', '')
+        string = float(string)
+        # leave 2 decimal
+        string = round(string, 2)
+        return [string]
+    else:  # it's likely to be a string
+        if len(string) > len(answer) + 20 or count_letters(string) > count_letters(answer) + 2:
+            return []
+        return [string]
+
+
+def get_fill_blank_prediction(response, answer):
+    """get the prediction from the generated response,
+    return a list of predicted strings or numbers"""
+
+    def get_key_subresponses(response):
+        response = response.strip("。").strip()
+        sub_responses = re.split(r'。|\n', response)
+        indicators_of_keys = ['是', '为', '所以', '等于', '方案', '选择',
+                              '正确答案', '因此', '最后', '答案', '结果']
+        key_responses = []
+        for index, resp in enumerate(sub_responses):
+            # if last one, accept it's an equation (the entire response can be just one sentence with equation)
+            if index == len(sub_responses) - 1:
+                indicators_of_keys.extend(['='])
+            shortest_key_response = None
+            # the shortest response that may contain the answer (tail part of the response)
+            for indicator in indicators_of_keys:
+                if indicator in resp:
+                    if not shortest_key_response:
+                        shortest_key_response = resp.split(indicator)[-1].strip()
+                    else:
+                        if len(resp.split(indicator)[-1].strip()) < len(shortest_key_response):
+                            shortest_key_response = resp.split(indicator)[-1].strip()
+
+            if shortest_key_response:
+                # and it's not trivial
+                if shortest_key_response.strip() not in [":", ",", ".", "!", "?", ";", ":", "'"]:
+                    key_responses.append(shortest_key_response)
+        if len(key_responses) == 0:  # did not found any
+            return [response]
+        return key_responses
+
+    key_responses = get_key_subresponses(response)
+
+    pred_list = key_responses.copy()  # keep the original string response
+    for resp in key_responses:
+        pred_list.extend(extract_numbers(resp))
+
+    tmp_pred_list = []
+    for i in range(len(pred_list)):
+        tmp_pred_list.extend(normalize_str(pred_list[i], answer))
+    pred_list = tmp_pred_list
+
+    # remove duplicates
+    pred_list = list(set(pred_list))
+
+    return pred_list
+
+
+def get_TF_prediction(response):
+    """get the prediction from the generated response,
+    return a list of predicted strings or numbers"""
+
+    def get_key_subresponses(response):
+        response = response.strip("。").strip()
+        sub_responses = re.split(r'。|\n', response)
+        indicators_of_keys = ['是', '为', '所以', '判断',
+                              '陈述', '说法', '表达', '答案', '结果']
+        key_responses = []
+        for index, resp in enumerate(sub_responses):
+            shortest_key_response = None
+            # the shortest response that may contain the answer (tail part of the response)
+            for indicator in indicators_of_keys:
+                if indicator in resp:
+                    if not shortest_key_response:
+                        shortest_key_response = resp.split(indicator)[-1].strip()
+                    else:
+                        if len(resp.split(indicator)[-1].strip()) < len(shortest_key_response):
+                            shortest_key_response = resp.split(indicator)[-1].strip()
+
+            if shortest_key_response:
+                # and it's not trivial
+                if shortest_key_response.strip() not in [":", ",", ".", "!", "?", ";", ":", "'"]:
+                    key_responses.append(shortest_key_response)
+        if len(key_responses) == 0:  # did not found any
+            return [response]
+        return key_responses
+
+    key_responses = get_key_subresponses(response)
+
+    pred_list = key_responses.copy()  # keep the original string response
+    # remove duplicates
+    pred_list = list(set(pred_list))
+
+    return pred_list
+
+
+class CMMMU(ImageBaseDataset):
+    TYPE = 'VQA'
+
+    DATASET_URL = {
+        'CMMMU_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/CMMMU_VAL.tsv'
+    }
+
+    DATASET_MD5 = {
+        'CMMMU_VAL': 'b4727e2fce2415bf646379e60c11a726'
+    }
+
+    def dump_image(self, line):
+        os.makedirs(self.img_root, exist_ok=True)
+
+        tgt_path_z = []
+        if isinstance(line['image'], list):
+            for i in range(len(line['image'])):
+                tgt_path = osp.join(self.img_root, f"{line['index']}--{i + 1}.jpg")
+                if not read_ok(tgt_path):
+                    decode_base64_to_image_file(line['image'][i], tgt_path)
+                tgt_path_z.append(tgt_path)
+        else:
+            tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
+            if not read_ok(tgt_path):
+                decode_base64_to_image_file(line['image'], tgt_path)
+            tgt_path_z.append(tgt_path)
+        return tgt_path_z
+
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+
+        suffix = eval_file.split('.')[-1]
+        result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+
+        if not osp.exists(result_file):
+            data = load(eval_file)
+            assert 'answer' in data and 'prediction' in data
+            data['prediction'] = [str(x) for x in data['prediction']]
+            data['answer'] = [str(x) for x in data['answer']]
+
+            correct_count = 0
+            correct_category = {
+                '技术与工程': [0, 0],
+                '科学': [0, 0],
+                '健康与医学': [0, 0],
+                '商业': [0, 0],
+                '艺术与设计': [0, 0],
+                '人文社会科学': [0, 0],
+            }
+
+            for i in tqdm(data.iterrows()):
+                line = i[1]
+                correct_category[line['category']][0] += 1
+
+                # Options
+                if line['type'] == '选择':
+                    index2ans = {
+                        'A': line['option1'],
+                        'B': line['option2'],
+                        'C': line['option3'],
+                        'D': line['option4']
+                    }
+                    fact_option = get_multi_choice_prediction(line['prediction'], ['A', 'B', 'C', 'D'], index2ans)
+                    if fact_option == line['answer']:
+                        correct_count += 1
+                        correct_category[line['category']][1] += 1
+
+                # Binary
+                elif line['type'] == '判断':
+                    positive_keywords = ['正确', '对', '准确', '肯定', '对的']
+                    negative_keywords = ['不对', '错误', '不正确', '不准确', '不合适', '否定', '错的', '错']
+                    ambiguous_keywords = ['对错', '是否正确', '否正确', '或者', '是否', '正确性', '对不']
+
+                    def judge_similarity(pred_list, positive_keywords, negative_keywords):
+                        positive_count = 0
+                        negative_count = 0
+
+                        for pred in pred_list:
+                            if any(pos_word in pred for pos_word in positive_keywords):
+                                positive_count += 1
+                            elif any(neg_word in pred for neg_word in negative_keywords):
+                                negative_count += 1
+
+                        if positive_count > negative_count:
+                            return "对"
+                        elif negative_count > positive_count:
+                            return "错"
+                        else:
+                            return random.choice(['对', '错'])
+
+                    answer = get_TF_prediction(line['prediction'])
+                    answer = [word for word in answer if not any(ambiguous in word for ambiguous in ambiguous_keywords)]
+                    fact_answer = judge_similarity(answer, positive_keywords, negative_keywords)
+                    if fact_answer == line['answer']:
+                        correct_count += 1
+                        correct_category[line['category']][1] += 1
+
+                # Fill the Blank
+                else:
+                    norm_answers = normalize_str(line['answer'], line['answer'])
+                    predicted_answer = get_fill_blank_prediction(line['prediction'], line['answer'])
+
+                    for pred in predicted_answer:
+                        # already normalized
+                        if isinstance(pred, str):  # if it's a string, then find if ans in the pred_i
+                            for norm_ans in norm_answers:
+                                # only see if the string answer in the string pred
+                                # print(norm_ans, pred)
+                                if isinstance(norm_ans, str) and norm_ans in pred:
+                                    correct_count += 1
+                                    correct_category[line['category']][1] += 1
+                        else:  # it's a number
+                            if pred in norm_answers:
+                                correct_count += 1
+                                correct_category[line['category']][1] += 1
+
+            accuracyz = {}
+            accuracyz['总准确率'] = correct_count / len(data)
+            for i in correct_category.keys():
+                accuracyz[i] = correct_category[i][1] / correct_category[i][0]
+
+            accuracyz = d2df(accuracyz)
+            accuracyz.round(10)
+            dump(accuracyz, result_file)
+
+        result = pd.read_csv(result_file)
+        return result
+
+    def build_prompt(self, line):
+        if line['type'] == '选择':
+            tgt_path = self.dump_image(line)
+            question = line['question']
+            options_prompt = 'Options:\n'
+
+            for i in [['A', '1'], ['B', '2'], ['C', '3'], ['D', '4']]:
+                options_prompt += i[0] + '. ' + line['option' + i[1]] + '\n'
+
+            prompt = (f'问题: {question}\n' + options_prompt
+                      + '请回答上述多项选择题，并选出正确选项。这些题目可能包括单选和多选题型。如果所提供的信息不足以确定一个明确的答案，那么请根据可用的数据和你的判断来选择最可能正确的选项。')
+
+            msgs = []
+            if isinstance(tgt_path, list):
+                msgs.extend([dict(type='image', value=p) for p in tgt_path])
+            else:
+                msgs = [dict(type='image', value=tgt_path)]
+            msgs.append(dict(type='text', value=prompt))
+
+            return msgs
+
+        elif line['type'] == '判断':
+            msgs = super().build_prompt(line)
+            assert msgs[-1]['type'] == 'text'
+            msgs[-1]['value'] += '\n请回答上述判断题，并根据题目描述和所给的信息来判断问题中陈述的对错。如果信息不完整或不足以作出绝对判断，请运用你的逻辑推理和现有信息来做出最可能的判断。'
+            return msgs
+
+        else:
+            msgs = super().build_prompt(line)
+            assert msgs[-1]['type'] == 'text'
+            msgs[-1]['value'] += '\n请回答上述填空题，并根据题目的要求和所提供的信息来给出最恰当的答案。如果信息不足以确切回答，那么请依据现有的数据和你的推理能力来填写最合理的答案。'
+            return msgs
--- a/VLMEvalKit/vlmeval/dataset/dude.py
+++ b/VLMEvalKit/vlmeval/dataset/dude.py
+import math
+from typing import List
+
+from .utils.judge_util import build_judge
+from .image_base import ImageBaseDataset
+from .mmlongbench import concat_images, MMLongBench_auxeval, anls_compute
+from ..smp import *
+
+
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+
+def DUDE_acc(result_file):
+    data = load(result_file)
+    overall_score = 0.0
+    score_list = list()
+    for i in range(len(data)):
+        item = data.iloc[i]
+        if isinstance(item['answer'], float) and math.isnan(item['answer']):
+            item['answer'] = 'Not answerable'
+
+        item['answer'] = item['answer'].lower()
+        item['pred'] = item['pred'].lower()
+        score = anls_compute(item['answer'], item['pred'])
+        score_list.append(score)
+        overall_score += score
+
+    data['score'] = score_list
+    dump(data, result_file)
+
+    res = dict()
+    res['category'], res['num'], res['avg_score'] = ['anls'], [len(data)], [overall_score / len(data)]
+    res = pd.DataFrame(res)
+    return res
+
+
+class DUDE(ImageBaseDataset):
+
+    TYPE = 'VQA'
+
+    DATASET_URL = {
+        'DUDE': 'https://opencompass.openxlab.space/utils/VLMEval/DUDE.tsv',
+        'DUDE_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/DUDE_MINI.tsv',
+    }
+    DATASET_MD5 = {
+        'DUDE': '130d860d08206e1e407cd77150c10d88',
+        'DUDE_MINI': 'e0c0d998114f0cca7516d12039d2b538',
+    }
+
+    SUPPORTED_MODELS = {
+        'GPT4': (1, 1),
+        'GPT4V': (1, 1),
+        'GPT4V_HIGH': (1, 1),
+        'GPT4o': (1, 1),
+        'GPT4o_HIGH': (1, 1),
+        'GPT4o_MINI': (1, 1),
+        'XComposer2d5': (1, -1),
+        'XComposer2_4KHD': (1, -1),
+        'MiniCPM-Llama3-V-2_5': (1, 5),
+        'InternVL-Chat-V1-5': (5, 2),
+    }
+
+    def __init__(self, dataset, **kwargs):
+        self.model_list = list(self.SUPPORTED_MODELS.keys())
+        model_name = kwargs['model']
+        if not listinstr(self.model_list, model_name):
+            raise AssertionError("{} doesn't support the evaluation on DUDE.".format(model_name))
+        super(DUDE, self).__init__(dataset)
+
+        self.is_api = True if listinstr(['GPT4'], model_name) else False
+        self.max_pages = 120
+        concat_num, column_num = self.SUPPORTED_MODELS.get(model_name)
+        self.concat_num = concat_num
+        self.column_num = column_num
+
+    def prepare_tsv(self, url, file_md5=None):
+        data_root = LMUDataRoot()
+        os.makedirs(data_root, exist_ok=True)
+        file_name = url.split('/')[-1]
+        data_path = osp.join(data_root, file_name)
+        if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
+            pass
+        else:
+            warnings.warn('The dataset tsv is not downloaded')
+            download_file(url, data_path)
+        return load(data_path)
+
+    def dump_image(self, origin_line):
+        os.makedirs(self.img_root, exist_ok=True)
+        try:
+            import fitz
+        except Exception as e:
+            logging.critical(f'{type(e)}: {e}')
+            logging.critical('Please use `pip install pymupdf` to parse PDF files.')
+
+        line = origin_line.copy()
+        if not isinstance(line['image_path'], List):
+            line['image_path'] = [line['image_path']]
+        line['image_path'] = line['image_path'][:self.max_pages]
+        skip_pdf_parse = True
+        for im_name in line['image_path']:
+            path = osp.join(self.img_root, im_name)
+            if not read_ok(path):
+                skip_pdf_parse = False
+                break
+
+        # Just for being compatible with the zooped loop: zip(line['image'], line['image_path'])
+        if skip_pdf_parse:
+            line['image'] = line['image_path']
+        else:
+            pdf_data = base64.b64decode(line['image'])
+            pdf_file = io.BytesIO(pdf_data)
+            encoded_images = []
+            with fitz.open(stream=pdf_file, filetype='pdf') as doc:
+                doc = doc[:self.max_pages]
+                for page in doc:
+                    image = page.get_pixmap(dpi=144)
+                    image_file = io.BytesIO(image.tobytes(output='png'))
+                    image = Image.open(image_file)
+                    encoded_image = encode_image_to_base64(image)
+                    encoded_images.append(encoded_image)
+            line['image'] = encoded_images
+            print('process {}'.format(line['doc_id']))
+
+        if 'image' in line:
+            if isinstance(line['image'], list):
+                tgt_path = []
+                assert 'image_path' in line
+                for img, im_name in zip(line['image'], line['image_path']):
+                    path = osp.join(self.img_root, im_name)
+                    if not read_ok(path):
+                        decode_base64_to_image_file(img, path)
+                    tgt_path.append(path)
+            else:
+                tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
+                if not read_ok(tgt_path):
+                    decode_base64_to_image_file(line['image'], tgt_path)
+                tgt_path = [tgt_path]
+        else:
+            assert 'image_path' in line
+            tgt_path = toliststr(line['image_path'])
+
+        if self.concat_num > 0 and not self.is_api:
+            concatenated_images = concat_images(tgt_path, max_concat=self.concat_num, column_num=self.column_num)
+
+            old_tgt_path = tgt_path
+            assert isinstance(old_tgt_path, list)
+            if self.column_num != -1:
+                tgt_path = [
+                    '_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat{}_{}.jpg'.format(self.concat_num, i)
+                    for i in range(len(concatenated_images))
+                ]
+            else:
+                tgt_path = ['_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat_all.jpg']
+
+            for path, concatenated_image in zip(tgt_path, concatenated_images):
+                if not read_ok(path):
+                    decode_base64_to_image_file(encode_image_to_base64(concatenated_image), path)
+                    num_images, image_size = len(old_tgt_path), concatenated_image.size
+                    print('concat {} images to a new one with size {}. save at {}'.format(num_images, image_size, path))
+        return tgt_path
+
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        logger = get_logger('Evaluation')
+        model = judge_kwargs['model']
+
+        suffix = eval_file.split('.')[-1]
+        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+
+        if osp.exists(storage):
+            logger.warning(f'GPT scoring file {storage} already exists, will reuse it in DUDE_eval. ')
+        else:
+            data = load(eval_file)
+            model = build_judge(max_tokens=128, **judge_kwargs)
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+
+            ans = {}
+            if osp.exists(tmp_file):
+                ans = load(tmp_file)
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+
+            if len(indices):
+                new_results = list()
+                for model, line in tqdm(tups):
+                    res = MMLongBench_auxeval(model, line)
+                    new_results.append(res)
+
+            log_map, res_map, pred_map = {}, {}, {}
+            all_inds = [line['index'] for line in lines]
+            for k, v in zip(all_inds, new_results):
+                log_map[k] = v['log']
+                res_map[k] = v['res']
+                pred_map[k] = v['pred']
+            data['res'] = [res_map[idx] for idx in data['index']]
+            data['log'] = [log_map[idx] for idx in data['index']]
+            data['pred'] = [pred_map[idx] for idx in data['index']]
+            dump(data, storage)
+
+        score = DUDE_acc(storage)
+        score_pth = storage.replace('.xlsx', '_score.csv')
+
+        dump(score, score_pth)
+        logger.info(f'DUDE successfully finished evaluating {eval_file}, results saved in {score_pth}')
+        logger.info('Score: ')
+        logger.info(score)
--- a/VLMEvalKit/vlmeval/dataset/dynamath.py
+++ b/VLMEvalKit/vlmeval/dataset/dynamath.py
+import re
+import json
+import sympy as sp
+import numpy as np
+import pandas as pd
+from sympy import simplify, Eq, sympify, Pow, pi
+from sympy.parsing.latex import parse_latex
+import sys
+import math
+import os
+import os.path as osp
+import argparse
+
+from .image_base import ImageBaseDataset
+from .utils import build_judge
+from ..utils import track_progress_rich
+from ..smp import load, dump, d2df, toliststr
+
+
+def preprocess(str1):
+    if 0 <= str1.find("{") < str1.rfind("}"):
+        str1 = str1[str1.find("{"): str1.rfind("}") + 1]
+    str2 = str1.replace("\\", "")
+    str2 = str2.replace("\\n", "\n")
+    return str2
+
+
+def transfer(str1):
+    if "\u03c0" in str1:
+        strs = str1.split('\u03c0')
+        str1 = strs[0]
+        return float(str1) * np.pi
+    else:
+        return float(str1)
+
+
+def parse_answer(answer, answer_type="multiple choice"):
+    if answer_type == "float":
+        if answer.isdigit():
+            return True, float(answer)
+        else:
+            parts = answer.split(' ')
+            answer = parts[0]
+            try:
+                answer = transfer(answer)
+                return True, answer
+            except:
+                return False, None
+    elif answer_type == "multiple choice":
+        if len(answer) == 1:
+            return True, answer.upper()
+        else:
+            in_flag = [ch in answer.upper() for ch in 'ABCDE']
+            if sum(in_flag) == 1:
+                for ch in 'ABCDE':
+                    if ch in answer.upper():
+                        return True, ch
+            return False, None
+    else:
+        return True, answer
+
+
+def DynaMath_auxeval(model, line):
+    pred = line['prediction']
+    pred = preprocess(pred)
+
+    succeed, short_answer = None, None
+    try:
+        dj = json.loads(pred, strict=False)
+        short_answer = dj.get("short answer")
+        assert short_answer is not None
+        succeed, short_answer = parse_answer(short_answer, answer_type=line['anwser_type'])
+        assert succeed
+    except:
+        # Failed to parse the JSON, use an auxiliary LLM to get the short answer
+        if line['answer_type'] == 'multiple choice':
+            inst = "Output the corresponing choice option, such as 'A', 'B', 'C', 'D', in a single line."
+        elif line['answer_type'] == 'float':
+            inst = "Output a three-digit floating-point number in a single line."
+        else:
+            inst = (
+                "Output a short answer in a single line. Any float numbers in the answer "
+                "should be formatted as three-digit floating-point numbers."
+            )
+
+        prompt = f"Free-form answer: {pred}\nInstruction: {inst}"
+        response = pred
+        succeed, short_answer = parse_answer(response, line['answer_type'])
+        if not succeed:
+            response = model.generate(prompt)
+            succeed, short_answer = parse_answer(response, line['answer_type'])
+
+    if line['answer_type'] == 'float':
+        if succeed:
+            diff = float(short_answer) - float(line['answer'])
+            if abs(diff) <= 0.001:
+                return dict(parse=True, extracted=short_answer, correct=True)
+            else:
+                return dict(parse=True, extracted=short_answer, correct=False)
+        else:
+            return dict(parse=False, extracted=None, correct=False)
+    elif line['answer_type'] == 'multiple choice':
+        if succeed:
+            return dict(parse=True, extracted=short_answer, correct=(short_answer == line['answer']))
+        else:
+            if line['answer'] in pred[:3].upper():
+                return dict(parse=False, extracted=None, correct=True)
+            else:
+                return dict(parse=False, extracted=None, correct=False)
+    else:
+        if succeed:
+            return dict(parse=True, extracted=short_answer, correct=(short_answer.lower() in line['answer'].lower()))
+        else:
+            return dict(parse=False, extracted=None, correct=(short_answer.lower() in line['answer'].lower()))
+
+
+class Dynamath(ImageBaseDataset):
+
+    TYPE = 'VQA'
+    DATASET_URL = {'DynaMath': 'https://opencompass.openxlab.space/utils/VLMEval/DynaMath.tsv'}
+    DATASET_MD5 = {'DynaMath': 'b8425ad9a7114571fc9366e013699494'}
+    GUIDE = """
+## Answer Instruction Please provide an answer to the question outlined above. Your response should adhere \
+to the following JSON format, which includes two keys: 'solution' and 'short answer'. The 'solution' key can contain \
+detailed steps needed to solve the question, and the 'short answer' key should provide a concise response. {INST}
+
+Example of expected JSON response format:
+
+"""
+    EXAMPLE = {
+        "solution": "[Detailed step-by-step explanation]",
+        "short answer": "[Concise Answer]"
+    }
+    TEXT_EXAMPLE = json.dumps(EXAMPLE, indent=4)
+
+    # Given one data record, return the built prompt (a multi-modal message), can override
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+
+        if self.meta_only:
+            tgt_path = toliststr(line['image_path'])
+        else:
+            tgt_path = self.dump_image(line)
+
+        prompt = f"## Question\n {line['question']}"
+        if line['answer_type'] == 'multiple choice':
+            inst = "Provide the corresponing choice option in the 'short answer' key, such as 'A', 'B', 'C', or 'D'."
+        elif line['answer_type'] == 'float':
+            inst = "Format the answer as a three-digit floating-point number and provide it in the 'short answer' key."
+        else:
+            inst = "Float numbers in the answer should be formatted as three-digit floating-point numbers."
+
+        prompt = prompt + self.GUIDE.format(INST=inst) + self.TEXT_EXAMPLE
+
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=prompt))
+        return msgs
+
+    def evaluate(self, eval_file, **judge_kwargs):
+        judge_name = judge_kwargs.pop('model', 'gpt-4o-mini')
+
+        model = build_judge(model=judge_name, **judge_kwargs)
+        suffix = eval_file.split('.')[-1]
+
+        storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx')  # noqa: F841
+        score_file = eval_file.replace(f'.{suffix}', f'_{judge_name}_score.csv')  # noqa: F841
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl')  # noqa: F841
+        nproc = judge_kwargs.pop('nproc', 6)  # noqa: F841
+
+        res = load(tmp_file) if os.path.exists(tmp_file) else {}
+        res = {k: v for k, v in res.items() if v is not None}
+
+        model.system_prompt = """\
+You are a helpful assistant that helps me to format free-form answers into a short answer according to the instruction.
+"""
+        if not osp.exists(storage):
+            data = load(eval_file)
+            lt = len(data)
+            payloads = [dict(model=model, line=data.iloc[i]) for i in range(lt) if data.iloc[i]['index'] not in res]
+            keys = [idx for idx in data['index'] if idx not in res]
+
+            if len(keys):
+                results = track_progress_rich(DynaMath_auxeval, payloads, nproc=nproc, save=tmp_file, keys=keys)
+                for k, r in zip(keys, results):
+                    res[k] = r
+
+            data['parse'] = [res[idx]['parse'] for idx in data['index']]
+            data['extracted'] = [res[idx]['extracted'] for idx in data['index']]
+            data['correct'] = [res[idx]['correct'] for idx in data['index']]
+            dump(data, storage)
+
+        data = load(storage)
+        # Calculate Average Accuracy
+        score_avg = {}
+        score_avg['Overall'] = np.mean(data['correct'])
+
+        subs = set(data['subject'])
+        for sub in subs:
+            data_sub = data[data['subject'] == sub]
+            score_avg[f'Subject-{sub}'] = np.mean(data_sub['correct'])
+
+        lvls = set(data['knowledge_level'])
+        for lvl in lvls:
+            data_lvl = data[data['knowledge_level'] == lvl]
+            score_avg[f'Level-{lvl}'] = np.mean(data_lvl['correct'])
+
+        # Calculate the Worst Case Accuracy
+        score_worst = {}
+        data_worst = data[data['varid'] == 1]
+        qid2corr = {idx: True for idx in data_worst['index']}
+        lt = len(data)
+        for i in range(lt):
+            item = data.iloc[i]
+            qid2corr[item['qid']] *= item['correct']
+        data_worst['correct'] = [qid2corr[idx] for idx in data_worst['qid']]
+        score_worst['Overall'] = np.mean(data_worst['correct'])
+
+        subs = set(data_worst['subject'])
+        for sub in subs:
+            data_sub = data_worst[data_worst['subject'] == sub]
+            score_worst[f'Subject-{sub}'] = np.mean(data_sub['correct'])
+
+        lvls = set(data_worst['knowledge_level'])
+        for lvl in lvls:
+            data_lvl = data_worst[data_worst['knowledge_level'] == lvl]
+            score_worst[f'Level-{lvl}'] = np.mean(data_lvl['correct'])
+
+        d1 = {'Setting': 'Average'}
+        d1.update(score_avg)
+        d2 = {'Setting': 'Worst Case'}
+        d2.update(score_worst)
+        score = pd.concat([d2df(d1), d2df(d2)], ignore_index=True)
+
+        dump(score, score_file)
+        return score
--- a/VLMEvalKit/vlmeval/dataset/image_base.py
+++ b/VLMEvalKit/vlmeval/dataset/image_base.py
+import pandas as pd
+from abc import abstractmethod
+from ..smp import *
+
+
+def img_root_map(dataset):
+    if 'MM_NIAH' in dataset:
+        return 'MMNIAH'
+    if 'CRPE' in dataset:
+        return 'CRPE'
+    if 'OCRVQA' in dataset:
+        return 'OCRVQA'
+    if 'COCO_VAL' == dataset:
+        return 'COCO'
+    if 'MMMU' in dataset:
+        return 'MMMU'
+    if "QSpatial" in dataset:
+        return "QSpatial"
+
+    mmbench_root_map = {
+        'MMBench_DEV_EN': 'MMBench', 'MMBench_TEST_EN': 'MMBench',
+        'MMBench_DEV_CN': 'MMBench', 'MMBench_TEST_CN': 'MMBench',
+        'MMBench': 'MMBench', 'MMBench_CN': 'MMBench',
+        'MMBench_DEV_EN_V11': 'MMBench_V11', 'MMBench_TEST_EN_V11': 'MMBench_V11',
+        'MMBench_DEV_CN_V11': 'MMBench_V11', 'MMBench_TEST_CN_V11': 'MMBench_V11',
+        'MMBench_V11': 'MMBench', 'MMBench_CN_V11': 'MMBench',
+    }
+    if dataset in mmbench_root_map:
+        return mmbench_root_map[dataset]
+    return dataset
+
+
+class ImageBaseDataset:
+
+    MODALITY = 'IMAGE'
+    DATASET_URL = {}
+    DATASET_MD5 = {}
+
+    def __init__(self, dataset='MMBench', skip_noimg=True):
+        ROOT = LMUDataRoot()
+        # You can override this variable to save image files to a different directory
+        self.dataset_name = dataset
+        self.img_root = osp.join(ROOT, 'images', img_root_map(dataset))
+
+        data = self.load_data(dataset)
+        self.skip_noimg = skip_noimg
+        if skip_noimg and 'image' in data:
+            data = data[~pd.isna(data['image'])]
+
+        data['index'] = [str(x) for x in data['index']]
+
+        self.meta_only = True
+
+        # The image field can store the base64 encoded image or another question index (for saving space)
+        if 'image' in data:
+            data['image'] = [str(x) for x in data['image']]
+            image_map = {x: y for x, y in zip(data['index'], data['image'])}
+            for k in image_map:
+                if len(image_map[k]) <= 64:
+                    idx = image_map[k]
+                    assert idx in image_map and len(image_map[idx]) > 64
+                    image_map[k] = image_map[idx]
+
+            images = [toliststr(image_map[k]) for k in data['index']]
+            data['image'] = [x[0] if len(x) == 1 else x for x in images]
+            self.meta_only = False
+
+        if 'image_path' in data:
+            paths = [toliststr(x) for x in data['image_path']]
+            data['image_path'] = [x[0] if len(x) == 1 else x for x in paths]
+
+        if np.all([istype(x, int) for x in data['index']]):
+            data['index'] = [int(x) for x in data['index']]
+
+        self.data = data
+        self.post_build(dataset)
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        return dict(self.data.iloc[idx])
+
+    def prepare_tsv(self, url, file_md5=None):
+        data_root = LMUDataRoot()
+        os.makedirs(data_root, exist_ok=True)
+        update_flag = False
+        file_name = url.split('/')[-1]
+        data_path = osp.join(data_root, file_name)
+        if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
+            pass
+        else:
+            warnings.warn('The dataset tsv is not downloaded')
+            download_file(url, data_path)
+            update_flag = True
+
+        if file_size(data_path, 'GB') > 1:
+            local_path = data_path.replace('.tsv', '_local.tsv')
+            if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
+                from ..tools import LOCALIZE
+                LOCALIZE(data_path, local_path)
+            data_path = local_path
+        return load(data_path)
+
+    def dump_image(self, line):
+        os.makedirs(self.img_root, exist_ok=True)
+
+        if 'image' in line:
+            if isinstance(line['image'], list):
+                tgt_path = []
+                assert 'image_path' in line
+                for img, im_name in zip(line['image'], line['image_path']):
+                    path = osp.join(self.img_root, im_name)
+                    if not read_ok(path):
+                        decode_base64_to_image_file(img, path)
+                    tgt_path.append(path)
+            else:
+                tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
+                if not read_ok(tgt_path):
+                    decode_base64_to_image_file(line['image'], tgt_path)
+                tgt_path = [tgt_path]
+        else:
+            assert 'image_path' in line
+            tgt_path = toliststr(line['image_path'])
+
+        return tgt_path
+
+    def display(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        assert isinstance(line, pd.Series) or isinstance(line, dict)
+        mmqa_display(line)
+
+    # Return a list of dataset names that are supported by this class, can override
+    @classmethod
+    def supported_datasets(cls):
+        return list(cls.DATASET_URL)
+
+    # Given the dataset name, return the dataset as a pandas dataframe, can override
+    def load_data(self, dataset):
+        url = self.DATASET_URL[dataset]
+        file_md5 = self.DATASET_MD5[dataset] if dataset in self.DATASET_MD5 else None
+        return self.prepare_tsv(url, file_md5)
+
+    # Post built hook, will be called after the dataset is built, can override
+    def post_build(self, dataset):
+        pass
+
+    # Given one data record, return the built prompt (a multi-modal message), can override
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+
+        if self.meta_only:
+            tgt_path = toliststr(line['image_path'])
+        else:
+            tgt_path = self.dump_image(line)
+
+        question = line['question']
+
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=question))
+        return msgs
+
+    # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
+    @abstractmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        pass
--- a/VLMEvalKit/vlmeval/dataset/image_caption.py
+++ b/VLMEvalKit/vlmeval/dataset/image_caption.py
+from .image_base import ImageBaseDataset
+from ..smp import *
+
+
+class COCO_Caption_Scorer():
+    def __init__(self, ref, gt):
+        from pycocoevalcap.bleu.bleu import Bleu
+        from pycocoevalcap.rouge.rouge import Rouge
+        from pycocoevalcap.cider.cider import Cider
+
+        self.ref = ref
+        self.gt = gt
+        print('setting up scorers...')
+        self.scorers = [
+            (Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']),
+            (Rouge(), 'ROUGE_L'),
+            (Cider(), 'CIDEr'),
+        ]
+
+    def compute_scores(self):
+        total_scores = {}
+        for scorer, method in self.scorers:
+            print('computing %s score...' % (scorer.method()))
+            score, scores = scorer.compute_score(self.gt, self.ref)
+            if isinstance(method, list):
+                for sc, scs, m in zip(score, scores, method):
+                    print('%s: %0.3f' % (m, sc * 100))
+                total_scores['Bleu'] = [x * 100 for x in score]
+            else:
+                print('%s: %0.3f' % (method, score * 100))
+                total_scores[method] = score * 100
+
+        print('*****DONE*****')
+        for key, value in total_scores.items():
+            print('{}:{}'.format(key, value))
+        return total_scores
+
+
+class ImageCaptionDataset(ImageBaseDataset):
+
+    TYPE = 'Caption'
+
+    DATASET_URL = {
+        'COCO_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/COCO_VAL.tsv',
+    }
+
+    DATASET_MD5 = {
+        'COCO_VAL': '72a5079dead060269ac222c5aa5128af',
+    }
+
+    def load_data(self, dataset):
+        data = super().load_data(dataset)
+        if 'question' not in data:
+            data['question'] = [(
+                'Please describe this image in general. Directly provide the description, '
+                'do not include prefix like "This image depicts". '
+            )] * len(data)
+        return data
+
+    # It returns a dictionary of scores
+    @classmethod
+    def evaluate(self, eval_file, **kwargs):
+        data = load(eval_file)
+        lt = len(data)
+        lines = [data.iloc[i] for i in range(lt)]
+        ref, gt = {}, {}
+        for i, line in enumerate(lines):
+            ref[str(i)] = [str(line['prediction'])]
+            gt[str(i)] = eval(line['answer'])
+
+        scorer = COCO_Caption_Scorer(ref, gt)
+        coco_caption_score_dict = scorer.compute_scores()
+        score_pth = eval_file.replace('.xlsx', '_score.json')
+        dump(coco_caption_score_dict, score_pth)
+        return coco_caption_score_dict
--- a/VLMEvalKit/vlmeval/dataset/image_mcq.py
+++ b/VLMEvalKit/vlmeval/dataset/image_mcq.py
+import warnings
+
+from .image_base import ImageBaseDataset
+from .utils import build_judge, DEBUG_MESSAGE
+from ..smp import *
+import pandas as pd
+
+MMMB_URLS = {
+    'MMMB_ar': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_ar.tsv',
+    'MMMB_cn': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_cn.tsv',
+    'MMMB_en': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_en.tsv',
+    'MMMB_pt': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_pt.tsv',
+    'MMMB_ru': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_ru.tsv',
+    'MMMB_tr': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_tr.tsv',
+}
+
+MTL_MMBench_URLS = {
+    'MMBench_dev_ar': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_ar.tsv',
+    'MMBench_dev_cn': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_cn.tsv',
+    'MMBench_dev_en': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_en.tsv',
+    'MMBench_dev_pt': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_pt.tsv',
+    'MMBench_dev_tr': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_tr.tsv',
+    'MMBench_dev_ru': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_ru.tsv',
+}
+
+MMMB_MD5 = {
+    'MMMB_ar': 'f3a18b6385f1d9701840aa42de27aead', 'MMMB_cn': '13ed82fa89730037292fcaa27f08f430',
+    'MMMB_en': '1cd781a71ec5a2983c090b84105d6a01', 'MMMB_pt': '548ea2b3bb2da991790386f0015d30d1',
+    'MMMB_ru': 'ce1cc8a0533425ab0d86b326ebfc2984', 'MMMB_tr': '0733739d43090327975294292bc5cd67'
+}
+
+MTL_MMBench_MD5 = {
+    'MMBench_dev_ar': '4271b4a0d0200e1a86380a878e0d64a4', 'MMBench_dev_cn': '2ed5135326fed02c8e51ea50dda8222f',
+    'MMBench_dev_en': 'd9ab776fc018b3d45785e9a5c23431c2', 'MMBench_dev_pt': '4ddfbcd27ef12444b908c03831cd0295',
+    'MMBench_dev_tr': '4fab39d501389d3d6cc90264bb708f11', 'MMBench_dev_ru': '5ba1171ff2e68f80637bf78349e402a5'
+}
+
+
+class ImageMCQDataset(ImageBaseDataset):
+
+    TYPE = 'MCQ'
+
+    DATASET_URL = {
+        # MMBench v1.0
+        'MMBench_DEV_EN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_EN.tsv',
+        'MMBench_TEST_EN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_EN.tsv',
+        'MMBench_DEV_CN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_CN.tsv',
+        'MMBench_TEST_CN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_CN.tsv',
+        'MMBench': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench.tsv',  # Internal
+        'MMBench_CN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_CN.tsv',  # Internal
+        # MMBench v1.1
+        'MMBench_DEV_EN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_EN_V11.tsv',
+        'MMBench_TEST_EN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_EN_V11.tsv',
+        'MMBench_DEV_CN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_CN_V11.tsv',
+        'MMBench_TEST_CN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_CN_V11.tsv',
+        'MMBench_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_V11.tsv',  # Internal
+        'MMBench_CN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_CN_V11.tsv',  # Internal
+        # SEEDBench Series
+        'SEEDBench_IMG': 'https://opencompass.openxlab.space/utils/benchmarks/SEEDBench/SEEDBench_IMG.tsv',
+        'SEEDBench2': 'https://huggingface.co/datasets/VLMEval/SEEDBench2/resolve/main/SEEDBench2.tsv',
+        'SEEDBench2_Plus': 'https://opencompass.openxlab.space/utils/benchmarks/SEEDBench/SEEDBench2_Plus.tsv',
+        # ScienceQA Series
+        'ScienceQA_VAL': 'https://opencompass.openxlab.space/utils/benchmarks/ScienceQA/ScienceQA_VAL.tsv',
+        'ScienceQA_TEST': 'https://opencompass.openxlab.space/utils/benchmarks/ScienceQA/ScienceQA_TEST.tsv',
+        # MMT-Bench
+        'MMT-Bench_ALL_MI': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_ALL_MI.tsv',
+        'MMT-Bench_ALL': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_ALL.tsv',
+        'MMT-Bench_VAL_MI': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_VAL_MI.tsv',
+        'MMT-Bench_VAL': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_VAL.tsv',
+        # AesBench
+        'AesBench_VAL': 'https://huggingface.co/datasets/VLMEval/AesBench/resolve/main/AesBench_VAL.tsv',
+        'AesBench_TEST': 'https://huggingface.co/datasets/VLMEval/AesBench/resolve/main/AesBench_TEST.tsv',
+        # Q-Bench1
+        'Q-Bench1_VAL': 'https://huggingface.co/datasets/zhangzicheng/qbench_tsv/resolve/main/Q-Bench1_VAL.tsv',
+        'Q-Bench1_TEST': 'https://huggingface.co/datasets/zhangzicheng/qbench_tsv/resolve/main/Q-Bench1_TEST.tsv',
+        # A-Bench
+        'A-Bench_VAL': 'https://huggingface.co/datasets/zhangzicheng/abench_tsv/resolve/main/A-bench_VAL.tsv',
+        'A-Bench_TEST': 'https://huggingface.co/datasets/zhangzicheng/abench_tsv/resolve/main/A-bench_TEST.tsv',
+        # R-Bench
+        'R-Bench-Dis': 'https://huggingface.co/datasets/lcysyzxdxc/R-Bench/blob/main/R-bench-dis.tsv',
+        'R-Bench-Ref': 'https://huggingface.co/datasets/lcysyzxdxc/R-Bench/blob/main/R-bench-ref.tsv',
+        # Other Benchmarks
+        'CCBench': 'https://opencompass.openxlab.space/utils/VLMEval/CCBench.tsv',
+        'AI2D_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST.tsv',
+        'AI2D_TEST_NO_MASK': 'https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST_NO_MASK.tsv',
+        'MMStar': 'https://opencompass.openxlab.space/utils/VLMEval/MMStar.tsv',
+        'RealWorldQA': 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv',
+        'MLLMGuard_DS': 'https://opencompass.openxlab.space/utils/VLMEval/MLLMGuard_DS.tsv',
+        'BLINK': 'https://opencompass.openxlab.space/utils/VLMEval/BLINK.tsv',
+        'TaskMeAnything_v1_imageqa_random': (
+            'https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-random/'
+            'resolve/main/TaskMeAnything-v1-imageqa-random.tsv'
+        ),
+        'A-OKVQA': 'https://huggingface.co/datasets/Allen8/A-OKVQA/resolve/main/a-okvqa.tsv',
+        'WorldMedQA-V': 'https://opencompass.openxlab.space/utils/VLMEval/WorldMedQA-V.tsv',
+        'VisOnlyQA-VLMEvalKit': (
+            'https://huggingface.co/datasets/ryokamoi/VisOnlyQA_Eval_Real/'
+            'resolve/main/visonlyqa_vlmevalkit.tsv'
+        ),
+    }
+
+    DATASET_MD5 = {
+        # MMBench v1.0
+        'MMBench_DEV_EN': 'b6caf1133a01c6bb705cf753bb527ed8',
+        'MMBench_TEST_EN': '6939fadb0ce626fefc0bdc9c64efc528',
+        'MMBench_DEV_CN': '08b8fc3324a5ed74155350f57be69fbd',
+        'MMBench_TEST_CN': '7e1239baf0ee4c8b513e19705a0f317e',
+        'MMBench': '4115aea3383f3dd0083be6a633e0f820',  # Internal Only
+        'MMBench_CN': '2e053ffc90ea598b1feae13c36dc13ee',    # Internal Only
+        # MMBench v1.1
+        'MMBench_DEV_EN_V11': '30c05be8f2f347a50be25aa067248184',
+        'MMBench_TEST_EN_V11': '26f0f15381a21720255091d3e0316ce6',
+        'MMBench_DEV_CN_V11': '593f9b5f6bea453d870a798b34ae4f37',
+        'MMBench_TEST_CN_V11': '74bbe4556dac745613c7cbe5ad787050',
+        'MMBench_V11': 'b9276414f57af1308dcc4d0cd9b42e7c',  # Internal Only
+        'MMBench_CN_V11': '95f6980dd1b4de38e3cbffe0305a3f25',    # Internal Only
+        # SEEDBench
+        'SEEDBench_IMG': '68017231464752261a2526d6ca3a10c0',
+        'SEEDBench2': '4ec15cf864c4f16274112284f531813e',
+        'SEEDBench2_Plus': 'e32d3216dc4f452b0fe497a52015d1fd',
+        # ScienceQA
+        'ScienceQA_VAL': '96320d05e142e585e7204e72affd29f3',
+        'ScienceQA_TEST': 'e42e9e00f9c59a80d8a5db35bc32b71f',
+        # MMT-Bench
+        'MMT-Bench_ALL_MI': '5272157097e19cdd7cb41e412ab3b7c7',
+        'MMT-Bench_ALL': 'b273a2f4c596fe4f2605de0494cd632f',
+        'MMT-Bench_VAL_MI': 'c7d7b998eb5cd9aa36c7d4f721472462',
+        'MMT-Bench_VAL': '8dd4b730f53dbf9c3aed90ca31c928e0',
+        # AesBench
+        'AesBench_VAL': '3edb0c319e9187aa0b97fe7a11700a8c',
+        'AesBench_TEST': '58b1f7ba2cc32e1d68896d6ee716bbf8',
+        # Q-Bench1
+        'Q-Bench1_VAL': '837bdb6cd2da571713543462815187b7',
+        'Q-Bench1_TEST': '15e759bfd58c9d5f30b23a317d347153',
+        # A-Bench
+        'A-Bench_VAL': '218563ec50d34bb336c814143a5bb9c1',
+        'A-Bench_TEST': '567013fb033a20cf23f51d8e865bd16c',
+        # R-Bench
+        'R-Bench-Dis': 'd6e961dbfc43350688af2560226830b4',
+        'R-Bench-Ref': '270c1cb555acb523f3fdb178ed57021d',
+        # Other Benchmarks
+        'CCBench': 'f5dde47f24dc5a6fb6e595b409b466ac',
+        'AI2D_TEST': '0f593e0d1c7df9a3d69bf1f947e71975',
+        'AI2D_TEST_NO_MASK': 'fd8f463634d4fe9fbd23b876e8eea5be',
+        'MMStar': 'e1ecd2140806c1b1bbf54b43372efb9e',
+        'RealWorldQA': '92321028d2bc29040284b6674721e48f',
+        'MLLMGuard_DS': '975fc0dd7119386e198c37d71e274b3f',
+        'BLINK': '3b6649b6a662184ea046908e5506260e',
+        'TaskMeAnything_v1_imageqa_random': '023fef69e2ca21827afb77c5ec3bc889',
+        'WorldMedQA-V': '441e63875e30c87f5750528b57b41285',
+        "VisOnlyQA-VLMEvalKit": 'cf460a31d2acb8d3a7cecd0e69298bfa',
+    }
+
+    DATASET_URL.update(MMMB_URLS)
+    DATASET_URL.update(MTL_MMBench_URLS)
+    DATASET_MD5.update(MMMB_MD5)
+    DATASET_MD5.update(MTL_MMBench_MD5)
+
+    def build_prompt(self, line):
+
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+
+        if self.meta_only:
+            tgt_path = toliststr(line['image_path'])
+        else:
+            tgt_path = self.dump_image(line)
+
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        options_prompt = 'Options:\n'
+        for key, item in options.items():
+            options_prompt += f'{key}. {item}\n'
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        prompt = ''
+        if hint is not None:
+            prompt += f'Hint: {hint}\n'
+        prompt += f'Question: {question}\n'
+        if len(options):
+            prompt += options_prompt
+            prompt += 'Please select the correct answer from the options above. \n'
+
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=prompt))
+
+        return msgs
+
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.multiple_choice import report_acc, report_acc_MMT, mcq_circular_eval, mcq_vanilla_eval
+        # assert dataset is not None
+        dataset_map = {
+            'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_EN_V11': 'MMBench_V11',
+            'MMBench_TEST_CN': 'MMBench_CN', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11'
+        }
+        dataset = self.dataset_name
+        if dataset in dataset_map:
+            dataset = dataset_map[dataset]
+        nproc = judge_kwargs.pop('nproc', 4)
+
+        circular = False
+        if listinstr(['mmbench', 'ccbench'], dataset.lower()):
+            data = load(eval_file)
+            data['index'] = [int(x) for x in data['index']]
+            dump(data, eval_file)
+            circular = True
+
+        suffix = eval_file.split('.')[-1]
+        model = judge_kwargs.get('model', 'exact_matching')
+        assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
+        name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
+        name_str = name_str_map[model] if model in name_str_map else model
+
+        if model == 'exact_matching':
+            model = None
+        elif gpt_key_set():
+            model = build_judge(**judge_kwargs)
+            if not model.working():
+                warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                warnings.warn(DEBUG_MESSAGE)
+                model = None
+        else:
+            warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+            model = None
+
+        result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+
+        data = load(eval_file)
+        data = data.sort_values(by='index')
+        data['prediction'] = [str(x) for x in data['prediction']]
+        # If not choice label, then use lower case
+        for k in data.keys():
+            data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
+
+        meta = self.data
+        meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
+        data_map = {x: y for x, y in zip(data['index'], data['question'])}
+        for k in data_map:
+            assert k in meta_q_map, (
+                f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
+            )
+
+        if circular:
+            data = mcq_circular_eval(model, data, meta, nproc, result_file, self.dataset_name)
+        else:
+            data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
+
+        # load split
+        dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+        data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+
+        # May have different report acc functions for different datasets
+        if 'MMT' in dataset:
+            acc = report_acc_MMT(data)
+        else:
+            acc = report_acc(data)
+
+        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        dump(acc, score_file)
+
+        if dataset == 'AesBench_VAL':
+            warnings.warn('Note that AesBench VAL is just a toy version of AesBench TEST. For full results, \
+                           please evaluate on AesBench TEST. The AesBench TEST dataset is more than 20 times \
+                           larger than the VAL dataset and the leaderboard results are based on AesBench TEST.')
+        if dataset == 'VisOnlyQA-VLMEvalKit':
+            warnings.warn('Note that the results on VisOnlyQA-VLMEvalKit are different from the results on \
+                           the original VisOnlyQA. VisOnlyQA-VLMEvalKit does not include the \
+                           chemistry__shape_multi split and uses a different evaluation prompt. Please \
+                           explicitly specify the version of the dataset when you report results.')
+
+        return acc
+
+
+class MMMUDataset(ImageMCQDataset):
+
+    DATASET_URL = {
+        'MMMU_DEV_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_DEV_VAL.tsv',
+        'MMMU_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_TEST.tsv',
+    }
+
+    DATASET_MD5 = {
+        'MMMU_DEV_VAL': '521afc0f3bf341e6654327792781644d',
+        'MMMU_TEST': 'c19875d11a2d348d07e5eb4bdf33166d',
+    }
+
+    @staticmethod
+    def split_MMMU(msgs):
+        text, images = None, []
+        for s in msgs:
+            if s['type'] == 'image':
+                images.append(s['value'])
+            elif s['type'] == 'text':
+                assert text is None
+                text = s['value']
+        text_segs = text.split('<image ')
+        if len(text_segs) == 1:
+            return msgs
+
+        segs = [dict(type='text', value=text_segs[0])]
+        for i, seg in enumerate(text_segs):
+            if i == 0:
+                continue
+            assert istype(seg[0], int) and seg[1] == '>'
+            image_idx = int(seg[0]) - 1
+            segs.append(dict(type='image', value=images[image_idx]))
+            segs.append(dict(type='text', value=seg[2:]))
+        return segs
+
+    def build_prompt(self, line):
+        msgs = super().build_prompt(line)
+        msgs = self.split_MMMU(msgs)
+        return msgs
+
+
+class MUIRDataset(ImageMCQDataset):
+
+    DATASET_URL = {
+        'MUIRBench': 'http://opencompass.openxxlab.com/utils/VLMEval/MUIRBench.tsv'
+    }
+
+    DATASET_MD5 = {
+        'MUIRBench': '2e5e6fd7699761b08a7cb3ab8c0c2ec8'
+    }
+
+    @staticmethod
+    def split_MUIR(msgs):
+        text, images = None, []
+
+        # Separate images and text from msgs
+        for s in msgs:
+            if s['type'] == 'image':
+                images.append(s['value'])
+            elif s['type'] == 'text':
+                assert text is None  # Ensure only one text entry is expected
+                text = s['value']
+
+        # Split text by <image> tags
+        text_segs = text.split('<image>')
+
+        # Initialize the segments list
+        segs = []
+
+        # Iterate through the text segments and images
+        for i, seg in enumerate(text_segs):
+            # Append the image if this is not the first segment and there are still images left
+            if i > 0 and i - 1 < len(images):
+                segs.append(dict(type='image', value=images[i - 1]))
+            # Append the text segment (if it's non-empty)
+            if len(seg) > 0:
+                segs.append(dict(type='text', value=seg))
+
+        return segs
+
+    def build_prompt(self, line):
+
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+
+        if self.meta_only:
+            tgt_path = toliststr(line['image_path'])
+        else:
+            tgt_path = self.dump_image(line)
+
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        # options_prompt = ''
+        options_prompt = '\n'.join([f'{key}. {item}' for key, item in options.items()])
+        # for key, item in options.items():
+        #     options_prompt += f'{key}. {item}\n'
+
+        prompt = ''
+
+        prompt += f'{question}\n'
+        if len(options):
+            prompt += options_prompt
+            prompt += "\nAnswer with the option's letter from the given choices directly."
+
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=prompt))
+
+        msgs = self.split_MUIR(msgs)
+        return msgs
+
+
+class GMAIMMBenchDataset(ImageMCQDataset):
+
+    DATASET_URL = {
+        'GMAI-MMBench_VAL': 'https://huggingface.co/datasets/VLMEval/GMAI-MMBench/resolve/main/GMAI-MMBench_VAL.tsv',
+        'GMAI_mm_bench_TEST_part_1': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_1.tsv',  # noqa: E501
+        'GMAI_mm_bench_TEST_part_2': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_2.tsv',  # noqa: E501
+        'GMAI_mm_bench_TEST_part_3': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_3.tsv',  # noqa: E501
+        'GMAI_mm_bench_TEST_part_4': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_4.tsv',  # noqa: E501
+        'GMAI_mm_bench_TEST_part_5': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_5.tsv',  # noqa: E501
+        'GMAI_mm_bench_TEST_part_6': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_6.tsv',  # noqa: E501
+        'GMAI_mm_bench_TEST_part_7': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_7.tsv',  # noqa: E501
+        'GMAI_mm_bench_TEST_part_8': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_8.tsv',  # noqa: E501
+        'GMAI_mm_bench_TEST_part_9': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_9.tsv',  # noqa: E501
+        'GMAI_mm_bench_TEST_part_10': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_10.tsv',  # noqa: E501
+        'GMAI_mm_bench_TEST_part_11': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_11.tsv',  # noqa: E501
+    }
+
+    DATASET_MD5 = {
+        'GMAI-MMBench_VAL': '254bd581627866f1c499d3d6b4422324',
+        'GMAI_mm_bench_TEST_part_1': '900d735231230a63f4ed45665c078ef4',
+        'GMAI_mm_bench_TEST_part_2': '1b27ab621386945d7e4a765ad2d22b0e',
+        'GMAI_mm_bench_TEST_part_3': '44bdc2b6267dd505d529b8cad06f0fb2',
+        'GMAI_mm_bench_TEST_part_4': '5a04a04fcac9f1466709f242fdb80acb',
+        'GMAI_mm_bench_TEST_part_5': 'c70baf8909eda9af0ddeab275c721336',
+        'GMAI_mm_bench_TEST_part_6': '825abc39596b644dead9350d0cfa3b96',
+        'GMAI_mm_bench_TEST_part_7': 'defb8aed2fb77365a76b6b9abd6a2701',
+        'GMAI_mm_bench_TEST_part_8': 'ff490d60b85f2bb0abb67a435b298c65',
+        'GMAI_mm_bench_TEST_part_9': 'ff67c86f40da93b09139ac1d1ba5dc6b',
+        'GMAI_mm_bench_TEST_part_10': '3dae94627b9ac0fe00180d4780fbf6dc',
+        'GMAI_mm_bench_TEST_part_11': 'd08dc813f0eb6bbab63cae2a9d113c4b',
+    }
+
+    @classmethod
+    def supported_datasets(cls):
+        return ['GMAI-MMBench_VAL', 'GMAI-MMBench_TEST']
+
+    def load_data(self, dataset):
+        if dataset == 'GMAI-MMBench_VAL':
+            data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
+            if file_size(data_path, 'GB') > 1:
+                local_path = data_path.replace('.tsv', '_local.tsv')
+                if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL'):
+                    from ..tools import LOCALIZE
+                    LOCALIZE(data_path, local_path)
+                data_path = local_path
+            return load(data_path)
+        elif dataset == 'GMAI-MMBench_TEST':
+            dfs = []
+            for part_num in range(1, 12):
+                part_name = f'GMAI_mm_bench_TEST_part_{part_num}'
+                url = self.DATASET_URL[part_name]
+                file_md5 = self.DATASET_MD5.get(part_name)
+                tsv_path = osp.join(LMUDataRoot(), f'{part_name}.tsv')
+                if not osp.exists(tsv_path) or (file_md5 and md5(tsv_path) != file_md5):
+                    download_file(url, filename=tsv_path)
+                local_path = tsv_path.replace('.tsv', '_local.tsv')
+                if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL'):
+                    from ..tools import LOCALIZE
+                    LOCALIZE(tsv_path, local_path)
+                tsv_path = local_path
+                # 加载数据
+                df = load(tsv_path)
+                dfs.append(df)
+            # 合并所有数据
+            data = pd.concat(dfs, ignore_index=True)
+            return data
+        else:
+            raise ValueError(f"未知的数据集：{dataset}")
+
+    def report_acc_by_groups(self, df, group_column):
+        res = defaultdict(list)
+
+        # Check for the 'split' column
+        if 'split' in df:
+            splits = list(set(df['split']))
+            res['split'] = splits
+        else:
+            df['split'] = ['none'] * len(df)
+            res['split'] = ['none']
+
+        res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']]
+
+        if group_column not in df:
+            raise ValueError(f"Column '{group_column}' not found in dataframe.")  # noqa: E713
+
+        abilities = list(set(df[group_column]))
+        abilities = ['None' if isinstance(ab, float) and pd.isna(ab) else ab for ab in abilities]
+        abilities.sort()
+
+        for ab in abilities:
+            ab_name = ab
+            sub_df = df[df[group_column] == ab]
+            res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
+
+        return pd.DataFrame(res)
+
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.multiple_choice import report_acc, mcq_vanilla_eval
+        nproc = judge_kwargs.pop('nproc', 4)
+
+        suffix = eval_file.split('.')[-1]
+        model = judge_kwargs.get('model', 'exact_matching')
+        assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
+        name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
+        name_str = name_str_map[model] if model in name_str_map else model
+
+        if model == 'exact_matching':
+            model = None
+        elif gpt_key_set():
+            model = build_judge(**judge_kwargs)
+            if not model.working():
+                warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                warnings.warn(DEBUG_MESSAGE)
+                model = None
+        else:
+            warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+            model = None
+
+        result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+
+        data = load(eval_file)
+        data = data.sort_values(by='index')
+        data['prediction'] = [str(x) for x in data['prediction']]
+        # If not choice label, then use lower case
+        for k in data.keys():
+            data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
+
+        meta = self.data
+        meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
+        data_map = {x: y for x, y in zip(data['index'], data['question'])}
+        for k in data_map:
+            assert k in meta_q_map, (
+                f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
+            )
+
+        data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
+
+        # load split
+        dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+        data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+
+        acc = report_acc(data)
+
+        for group_col in ['clinical vqa task', 'department', 'perceptual granularity']:
+            acc_grouped = self.report_acc_by_groups(data, group_col)
+            score_file_grouped = eval_file.replace(f'.{suffix}', f'_{group_col}_acc.csv')
+            dump(acc_grouped, score_file_grouped)
+
+        return acc
+
+
+class MMERealWorld(ImageMCQDataset):
+
+    TYPE = 'MMERealWorld'
+
+    DATASET_MD5 = {
+        'MME-RealWorld': '271c33ec814c39533c467ec6fb8a6f36',
+        'MME-RealWorld-Lite': '4c17057d7d3b6c4a0d4397c3dae0881c',
+        'MME-RealWorld-CN': 'daaa763d52a760a38606d5dedb3fe444',
+    }
+    SYS = {
+        'MME-RealWorld': (
+            'Select the best answer to the above multiple-choice question based on the image. '
+            'Respond with only the letter (A, B, C, D, or E) of the correct option. \n'
+            'The best answer is:'
+        ),
+        'MME-RealWorld-Lite': (
+            'Select the best answer to the above multiple-choice question based on the image. '
+            'Respond with only the letter (A, B, C, D, or E) of the correct option. \n'
+            'The best answer is:'
+        ),
+        'MME-RealWorld-CN': (
+            '根据图像选择上述多项选择题的最佳答案。只需回答正确选项的字母（A, B, C, D 或 E）。\n'
+            '最佳答案为：'
+        ),
+    }
+
+    @classmethod
+    def supported_datasets(cls):
+        return ['MME-RealWorld', 'MME-RealWorld-CN', 'MME-RealWorld-Lite',]
+
+    def load_data(
+        self, dataset="MME-RealWorld", repo_id="yifanzhang114/MME-RealWorld-Base64"
+    ):
+
+        def check_integrity(pth):
+            data_file = osp.join(pth, f"{dataset}.tsv")
+
+            if not os.path.exists(data_file):
+                return False
+
+            if md5(data_file) != self.DATASET_MD5[dataset]:
+                return False
+            return True
+
+        def generate_tsv(pth):
+            tsv_file = os.path.join(pth, f"{dataset}.tsv")
+
+            if os.path.exists(tsv_file):
+                print(f"{tsv_file} already exists.")
+                return
+
+            json_dir = os.path.join(pth, dataset)
+            json_files = [f for f in os.listdir(json_dir) if f.endswith(".json")]
+
+            data_list = []
+            for json_file in json_files:
+                with open(os.path.join(json_dir, json_file), "r") as f:
+                    data = json.load(f)
+                    for item in tqdm(data):
+                        choice_prompt = (
+                            "The choices are listed below:\n"
+                            if dataset in ["MME-RealWorld", "MME-RealWorld-Lite"]
+                            else "选项如下所示:\n"
+                        )
+                        data_list.append(
+                            {
+                                "index": item["index"],
+                                "image": item["image"],
+                                "question": item["question"],
+                                "multi-choice options": choice_prompt
+                                + "\n".join(item["multi-choice options"]),
+                                "A": item["multi-choice options"][0][4:],
+                                "B": item["multi-choice options"][1][4:],
+                                "C": item["multi-choice options"][2][4:],
+                                "D": item["multi-choice options"][3][4:],
+                                "E": item["multi-choice options"][4][4:],
+                                "answer": item["answer"],
+                                "category": item["category"],
+                                "l2-category": item["l2-category"],
+                            }
+                        )
+            df = pd.DataFrame(data_list)
+            df.to_csv(tsv_file, sep="\t", index=False)
+            print(f"TSV file saved to {tsv_file}")
+
+        # Check if dataset is cached and has integrity
+        if dataset == "MME-RealWorld-Lite":
+            url = 'https://huggingface.co/datasets/yifanzhang114/MME-RealWorld-Base64/resolve/main/mme_realworld_lite.tsv'  # noqa: E501
+            file_md5 = (
+                self.DATASET_MD5[dataset] if dataset in self.DATASET_MD5 else None
+            )
+            datas = self.prepare_tsv(url, file_md5)
+            choice_prompt = "The choices are listed below:\n"
+            for index, item in datas.iterrows():
+                options = eval(item["multi-choice options"])
+                datas.loc[index, "multi-choice options"] = choice_prompt + "\n".join(
+                    options
+                )
+                datas.loc[index, "A"] = options[0][4:]
+                datas.loc[index, "B"] = options[1][4:]
+                datas.loc[index, "C"] = options[2][4:]
+                datas.loc[index, "D"] = options[3][4:]
+                datas.loc[index, "E"] = options[4][4:]
+            return datas
+
+        update_flag = False
+        cache_path = get_cache_path(repo_id)
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+            print(f"Using cached dataset from {cache_path}")
+        else:
+            from huggingface_hub import snapshot_download
+
+            # Download or find the dataset path
+            dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
+            generate_tsv(dataset_path)
+            update_flag = True
+
+        data_path = os.path.join(dataset_path, f"{dataset}.tsv")
+        if file_size(data_path, "GB") > 1:
+            local_path = data_path.replace(".tsv", "_local.tsv")
+            if (
+                not osp.exists(local_path)
+                or os.environ.get("FORCE_LOCAL", None)
+                or update_flag
+            ):
+                from vlmeval.tools import LOCALIZE
+
+                LOCALIZE(data_path, local_path)
+            data_path = local_path
+        return load(data_path)
+
+    def post_build(self, dataset):
+        self.TYPE = 'MMERealWorld'
+
+    # Given one data record, return the built prompt (a multi-modal message), can override
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+
+        if self.meta_only:
+            tgt_path = toliststr(line['image_path'])
+        else:
+            tgt_path = self.dump_image(line)
+
+        question = line['question']
+
+        choice_prompt = line['multi-choice options'] + '\n'
+        question += ' ' + choice_prompt + self.SYS[self.dataset_name]
+
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=question))
+        return msgs
+
+    # It returns a dictionary
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.multiple_choice import extract_characters_regex, get_dimension_rating
+        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        FAIL_MSG = 'Failed to obtain answer via API.'
+        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
+        tgt_file = eval_file.replace('.xlsx', '_rating.json')
+        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+
+        if not osp.exists(score_file):
+
+            res = {} if not osp.exists(tmp_file) else load(tmp_file)
+            res = {k: v for k, v in res.items() if FAIL_MSG not in v}
+
+            data = load(eval_file)
+            cnt_rejected = 0
+            data_un = data[~pd.isna(data['prediction'])]
+
+            for idx in data['index']:
+                ans = data.loc[data['index'] == idx, 'answer'].values[0]
+                pred = data.loc[data['index'] == idx, 'prediction'].values[0]
+
+                extract_pred = extract_characters_regex(pred)
+                if extract_pred == '':
+                    cnt_rejected += 1
+                    data.loc[data['index'] == idx, 'score'] = 0
+                else:
+                    data.loc[data['index'] == idx, 'score'] = int(extract_pred == ans)
+
+            print(
+                f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
+                f'failed to obtain the score for another {cnt_rejected} questions. '
+                f'Those questions will be counted as 0 score in ALL rating.'
+            )
+
+            dump(data, score_file)
+
+        rating = get_dimension_rating(score_file)
+        dump(rating, tgt_file)
+        return rating
+
+
+class HRBenchDataset(ImageMCQDataset):
+
+    DATASET_URL = {
+        'HRBench4K': 'https://huggingface.co/datasets/DreamMr/HR-Bench/resolve/main/hr_bench_4k.tsv',
+        'HRBench8K': 'https://huggingface.co/datasets/DreamMr/HR-Bench/resolve/main/hr_bench_8k.tsv',
+    }
+
+    DATASET_MD5 = {
+        'HRBench4K': 'f6b041b03d49543494b8a56d2e35be65',
+        'HRBench8K': '274c9c7f89329b804a4723178a00219c',
+    }
+
+    def evaluate(self, eval_file, **judge_kwargs):
+        assert os.path.exists(eval_file), '{} does not exist!'.format(eval_file)
+        from .utils.multiple_choice import mcq_vanilla_eval
+        from .utils.hrbench import report_acc_hrbench
+        nproc = judge_kwargs.pop('nproc', 4)
+
+        suffix = eval_file.split('.')[-1]
+        model = judge_kwargs.get('model', 'extract_matching')
+        assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
+        name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
+        name_str = name_str_map[model] if model in name_str_map else model
+
+        if model == 'exact_matching':
+            model = None
+        elif gpt_key_set():
+            model = build_judge(**judge_kwargs)
+            if not model.working():
+                warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                warnings.warn(DEBUG_MESSAGE)
+                model = None
+        else:
+            warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+            model = None
+
+        result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+
+        data = load(eval_file)
+        data = data.sort_values(by='index')
+        data['prediction'] = [str(x) for x in data['prediction']]
+        # If not choice label, then use lower case
+        for k in data.keys():
+            data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
+
+        meta = self.data
+        meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
+        data_map = {x: y for x, y in zip(data['index'], data['question'])}
+        for k in data_map:
+            assert k in meta_q_map, (
+                f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
+            )
+
+        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+
+        if osp.exists(score_file):
+            acc = load(score_file)
+            return acc
+        data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
+        dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+        data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+
+        acc = report_acc_hrbench(data)
+
+        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        dump(acc, score_file)
+
+        return acc
+
+
+class CustomMCQDataset(ImageMCQDataset):
+
+    def load_data(self, dataset):
+        data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
+
+        if file_size(data_path, 'GB') > 1:
+            local_path = data_path.replace('.tsv', '_local.tsv')
+            if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
+                from ..tools import LOCALIZE
+                LOCALIZE(data_path, local_path)
+            data_path = local_path
+        return load(data_path)
+
+
+class NaturalBenchDataset(ImageMCQDataset):
+
+    DATASET_URL = {
+        'NaturalBenchDataset': (
+            'https://huggingface.co/datasets/BaiqiL/'
+            'NaturalBench/resolve/main/NaturalBenchDataset.tsv'
+        ),
+    }
+    DATASET_MD5 = {
+        'NaturalBenchDataset':'dbe25b044bc35696426381e9ba4fe930',
+    }
+
+    def build_prompt(self, line):
+        SUFFIX_FOR_VQA = {
+            "yes_no": "Please answer Yes or No.",
+            "multiple_choice": "Please output the letter corresponding to the correct option."
+        }
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+
+        if self.meta_only:
+            tgt_path = toliststr(line['image_path'])
+        else:
+            tgt_path = self.dump_image(line)
+
+        question = line['question']
+        prompt = f'{question} {SUFFIX_FOR_VQA[line["type"]]}'
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=prompt))
+
+        return msgs
+
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.naturalbench import extract_answer, get_scores
+
+        data = load(eval_file)
+        data = data.sort_values(by='index')
+        predictions = [str(x) for x in data['prediction']]
+        answers = [str(x) for x in data['answer']]
+        indexs = [str(x) for x in data['index']]
+        meta = self.data
+        types = [str(x) for x in meta['type']]
+        results = {}
+        assert len(predictions) == len(answers) == len(indexs) == len(types) == (1900 * 4)
+        number_answered_samples = len(predictions) // 4
+        for i in range(number_answered_samples):
+            results[i] = {
+                "q0_i0": extract_answer(predictions[i * 4], types[i * 4]),
+                "q0_i1": extract_answer(predictions[i * 4 + 1], types[i * 4 + 1]),
+                "q1_i0": extract_answer(predictions[i * 4 + 2], types[i * 4 + 2]),
+                "q1_i1": extract_answer(predictions[i * 4 + 3], types[i * 4 + 3])
+            }
+
+        scores = get_scores(results)
+        print(scores)
+        score_file = 'NaturalBench_acc.csv'
+        df = pd.DataFrame(list(scores.items()), columns=['Metric', 'Score'])
+        dump(df, score_file)
+
+        return scores
--- a/VLMEvalKit/vlmeval/dataset/image_mt.py
+++ b/VLMEvalKit/vlmeval/dataset/image_mt.py
+from .image_base import ImageBaseDataset
+from .utils.judge_util import build_judge
+from ..smp import *
+from ..utils import track_progress_rich
+
+
+class ImageMTDataset(ImageBaseDataset):
+
+    TYPE = 'MT'
+
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+
+        if self.meta_only:
+            tgt_path = toliststr(line['image_path'])
+        else:
+            tgt_path = self.dump_image(line)
+
+        questions = toliststr(line['question'])
+        if 'answer' in line:
+            answers = toliststr(line['answer'])
+        else:
+            answers = [''] * len(questions)
+        assert len(questions) == len(answers)
+
+        dlgs, pics_number = [], 0
+        for i in range(len(questions)):
+            q, a = questions[i], answers[i]
+            if '<ImageHere>' in q:
+                content = []
+                tag_number = q.count('<ImageHere>')
+                images = tgt_path[pics_number: pics_number + tag_number]
+                pics_number += tag_number
+                q_split = q.split('<ImageHere>')
+                for i in range(tag_number):
+                    qsp, im = q_split[i], images[i]
+                    if qsp != '':
+                        content.append(dict(type='text', value=qsp))
+                    content.append(dict(type='image', value=im))
+                if q_split[-1] != '':
+                    content.append(dict(type='text', value=q_split[-1]))
+            else:
+                content = [dict(type='text', value=q)]
+            dlgs.append(dict(role='user', content=content))
+            assert '<ImageHere>' not in a, 'We currently do not support images in the answer. '
+            content = [dict(type='text', value=a)]
+            dlgs.append(dict(role='assistant', content=content))
+        return dlgs
+
+
+class MMDUDataset(ImageMTDataset):
+
+    DATASET_URL = {'MMDU': 'https://opencompass.openxlab.space/utils/VLMEval/MMDU.tsv'}
+    DATASET_MD5 = {'MMDU': '848b635a88a078f49aebcc6e39792061'}
+    DIMS = [
+        'Creativity', 'Richness', 'Visual Perception', 'Logical Coherence',
+        'Answer Accuracy', 'Image Relationship Understanding', 'Overall Score'
+    ]
+
+    def calculat_metric(self, ans):
+        all = defaultdict(lambda: 0)
+        tot = defaultdict(lambda: 0)
+        valid = defaultdict(lambda: 0)
+        for k in ans:
+            res = ans[k]['res']
+            assert isinstance(res, pd.DataFrame)
+            lt = len(res)
+            for i in range(lt):
+                line = res.iloc[i]
+                for k in self.DIMS:
+                    tot[k] += 1
+                    if k in line and line[k] is not None:
+                        try:
+                            score = int(line[k])
+                            score = np.clip(score, 0, 10)
+                            all[k] += score
+                            valid[k] += 1
+                        except Exception as e:
+                            print(f'Failed to parse the score: {str(e)}')
+        sp1 = {'set': 'all'}
+        sp1.update({k: all[k] / tot[k] * 10 for k in self.DIMS})
+        sp2 = {'set': 'valid'}
+        sp2.update({k: all[k] / valid[k] * 10 for k in self.DIMS})
+
+        return pd.DataFrame([sp1, sp2])
+
+    def evaluate(self, eval_file, **judge_kwargs):
+        suffix = eval_file.split('.')[-1]
+        model = judge_kwargs['model']
+
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
+        nproc = judge_kwargs.pop('nproc', 4)
+
+        data = load(eval_file)
+        model = judge_kwargs.pop('model', 'gpt-4o')
+        judge_model = build_judge(model=model, **judge_kwargs)
+
+        lt = len(data)
+        lines = [data.iloc[i] for i in range(lt)]
+        tups = [(judge_model, line) for line in lines]
+        indices = [line['index'] for line in lines]
+
+        ans = {}
+        if osp.exists(tmp_file):
+            ans = load(tmp_file)
+
+        tups = [x for x, i in zip(tups, indices) if i not in ans]
+        indices = [i for i in indices if i not in ans]
+
+        from .utils.mmdu import mmdu_score
+
+        if len(indices):
+            new_results = track_progress_rich(
+                mmdu_score,
+                tups,
+                nproc=nproc,
+                chunksize=nproc,
+                keys=indices,
+                save=tmp_file,)
+            ans = load(tmp_file)
+            for k, v in zip(indices, new_results):
+                assert k in ans
+
+        metric = self.calculat_metric(ans)
+        dump(metric, score_file)
+        return metric
--- a/VLMEvalKit/vlmeval/dataset/image_vqa.py
+++ b/VLMEvalKit/vlmeval/dataset/image_vqa.py
+import os
+import re
+import tempfile
+from functools import partial
+
+import pandas as pd
+
+from .image_base import ImageBaseDataset
+from .utils import build_judge, DEBUG_MESSAGE
+from ..smp import *
+from ..utils import track_progress_rich
+
+
+class ImageVQADataset(ImageBaseDataset):
+    TYPE = 'VQA'
+
+    DATASET_URL = {
+        'OCRVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TEST.tsv',
+        'OCRVQA_TESTCORE': 'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TESTCORE.tsv',
+        'TextVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/TextVQA_VAL.tsv',
+        'DocVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_VAL.tsv',
+        'DocVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_TEST.tsv',
+        'InfoVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_VAL.tsv',
+        'InfoVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_TEST.tsv',
+        'ChartQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/ChartQA_TEST.tsv',
+        'GQA_TestDev_Balanced': 'https://opencompass.openxlab.space/utils/VLMEval/GQA_TestDev_Balanced.tsv',
+    }
+
+    DATASET_MD5 = {
+        'OCRVQA_TEST': 'ca46a6d74b403e9d6c0b670f6fc00db9',
+        'OCRVQA_TESTCORE': 'c5239fe77db8bdc1f2ad8e55e0d1fe97',
+        'TextVQA_VAL': 'b233b31f551bbf4056f2f955da3a92cd',
+        'DocVQA_VAL': 'd5ee77e1926ff10690d469c56b73eabf',
+        'DocVQA_TEST': '6a2f28cac26ef2d3447374e8c6f6c8e9',
+        'InfoVQA_VAL': '2342e9c225222f0ef4dec545ebb126fe',
+        'InfoVQA_TEST': 'df535bf51b88dc9718252c34131a6227',
+        'ChartQA_TEST': 'c902e0aa9be5582a7aad6dcf52734b42',
+        'GQA_TestDev_Balanced': 'fead7df22befc1ed3ca2b62ea26fa17b',
+    }
+
+    def build_prompt(self, line):
+        msgs = super().build_prompt(line)
+        assert msgs[-1]['type'] == 'text'
+        msgs[-1]['value'] += '\nAnswer the question using a single word or phrase.'
+        return msgs
+
+    # It returns a DataFrame
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.vqa_eval import hit_calculate, process_line
+
+        data = load(eval_file)
+        dataset = self.dataset_name
+        assert 'answer' in data and 'prediction' in data
+        data['prediction'] = [str(x) for x in data['prediction']]
+        data['answer'] = [str(x) for x in data['answer']]
+        lt = len(data)
+        pool = mp.Pool(16)
+        lines = [data.iloc[i] for i in range(lt)]
+        if listinstr(['TextVQA'], dataset):
+            res = pool.map(partial(process_line, method='vqa_score'), lines)
+        elif listinstr(['ChartQA'], dataset):
+            res = pool.map(partial(process_line, method='relaxed_accuracy'), lines)
+        elif listinstr(['OCRVQA', 'GQA'], dataset):
+            res = pool.map(partial(process_line, method='accuracy'), lines)
+        elif listinstr(['DocVQA', 'InfoVQA'], dataset):
+            res = pool.map(partial(process_line, method='anls'), lines)
+        else:  # default using vqa_score to calculate score
+            res = pool.map(process_line, lines)
+        hit = hit_calculate(res, dataset)
+        ret = dict()
+        if 'split' in data:
+            splits = set(data['split'])
+            for sp in splits:
+                sub = [r for l, r in zip(lines, res) if l['split'] == sp]
+                # [np.mean(x['match']) >= full_score_weight for x in sub]
+                hit = hit_calculate(sub, dataset)
+                ret[sp] = np.mean(hit) * 100
+            sub = [r for l, r in zip(lines, res)]
+            hit = hit_calculate(sub, dataset)
+            ret['Overall'] = np.mean(hit) * 100
+        else:
+            ret['Overall'] = np.mean(hit) * 100
+            if 'category' in data:
+                cates = list(set(data['category']))
+                cates.sort()
+                for c in cates:
+                    sub = [r for l, r in zip(lines, res) if l['category'] == c]
+                    # [np.mean(x['match']) >= full_score_weight for x in sub]
+                    hit = hit_calculate(sub, dataset)
+                    ret[c] = np.mean(hit) * 100
+        ret = d2df(ret)
+        ret.round(2)
+
+        suffix = eval_file.split('.')[-1]
+        result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        dump(ret, result_file)
+        return ret
+
+
+class VizWiz(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'VizWiz': 'https://opencompass.openxlab.space/utils/VLMEval/VizWiz.tsv'
+    }
+    DATASET_MD5 = {
+        'VizWiz': 'fa4ac4164467563ed2fac6eac6631bd0'
+    }
+
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.vqa_eval import hit_calculate, process_line
+
+        suffix = eval_file.split('.')[-1]
+        result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+
+        if not osp.exists(result_file):
+            data = load(eval_file)
+            assert 'answers' in data and 'prediction' in data
+            data['prediction'] = [str(x) for x in data['prediction']]
+            data['answer'] = [str(x) for x in data['answers']]
+
+            lt = len(data)
+            pool = mp.Pool(16)
+            lines = [data.iloc[i] for i in range(lt)]
+            res = pool.map(process_line, lines)
+
+            hit = hit_calculate(res, 'VizWiz')
+            ret = dict()
+
+            ret['Overall'] = np.mean(hit) * 100
+            ret = d2df(ret)
+            ret.round(2)
+
+            dump(ret, result_file)
+
+        retz = pd.read_csv(result_file)
+        return retz
+
+
+class OCRBench(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'OCRBench': 'https://opencompass.openxlab.space/utils/VLMEval/OCRBench.tsv'
+    }
+    DATASET_MD5 = {'OCRBench': 'e953d98a987cc6e26ef717b61260b778'}
+
+    # It returns a dictionary
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        OCRBench_score = {
+            'Regular Text Recognition': 0,
+            'Irregular Text Recognition': 0,
+            'Artistic Text Recognition': 0,
+            'Handwriting Recognition': 0,
+            'Digit String Recognition': 0,
+            'Non-Semantic Text Recognition': 0,
+            'Scene Text-centric VQA': 0,
+            'Doc-oriented VQA': 0,
+            'Key Information Extraction': 0,
+            'Handwritten Mathematical Expression Recognition': 0,
+        }
+
+        data = load(eval_file)
+        lt = len(data)
+        lines = [data.iloc[i] for i in range(lt)]
+        for i in tqdm(range(len(lines))):
+            line = lines[i]
+            predict = str(line['prediction'])
+            answers = eval(line['answer'])
+            category = line['category']
+            if category == 'Handwritten Mathematical Expression Recognition':
+                for j in range(len(answers)):
+                    answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
+                    predict = predict.strip().replace('\n', ' ').replace(' ', '')
+                    if answer in predict:
+                        OCRBench_score[category] += 1
+                        break
+            else:
+                for j in range(len(answers)):
+                    answer = answers[j].lower().strip().replace('\n', ' ')
+                    predict = predict.lower().strip().replace('\n', ' ')
+                    if answer in predict:
+                        OCRBench_score[category] += 1
+                        break
+
+        final_score_dict = {}
+        final_score_dict['Text Recognition'] = \
+            (OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition']
+             + OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition']
+             + OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition'])
+        final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA']
+        final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA']
+        final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction']
+        final_score_dict['Handwritten Mathematical Expression Recognition'] = \
+            (OCRBench_score['Handwritten Mathematical Expression Recognition'])
+        final_score_dict['Final Score'] = \
+            (final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA']
+             + final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction']
+             + final_score_dict['Handwritten Mathematical Expression Recognition'])
+        final_score_dict['Final Score Norm'] = (float(final_score_dict['Final Score']) / 10)
+        score_pth = eval_file.replace('.xlsx', '_score.json')
+        dump(final_score_dict, score_pth)
+        return final_score_dict
+
+
+class MathVista(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'MathVista_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVista_MINI.tsv'
+    }
+    DATASET_MD5 = {'MathVista_MINI': 'f199b98e178e5a2a20e7048f5dcb0464'}
+
+    # It returns a DataFrame
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.mathvista import MathVista_auxeval, MathVista_acc
+
+        model = judge_kwargs['model']
+        suffix = eval_file.split('.')[-1]
+        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        nproc = judge_kwargs.pop('nproc', 4)
+
+        if not osp.exists(storage):
+            data = load(eval_file)
+            model = build_judge(max_tokens=128, **judge_kwargs)
+            assert model.working(), ('MathVista evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+
+            ans = {}
+            if osp.exists(tmp_file):
+                ans = load(tmp_file)
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+
+            if len(indices):
+                new_results = track_progress_rich(
+                    MathVista_auxeval,
+                    tups,
+                    nproc=nproc,
+                    chunksize=nproc,
+                    keys=indices,
+                    save=tmp_file,
+                )
+                ans = load(tmp_file)
+                for k, v in zip(indices, new_results):
+                    assert k in ans
+                    assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
+
+            data['res'] = [ans[idx]['res'] for idx in data['index']]
+            data['log'] = [ans[idx]['log'] for idx in data['index']]
+            dump(data, storage)
+
+        score = MathVista_acc(storage)
+        score_pth = storage.replace('.xlsx', '_score.csv')
+        dump(score, score_pth)
+        return score
+
+
+class MathVerse(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'MathVerse_MINI': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIV.tsv', # noqa
+        'MathVerse_MINI_Vision_Only': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVOnly.tsv', # noqa
+        'MathVerse_MINI_Vision_Dominant': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVDom.tsv', # noqa
+        'MathVerse_MINI_Vision_Intensive': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVInt.tsv', # noqa
+        'MathVerse_MINI_Text_Lite': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINITLite.tsv', # noqa
+        'MathVerse_MINI_Text_Dominant': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINITDom.tsv', # noqa
+    }
+    DATASET_MD5 = {
+        'MathVerse_MINI': '5017caca32b7fa110c350a1bea861b65',
+        'MathVerse_MINI_Vision_Only': '68a11d4680014ac881fa37adeadea3a4',
+        'MathVerse_MINI_Vision_Dominant': 'b8fb63852d261ab2aaefba29cc2414d3',
+        'MathVerse_MINI_Vision_Intensive': '01cbd35be202bb0c4873a4186a63bc19',
+        'MathVerse_MINI_Text_Lite': '19e4b13bdd30b89a03b2e358bcfefa04',
+        'MathVerse_MINI_Text_Dominant': '4f5cd2fa6630ea00bb11d6fde1f6fe6a',
+    }
+
+    # It returns a DataFrame
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.mathverse import MathVerse_auxeval_extract, MathVerse_auxeval_score, MathVerse_acc
+
+        model = judge_kwargs['model']
+        suffix = eval_file.split('.')[-1]
+        storage_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.xlsx')
+        tmp_file_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.pkl')
+        storage_score = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
+        tmp_file_score = eval_file.replace(f'.{suffix}', f'_{model}_score.pkl')
+        nproc = judge_kwargs.pop('nproc', 4)
+        # stage1: extract the answer
+        if not osp.exists(storage_extract):
+            data = load(eval_file)
+            model = build_judge(max_tokens=128, **judge_kwargs)
+            assert model.working(), ('MathVerse evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+
+            ans = {}
+            if osp.exists(tmp_file_extract):
+                ans = load(tmp_file_extract)
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+
+            if len(indices):
+                new_results = track_progress_rich(
+                    MathVerse_auxeval_extract,
+                    tups,
+                    nproc=nproc,
+                    chunksize=nproc,
+                    keys=indices,
+                    save=tmp_file_extract,
+                )
+                ans = load(tmp_file_extract)
+                for k, v in zip(indices, new_results):
+                    assert k in ans
+                    assert ans[k]['log_extract'] == v['log_extract'] and ans[k]['extract'] == v['extract']
+
+            data['extract'] = [ans[idx]['extract'] for idx in data['index']]
+            data['log_extract'] = [ans[idx]['log_extract'] for idx in data['index']]
+            dump(data, storage_extract)
+
+        # stage2: score the answer
+        if not osp.exists(storage_score):
+            data = load(storage_extract)
+            model = build_judge(max_tokens=128, **judge_kwargs)
+            assert model.working(), ('MathVerse evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+
+            ans = {}
+            if osp.exists(tmp_file_score):
+                ans = load(tmp_file_score)
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+
+            if len(indices):
+                new_results = track_progress_rich(
+                    MathVerse_auxeval_score,
+                    tups,
+                    nproc=nproc,
+                    chunksize=nproc,
+                    keys=indices,
+                    save=tmp_file_score,
+                )
+                ans = load(tmp_file_score)
+                for k, v in zip(indices, new_results):
+                    assert k in ans
+                    assert ans[k]['log_score'] == v['log_score'] and ans[k]['score'] == v['score']
+
+            data['score'] = [ans[idx]['score'] for idx in data['index']]
+            data['log_score'] = [ans[idx]['log_score'] for idx in data['index']]
+            dump(data, storage_score)
+
+        score = MathVerse_acc(storage_score)
+        score_pth = storage_score.replace('.xlsx', '.csv')
+        dump(score, score_pth)
+        return score
+
+
+class MathVision(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'MathVision': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision.tsv',
+        'MathVision_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision_MINI.tsv'
+    }
+    DATASET_MD5 = {
+        'MathVision': '93f6de14f7916e598aa1b7165589831e',
+        'MathVision_MINI': '060fe4fa5d868987ce179307bd5f8a33'
+    }
+
+    # It returns a DataFrame
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.mathv import MATH_V_auxeval, MATH_V_acc
+
+        if 'model' in judge_kwargs:
+            model = judge_kwargs['model']
+        else:
+            model = os.path.basename(os.environ.get('LOCAL_LLM'))
+        suffix = eval_file.split('.')[-1]
+        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        nproc = judge_kwargs.pop('nproc', 4)
+
+        if not osp.exists(storage):
+            data = load(eval_file)
+            model = build_judge(max_tokens=128, **judge_kwargs)
+            assert model.working(), ('MATH-Vision evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+
+            ans = {}
+            if osp.exists(tmp_file):
+                ans = load(tmp_file)
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+
+            if len(indices):
+                new_results = track_progress_rich(
+                    MATH_V_auxeval,
+                    tups,
+                    nproc=nproc,
+                    chunksize=nproc,
+                    keys=indices,
+                    save=tmp_file,
+                )
+                ans = load(tmp_file)
+                for k, v in zip(indices, new_results):
+                    assert k in ans
+                    assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
+
+            data['res'] = [ans[idx]['res'] for idx in data['index']]
+            data['log'] = [ans[idx]['log'] for idx in data['index']]
+            dump(data, storage)
+
+        score = MATH_V_acc(storage)
+        score_pth = storage.replace('.xlsx', '_score.csv')
+        dump(score, score_pth)
+        return score
+
+
+class OlympiadBench(ImageBaseDataset):
+    TYPE = 'VQA_ex_prompt'
+    DATASET_URL = {
+        'OlympiadBench': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench.tsv',
+        'OlympiadBench_EN': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench_EN.tsv',
+        'OlympiadBench_CN': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench_CN.tsv'
+    }
+    DATASET_MD5 = {
+        'OlympiadBench': '9735ae0f0299eae1e7d07f5a7feab914',
+        'OlympiadBench_EN': '5c68e100d394351fc7049f29d4d4efed',
+        'OlympiadBench_CN': 'ea01b16788955702c79650c701e5b623'
+    }
+
+    def dump_image(self, line):
+        os.makedirs(self.img_root, exist_ok=True)
+
+        tgt_path_z = []
+        if isinstance(line['image'], list):
+            for i in range(len(line['image'])):
+                tgt_path = osp.join(self.img_root, f"{line['index']}--{i + 1}.jpg")
+                if not read_ok(tgt_path):
+                    decode_base64_to_image_file(line['image'][i], tgt_path)
+                tgt_path_z.append(tgt_path)
+        else:
+            tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
+            if not read_ok(tgt_path):
+                decode_base64_to_image_file(line['image'], tgt_path)
+            tgt_path_z.append(tgt_path)
+        return tgt_path_z
+
+    def build_prompt(self, line):
+
+        from .utils.olympiadbench import get_answer_type_text, make_input
+
+        self.is_chinese = 'zh' in line['source']
+        self.is_math = 'maths' in line['source']
+        self.is_theorem_proving = 'TP' in line['source']
+
+        if self.is_chinese:
+            subject_content = '数学' if self.is_math else '物理'
+            if self.is_theorem_proving:
+                prompt = (
+                    f"以下是中国{subject_content}竞赛中的证明题。请根据题目的要求，运用逻辑推理及常用定理证明题目中的命题。"
+                    "证明过程中使用的变量和公式请使用LaTeX格式表示。"
+                )
+            else:
+                answer_type_text = get_answer_type_text(line['answer_type'], is_chinese=True,
+                                                        multiple_answer=line['is_multiple_answer'])
+                if line['is_multiple_answer']:
+                    multiple_answer_text = '\\boxed{用英文逗号连接的多个答案}'
+                else:
+                    multiple_answer_text = '\\boxed{答案}'
+                unit_text = ''
+                if line['unit']:
+                    multiple_answer_text += '(单位)'
+                    unit_text = '，注意答案的单位不要放在\\boxed{}中'
+                prompt = (
+                    f'以下是中国{subject_content}竞赛中的解答题{answer_type_text}。请根据题目的要求和所提供的信息计算得出答案。'
+                    f'解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以“所以最终答案是{multiple_answer_text}。”'
+                    f'显式给出结果{unit_text}。'
+                )
+        else:
+            subject_content = 'Math' if self.is_math else 'Physics'
+            if self.is_theorem_proving:
+                prompt = (
+                    f'The following is a theorem proving problem from an International {subject_content} competition. '
+                    'Please use logical reasoning and common theorems to prove the proposition in the problem '
+                    'according to the given requirements. '
+                    'Please use LaTeX format to represent the variables and formulas used in the proof.'
+                )
+            else:
+                if line['is_multiple_answer']:
+                    multiple_answer_text = '\\boxed{multiple answers connected with commas}'
+                else:
+                    multiple_answer_text = '\\boxed{answer}'
+                unit_text = ''
+                if line['unit']:
+                    multiple_answer_text += '(unit)'
+                    unit_text = ', note that the unit of the answer should not be included in \\boxed{}'
+                answer_type_text = get_answer_type_text(line['answer_type'], is_chinese=False,
+                                                        multiple_answer=line['is_multiple_answer'])
+                prompt = (
+                    f'The following is an open-ended problem from an International {subject_content} competition. '
+                    f'{answer_type_text}Please calculate the answer according to the given requirements and '
+                    'the information provided. Please use LaTeX format to represent the variables and formulas '
+                    'used in the solution process and results. Please end your solution with "So the final answer '
+                    f'is {multiple_answer_text}." and give the result explicitly{unit_text}.'
+                )
+
+        if self.is_math:
+            input = make_input(prompt, line['question'])
+        else:
+            if 'context' in line.keys() and str(line['context']) != 'nan':  # cannot be null
+                input = make_input(prompt, line['context'] + '\n' + line['question'])
+            else:
+                input = make_input(prompt, line['question'])
+
+        ret = [dict(type='text', value=input)]
+        tgt_path = self.dump_image(line)
+
+        ret.extend([dict(type='image', value=s) for s in tgt_path])
+
+        return ret
+
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.olympiadbench import MathJudger, extract_answer
+        judger = MathJudger()
+
+        suffix = eval_file.split('.')[-1]
+        name_str1 = 'judge'
+        name_str2 = 'score'
+        result_file = eval_file.replace(f'.{suffix}', f'_{name_str1}_result.xlsx')
+        score_file = eval_file.replace(f'.{suffix}', f'_{name_str2}_result.csv')
+
+        if not osp.exists(result_file):
+            data = load(eval_file)
+            scorez = []
+
+            for i in tqdm(data.iterrows()):
+                line = i[1]
+                model_answer = line['prediction']
+                is_chinese = 'zh' in line['source']
+                model_answer = extract_answer(is_chinese, model_answer, is_deepseek=False)
+                answer_type = line['answer_type']
+
+                final_answer = line['final_answer'][2:-2]
+
+                if str(answer_type) != 'nan' and 'Tuple' in answer_type:
+                    judge_result = judger.judge(model_answer, final_answer)
+                else:
+                    if str(line['error']) != 'nan':
+                        if ',' in line['error']:
+                            precisions = line['error'].split(',')
+                            precisions = [float(p) if p else 1e-8 for p in precisions]
+                            judge_result = judger.judge(model_answer, final_answer, precisions)
+                        else:
+                            precision = float(line['error'])
+                            judge_result = judger.judge(model_answer, final_answer, precision)
+                    else:
+                        judge_result = judger.judge(model_answer, final_answer)
+                scorez.append(judge_result)
+
+            data['score'] = scorez
+            dump(data, result_file)
+
+        judge_file = load(result_file)
+
+        if not osp.exists(score_file):
+            name_list = ['OE_MM_maths_en_COMP', 'OE_MM_maths_zh_CEE', 'OE_MM_maths_zh_COMP', 'OE_MM_physics_en_COMP',
+                         'OE_MM_physics_zh_CEE','OE_TO_maths_en_COMP', 'OE_TO_maths_zh_CEE', 'OE_TO_maths_zh_COMP',
+                         'OE_TO_physics_en_COMP', 'OE_TO_physics_zh_CEE']
+
+            sample_list = [[] for _ in range(len(name_list))]
+            for i in judge_file.iterrows():
+                line = i[1]
+                for j in range(len(name_list)):
+                    if line['source'] == name_list[j]:
+                        sample_list[j].append(line['score'])
+
+            acc_dict = {}
+            correct_list = []
+
+            # fine-grained
+            for i in range(len(name_list)):
+                correct_num = 0
+                for j in sample_list[i]:
+                    if j:
+                        correct_num += 1
+                correct_list.append(correct_num)
+                acc = 100 * correct_num / len(sample_list[i])
+                acc_dict[name_list[i]] = [acc]
+
+            # 4 grained
+            labela = ['zh', 'en']
+            labelb = ['maths', 'physics']
+
+            grain_list = [[x,y] for x in labela for y in labelb]
+            for j in grain_list:
+                dict_name = j[0] + "_" + j[1]
+                correct_num = 0
+                full_num = 0
+                for i in range(len(name_list)):
+                    if all(k in name_list[i] for k in j):
+                        correct_num += correct_list[i]
+                        full_num += len(sample_list[i])
+                acc = 100 * correct_num / full_num
+                acc_dict[dict_name] = [acc]
+
+            # 2 grained
+            grain_list = ['maths', 'physics']
+            for j in grain_list:
+                dict_name = j
+                correct_num = 0
+                full_num = 0
+                for i in range(len(name_list)):
+                    if j in name_list[i]:
+                        correct_num += correct_list[i]
+                        full_num += len(sample_list[i])
+                acc = 100 * correct_num / full_num
+                acc_dict[dict_name] = [acc]
+
+            # AVG
+            correct_num = sum(correct_list)
+            acc = 100 * correct_num / len(judge_file)
+            acc_dict['AVG'] = [acc]
+
+            acc_pd = pd.DataFrame(acc_dict)
+            acc_pd.to_csv(score_file, index=False, encoding='gbk')
+
+        accdz = pd.read_csv(score_file)
+        return accdz
+
+
+class LLaVABench(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {'LLaVABench': 'https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv'}
+    DATASET_MD5 = {'LLaVABench': 'd382a093f749a697820d3dadd61c8428'}
+
+    # It returns a DataFrame
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.llavabench import (
+            build_prompt,
+            LLaVABench_atomeval,
+            LLaVABench_score,
+        )
+
+        suffix = '.' + eval_file.split('.')[-1]
+        record_file = eval_file.replace(suffix, '_openai_result' + suffix)
+        score_file = eval_file.replace(suffix, '_score.csv')
+        nproc = judge_kwargs.pop('nproc', 4)
+        system_prompt = 'You are a helpful and precise assistant for checking the quality of the answer.'
+
+        if not osp.exists(record_file):
+            data = load(eval_file)
+            lines = [data.iloc[i] for i in range(len(data))]
+            model = build_judge(temperature=0.2, system_prompt=system_prompt, **judge_kwargs)
+            assert model.working(), ('LLaVABench evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
+
+            prompts = [build_prompt(line) for line in lines]
+            tups = [(model, prompt) for prompt in prompts]
+            scores = track_progress_rich(LLaVABench_atomeval, tups, nproc=nproc, chunksize=nproc)
+            data['gpt4_score'] = [x[0] for x in scores]
+            data['score'] = [x[1] for x in scores]
+            dump(data, record_file)
+
+        data = load(record_file)
+        ret = LLaVABench_score(data).round(1)
+        dump(ret, score_file)
+        return ret
+
+
+class MMVet(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'MMVet': 'https://opencompass.openxlab.space/utils/VLMEval/MMVet.tsv'
+    }
+    DATASET_MD5 = {'MMVet': '748aa6d4aa9d4de798306a63718455e3'}
+
+    # It returns a DataFrame
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.mmvet import MMVet_auxeval, MMVet_acc
+
+        suffix = eval_file.split('.')[-1]
+        model = judge_kwargs['model']
+        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        nproc = judge_kwargs.pop('nproc', 4)
+        if not osp.exists(storage):
+            data = load(eval_file)
+            model = build_judge(max_tokens=3, **judge_kwargs)
+            assert model.working(), ('MMVet evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
+
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+
+            ans = load(tmp_file) if osp.exists(tmp_file) else {}
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+
+            if len(indices):
+                new_results = track_progress_rich(
+                    MMVet_auxeval,
+                    tups,
+                    nproc=nproc,
+                    chunksize=nproc,
+                    keys=indices,
+                    save=tmp_file,
+                )
+                ans = load(tmp_file)
+                for k, v in zip(indices, new_results):
+                    assert k in ans
+                    assert ans[k]['log'] == v['log'] and ans[k]['score'] == v['score']
+            data['score'] = [ans[idx]['score'] for idx in data['index']]
+            data['log'] = [ans[idx]['log'] for idx in data['index']]
+            dump(data, storage)
+
+        score, score_fine = MMVet_acc(storage)
+        score_pth = storage.replace('.xlsx', '_score.csv')
+        score_fine_pth = storage.replace('.xlsx', '_score_fine.csv')
+        dump(score, score_pth)
+        dump(score_fine, score_fine_pth)
+        return score
+
+
+class MTVQADataset(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {'MTVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/MTVQA_TEST.tsv'}
+    DATASET_MD5 = {'MTVQA_TEST': 'd87c17dbab934b7cd89c0a3c1c5657f4'}
+
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        data = load(eval_file)
+        assert 'answer' in data and 'prediction' in data and 'category' in data
+        data['prediction'] = [str(x) for x in data['prediction']]
+        data['answer'] = [str(x) for x in data['answer']]
+        if 'split' in data:
+            assert np.all([x.lower() == 'test' for x in data['split']]), 'We only support MTVQA_TEST for now. '
+        lt = len(data)
+        category_scores = defaultdict(list)
+        for i in range(lt):
+            line = data.iloc[i]
+            ans = line['answer'].strip().lower().replace('.', '')
+            pred = line['prediction'].strip().lower().replace('.', '')
+            cate = line['category']
+            score = 1.0 if ans in pred else 0.0
+            category_scores[cate].append(score)
+            category_scores['Average'].append(score)
+        # Calculate the average score for each category, the score is normalized to [0, 100]
+        category_averages = {category: np.mean(scores) * 100 for category, scores in category_scores.items()}
+
+        suffix = eval_file.split('.')[-1]
+        result_file = eval_file.replace(f'.{suffix}', '_acc.json')
+        dump(category_averages, result_file)
+
+        return category_averages
+
+    # MT-VQA adopts a custom prompt
+    def build_prompt(self, line):
+        msgs = super().build_prompt(line)
+        assert sum([x['type'] == 'text' for x in msgs]) == 1
+        for item in msgs:
+            if item['type'] == 'text':
+                item['value'] += '\nAnswer the question using a word or phrase in the language of the question.'
+        return msgs
+
+
+class TableVQABench(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'TableVQABench': 'https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/mentor-vil/datasets/tablevqa-bench.tsv'
+    }
+    DATASET_MD5 = {'TableVQABench': '2550adc61bdc82d8e62f3b003de7c62d'}
+
+    from .utils.tablevqabench import FINTABNETQA_PROMPT, VTABFACT_PROMPT, VWTQ_PROMPT
+
+    # It returns a DataFrame
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        import pandas as pd
+        from .utils.tablevqabench import evaluate_fintabnet, evaluate_tabfact, evaluate_wtq
+
+        data = load(eval_file)
+        assert 'answer' in data and 'prediction' in data
+
+        data['prediction'] = data['prediction'].str.replace('^Answer: ', '', regex=True)
+        data_group = dict(tuple(data.groupby('split')))
+        eval_result = {'split': [], 'average_scores': []}
+        for split in ['fintabnetqa', 'vtabfact', 'vwtq', 'vwtq_syn']:
+            data_split = data_group[split].to_dict(orient='records')
+            if split == 'fintabnetqa':
+                split_eval_meta = evaluate_fintabnet(data_split, ['accuracy'])
+            elif split == 'vtabfact':
+                split_eval_meta = evaluate_tabfact(data_split, ['accuracy'])
+            elif split == 'vwtq' or split == 'vwtq_syn':
+                split_eval_meta = evaluate_wtq(data_split, ['accuracy'])
+            eval_result['split'].append(split)
+            eval_result['average_scores'].append(split_eval_meta['average_scores'])
+
+        suffix = eval_file.split('.')[-1]
+        result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        eval_result = pd.DataFrame(eval_result)
+        dump(eval_result, result_file)
+
+        return eval_result
+
+    # TableVQABench adopts a custom prompt
+    def build_prompt(self, line):
+        msgs = super().build_prompt(line)
+        assert sum([x['type'] == 'text' for x in msgs]) == 1
+        for item in msgs:
+            if item['type'] == 'text':
+                if line['split'] == 'fintabnetqa':
+                    item['value'] = self.FINTABNETQA_PROMPT.format_map({'question': item['value']})
+                elif line['split'] == 'vtabfact':
+                    item['value'] = self.VTABFACT_PROMPT.format_map({'question': item['value']})
+                elif line['split'] == 'vwtq_syn' or line['split'] == 'vwtq':
+                    item['value'] = self.VWTQ_PROMPT.format_map({'question': item['value']})
+        return msgs
+
+
+class CustomVQADataset(ImageBaseDataset):
+    TYPE = 'VQA'
+
+    def load_data(self, dataset):
+        data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
+
+        if file_size(data_path, 'GB') > 1:
+            local_path = data_path.replace('.tsv', '_local.tsv')
+            if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
+                from ..tools import LOCALIZE
+
+                LOCALIZE(data_path, local_path)
+            data_path = local_path
+        return load(data_path)
+
+    def evaluate(self, eval_file, **judge_kwargs):
+        raise NotImplementedError
+
+
+class CRPE(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'CRPE_EXIST': 'https://huggingface.co/datasets/petter12321/crpe_vlmevalkit/resolve/main/CRPE_EXIST.tsv',
+        'CRPE_RELATION': 'https://huggingface.co/datasets/petter12321/crpe_vlmevalkit/resolve/main/CRPE_RELATION.tsv'
+    }
+    DATASET_MD5 = {
+        'CRPE_EXIST': '315584e23ac1ff7f8719ed3b7ad90f08',
+        'CRPE_RELATION': 'bad7094cde0b572288f4b119c2d0c656'}
+
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.crpe import is_correct
+        # find-image, count-text, find-text,
+        # infer-choose, count-image, visual-reasoning
+        score = {
+            'exist': 0,
+            'subject': 0,
+            'predicate': 0,
+            'object': 0,
+            'total': 0,
+        }
+        num = {
+            'exist': 0,
+            'subject': 0,
+            'predicate': 0,
+            'object': 0,
+            'total': 0,
+        }
+        final_score_dict = {
+            'exist': 0,
+            'subject': 0,
+            'predicate': 0,
+            'object': 0,
+            'total': 0,
+        }
+        data = load(eval_file)
+        lt = len(data)
+        lines = [data.iloc[i] for i in range(lt)]
+        for i in tqdm(range(len(lines))):
+            line = lines[i]
+            predict = str(line['prediction'])
+            answers = str(line['answer'])
+            # print("predict =", predict)
+            # print("answers =", answers)
+            category = line['category']
+            if is_correct(answers, predict):
+                score[category] += 1
+                score['total'] += 1
+            num[category] += 1
+            num['total'] += 1
+
+        for category in ['exist', 'subject', 'predicate', 'object', 'total']:
+            if num[category] != 0:
+                final_score_dict[category] = score[category] / num[category]
+            else:
+                final_score_dict[category] = None
+
+        score_pth = eval_file.replace('.xlsx', '_score.json')
+        dump(final_score_dict, score_pth)
+        return final_score_dict
+
+    def build_prompt(self, line):
+        ROOT = LMUDataRoot()
+        msgs = super().build_prompt(line)
+        for msg in msgs:
+            if msg['type'] == 'image':
+                msg['value'] = osp.join(osp.join(ROOT, 'images', self.dataset_name), msg['value'])
+        return msgs
+
+
+class QSpatial(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'QSpatial_plus': '',
+        'QSpatial_scannet': ''
+    }
+
+    # NOTE: To evaluate Q-Spatial-ScanNet, you need to get the permission from ScanNet website
+    # Once you get the permission, you can use the helper code here to download and extract necessary images:
+    # https://github.com/andrewliao11/Q-Spatial-Bench-code?tab=readme-ov-file#for-qspatial_scannet
+    qspatial_root = "TO_BE_REPLACED_WITH_THE_PATH_TO_QSPATIAL_DATASET"
+    url = "https://raw.githubusercontent.com/andrewliao11/Q-Spatial-Bench-code/refs/heads/main/prompt_templates/"
+
+    def post_build(self, dataset):
+        # Download the prompt templates from github
+
+        links = [
+            self.url + "system_prompt.txt",
+            self.url + "spatial_prompt_single.txt",
+            self.url + "spatial_prompt_steps.txt",
+            self.url + "standard_prompt.txt",
+            self.url + "zero_shot_prompt.txt"
+        ]
+        with tempfile.TemporaryDirectory() as temp_dir:
+            for link in links:
+                tgt_path = os.path.join(temp_dir, link.split("/")[-1])
+                os.system(f"wget {link} -O {tgt_path}")
+
+            self.system_prompt = open(os.path.join(temp_dir, "system_prompt.txt")).read()
+            self._prompt_templates = dict(
+                spatial_prompt_single=open(os.path.join(temp_dir, "spatial_prompt_single.txt")).read(),
+                spatial_prompt_steps=open(os.path.join(temp_dir, "spatial_prompt_steps.txt")).read(),
+                standard_prompt=open(os.path.join(temp_dir, "standard_prompt.txt")).read(),
+                zero_shot_prompt=open(os.path.join(temp_dir, "zero_shot_prompt.txt")).read(),
+            )
+
+    # Given one data record, return the built prompt (a multi-modal message), can override
+    def build_prompt(self, line):
+        from jinja2.sandbox import SandboxedEnvironment
+        text_prompt_template = self._prompt_templates["spatial_prompt_single"]
+        env = SandboxedEnvironment()
+        text_prompt = env.from_string(text_prompt_template).render(question=line["question"])
+        tgt_path = self.dump_image(line)
+
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+
+        msgs.append(dict(type='text', value=f"{self.system_prompt}\n{text_prompt}"))
+        return msgs
+
+    # Given the dataset name, return the dataset as a pandas dataframe, can override
+    def load_data(self, dataset):
+        import io
+        import pandas as pd
+        from datasets import load_dataset
+
+        hf_dataset = load_dataset("andrewliao11/Q-Spatial-Bench", split=dataset)
+        df = hf_dataset.to_pandas()
+
+        df.reset_index(drop=True, inplace=True)
+        df['index'] = df.index
+        df['answer'] = list(zip(df['answer_value'], df['answer_unit']))
+        df = df[['index'] + [col for col in df.columns if col != 'index']]
+
+        if dataset == "QSpatial_scannet":
+            df = df.drop(columns=["image"])
+            df["image"] = [Image.open(os.path.join(self.qspatial_root, image_path)) for image_path in df["image_path"]]
+        else:
+            df["image"] = [Image.open(io.BytesIO(image_dict["bytes"])) for image_dict in df["image"]]
+
+        df["image"] = [encode_image_to_base64(image) for image in df["image"]]
+        return df
+
+    @classmethod
+    def get_multiplier(self, unit):
+
+        unit = unit.lower()
+        if unit in ["meters", "meter", "m", "metre", "metres"]:
+            multiplier = 100
+        elif unit in ["centimeters", "centimeter", "cm"]:
+            multiplier = 1
+        elif unit in ["feet", "foot", "ft"]:
+            multiplier = 30.48
+        elif unit in ["inch", "inches", "in"]:
+            multiplier = 2.54
+        elif unit in ["mm"]:
+            multiplier = 0.1
+        else:
+            print(f"Unknown unit: {unit}")
+            multiplier = 0.
+
+        return multiplier
+
+    @classmethod
+    def parse_string(self, input_str):
+        # Regular expression to match the pattern (number or range, text)
+        match = re.match(r'\(([\d.-]+), (.+)\)', input_str)
+        if match:
+            number_part = match.group(1)
+            text = match.group(2)
+
+            if '-' in number_part:
+                start, end = map(float, number_part.split('-'))
+                number = (start + end) / 2
+            else:
+                number = float(number_part)
+
+            return number * self.get_multiplier(text)
+        else:
+            print(f"Unable to parse the input string {input_str}")
+            return 0
+
+    @classmethod
+    def parse_prediction(self, vlm_response):
+        # Value
+        pattern = r'scalar{([^}]*)}'
+        str_inside_scalar_boxes = re.findall(pattern, vlm_response)[-1]
+        scalar_list = re.findall(r'\d+\.?\d*', str_inside_scalar_boxes)
+        parsed_scalar = np.array(scalar_list).astype(float).mean()
+
+        # Unit
+        pattern = r'distance_unit{([^}]*)}'
+        str_inside_unit_boxes = re.findall(pattern, vlm_response)
+        parsed_unit = str_inside_unit_boxes[-1]
+
+        pred_value_in_cms = parsed_scalar * self.get_multiplier(parsed_unit)
+        return pred_value_in_cms
+
+    # It returns a dictionary
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+
+        data = load(eval_file)
+        if "model" in judge_kwargs:
+            from .utils.qspatial import QSpatial_auxeval
+
+            # extract using model
+            model = judge_kwargs['model']
+            suffix = eval_file.split('.')[-1]
+            storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+            tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+            nproc = judge_kwargs.pop('nproc', 4)
+
+            if not osp.exists(storage):
+                model = build_judge(max_tokens=128, **judge_kwargs)
+
+                assert model.working(), ('Evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
+                lt = len(data)
+                lines = [data.iloc[i] for i in range(lt)]
+                tups = [(model, line) for line in lines]
+                indices = [line['index'] for line in lines]
+
+                ans = {}
+                if osp.exists(tmp_file):
+                    ans = load(tmp_file)
+                tups = [x for x, i in zip(tups, indices) if i not in ans]
+                indices = [i for i in indices if i not in ans]
+
+                if len(indices):
+                    new_results = track_progress_rich(
+                        QSpatial_auxeval,
+                        tups,
+                        nproc=nproc,
+                        chunksize=nproc,
+                        keys=indices,
+                        save=tmp_file,
+                    )
+                    ans = load(tmp_file)
+                    for k, v in zip(indices, new_results):
+                        assert k in ans
+                        assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
+
+                data['res'] = [ans[idx]['res'] for idx in data['index']]
+                data['log'] = [ans[idx]['log'] for idx in data['index']]
+                dump(data, storage)
+
+            data = load(storage)
+
+            pred_value_in_cms = []
+            for res in data["res"]:
+                try:
+                    pred_value_in_cms.append(self.parse_string(res))
+                except ValueError:
+                    pred_value_in_cms.append(0.)
+
+            pred_value_in_cms = np.array(pred_value_in_cms) + 1e-8
+        else:
+            # regex parsing
+            pred_value_in_cms = []
+            n_errors_in_parsing = 0
+            for pred in data["prediction"]:
+                try:
+                    parsed_value = self.parse_prediction(pred)
+                except IndexError:
+                    n_errors_in_parsing += 1
+                    parsed_value = 1e-8
+
+                pred_value_in_cms.append(parsed_value)
+
+            print(f"Encounter {n_errors_in_parsing} errors in parsing")
+            pred_value_in_cms = np.array(pred_value_in_cms) + 1e-8
+
+        # Ground truth
+        ground_truth_value_in_cms = []
+        for answer in data["answer"]:
+            value, unit = eval(answer)
+            ground_truth_value_in_cms.append(value * self.get_multiplier(unit))
+        ground_truth_value_in_cms = np.array(ground_truth_value_in_cms) + 1e-8
+
+        # Calculate the score
+        pred_gt = pred_value_in_cms / ground_truth_value_in_cms
+        gt_pred = ground_truth_value_in_cms / pred_value_in_cms
+        delta_2 = np.stack([pred_gt, gt_pred]).max(0) < 2.
+        delta_1_point_5 = np.stack([pred_gt, gt_pred]).max(0) < 1.5
+
+        data["eval_score_delta_2"] = delta_2
+        data["eval_score_delta_1_point_5"] = delta_1_point_5
+
+        final_score_dict = {
+            "delta_2": delta_2.mean(),
+            "delta_1_point_5": delta_1_point_5.mean()
+        }
+        for question_type in set(data["question_type"]):
+            filtered_data = data[data["question_type"] == question_type]
+            delta_2_per_question_type = filtered_data["eval_score_delta_2"].mean()
+            delta_1_point_5_per_question_type = filtered_data["eval_score_delta_1_point_5"].mean()
+            final_score_dict.update({f"{question_type}_delta_2": delta_2_per_question_type})
+            final_score_dict.update({f"{question_type}_delta_1_point_5": delta_1_point_5_per_question_type})
+
+        score_pth = eval_file.replace('.xlsx', '_score.json')
+        dump(final_score_dict, score_pth)
+        return final_score_dict
+
+
+class MMNIAH(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'MM_NIAH_VAL':
+            'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/MM_NIAH_VAL.tsv',
+        'MM_NIAH_TEST':
+            ['https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-aa',
+             'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ab',
+             'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ac',
+             'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ad',
+             'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ae']}
+    DATASET_MD5 = {'MM_NIAH_VAL': '27e5a8c3cef7746cb38f89cd86c474c5',
+                   'MM_NIAH_TEST': 'f490eb2a43096307465fe9e7ef13497c'}
+
+    def prepare_tsv(self, url, file_md5=None):
+        import os
+        data_root = LMUDataRoot()
+        os.makedirs(data_root, exist_ok=True)
+        update_flag = False
+        file_name = 'MM_NIAH_VAL.tsv' if 'MM_NIAH_VAL' in url else 'MM_NIAH_TEST.tsv'
+        data_path = osp.join(data_root, file_name)
+        if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
+            pass
+        elif file_name == 'MM_NIAH_TEST.tsv':
+            warnings.warn('The dataset tsv is not downloaded')
+            for i in range(len(url)):
+                if osp.exists(osp.join(data_root, 'part-a' + chr(ord('a') + i))):
+                    print('part_a' + chr(ord('a') + i) + ' is existed')
+                    continue
+                download_file(url[i], data_path)
+            file_prefix = 'part-'
+            output_file = data_path
+            split_files = sorted([f for f in os.listdir(data_root) if f.startswith(file_prefix)])
+            with open(output_file, 'wb') as outfile:
+                # 逐个读取每个拆分文件并写入到输出文件
+                for filename in split_files:
+                    with open(osp.join(data_root, filename), 'rb') as infile:
+                        outfile.write(infile.read())
+            update_flag = True
+        else:
+            warnings.warn('The dataset tsv is not downloaded')
+            download_file(url, data_path)
+            update_flag = True
+
+        if file_size(data_path, 'GB') > 1:
+            local_path = data_path.replace('.tsv', '_local.tsv')
+            if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
+                from ..tools import LOCALIZE
+                LOCALIZE(data_path, local_path)
+            data_path = local_path
+        return load(data_path)
+
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.mmniah import is_correct
+        # find-image, count-text, find-text,
+        # infer-choose, count-image, visual-reasoning
+        MMNIAH_score = {
+            'count-text': 0,
+            'find-image': 0,
+            'find-text': 0,
+            'infer-choose': 0,
+            'count-image': 0,
+            'visual-reasoning': 0,
+            'total': 0,
+        }
+        MMNIAH_num = {
+            'count-text': 0,
+            'find-image': 0,
+            'find-text': 0,
+            'infer-choose': 0,
+            'count-image': 0,
+            'visual-reasoning': 0,
+            'total': 0,
+        }
+        final_score_dict = {
+            'count-text': 0,
+            'find-image': 0,
+            'find-text': 0,
+            'infer-choose': 0,
+            'count-image': 0,
+            'visual-reasoning': 0,
+            'total': 0,
+        }
+        data = load(eval_file)
+        lt = len(data)
+        lines = [data.iloc[i] for i in range(lt)]
+        for i in tqdm(range(len(lines))):
+            line = lines[i]
+            predict = line['prediction']
+            answers = line['answer']
+            category = line['category']
+            if category in ['visual-reasoning', 'find-image']:
+                answers = int(answers)
+            if is_correct(answers, predict):
+                MMNIAH_score[category] += 1
+                MMNIAH_score['total'] += 1
+            MMNIAH_num[category] += 1
+            MMNIAH_num['total'] += 1
+
+        for category in ['find-image', 'count-text', 'find-text',
+                         'infer-choose', 'count-image', 'visual-reasoning', 'total']:
+            if MMNIAH_num[category] != 0:
+                final_score_dict[category] = MMNIAH_score[category] / MMNIAH_num[category]
+            else:
+                final_score_dict[category] = None
+
+        score_pth = eval_file.replace('.xlsx', '_score.json')
+        dump(final_score_dict, score_pth)
+        return final_score_dict
+
+    def build_prompt(self, line):
+        msgs = super().build_prompt(line)
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        totalchoice = line['multi-choice options']
+        totalchoice = eval(totalchoice)
+        # find-image, count-text, find-text,
+        # infer-choose, count-image, visual-reasoning
+        context = msgs[-1]['value']
+        context = eval(context)
+        question = context[0] + '\n' + context[1]
+        # tgt_path是所有图像地址列表
+        tgt_path = []
+        for i in range(len(msgs) - 1):
+            tgt_path.append(msgs[i]['value'])
+        choices = totalchoice[0]
+        choices_image = totalchoice[1]
+        if choices:
+            for c_idx, c in enumerate(choices):
+                question = f"{question}\n{chr(c_idx + ord('A'))}. {c}"
+            question += "\nAnswer with the option's letter from the given choices directly."
+        elif choices_image:
+            for c_idx in range(len(choices_image)):
+                question = f"{question}\n{chr(c_idx + ord('A'))}. <image>"
+            question += "\nAnswer with the option's letter from the given choices directly."
+        else:
+            question += '\nAnswer the question using a single word or phrase.'
+        question = '<start>' + question + '<end>'
+        question = question.split('<image>')
+        if choices_image:
+            for i in range(len(question) - 5):
+                question[i] = question[i] + '\n<image>'
+            for i in range(len(question) - 5, len(question) - 1):
+                question[i] = question[i] + '<image>'
+        else:
+            for i in range(len(question) - 1):
+                question[i] = question[i] + '\n<image>'
+        assert len(tgt_path) + 1 == len(question)
+        context = []
+        for i in range(len(tgt_path)):
+            context.append(question[i])
+            context.append(tgt_path[i])
+        context.append(question[-1])
+        context[0] = context[0][7:]
+        context[-1] = context[-1][:-5]
+        msgs = []
+        for i in range(len(context)):
+            if i % 2 == 0:
+                msgs.append(dict(type='text', value=context[i]))
+            else:
+                ROOT = LMUDataRoot()
+                msgs.append(dict(type='image', value=osp.join(osp.join(ROOT, 'images', self.dataset_name), context[i])))
+        for element in msgs:
+            if element['value'] == '':
+                msgs.remove(element)
+        return msgs
--- a/VLMEvalKit/vlmeval/dataset/image_yorn.py
+++ b/VLMEvalKit/vlmeval/dataset/image_yorn.py
+from ..smp import *
+from ..utils import *
+from .image_base import ImageBaseDataset
+from .utils import build_judge, DEBUG_MESSAGE
+
+
+class ImageYORNDataset(ImageBaseDataset):
+
+    TYPE = 'Y/N'
+
+    DATASET_URL = {
+        'MME': 'https://opencompass.openxlab.space/utils/VLMEval/MME.tsv',
+        'HallusionBench': 'https://opencompass.openxlab.space/utils/VLMEval/HallusionBench.tsv',
+        'POPE': 'https://opencompass.openxlab.space/utils/VLMEval/POPE.tsv',
+        'AMBER': 'https://huggingface.co/datasets/yifanzhang114/AMBER_base64/resolve/main/AMBER.tsv',
+    }
+
+    DATASET_MD5 = {
+        'MME': 'b36b43c3f09801f5d368627fb92187c3',
+        'HallusionBench': '0c23ac0dc9ef46832d7a24504f2a0c7c',
+        'POPE': 'c12f5acb142f2ef1f85a26ba2fbe41d5',
+        'AMBER': '970d94c0410916166e0a76ba75da7934',
+    }
+
+    # It returns a dataframe
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.yorn import YOrN_Extraction, YOrN_auxeval
+        from .utils.yorn import default_rating, MME_rating, Hallusion_rating, POPE_rating, AMBER_rating
+
+        dataset = self.dataset_name
+        data = load(eval_file)
+        data['prediction'] = [str(x) for x in data['prediction']]
+        storage = eval_file.replace('.xlsx', '_auxmatch.xlsx')
+        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
+        nproc = judge_kwargs.pop('nproc', 4)
+
+        if not osp.exists(storage):
+            ans_map = {k: YOrN_Extraction(v) for k, v in zip(data['index'], data['prediction'])}
+            if osp.exists(tmp_file):
+                tmp = load(tmp_file)
+                for k in tmp:
+                    if ans_map[k] == 'Unknown' and tmp[k] != 'Unknown':
+                        ans_map[k] = tmp[k]
+
+            data['extracted'] = [ans_map[x] for x in data['index']]
+            unknown = data[data['extracted'] == 'Unknown']
+
+            model = judge_kwargs.get('model', 'exact_matching')
+            if model == 'exact_matching':
+                model = None
+            elif gpt_key_set():
+                model = build_judge(**judge_kwargs)
+                if not model.working():
+                    warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                    warnings.warn(DEBUG_MESSAGE)
+                    model = None
+            else:
+                model = None
+                warnings.warn('OPENAI_API_KEY is not working properly, will use exact matching for evaluation')
+
+            if model is not None:
+                lt = len(unknown)
+                lines = [unknown.iloc[i] for i in range(lt)]
+                tups = [(model, line) for line in lines]
+                indices = list(unknown['index'])
+                if len(tups):
+                    res = track_progress_rich(
+                        YOrN_auxeval, tups, nproc=nproc, chunksize=nproc, keys=indices, save=tmp_file)
+                    for k, v in zip(indices, res):
+                        ans_map[k] = v
+
+            data['extracted'] = [ans_map[x] for x in data['index']]
+            dump(data, storage)
+
+        data = load(storage)
+        if listinstr(['AMBER'], dataset):
+            data['score'] = (data['answer'].str.lower() == data['extracted'].str.lower())
+        else:
+            data['score'] = (data['answer'] == data['extracted'])
+        dump(data, storage)
+
+        if dataset is not None and listinstr(['MME'], dataset):
+            score = MME_rating(storage)
+        elif dataset is not None and listinstr(['Hallusion'], dataset):
+            score = Hallusion_rating(storage)
+        elif dataset is not None and listinstr(['POPE'], dataset):
+            score = POPE_rating(storage)
+        elif dataset is not None and listinstr(['AMBER'], dataset):
+            score = AMBER_rating(storage)
+        else:
+            score = default_rating(storage)
+
+        score_tgt = eval_file.replace('.xlsx', '_score.csv')
+        dump(score, score_tgt)
+        return score
--- a/VLMEvalKit/vlmeval/dataset/longvideobench.py
+++ b/VLMEvalKit/vlmeval/dataset/longvideobench.py
+from huggingface_hub import snapshot_download
+from ..smp import *
+from .video_base import VideoBaseDataset
+from .utils import build_judge, DEBUG_MESSAGE
+from glob import glob
+
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+
+def timestamp_to_seconds(timestamp):
+    # Split the timestamp into hours, minutes, and seconds
+    h, m, s = timestamp.split(":")
+    # Convert hours, minutes, and total seconds (including fractions) to float and compute total seconds
+    total_seconds = int(h) * 3600 + int(m) * 60 + float(s)
+    return total_seconds
+
+
+def uniformly_subsample(lst, K):
+    n = len(lst)
+    if K >= n:
+        return lst
+    step = n / K
+    return [lst[int(i * step)] for i in range(K)]
+
+
+def insert_subtitles_into_frames(
+    frames,
+    frame_timestamps,
+    subtitles,
+    starting_timestamp_for_subtitles,
+    duration,
+):
+    interleaved_list = []
+    cur_i = 0
+
+    for subtitle in subtitles:
+        if "timestamp" in subtitle:
+            start, end = subtitle["timestamp"]
+
+            if not isinstance(end, float):
+                end = duration
+
+            start -= starting_timestamp_for_subtitles
+            end -= starting_timestamp_for_subtitles
+
+            subtitle_timestamp = (start + end) / 2
+            subtitle_text = subtitle["text"]
+        else:
+            start, end = subtitle["start"], subtitle["end"]
+            start = timestamp_to_seconds(start)
+            end = timestamp_to_seconds(end)
+            start -= starting_timestamp_for_subtitles
+            end -= starting_timestamp_for_subtitles
+
+            subtitle_timestamp = (start + end) / 2
+            subtitle_text = subtitle["line"]
+
+        for i, (frame, frame_timestamp) in enumerate(
+            zip(frames[cur_i:], frame_timestamps[cur_i:])
+        ):
+            if frame_timestamp <= subtitle_timestamp:
+                # print("frame:", frame_timestamp)
+                interleaved_list.append({"type": "image", "value": frame})
+                cur_i += 1
+            else:
+                break
+
+        if end - start < 1:
+            end = subtitle_timestamp + 0.5
+            start = subtitle_timestamp - 0.5
+
+        covering_frames = False
+        for frame, frame_timestamp in zip(frames, frame_timestamps):
+            if frame_timestamp < end and frame_timestamp > start:
+                covering_frames = True
+                break
+
+        if covering_frames:
+            interleaved_list.append({"type": "text", "value": subtitle_text + "\n"})
+        else:
+            pass
+
+    for i, (frame, frame_timestamp) in enumerate(
+        zip(frames[cur_i:], frame_timestamps[cur_i:])
+    ):
+        interleaved_list.append({"type": "image", "value": frame})
+    return interleaved_list
+
+
+class LongVideoBench(VideoBaseDataset):
+
+    MD5 = '82905eae3a5ae7383c5a8ee9655e1ab9'
+    SYS = ''
+
+    TYPE = 'Video-MCQ'
+
+    def __init__(self, dataset='LongVideoBench', use_subtitle=False, nframe=0, fps=-1):
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+        self.use_subtitle = use_subtitle
+        self.dataset_name = dataset
+
+    @classmethod
+    def supported_datasets(cls):
+        return ['LongVideoBench']
+
+    def prepare_dataset(self, dataset_name='LongVideoBench', repo_id='longvideobench/LongVideoBench'):
+
+        def check_integrity(pth):
+            data_file = osp.join(pth, f'{dataset_name}.tsv')
+
+            if not osp.exists(data_file):
+                return False
+
+            if md5(data_file) != self.MD5:
+                print("md5 mismatch", md5(data_file), self.MD5)
+                return False
+            data = load(data_file)
+            for video_pth in data['video_path']:
+                if not osp.exists(osp.join(pth, video_pth)):
+                    print(video_pth, "is not found")
+                    return False
+            return True
+
+        if modelscope_flag_set():
+            repo_id = "AI-ModelScope/LongVideoBench"
+
+        cache_path = get_cache_path(repo_id)
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+            def generate_tsv(pth):
+                data_file = osp.join(pth, f'{dataset_name}.tsv')
+                if osp.exists(data_file) and md5(data_file) == self.MD5:
+                    return
+
+                data_file = pd.read_json(osp.join(pth, 'lvb_val.json'))
+                data_file = data_file.assign(index=range(len(data_file)))
+                data_file['video'] = data_file['video_id']
+                data_file['video_path'] = data_file['video_path'].apply(lambda x: f'./videos/{x}')
+
+                data_file.to_csv(osp.join(pth, f'{dataset_name}.tsv'), sep='\t', index=False)
+
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+                dataset_snapshot_download(dataset_id=repo_id)
+            else:
+                snapshot_download(repo_id=repo_id, repo_type='dataset')
+            print("All videos are downloaded for LongVideoBench")
+
+            if not glob(osp.join(cache_path, "videos")):
+                tar_files = glob(osp.join(cache_path, "**/*.tar*"), recursive=True)
+
+                def untar_video_data(tar_file, cache_dir):
+                    import tarfile
+                    with tarfile.open(tar_file, "r") as tar_ref:
+                        tar_ref.extractall(cache_dir)
+                        print(f"Extracted all files from {tar_file} to {cache_dir}")
+
+                def concat_tar_parts(tar_parts, output_tar):
+                    with open(output_tar, "wb") as out_tar:
+                        from tqdm import tqdm
+                        for part in tqdm(sorted(tar_parts)):
+                            with open(part, "rb") as part_file:
+                                out_tar.write(part_file.read())
+                    print(f"Concatenated parts {tar_parts} into {output_tar}")
+
+                tar_parts_dict = {}
+
+                # Group tar parts together
+                for tar_file in tar_files:
+                    base_name = tar_file.split(".tar")[0]
+                    if base_name not in tar_parts_dict:
+                        tar_parts_dict[base_name] = []
+                    tar_parts_dict[base_name].append(tar_file)
+
+                # Concatenate and untar split parts
+                for base_name, parts in tar_parts_dict.items():
+                    print(f"Extracting following tar files: {parts}")
+                    output_tar = base_name + ".tar"
+                    if not osp.exists(output_tar):
+                        print('Start concatenating tar files')
+
+                        concat_tar_parts(parts, output_tar)
+                        print('Finish concatenating tar files')
+
+                    if not osp.exists(osp.join(cache_path, osp.basename(base_name))):
+                        untar_video_data(output_tar, cache_path)
+
+            print('All videos are extracted for LongVideoBench')
+
+            dataset_path = cache_path
+            generate_tsv(dataset_path)
+
+        data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
+
+        return dict(data_file=data_file, root=dataset_path)
+
+    def save_video_frames(self, video_path, video_llm=False):
+
+        vid_path = osp.join(self.data_root, video_path)
+        vid = decord.VideoReader(vid_path)
+        video_info = {
+            'fps': vid.get_avg_fps(),
+            'n_frames': len(vid),
+        }
+        if self.nframe > 0 and self.fps < 0:
+            step_size = len(vid) / (self.nframe + 1)
+            indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
+            frame_paths = self.frame_paths(video_path[:-4])
+        elif self.fps > 0:
+            # not constrained by num_frames, get frames by fps
+            total_duration = video_info['n_frames'] / video_info['fps']
+            required_frames = int(total_duration * self.fps)
+            step_size = video_info['fps'] / self.fps
+            indices = [int(i * step_size) for i in range(required_frames)]
+            frame_paths = self.frame_paths_fps(video_path[:-4], len(indices))
+
+        flag = np.all([osp.exists(p) for p in frame_paths])
+
+        if not flag:
+            images = [vid[i].asnumpy() for i in indices]
+            images = [Image.fromarray(arr) for arr in images]
+            for im, pth in zip(images, frame_paths):
+                if not osp.exists(pth) and not video_llm:
+                    im.save(pth)
+
+        return frame_paths, indices, video_info
+
+    # def save_video_into_images(self, line, num_frames=8):
+    #     frame_paths, indices, video_info = self.save_video_frames(line['video_path'], num_frames)
+    #     return frame_paths
+
+    def build_prompt(self, line, video_llm):
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+
+        frames, indices, video_info = self.save_video_frames(line['video_path'], video_llm)
+        fps = video_info["fps"]
+
+        message = [dict(type='text', value=self.SYS)]
+        if video_llm:
+            message.append(dict(type='video', value=osp.join(self.data_root, line['video_path'])))
+        else:
+            if not self.use_subtitle:
+                with open(osp.join(self.data_root, "subtitles", line["subtitle_path"])) as f:
+                    subtitles = json.load(f)
+
+                frame_message = insert_subtitles_into_frames(
+                    frames,
+                    [ind_ / fps for ind_ in indices],
+                    subtitles,
+                    line["starting_timestamp_for_subtitles"],
+                    line["duration"]
+                )
+
+                message += frame_message
+            else:
+                for im in frames:
+                    message.append(dict(type='image', value=im))
+
+        line['question'] += '\n' + '\n'.join(
+            ["{}. {}".format(chr(ord("A") + i), cand) for i, cand in enumerate(eval(line['candidates']))]
+        )
+        prompt = line["question"] + "\nAnswer with the option's letter from the given choices directly."
+        message.append(dict(type='text', value=prompt))
+        return message
+
+    # It returns a dictionary
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.longvideobench import get_dimension_rating, extract_characters_regex, extract_option
+
+        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+
+        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
+        tgt_file = eval_file.replace('.xlsx', '_rating.json')
+        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+
+        if not osp.exists(score_file):
+            model = judge_kwargs.get('model', 'exact_matching')
+            assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
+
+            if model == 'exact_matching':
+                model = None
+            elif gpt_key_set():
+                model = build_judge(**judge_kwargs)
+                if not model.working():
+                    warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                    warnings.warn(DEBUG_MESSAGE)
+                    model = None
+            else:
+                warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+                model = None
+            res = {} if not osp.exists(tmp_file) else load(tmp_file)
+            res = {k: v for k, v in res.items() if FAIL_MSG not in v}
+
+            data = load(eval_file)
+            data_un = data[~pd.isna(data['prediction'])]
+
+            for idx in data['index']:
+                ans = data.loc[data['index'] == idx, 'correct_choice'].values[0]
+                ans = chr(ord("A") + ans)
+                pred = str(data.loc[data['index'] == idx, 'prediction'].values[0])
+
+                if extract_characters_regex(pred) == '':
+                    extract_pred = extract_option(
+                        model,
+                        data.loc[data['index'] == idx].to_dict(orient='records')[0],
+                        'LongVideoBench'
+                    )
+                    data.loc[idx, 'score'] = int(extract_pred == ans)
+                else:
+                    data.loc[idx, 'score'] = int(extract_characters_regex(pred) == ans)
+
+            rejected = [x for x in data['score'] if x == -1]
+
+            print(
+                f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
+                f'failed to obtain the score for another {len(rejected)} questions. '
+                f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
+            )
+
+            dump(data, score_file)
+
+        rating = get_dimension_rating(score_file)
+        dump(rating, tgt_file)
+        return rating
--- a/VLMEvalKit/vlmeval/dataset/miabench.py
+++ b/VLMEvalKit/vlmeval/dataset/miabench.py
+import json
+import os
+
+import pandas as pd
+
+from .image_base import ImageBaseDataset
+from ..smp import *
+from .utils import build_judge, DEBUG_MESSAGE
+from ..utils import track_progress_rich
+
+
+def generate_prompt(d):
+    question = d['question']
+    weights = eval(d['component_weight'])
+    components = eval(d['components'])
+    num_of_component = int(d['num_of_component'])
+    response = d['prediction']
+
+    if num_of_component == 1:
+        components = f"The first component is: '{components[0]}'. "
+        score = f"The first component is worth: {weights[0]} scores. "
+    elif num_of_component == 2:
+        components = f"The first component is: '{components[0]}', and the second component is '{components[1]}'. "
+        score = f"The first and second component is each worth {weights[0]} and {weights[1]} scores. "
+    elif num_of_component == 3:
+        components = (
+            f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
+            f"and the third component is '{components[2]}'. "
+        )
+        score = (
+            "The first, second, and third component is each worth "
+            f"{weights[0]}, {weights[1]}, and {weights[2]} scores."
+        )
+    elif num_of_component == 4:
+        components = (
+            f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
+            f"and the third component is '{components[2]}', and the fourth component is '{components[3]}'. "
+        )
+        score = (
+            "The first, second, third, and fourth component is each worth "
+            f"{weights[0]}, {weights[1]}, {weights[2]}, and {weights[3]} scores."
+        )
+    elif num_of_component == 5:
+        components = (
+            f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
+            f"and the third component is '{components[2]}', and the fourth component is '{components[3]}', "
+            f"and the fifth component is '{components[4]}'. "
+        )
+        score = (
+            "The first, second, third, fourth, and fifth component is each worth "
+            f"{weights[0]}, {weights[1]}, {weights[2]}, {weights[3]}, and {weights[4]} scores."
+        )
+
+    return (
+        "Here is an instruction for a multimodal LLM: '"
+        f"{question}"
+        "'. You need to grade if the response from the model follows each component of the instruction. "
+        f"{components}"
+        "The response is: '"
+        f"{response}"
+        "'. You need to score the response and be strict. The total score ranges from 0 to 10, "
+        "depending on if the response follows the instruction. "
+        f"{score}"
+        "List scores of each component, and the total score in one sentence in this format: "
+        "score of component 1: x/2, score of component 2: y/8, total score: z/10. Then explain your reasons."
+    )
+
+
+def process_rawscore(component_type, raw_score):
+    first_sentence = raw_score.split('.')[0].split(',')
+    score_dict = {}
+    for i in range(len(first_sentence) - 1):
+        score_ = first_sentence[i].split(':')[1][1:].split('/')
+        score = int(score_[0]) / int(score_[1])
+        score_dict[component_type[i]] = score
+    total_score_ = first_sentence[i + 1].split(':')[1][1:].split('/')
+    total_score = int(total_score_[0]) / int(total_score_[1])
+    score_dict['total_score'] = total_score
+    return score_dict
+
+
+def get_score_dict(data, score_raw):
+    cat_score_dict = {}
+    for i in range(len(data)):
+        try:
+            cmp = data['component_type'][i][2:-2]
+            cmp_list = cmp.split('\', \'')
+            score_dict = process_rawscore(cmp_list, score_raw[i])
+            for key, val in score_dict.items():
+                if key not in cat_score_dict.keys():
+                    cat_score_dict[key] = [val]
+                else:
+                    cat_score_dict[key].append(val)
+        except:
+            pass
+    cat_score_dict_average = {}
+    for key, val in cat_score_dict.items():
+        cat_score_dict_average[key] = sum(val) / len(val)
+    return cat_score_dict_average
+
+
+class MIABench(ImageBaseDataset):
+    TYPE = 'VQA'
+
+    DATASET_URL = {
+        'MIA-Bench': 'https://opencompass.openxlab.space/utils/VLMEval/Mia-Bench.tsv',
+    }
+    DATASET_MD5 = {
+        'MIA-Bench': '0b9de595f4dd40af18a69b94d89aba82',
+    }
+
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        judge_name = judge_kwargs.pop('model', 'gpt-4o')
+
+        model = build_judge(model=judge_name, **judge_kwargs)
+        suffix = eval_file.split('.')[-1]
+
+        storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx')  # noqa: F841
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl')  # noqa: F841
+        nproc = judge_kwargs.pop('nproc', 4)  # noqa: F841
+
+        if not osp.exists(storage):
+            data = load(eval_file)
+            num_samples = len(data)
+            lines = [data.loc[i] for i in range(num_samples)]
+            prompts = [generate_prompt(line) for line in lines]
+            org_data = MIABench('MIA-Bench').data
+            img_map = {x: y for x, y in zip(org_data['index'], org_data['image'])}
+            image_b64 = [img_map[idx] for idx in data['index']]
+            indices = list(data['index'])
+            mm_messages = [
+                dict(message=[
+                    dict(type='text', value=prompt),
+                    dict(type='image', value=f'data:image/jpeg;base64,{b64}')
+                ])
+                for prompt, b64 in zip(prompts, image_b64)
+            ]
+
+            res = {}
+            if osp.exists(tmp_file):
+                res = load(tmp_file)
+
+            jobs = {k: v for k, v in zip(indices, mm_messages) if k not in res}
+            job_keys = list(jobs.keys())
+            job_vals = [jobs[k] for k in job_keys]
+
+            resps = track_progress_rich(
+                model.generate,
+                job_vals,
+                nproc=nproc,
+                chunksize=nproc,
+                keys=job_keys,
+                save=tmp_file,
+            )
+            for k, resp in zip(job_keys, resps):
+                res[k] = resp
+            data['score_raw'] = [res[idx] for idx in indices]
+            dump(data, storage)
+
+        goresult = load(storage)
+        results = get_score_dict(goresult, goresult['score_raw'])
+        result_pth = storage.replace('.xlsx', '_score.csv')
+        results_pd = pd.DataFrame.from_dict(list(results.items()))
+        dump(results_pd, result_pth)
+
+        return results
--- a/VLMEvalKit/vlmeval/dataset/mlvu.py
+++ b/VLMEvalKit/vlmeval/dataset/mlvu.py
+import huggingface_hub
+from huggingface_hub import snapshot_download
+from ..smp import *
+from .video_concat_dataset import ConcatVideoDataset
+from .video_base import VideoBaseDataset
+from .utils import build_judge, DEBUG_MESSAGE
+from ..utils import track_progress_rich
+import torchvision.transforms as T
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+from decord import VideoReader, cpu
+import pandas as pd
+import imageio
+import cv2
+import zipfile
+import os
+import glob
+from .utils.mlvu import *
+
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+
+class MLVU(ConcatVideoDataset):
+    def __init__(self, dataset='MLVU', nframe=0, fps=-1):
+        self.DATASET_SETS[dataset] = ['MLVU_MCQ', 'MLVU_OpenEnded']
+        self.type_data_dict = {
+            'M-Avg':['plotQA', 'needle', 'ego', 'count', 'anomaly_reco', 'topic_reasoning'],
+            'G-Avg':['sub_scene', 'summary']
+        }
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+
+    @classmethod
+    def supported_datasets(cls):
+        return ['MLVU']
+
+    def evaluate(self, eval_file, **judge_kwargs):
+        result = super().evaluate(eval_file=eval_file, **judge_kwargs)
+        suffix = eval_file.split('.')[-1]
+        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        for key in self.type_data_dict:
+            result.loc[key] = 0.0
+            for name, item in result.iterrows():
+                if name in self.type_data_dict[key]:
+                    result.loc[key, 'success'] += item['success']
+                    result.loc[key, 'overall'] += item['overall']
+            if key == 'G-Avg':
+                result.loc[key, 'acc'] = round(
+                    result.loc[key, 'success'] / result.loc[key, 'overall'], 2
+                )
+            else:
+                result.loc[key, 'acc'] = round(
+                    result.loc[key, 'success'] / result.loc[key, 'overall'] * 100, 1
+                )
+        result = result.reset_index().rename(columns={'index': 'task'})
+        dump(result, score_file)
+        return result
+
+
+class MLVU_MCQ(VideoBaseDataset):
+
+    MD5 = 'bb5c37e7cf8d43fc9a25c23d2b4633f5'
+    BASE_SYS = 'Carefully watch this video and pay attention to every detail. '
+    SYS = BASE_SYS + 'Based on your observations, select the best option that accurately addresses the question.'
+    TYPE = 'Video-MCQ'
+
+    def __init__(self, dataset='MLVU_MCQ', nframe=0, fps=-1):
+        self.type_data_list = {
+            'plotQA': ('1_plotQA.json', './MLVU/video/1_plotQA', 'MCQ'),
+            'needle': ('2_needle.json', './MLVU/video/2_needle', 'MCQ'),
+            'ego': ('3_ego.json', './MLVU/video/3_ego', 'MCQ'),
+            'count': ('4_count.json', './MLVU/video/4_count', 'MCQ'),
+            'order': ('5_order.json', './MLVU/video/5_order', 'MCQ'),
+            'anomaly_reco': ('6_anomaly_reco.json', './MLVU/video/6_anomaly_reco', 'MCQ'),
+            'topic_reasoning': ('7_topic_reasoning.json', './MLVU/video/7_topic_reasoning', 'MCQ'),
+        }
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+
+    @classmethod
+    def supported_datasets(cls):
+        return ['MLVU_MCQ']
+
+    def prepare_dataset(self, dataset_name='MLVU_MCQ', repo_id='MLVU/MVLU'):
+        def check_integrity(pth):
+            data_file = osp.join(pth, f'{dataset_name}.tsv')
+
+            if not os.path.exists(data_file):
+                return False
+
+            if md5(data_file) != self.MD5:
+                return False
+
+            data = load(data_file)
+            for idx, item in data.iterrows():
+                if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
+                    return False
+            return True
+
+        if modelscope_flag_set():
+            repo_id = "AI-ModelScope/MLVU"
+
+        cache_path = get_cache_path(repo_id)
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+            def generate_tsv(pth):
+                data_file = osp.join(pth, f'{dataset_name}.tsv')
+                if os.path.exists(data_file) and md5(data_file) == self.MD5:
+                    return
+                json_data_dir = os.path.join(dataset_path, 'MLVU', 'json')
+                self.data_list = []
+                for k, v in self.type_data_list.items():
+                    with open(os.path.join(json_data_dir, v[0]), 'r') as f:
+                        json_data = json.load(f)
+                    for data in json_data:
+                        self.data_list.append({
+                            'task_type': k,
+                            'prefix': v[1],
+                            'duration': data['duration'],
+                            'video': data['video'],
+                            'question': data['question'],
+                            'answer': data['answer'],
+                            'candidates': data['candidates'],
+                        })
+
+                data_df = pd.DataFrame(self.data_list)
+                data_df = data_df.assign(index=range(len(data_df)))
+                data_df.to_csv(data_file, sep='\t', index=False)
+
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+                dataset_path = dataset_snapshot_download(dataset_id=repo_id)
+            else:
+                hf_token = os.environ.get('HUGGINGFACE_TOKEN')
+                huggingface_hub.login(hf_token)
+                dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
+
+            generate_tsv(dataset_path)
+
+        data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
+        return dict(root=dataset_path, data_file=data_file)
+
+    def qa_template(self, data):
+        question = f"Question: {data['question']}\n"
+        question += 'Options:\n'
+        answer = data['answer']
+        answer_idx = -1
+        for idx, c in enumerate(eval(data['candidates'])):
+            question += f"({chr(ord('A') + idx)}) {c}\n"
+            if c == answer:
+                answer_idx = idx
+        question = question.rstrip()
+        answer = f"({chr(ord('A') + answer_idx)}) {answer}"
+        return question, answer
+
+    def save_video_frames(self, line):
+        suffix = line['video'].split('.')[-1]
+        video = line['video'].replace(f'.{suffix}','')
+        vid_path = osp.join(self.data_root, line['prefix'], line['video'])
+        vid = decord.VideoReader(vid_path)
+        video_info = {
+            'fps': vid.get_avg_fps(),
+            'n_frames': len(vid),
+        }
+        if self.nframe > 0 and self.fps < 0:
+            step_size = len(vid) / (self.nframe + 1)
+            indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
+            frame_paths = self.frame_paths(video)
+        elif self.fps > 0:
+            # not constrained by num_frames, get frames by fps
+            total_duration = video_info['n_frames'] / video_info['fps']
+            required_frames = int(total_duration * self.fps)
+            step_size = video_info['fps'] / self.fps
+            indices = [int(i * step_size) for i in range(required_frames)]
+            frame_paths = self.frame_paths_fps(video, len(indices))
+
+        flag = np.all([osp.exists(p) for p in frame_paths])
+
+        if not flag:
+            images = [vid[i].asnumpy() for i in indices]
+            images = [Image.fromarray(arr) for arr in images]
+            for im, pth in zip(images, frame_paths):
+                if not osp.exists(pth):
+                    im.save(pth)
+
+        return frame_paths
+
+    def save_video_into_images(self, line):
+        frame_paths = self.save_video_frames(line)
+        return frame_paths
+
+    def build_prompt(self, line, video_llm):
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+
+        question, answer = self.qa_template(line)
+        message = [dict(type='text', value=self.SYS, role='system')]
+        message.append(dict(type='text', value=question))
+        video_path = os.path.join(self.data_root, line['prefix'], line['video'])
+        if video_llm:
+            message.append(dict(type='video', value=video_path))
+        else:
+            img_frame_paths = self.save_video_into_images(line)
+            for im in img_frame_paths:
+                message.append(dict(type='image', value=im))
+        message.append(dict(type='text', value='\nOnly give the best option.'))
+        return message
+
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+
+        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
+        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+
+        if not osp.exists(score_file):
+            model = judge_kwargs.setdefault('model', 'chatgpt-0125')
+            assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
+
+            if model == 'exact_matching':
+                model = None
+            elif gpt_key_set():
+                model = build_judge(**judge_kwargs)
+                if not model.working():
+                    warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                    warnings.warn(DEBUG_MESSAGE)
+                    model = None
+            else:
+                warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+                model = None
+            res = {} if not osp.exists(tmp_file) else load(tmp_file)
+            res = {k: v for k, v in res.items() if FAIL_MSG not in v}
+
+            data = load(eval_file)
+            data_un = data[~pd.isna(data['prediction'])]
+
+            for idx in data['index']:
+                ans = data.loc[data['index'] == idx, 'answer'].values[0]
+                pred = data.loc[data['index'] == idx, 'prediction'].values[0]
+                options = eval(data.loc[data['index'] == idx, 'candidates'].values[0])
+                answer_idx = -1
+                for id, c in enumerate(options):
+                    if c == ans:
+                        answer_idx = id
+                ans = f"({chr(ord('A') + answer_idx)}) {ans}"
+                input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0]
+                for id, option_content in enumerate(eval(input_item['candidates'])):
+                    input_item[chr(ord('A') + id)] = option_content
+                    if option_content == input_item['answer']:
+                        input_item['answer'] = chr(ord('A') + id)
+
+                if FAIL_MSG in pred:
+                    data.loc[idx, 'score'] = -1
+                else:
+                    data.loc[idx, 'score'] = int(check_ans_with_model(
+                        pred, ans, model,
+                        input_item,
+                        'MLVU_MCQ'
+                    ))
+
+            rejected = [x for x in data['score'] if x == -1]
+
+            print(
+                f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
+                f'failed to obtain the score for another {len(rejected)} questions. '
+                f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
+            )
+
+            dump(data, score_file)
+
+        rating = get_dimension_rating(score_file)
+        return rating
+
+
+class MLVU_OpenEnded(VideoBaseDataset):
+
+    MD5 = 'cee573a3627c6ac434ded704c60511ba'
+    BASE_SYS = 'Carefully watch this video and pay attention to every detail. '
+    SYS = BASE_SYS + 'Based on your observations, answer the given questions.'
+    TYPE = 'Video-VQA'
+
+    def __init__(self, dataset='MLVU_OpenEnded', nframe=0, fps=-1):
+        self.type_data_list = {
+            'sub_scene': ('8_sub_scene.json', './MLVU/video/8_sub_scene', 'VQA'),
+            'summary': ('9_summary.json', './MLVU/video/9_summary', 'VQA')
+        }
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+
+    @classmethod
+    def supported_datasets(cls):
+        return ['MLVU_OpenEnded']
+
+    def prepare_dataset(self, dataset_name='MLVU_OpenEnded', repo_id='MLVU/MVLU'):
+        def check_integrity(pth):
+            data_file = osp.join(pth, f'{dataset_name}.tsv')
+
+            if not os.path.exists(data_file):
+                return False
+
+            if md5(data_file) != self.MD5:
+                return False
+
+            data = load(data_file)
+            for idx, item in data.iterrows():
+                if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
+                    return False
+            return True
+
+        if modelscope_flag_set():
+            repo_id = "AI-ModelScope/MLVU"
+
+        cache_path = get_cache_path(repo_id)
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+            def generate_tsv(pth):
+                data_file = osp.join(pth, f'{dataset_name}.tsv')
+                if os.path.exists(data_file) and md5(data_file) == self.MD5:
+                    return
+                json_data_dir = os.path.join(dataset_path, 'MLVU', 'json')
+                self.data_list = []
+                for k, v in self.type_data_list.items():
+                    with open(os.path.join(json_data_dir, v[0]), 'r') as f:
+                        json_data = json.load(f)
+                    for data in json_data:
+                        self.data_list.append({
+                            'task_type': k,
+                            'prefix': v[1],
+                            'duration': data['duration'],
+                            'video': data['video'],
+                            'question': data['question'],
+                            'answer': data['answer'],
+                            'scoring_points': data['scoring_points'] if 'scoring_points' in data else ''
+                        })
+
+                data_df = pd.DataFrame(self.data_list)
+                data_df = data_df.assign(index=range(len(data_df)))
+                data_df.to_csv(data_file, sep='\t', index=False)
+
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+                dataset_path = dataset_snapshot_download(dataset_id=repo_id)
+            else:
+                hf_token = os.environ.get('HUGGINGFACE_TOKEN')
+                huggingface_hub.login(hf_token)
+                dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
+
+            generate_tsv(dataset_path)
+
+        data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
+        return dict(root=dataset_path, data_file=data_file)
+
+    def qa_template(self, data):
+        question = f"{data['question']}"
+        answer = data['answer']
+        return question, answer
+
+    def save_video_frames(self, line):
+        suffix = line['video'].split('.')[-1]
+        video = line['video'].replace(f'.{suffix}','')
+        vid_path = osp.join(self.data_root, line['prefix'], line['video'])
+        vid = decord.VideoReader(vid_path)
+        video_info = {
+            'fps': vid.get_avg_fps(),
+            'n_frames': len(vid),
+        }
+        if self.nframe > 0 and self.fps < 0:
+            step_size = len(vid) / (self.nframe + 1)
+            indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
+            frame_paths = self.frame_paths(video)
+        elif self.fps > 0:
+            # not constrained by num_frames, get frames by fps
+            total_duration = video_info['n_frames'] / video_info['fps']
+            required_frames = int(total_duration * self.fps)
+            step_size = video_info['fps'] / self.fps
+            indices = [int(i * step_size) for i in range(required_frames)]
+            frame_paths = self.frame_paths_fps(video, len(indices))
+
+        flag = np.all([osp.exists(p) for p in frame_paths])
+
+        if not flag:
+            images = [vid[i].asnumpy() for i in indices]
+            images = [Image.fromarray(arr) for arr in images]
+            for im, pth in zip(images, frame_paths):
+                if not osp.exists(pth):
+                    im.save(pth)
+
+        return frame_paths
+
+    def save_video_into_images(self, line):
+        frame_paths = self.save_video_frames(line)
+        return frame_paths
+
+    def build_prompt(self, line, video_llm):
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+
+        question, answer = self.qa_template(line)
+        message = [dict(type='text', value=self.SYS, role='system')]
+        message.append(dict(type='text', value=question))
+        video_path = os.path.join(self.data_root, line['prefix'], line['video'])
+        if video_llm:
+            message.append(dict(type='video', value=video_path))
+        else:
+            img_frame_paths = self.save_video_into_images(line)
+            for im in img_frame_paths:
+                message.append(dict(type='image', value=im))
+        return message
+
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+
+        model = judge_kwargs['model'] if 'model' in judge_kwargs else judge_kwargs.setdefault('model', 'gpt-4-0125')
+        if model != 'gpt-4-0125':
+            print('MLVU Open Ended default using gpt-4-0125! So judge model is changed to gpt-4-0125')
+            judge_kwargs['model'] = 'gpt-4-0125'
+
+        suffix = eval_file.split('.')[-1]
+        score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        nproc = judge_kwargs.pop('nproc', 4)
+
+        if not osp.exists(score_file):
+            data = load(eval_file)
+            model_dict = {
+                'sub_scene': build_judge(system_prompt=system_prompt_sub_scene, **judge_kwargs),
+                'summary': build_judge(system_prompt=system_prompt_summary, **judge_kwargs)
+            }
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model_dict[line['task_type']], line) for line in lines]
+            indices = [line['index'] for line in lines]
+
+            ans = {}
+            if osp.exists(tmp_file):
+                ans = load(tmp_file)
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+
+            if len(indices):
+                _ = track_progress_rich(
+                    MLVU_OpenEnded_generate,
+                    tups,
+                    nproc=nproc,
+                    chunksize=nproc,
+                    keys=indices,
+                    save=tmp_file,
+                )
+            ans = load(tmp_file)
+            data = MLVU_OpenEnded_extract(ans, data)
+            dump(data, score_file)
+
+        rating = get_dimension_rating(score_file)
+        return rating
--- a/VLMEvalKit/vlmeval/dataset/mmbench_video.py
+++ b/VLMEvalKit/vlmeval/dataset/mmbench_video.py
+from huggingface_hub import snapshot_download
+from ..smp import *
+from .video_base import VideoBaseDataset
+from .utils import build_judge, DEBUG_MESSAGE
+from ..utils import track_progress_rich
+
+
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+
+def unwrap_hf_pkl(pth, suffix='.mp4'):
+    base_dir = os.path.join(pth, 'video_pkl/')
+    target_dir = os.path.join(pth, 'video/')
+    pickle_files = [os.path.join(base_dir, file) for file in os.listdir(base_dir)]
+    pickle_files.sort()
+
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir, exist_ok=True)
+        for pickle_file in pickle_files:
+            with open(pickle_file, 'rb') as file:
+                video_data = pickle.load(file)
+            # For each video file in the pickle file, write its contents to a new mp4 file
+            for video_name, video_content in video_data.items():
+                output_path = os.path.join(target_dir, f'{video_name}{suffix}')
+                with open(output_path, 'wb') as output_file:
+                    output_file.write(video_content)
+        print('The video file has been restored and stored from the pickle file.')
+    else:
+        print('The video file already exists.')
+
+
+class MMBenchVideo(VideoBaseDataset):
+
+    MD5 = '98f7df3eb1007fc375ea6fe88a98e2ff'
+    SYS = 'You are an AI assistant responsible for answering questions about videos.'
+    FRAMES_TMPL_PACK = """
+You will be provided with {} separate frames uniformly sampled from a video, \
+the frames are provided in chronological order of the video.
+Please analyze these images and provide the answer / answers to the \
+following question / questions about the video content.
+If multiple questions are provided (with indices I1, I2, I3, ...), \
+you should organize your answers in the following json format:
+{{
+    'I1': 'Answer to Question I1',
+    'I2': 'Answer to Question I2',
+    ...
+}}
+Otherwise, please directly reply with your response to the only question.
+Even if the information in these separate frames is not enough to give an answer,
+PLEASE GIVE A RESPONSE TO EACH OF THE QUESTIONS IN THE FORMAT DESCRIBED ABOVE.
+"""
+
+    FRAMES_TMPL_NOPACK = """
+You will be provided with {} separate frames uniformly sampled from a video, \
+the frames are provided in chronological order of the video.
+Please analyze these images and provide the answer to the question about the video content.
+Please directly reply with your response to the only question.
+"""
+
+    TYPE = 'Video-VQA'
+
+    def __init__(self, dataset='MMBench-Video', pack=False, nframe=0, fps=-1):
+        super().__init__(dataset=dataset, pack=pack, nframe=nframe, fps=fps)
+
+    @classmethod
+    def supported_datasets(cls):
+        return ['MMBench-Video']
+
+    def prepare_dataset(self, dataset_name='MMBench-Video', repo_id='opencompass/MMBench-Video'):
+        def check_integrity(pth):
+            data_file = osp.join(pth, f'{dataset_name}.tsv')
+            if md5(data_file) != self.MD5:
+                return False
+            data = load(data_file)
+            for video_pth in data['video_path']:
+                if not osp.exists(osp.join(pth, video_pth)):
+                    return False
+            return True
+
+        cache_path = get_cache_path(repo_id)
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+                dataset_path = dataset_snapshot_download(dataset_id=repo_id)
+            else:
+                dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
+            unwrap_hf_pkl(dataset_path)
+        self.video_path = osp.join(dataset_path, 'video/')
+        data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
+
+        return dict(data_file=data_file, root=osp.join(dataset_path, 'video'))
+
+    def build_prompt_pack(self, line):
+        if isinstance(line, int):
+            assert line < len(self)
+            video = self.videos[line]
+        elif isinstance(line, pd.Series):
+            video = line['video']
+        elif isinstance(line, str):
+            video = line
+
+        frames = self.save_video_frames(video)
+        sub = self.data[self.data['video'] == video]
+        sys_prompt = self.SYS + self.FRAMES_TMPL_PACK.format(len(frames))
+        message = [dict(type='text', value=sys_prompt)]
+        for im in frames:
+            message.append(dict(type='image', value=im))
+        nq = len(sub)
+        prompt = 'Questions: \n{}\nAnswers: \n'
+        qs = {int(sub.iloc[i]['index']): sub.iloc[i]['question'] for i in range(nq)}
+        prompt = prompt.format(json.dumps(qs))
+        message.append(dict(type='text', value=prompt))
+        return message
+
+    def build_prompt_nopack(self, line, video_llm):
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+        if video_llm:
+            question = line['question']
+            prefix, video_idx_path = os.path.split(line['video_path'])
+            message = [dict(type='text', value=question)]
+            message.append(dict(type='video', value=os.path.join(self.video_path, video_idx_path)))
+            return message
+        else:
+            frames = self.save_video_frames(line['video'])
+            sys_prompt = self.FRAMES_TMPL_NOPACK.format(len(frames))
+            message = [dict(type='text', value=sys_prompt)]
+            for im in frames:
+                message.append(dict(type='image', value=im))
+            prompt = 'Question: {}\nAnswer: '.format(line['question'])
+            message.append(dict(type='text', value=prompt))
+        return message
+
+    def build_prompt(self, line, video_llm):
+        if self.pack and not video_llm:
+            return self.build_prompt_pack(line)
+        else:
+            return self.build_prompt_nopack(line, video_llm)
+
+    @staticmethod
+    def remove_side_quote(s, syms=[',', '"', "'"]):
+        if np.all([x in syms for x in s]):
+            return ''
+        while s[0] in syms:
+            s = s[1:]
+        while s[-1] in syms:
+            s = s[:-1]
+        return s
+
+    @staticmethod
+    def robust_json_load(s):
+        try:
+            jsons = list(extract_json_objects(s))
+            assert len(jsons) == 1
+            return jsons[0]
+        except:
+            if '{' in s and s.find('{') == s.rfind('{'):
+                sub_str = s[s.find('{') + 1:].strip()
+                lines = sub_str.split('\n')
+                res = {}
+                for l in lines:
+                    l = l.strip()
+                    if ': ' in l:
+                        key = l.split(': ')[0].strip()
+                        val = l.split(': ')[1].strip()
+                        key = MMBenchVideo.remove_side_quote(key)
+                        val = MMBenchVideo.remove_side_quote(val)
+                        if len(key) and len(val):
+                            res[key] = val
+                return res
+            return None
+
+    def load_pack_answers(self, data_raw):
+        vstats = defaultdict(lambda: 0)
+        data = defaultdict(lambda: {})
+
+        for k in data_raw:
+            ans = data_raw[k].strip()
+            if FAIL_MSG in ans:
+                vstats['GEN_FAIL'] += 1
+                continue
+            res = self.robust_json_load(ans)
+            if res is not None:
+                data[k] = res
+                vstats['PARSE_OK'] += 1
+            else:
+                vstats['PARSE_FAIL'] += 1
+
+        # return data
+        meta = cp.deepcopy(self.data)
+        lt = len(meta)
+        prediction = []
+        for i in range(lt):
+            line = meta.iloc[i]
+            vid = line['video']
+            idx = str(line['index'])
+            prediction.append(data[vid][idx] if idx in data[vid] else None)
+        meta['prediction'] = prediction
+        vstats['VALIDQ'] = len([x for x in prediction if x is not None])
+        vstats['INVALIDQ'] = len([x for x in prediction if x is None])
+        return meta, vstats
+
+    # It returns a dictionary
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.mmbench_video import get_dimension_rating, system_prompt, build_prompt
+
+        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        judge = judge_kwargs['model']
+        nproc = judge_kwargs.pop('nproc', 4)
+
+        tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl')
+        tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json')
+        score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx')
+
+        model = build_judge(system_prompt=system_prompt, **judge_kwargs)
+        assert model.working(), 'MMBench-Video evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE
+
+        if not osp.exists(score_file):
+            res = {} if not osp.exists(tmp_file) else load(tmp_file)
+            res = {k: v for k, v in res.items() if model.fail_msg not in v}
+
+            data = load(eval_file)
+            data_un = data[~data['index'].isin(res)]
+            data_un = data_un[~pd.isna(data_un['prediction'])]
+            lt = len(data_un)
+            prompts = [build_prompt(data_un.iloc[i]) for i in range(lt)]
+            indices = [data_un.iloc[i]['index'] for i in range(lt)]
+
+            if len(prompts):
+                _ = track_progress_rich(
+                    model.generate,
+                    prompts,
+                    keys=indices,
+                    save=tmp_file,
+                    nproc=nproc,
+                    chunksize=nproc
+                )
+            score_map = load(tmp_file)
+            data['score'] = [score_map[idx] if idx in score_map else -1 for idx in data['index']]
+            rejected = [x for x in score_map.values() if FAIL_MSG in x]
+            data['score'] = [int(x) if istype(x, int) else -1 for x in data['score']]
+            print(
+                f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(score_map)} questions, '
+                f'failed to obtain the score for another {len(rejected)} questions. '
+                f'Those questions will be counted as 0 score in ALL rating, and will not be counted in VALID rating.'
+            )
+
+            dump(data, score_file)
+
+        rating = get_dimension_rating(score_file)
+        dump(rating, tgt_file)
+        return rating
--- a/VLMEvalKit/vlmeval/dataset/mmgenbench.py
+++ b/VLMEvalKit/vlmeval/dataset/mmgenbench.py
+import warnings
+import pandas as pd
+from abc import abstractmethod
+from ..smp import *
+from .image_base import ImageBaseDataset
+
+
+class MMGenBench(ImageBaseDataset):
+
+    prompt_list = [
+        """
+# Role
+You are an expert in the field of image understanding, focusing on the \
+understanding of images and generating the image caption-prompt.
+
+# Definition Explanation
+image caption-prompt: Refers to the caption or description of an image, \
+used to provide to a Text-to-Image model to generate a new image.
+Text-to-Image model: Can generate a new image based on the provided image \
+caption-prompt, such as stable diffusion 3, flux, and other image generation models.
+
+# Task Description
+Generate an image caption-prompt based on the input image.
+
+# Key Points and Requirements
+1. Accurately understand the input image and precisely generate an image caption-prompt.
+2. The generated image caption-prompt, when provided to the Text-to-Image model, requires the \
+Text-to-Image model to generate a new image that is as consistent as possible with the input image.
+3. The generated image caption-prompt must conform to the preferences of the Text-to-Image model.
+4. The generated image caption-prompt should describe the input image in as much \
+detail as possible, and it should be between 20 to 60 words.
+
+# Output Format
+A string, that is the image caption-prompt. No extra output needed.
+"""
+    ]
+    TYPE = 'GenerateImgPrompt'
+    DATASET_URL = {
+        'MMGenBench-Test': 'https://huggingface.co/datasets/lerogo/MMGenBench/resolve/main/MMGenBench-Test.tsv',
+        'MMGenBench-Domain': 'https://huggingface.co/datasets/lerogo/MMGenBench/resolve/main/MMGenBench-Domain.tsv',
+    }
+    PROMPT_MAP = {
+        'MMGenBench-Test': prompt_list[0],
+        'MMGenBench-Domain': prompt_list[0],
+    }
+    DATASET_MD5 = {
+        'MMGenBench-Test': "94f8dac6bbf7c20be403f99adeaa73da",
+        'MMGenBench-Domain': "5c10daf6e2c5f08bdfb0701aa6db86bb",
+    }
+
+    def __init__(self, dataset='MMGenBench', **kwargs):
+        super().__init__(dataset, **kwargs)
+        warnings.warn('This dataset is for inference only and does not support direct output of evaluation results.\n')
+        warnings.warn('Please refer to "https://github.com/lerogo/MMGenBench" for more evaluation information.\n')
+
+    def load_data(self, dataset):
+        data = super().load_data(dataset)
+        if 'question' not in data:
+            data['question'] = [(
+                self.PROMPT_MAP[dataset]
+            )] * len(data)
+        return data
+
+    # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
+    @abstractmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        warnings.warn('This evaluation method is not supported.\n')
+        warnings.warn('Please refer to "https://github.com/lerogo/MMGenBench" for more evaluation information.\n')
+        return None
--- a/VLMEvalKit/vlmeval/dataset/mmlongbench.py
+++ b/VLMEvalKit/vlmeval/dataset/mmlongbench.py
+import re
+import math
+from urllib.request import urlopen
+from PIL import Image, ImageDraw, ImageFont
+import torchvision.transforms as transforms
+
+from vlmeval.dataset.utils import build_judge, levenshtein_distance
+from vlmeval.smp import *
+from .image_base import ImageBaseDataset
+
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+
+def get_gpt4_ICE():
+    example_1 = """
+---
+Question: List the primary questions asked about the services in this report.
+Analysis:  The primary questions asked about the services in the report for The Limes Residential Home are:\n\n
+1. Is the service safe?\n
+2. Is the service effective?\n
+3. Is the service caring?\n
+4. Is the service responsive?\n
+5. Is the service well-led?
+Extracted answer: [
+    'Is the servife safe?',
+    'Is the service effective',
+    'Is the serve caring?',
+    'Is the service responsive?',
+    'Is the service well-led?'
+]
+Answer format: List\n
+"""
+
+    example_2 = """
+---
+Question: How many regulations of the HSCA 2008 are breached in all according to this report?
+Analysis: According to the report, the provider breached 10 Health and Social Care Act 2008 (Regulated Activities)
+Regulations in total. Here are the specifics:\n\n1. Regulation 13: Safeguarding service users from abuse and
+improper treatment\n2. Regulation 12: Safe care and treatment\n3. Regulation 18: Staffing\n4. Regulation 11:
+Need for consent\n5. Regulation 10: Dignity and respect\n6. Regulation 9: Person-centred care\n7. Regulation 17:
+Good governance\n8. Regulation 18 (CQC Registration Regulations 2009): Notification of other incidents\n9.
+Regulation 18: Failure to maintain an accurate and up-to-date care plan\n10. Regulation 11: Failure to implement
+the Mental Capacity Act 2005 code of practice effectively\n\nThese breaches involve issues concerning staffing,
+safeguarding, medicines management, dignity and respect, consent, care planning, governance, and failure to
+notify the CQC of incidents.
+Extracted answer: 10
+Answer format: Integer\n
+"""
+
+    example_3 = """
+---
+Question: According to the survey that is the percentage of Chinese who are paying more or
+about the same attention to politics after Trump's election?
+Analysis: The survey provided does not specify the percentage of Chinese individuals specifically who are paying
+more or about the same attention to politics after Trump's election. The report focuses primarily on American
+demographics and does not include specific details about the Chinese population in relation to this question. If
+you need information about a different demographic or a summary of the findings from the American demographic,
+I can certainly help with that!
+Extracted answer: Not answerable
+Answer format: String\n
+"""
+
+    example_4 = """
+---
+Question: How many quotations from male respondent over 50 years old are included in this report?
+Analysis: The image you've provided appears to be a screenshot of a document with multiple charts. However, the
+text is too small and blurry to read accurately. If you can provide a clearer image or more context, I might be
+able to help you with your question.
+Extracted answer: Fail to answer
+Answer format: String\n
+"""
+
+    return [example_1, example_2, example_3, example_4]
+
+
+def build_mmlongbench_gpt4_prompt(line):
+    task_description = """
+Given the question and analysis, you are tasked to extract answers with required formats from the free-form analysis.
+- Your extracted answers should be one of the following formats: (1) Integer, (2) Float, (3) String and (4) List.
+If you find the analysis the question can not be answered from the given documents, type "Not answerable".
+Exception: If the analysis only tells you that it can not read/understand the images or documents,
+type "Fail to answer".
+- Please make your response as concise as possible. Also note that your response should be formatted as below:
+```
+Extracted answer: [answer]
+Answer format: [answer format]
+```
+Please read the following example, then extract the answer from the model response
+and type it at the end of the prompt.\n
+"""
+    question = line['question']
+    prediction = str(line['prediction'])
+    prompt = task_description
+    examples = get_gpt4_ICE()
+    for example in examples:
+        prompt += example
+    prompt += '---\nQuestion:' + question + '\n'
+    prompt += 'Analysis: ' + prediction
+    return prompt
+
+
+def anls_compute(groundtruth, prediction, threshold=0.5):
+    dist = levenshtein_distance(groundtruth, prediction)
+    length = max(len(groundtruth.upper()), len(prediction.upper()))
+    value = 0.0 if length == 0 else float(dist) / float(length)
+    anls = 1.0 - value
+    if anls <= threshold:
+        anls = 0.0
+    return anls
+
+
+def is_float_equal(reference, prediction, include_percentage: bool = False, is_close: float = False) -> bool:
+    def get_precision(gt_ans: float) -> int:
+        precision = 3
+        if '.' in str(gt_ans):
+            precision = len(str(gt_ans).split('.')[-1])
+        return precision
+
+    reference = float(str(reference).strip().rstrip('%').strip())
+    try:
+        prediction = float(str(prediction).strip().rstrip('%').strip())
+    except:
+        return False
+
+    if include_percentage:
+        gt_result = [reference / 100, reference, reference * 100]
+    else:
+        gt_result = [reference]
+    for item in gt_result:
+        try:
+            if is_close:
+                if math.isclose(item, prediction, rel_tol=0.01):
+                    return True
+            precision = max(min(get_precision(prediction), get_precision(item)), 2)
+            if round(prediction, precision) == round(item, precision):
+                return True
+        except Exception:
+            continue
+    return False
+
+
+def get_clean_string(s):
+    s = str(s).lower().strip()
+    if s.endswith('mile'):
+        s.rstrip('mile').strip()
+    if s.endswith('miles'):
+        s.rstrip('miles').strip()
+    if s.endswith('million'):
+        s.rstrip('million').strip()
+    # remove parenthesis
+    s = re.sub(r'\s*\([^)]*\)', '', s).strip()
+    # remove quotes
+    s = re.sub(r"^['\"]|['\"]$", '', s).strip()
+    s = s.strip().lstrip('$').strip()
+    s = s.strip().rstrip('%').strip()
+    return s
+
+
+def is_exact_match(s):
+    flag = False
+    # Website
+    if 'https://' in s:
+        flag = True
+    # code file
+    if s.endswith('.py') or s.endswith('ipynb'):
+        flag = True
+    if s.startswith('page'):
+        flag = True
+    # telephone number
+    if re.fullmatch(r'\b\d+(-\d+|\s\d+)?\b', s):
+        flag = True
+    # time
+    if 'a.m.' in s or 'p.m.' in s:
+        flag = True
+    # YYYY-MM-DD
+    if re.fullmatch(r'\b\d{4}[-\s]\d{2}[-\s]\d{2}\b', s):
+        flag = True
+    # YYYY-MM
+    if re.fullmatch(r'\b\d{4}[-\s]\d{2}\b', s):
+        flag = True
+    # Email address
+    if re.fullmatch(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', s):
+        flag = True
+    return flag
+
+
+def isfloat(num):
+    try:
+        float(num)
+        return True
+    except ValueError:
+        return False
+
+
+def get_font():
+    try:
+        truetype_url = "http://opencompass.openxlab.space/utils/Fonts/SimHei.ttf"
+        ff = urlopen(truetype_url)
+        font = ImageFont.truetype(ff, size=40)
+    except Exception as e:
+        logging.warning(f'{type(e)}: {e}')
+        logging.warning("Fail to download the font. Use the default one.")
+        font = ImageFont.load_default(size=40)
+    return font
+
+
+def frame2img(img_path_list, font, save_path=None, idx_start=0):
+    imgs = [Image.open(img_path) for img_path in img_path_list]
+
+    new_imgs = []
+    for img in imgs:
+        w, h = img.size
+        scale = w / h
+        if w > h:
+            new_w = 560 * 2
+            new_h = int(560 * 2 / scale)
+        else:
+            new_w = int(560 * 2 * scale)
+            new_h = 560 * 2
+        img = transforms.functional.resize(img, [new_h, new_w],)
+        new_imgs.append(img)
+    imgs = new_imgs
+    new_w = 0
+    new_h = 0
+    pad = 40
+    if w > h:
+        for im in imgs:
+            w, h = im.size
+            new_w = max(new_w, w)
+            new_h += h + 10 + pad
+        new_img = Image.new("RGB", (new_w, new_h), "white")
+        draw = ImageDraw.Draw(new_img)
+        curr_h = 0
+        for idx, im in enumerate(imgs):
+            w, h = im.size
+            new_img.paste(im, (0, pad + curr_h))
+            draw.text((0, curr_h), f"<IMAGE {idx+idx_start}>", font=font, fill="black")
+            if idx + 1 < len(imgs):
+                draw.line([(0, pad + curr_h + h + 5), (new_w, pad + curr_h + h + 5)], fill='black', width=2)
+            curr_h += h + 10 + pad
+    else:
+        for im in imgs:
+            w, h = im.size
+            new_w += w + 10
+            new_h = max(new_h, h)
+        new_h += pad
+        new_img = Image.new('RGB', (new_w, new_h), 'white')
+        draw = ImageDraw.Draw(new_img)
+        curr_w = 0
+        for idx, im in enumerate(imgs):
+            w, h = im.size
+            new_img.paste(im, (curr_w, pad))
+            draw.text((curr_w, 0), f"<IMAGE {idx+idx_start}>", font=font, fill='black')
+            if idx + 1 < len(imgs):
+                draw.line([(curr_w + w + 5, 0), (curr_w + w + 5, new_h)], fill='black', width=2)
+            curr_w += w + 10
+
+    if save_path is not None:
+        new_img.save(save_path)
+
+    return new_img
+
+
+def concat_images(image_list, max_concat=1, column_num=1):
+    concatenated_images = []
+    if column_num == -1:
+        MAX_COLUMN_NUM = 20
+        max_concat = 1
+        while len(image_list) / max_concat > MAX_COLUMN_NUM:
+            max_concat += 1
+        interval = max(math.ceil(len(image_list) / max_concat), 1)
+        for i in range(0, len(image_list), interval):
+            batch_images = image_list[i:i + interval]
+            concatenated_image = frame2img(batch_images, font=get_font(), idx_start=i)
+            concatenated_images.append(concatenated_image)
+    else:
+        interval = max(math.ceil(len(image_list) / max_concat), 1)
+        for i in range(0, len(image_list), interval):
+            batch_images = [Image.open(filename) for filename in image_list[i:i + interval]]
+            if column_num == 1:
+                total_height = batch_images[0].height * len(batch_images)
+            else:
+                total_height = batch_images[0].height * ((len(batch_images) - 1) // column_num + 1)
+            concatenated_image = Image.new('RGB', (batch_images[0].width * column_num, total_height), 'white')
+
+            x_offset, y_offset = 0, 0
+            for count, image in enumerate(batch_images):
+                concatenated_image.paste(image, (x_offset, y_offset))
+                x_offset += image.width
+                if (count + 1) % column_num == 0:
+                    y_offset += image.height
+                    x_offset = 0
+            concatenated_images.append(concatenated_image)
+    return concatenated_images
+
+
+def eval_score(gt, pred, answer_type):
+    if answer_type == 'Int':
+        try:
+            gt, pred = int(gt), int(float(pred))
+        except:
+            pred = ''
+        score = (gt == pred)
+    elif answer_type == 'Float':
+        try:
+            gt = float(get_clean_string(str(gt)))
+            pred = float(get_clean_string(str(pred)))
+        except:
+            pred = ''
+        score = is_float_equal(gt, pred, include_percentage=True, is_close=True)
+    elif answer_type == 'Str':
+        gt = get_clean_string(gt)
+        pred = get_clean_string(pred)
+        if is_exact_match(gt):
+            score = (gt == pred)
+        else:
+            score = anls_compute(gt, pred)
+    else:
+        if isinstance(gt, str) and gt.startswith('['):
+            gt = eval(gt)
+        if not isinstance(gt, list):
+            gt = [gt]
+        if isinstance(pred, str) and pred.startswith('['):
+            pred = eval(pred)
+        if not isinstance(pred, list):
+            pred = [pred]
+        print(len(gt), len(pred))
+        if len(gt) != len(pred):
+            score = 0.0
+        else:
+            gt = sorted([get_clean_string(a) for a in gt])
+            pred = sorted([get_clean_string(a) for a in pred])
+            print(gt, pred)
+            if isfloat(gt[0]) or is_exact_match(gt[0]):
+                score = ('-'.join(gt) == '-'.join(pred))
+            else:
+                score = min([anls_compute(gt_v, pred_v) for gt_v, pred_v in zip(gt, pred)])
+
+    return float(score)
+
+
+def MMLongBench_auxeval(model, line):
+    prompt = build_mmlongbench_gpt4_prompt(line)
+    log = ''
+    retry = 5
+
+    for i in range(retry):
+        prediction = line['prediction']
+        res = model.generate(prompt, temperature=i * 0.5)
+
+        if FAIL_MSG in res:
+            log += f'Try {i}: output is {prediction}, failed to parse.\n'
+        else:
+            log += 'Succeed'
+            try:
+                pred = res.split('Answer format:')[0].split('Extracted answer:')[1].strip()
+            except:
+                pred = ''
+            return dict(log=log, res=res, pred=pred)
+    log += 'All 5 retries failed.\n'
+    return dict(log=log, res='', pred='')
+
+
+def get_f1(data):
+    gt_pos_data = data[data.apply(lambda k: k['answer'] != 'Not answerable', axis=1)]
+    pred_pos_data = data[data.apply(lambda k: k['pred'] != 'Not answerable', axis=1)]
+    recall = sum(gt_pos_data['score'].tolist()) / len(gt_pos_data)
+    precision = sum(pred_pos_data['score'].tolist()) / len(pred_pos_data)
+    return 2 * recall * precision / (recall + precision)
+
+
+def MMLongBench_acc(result_file):
+    data = load(result_file)
+    overall_score = 0.0
+    score_list = list()
+    for i in range(len(data)):
+        item = data.iloc[i]
+        try:
+            score = eval_score(item['answer'], item['pred'], item['answer_format'])
+        except:
+            score = 0.0
+        score_list.append(score)
+        overall_score += score
+
+    data['score'] = score_list
+    dump(data, result_file)
+
+    data_chart = data[data.apply(lambda k: 'Chart' in eval(k['evidence_sources']), axis=1)]
+    data_table = data[data.apply(lambda k: 'Table' in eval(k['evidence_sources']), axis=1)]
+    data_image = data[data.apply(lambda k: 'Figure' in eval(k['evidence_sources']), axis=1)]
+    data_text = data[data.apply(lambda k: 'Pure-text (Plain-text)' in eval(k['evidence_sources']), axis=1)]
+    data_layout = data[data.apply(lambda k: 'Generalized-text (Layout)' in eval(k['evidence_sources']), axis=1)]
+
+    data_single = data[data.apply(lambda k: len(eval(k['evidence_pages'])) == 1, axis=1)]
+    data_multi = data[data.apply(lambda k: len(eval(k['evidence_pages'])) > 1, axis=1)]
+    data_unans = data[data.apply(lambda k: len(eval(k['evidence_pages'])) == 0, axis=1)]
+
+    res = dict()
+    res['category'] = [
+        'overall_f1', 'overall_acc', 'text', 'layout', 'table', 'chart',
+        'image', 'single-page', 'multi-page', 'unanswerable'
+    ]
+    res['num'] = [
+        len(data), len(data), len(data_text), len(data_layout), len(data_table),
+        len(data_chart), len(data_image), len(data_single), len(data_multi), len(data_unans)
+    ]
+    res['avg_score'] = [
+        get_f1(data),
+        overall_score / len(data),
+        sum(data_text['score'].tolist()) / len(data_text) if len(data_text) > 0 else 0.0,
+        sum(data_layout['score'].tolist()) / len(data_layout) if len(data_layout) > 0 else 0.0,
+        sum(data_table['score'].tolist()) / len(data_table) if len(data_table) > 0 else 0.0,
+        sum(data_chart['score'].tolist()) / len(data_chart) if len(data_chart) > 0 else 0.0,
+        sum(data_image['score'].tolist()) / len(data_image) if len(data_image) > 0 else 0.0,
+        sum(data_single['score'].tolist()) / len(data_single) if len(data_single) > 0 else 0.0,
+        sum(data_multi['score'].tolist()) / len(data_multi) if len(data_multi) > 0 else 0.0,
+        sum(data_unans['score'].tolist()) / len(data_unans) if len(data_unans) > 0 else 0.0,
+    ]
+    res = pd.DataFrame(res)
+    return res
+
+
+class MMLongBench(ImageBaseDataset):
+
+    TYPE = 'VQA'
+
+    DATASET_URL = {
+        'MMLongBench_DOC': 'https://opencompass.openxlab.space/utils/VLMEval/MMLongBench_DOC.tsv',
+    }
+    DATASET_MD5 = {
+        'MMLongBench_DOC': '9b393e1f4c52718380d50586197eac9b',
+    }
+
+    SUPPORTED_MODELS = {
+        'GPT4': (1, 1),
+        'GPT4V': (1, 1),
+        'GPT4V_HIGH': (1, 1),
+        'GPT4o': (1, 1),
+        'GPT4o_HIGH': (1, 1),
+        'GPT4o_MINI': (1, 1),
+        'MiniCPM-Llama3-V-2_5': (1, 5),
+        'InternVL-Chat-V1-5': (5, 2),
+        'XComposer2_4KHD': (1, 5),
+        'XComposer2d5': (1, -1),
+    }
+
+    def __init__(self, dataset, **kwargs):
+        self.model_list = list(self.SUPPORTED_MODELS.keys())
+        model_name = kwargs['model']
+        if not listinstr(self.model_list, model_name):
+            raise AssertionError("{} doesn't support the evaluation on MMLongBench_DOC.".format(model_name))
+        super(MMLongBench, self).__init__(dataset)
+
+        self.is_api = True if listinstr(['GPT4'], model_name) else False
+        self.max_pages = 120
+        concat_num, column_num = self.SUPPORTED_MODELS.get(model_name)
+        self.concat_num = concat_num
+        self.column_num = column_num
+
+    def dump_image(self, origin_line):
+        os.makedirs(self.img_root, exist_ok=True)
+        try:
+            import fitz
+        except Exception as e:
+            logging.critical(f'{type(e)}: {e}')
+            logging.critical('Please use `pip install pymupdf` to parse PDF files.')
+
+        line = origin_line.copy()
+        line['image_path'] = line['image_path'][:self.max_pages]
+        skip_pdf_parse = True
+        for im_name in line['image_path']:
+            path = osp.join(self.img_root, im_name)
+            if not read_ok(path):
+                skip_pdf_parse = False
+                break
+
+        # Just for being compatible with the zooped loop: zip(line['image'], line['image_path'])
+        if skip_pdf_parse:
+            line['image'] = line['image_path']
+        else:
+            pdf_data = base64.b64decode(line['image'])
+            pdf_file = io.BytesIO(pdf_data)
+            encoded_images = []
+            with fitz.open(stream=pdf_file, filetype='pdf') as doc:
+                doc = doc[:self.max_pages]
+                for page in doc:
+                    image = page.get_pixmap(dpi=144)
+                    image_file = io.BytesIO(image.tobytes(output='png'))
+                    image = Image.open(image_file)
+                    encoded_image = encode_image_to_base64(image)
+                    encoded_images.append(encoded_image)
+            line['image'] = encoded_images
+            print('process {}'.format(line['doc_id']))
+
+        if 'image' in line:
+            if isinstance(line['image'], list):
+                tgt_path = []
+                assert 'image_path' in line
+                for img, im_name in zip(line['image'], line['image_path']):
+                    path = osp.join(self.img_root, im_name)
+                    if not read_ok(path):
+                        decode_base64_to_image_file(img, path)
+                    tgt_path.append(path)
+            else:
+                tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
+                if not read_ok(tgt_path):
+                    decode_base64_to_image_file(line['image'], tgt_path)
+                tgt_path = [tgt_path]
+        else:
+            assert 'image_path' in line
+            tgt_path = toliststr(line['image_path'])
+
+        if self.concat_num > 0 and not self.is_api:
+            concatenated_images = concat_images(tgt_path, max_concat=self.concat_num, column_num=self.column_num)
+
+            old_tgt_path = tgt_path
+            assert isinstance(old_tgt_path, list)
+            if self.column_num != -1:
+                tgt_path = [
+                    '_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat{}_{}.jpg'.format(self.concat_num, i)
+                    for i in range(len(concatenated_images))
+                ]
+            else:
+                tgt_path = [
+                    '_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat_all_{}.jpg'.format(i)
+                    for i in range(len(concatenated_images))
+                ]
+
+            for path, concatenated_image in zip(tgt_path, concatenated_images):
+                if not read_ok(path):
+                    decode_base64_to_image_file(encode_image_to_base64(concatenated_image), path)
+                    num_images, image_size = len(old_tgt_path), concatenated_image.size
+                    print('concat {} images to a new one with size {}. save at {}'.format(num_images, image_size, path))
+        return tgt_path
+
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        logger = get_logger('Evaluation')
+        model = judge_kwargs['model']
+
+        suffix = eval_file.split('.')[-1]
+        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+
+        if osp.exists(storage):
+            logger.warning(f'GPT scoring file {storage} already exists, will reuse it in MMLongBench_eval. ')
+        else:
+            data = load(eval_file)
+            model = build_judge(max_tokens=128, **judge_kwargs)
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+
+            ans = {}
+            if osp.exists(tmp_file):
+                ans = load(tmp_file)
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+
+            if len(indices):
+                new_results = list()
+                for model, line in tqdm(tups):
+                    res = MMLongBench_auxeval(model, line)
+                    new_results.append(res)
+
+            log_map, res_map, pred_map = {}, {}, {}
+            all_inds = [line['index'] for line in lines]
+            for k, v in zip(all_inds, new_results):
+                log_map[k] = v['log']
+                res_map[k] = v['res']
+                pred_map[k] = v['pred']
+            data['res'] = [res_map[idx] for idx in data['index']]
+            data['log'] = [log_map[idx] for idx in data['index']]
+            data['pred'] = [pred_map[idx] for idx in data['index']]
+            dump(data, storage)
+
+        score = MMLongBench_acc(storage)
+        score_pth = storage.replace('.xlsx', '_score.csv')
+
+        dump(score, score_pth)
+        logger.info(f'MMLongBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
+        logger.info('Score: ')
+        logger.info(score)
--- a/VLMEvalKit/vlmeval/dataset/mmmath.py
+++ b/VLMEvalKit/vlmeval/dataset/mmmath.py
+import re
+import json
+import sympy as sp
+import numpy as np
+from sympy import simplify, Eq, sympify, Pow, pi
+from sympy.parsing.latex import parse_latex
+import sys
+import math
+import os
+import argparse
+
+from .image_base import ImageBaseDataset
+from ..utils import track_progress_rich
+from ..smp import load, dump
+
+
+class AutoScoringJudge:
+    def __init__(self):
+        # Map of special symbols to their replacements
+        self.special_signal_map = {
+            "\\left": "",
+            "\\right": "",
+            "厘米":"",
+            # "∶": ":",
+            "，": ",",
+            "$": "",
+            "（":"(",
+            "）":")",
+            "\\infty":"oo",
+            "\\colon ":":",
+            # "\\approx": "=",
+            # "\\simeq": "=",
+            # "\\sim": "=",
+            # "^\\prime": "'",
+            # "^{\\prime}": "'",
+            "＋":"+",
+            "\\, ": "",
+            "\\,":"",
+            "^\\circ": "",
+            "^{\\circ}": "",
+            # "%": "",
+        }
+        self.pi = parse_latex("\\pi")
+        # MM-Math default precision
+        self.precision = 1e-2
+
+    def trans_greater_sign_to_interval(self, expr:str):
+        expr_tmp = expr.split("<")
+        return "(" + expr_tmp[0] + ", " + expr_tmp[-1] + ")"
+
+    def split_by_comma(self, expr: str):
+        # Splits expressions by commas outside of brackets
+        in_bracket_num = 0
+        splitted_expr = []
+        start_idx = 0
+        for i, char in enumerate(expr):
+            if char in ["(", "["]:
+                in_bracket_num += 1
+            elif char in [")", "]"]:
+                in_bracket_num -= 1
+            elif char == "," and in_bracket_num == 0:
+                splitted_expr.append(expr[start_idx:i].strip())
+                start_idx = i + 1
+
+        if start_idx < len(expr):
+            splitted_expr.append(expr[start_idx:].strip())
+
+        return splitted_expr
+
+    def trans_plus_minus_sign(self, expr_list: list):
+        # Translates plus-minus signs into separate expressions
+        new_expr_list = []
+        for expr in expr_list:
+            if "\\pm" in expr:
+                new_expr_list.append(expr.replace("\\pm", "+"))
+                new_expr_list.append(expr.replace("\\pm", "-"))
+            else:
+                new_expr_list.append(expr)
+
+        return new_expr_list
+
+    def judge(self, expression1, expression2, precision=1e-2):
+        # Judge if two expressions are equal (expression1 is considered as the Ground Truth)
+        # Default precision is a list for supporting multiple expressions
+        precision = precision if isinstance(precision, list) else [precision]
+
+        try:
+            expression1, expression2 = self.preprocess(expression1, expression2)
+        except:
+            return False
+        if expression1 == expression2:
+            # print("Exactly equal")
+            return True
+
+        # Remove Chinese characters from the string, as answers like "yes" or "no" in Chinese have been considered
+        expression1 = expression1 if re.fullmatch(r"[\u4e00-\u9fff]+", expression1) else re.sub(r'[\u4e00-\u9fff]+', '', expression1)  # noqa: E501
+        expression2 = expression2 if re.fullmatch(r'[\u4e00-\u9fff]+', expression2) else re.sub(r'[\u4e00-\u9fff]+', '', expression2)  # noqa: E501
+        # Check if two < or > in expression
+        if self.is_two_greater_sign(expression1):
+            expression1 = self.trans_greater_sign_to_interval(expression1)
+
+        if self.is_two_greater_sign(expression2):
+            expression2 = self.trans_greater_sign_to_interval(expression2)
+
+        expression1 = self.split_by_comma(expression1)
+        expression2 = self.split_by_comma(expression2)
+
+        temp_list1 = self.trans_plus_minus_sign(expression1)
+        temp_list2 = self.trans_plus_minus_sign(expression2)
+
+        # Set up a list for allowed errors
+        if len(precision) <= 1:
+            precision = precision * len(temp_list1)
+
+        if len(temp_list1) != len(temp_list2):
+            return False
+
+        # Check if elements in both lists can be paired and are equal
+        idx = -1
+        while len(temp_list1) != 0:
+            idx = (idx + 1) % len(temp_list1)
+
+            item1 = temp_list1[idx]
+            self.precision = precision[idx]
+
+            for item2 in temp_list2:
+                if self.is_equal(item1, item2):
+                    temp_list1.remove(item1)
+                    temp_list2.remove(item2)
+                    precision.remove(self.precision)
+                    break
+            else:
+                # If no match was found, return False
+                return False
+
+        # If all elements are matched, return True
+        return True
+
+    def is_interval(self, expr):
+        # Checks if an expression is an interval
+        return expr.startswith(("(", "[")) and expr.endswith((")", "]"))
+
+    def is_two_greater_sign(self, expr):
+        match = re.findall(r'<', expr)
+        return len(match) == 2
+
+    def sympy_sub_pi(self, expression_sympy):
+        # Replaces the symbol for pi in sympy expressions with its numerical value
+        return expression_sympy.subs(self.pi, math.pi)
+
+    def is_equal(self, expression1, expression2):
+        # Default first expression is ground truth. Check if expressions are equal in different aspects
+        if expression1 == expression2 and expression1 != "" and expression2 != "":
+            # print("Equivalent natively")
+            return True
+
+        # First check if both are intervals
+        if self.is_interval(expression1) and self.is_interval(expression2):
+            try:
+                if self.interval_equal(expression1, expression2):
+                    # print("Interval equivalent")
+                    return True
+            except:
+                return False
+
+        # Then check for numerical equality
+        try:
+            if self.numerical_equal(expression1, expression2):
+                # print("Numerically equivalent")
+                return True
+        except:
+            pass
+        # Then check if expressions are mathematically equal
+        try:
+            if self.expression_equal(expression1, expression2) and not ("=" in expression1 and "=" in expression2):
+                # print("Expression equivalent")
+                return True
+        except:
+            pass
+
+        # Lastly, check for equation equality
+        try:
+            if self.equation_equal(expression1, expression2):
+                # print("Equation equivalent")
+                return True
+        except:
+            pass
+
+        return False
+
+    def numerical_equal(self, expression1: str, expression2: str, include_percentage: bool = True):
+        # Check if two numerical values are equal within an allowed error range
+        # Includes possible percentage cases
+        reference = float(expression1)
+        prediction = float(expression2)
+
+        if include_percentage:
+            gt_result = [reference / 100, reference, reference * 100]
+        else:
+            gt_result = [reference]
+
+        for item in gt_result:
+            if abs(item - prediction) <= self.precision * 1.01:
+                return True
+        return False
+
+    def expression_equal(self, exp1, exp2):
+        # Check if two expressions are mathematically equivalent
+        # Extract expression and use sympy for equivalence checking
+        def extract_expression(expression):
+            if "=" in expression:
+                expression = expression.split("=")[1]
+            return expression.strip()
+
+        exp1 = extract_expression(exp1)
+        exp2 = extract_expression(exp2)
+
+        exp_too_long = len(exp1) > 300 or len(exp2) > 300
+
+        expr1_sym = sympify(parse_latex(exp1))
+        expr2_sym = sympify(parse_latex(exp2))
+        if expr1_sym == expr2_sym:
+            return True
+        else:
+            expr1_sym = self.sympy_sub_pi(expr1_sym)
+            expr2_sym = self.sympy_sub_pi(expr2_sym)
+
+            if (expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol)) or \
+                    (not expr1_sym.has(sp.Symbol) and expr2_sym.has(sp.Symbol)):
+                return False
+            elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol):
+                try:
+                    if not (self.can_compute_power(expr1_sym) and self.can_compute_power(expr2_sym)):
+                        print("These two numbers cannot be calculated by the current computer for: "
+                              f"\"{str(expr1_sym)}\" and \"{str(expr2_sym)}\"")
+                        return False
+                    if exp_too_long:
+                        print(f'Expression {exp1} or {exp2} is too long to compute. ')
+                        return False
+                    if abs(expr1_sym.evalf() - expr2_sym.evalf()) <= self.precision * 1.01:
+                        return True
+                    else:
+                        return False
+                except:
+                    return False
+            elif exp_too_long:
+                print(f'Expression {exp1} or {exp2} is too long to compute. ')
+                return False
+            else:
+                try:
+                    simplified_expr = simplify(expr1_sym - expr2_sym)
+                    num_value = simplified_expr.evalf()
+                    return abs(num_value) < 1e-3
+                except:
+                    return False
+
+    def equation_equal(self, expression1, expression2):
+        # Check if two equations are mathematically equivalent
+        # Simplify equations and use sympy for equivalence checking
+        def simplify_equation(latex_eq):
+            lhs, rhs = latex_eq.split('=')
+
+            lhs_expr = parse_latex(lhs)
+            rhs_expr = parse_latex(rhs)
+
+            equation = Eq(lhs_expr, rhs_expr)
+
+            simplified_eq = simplify(equation.lhs - equation.rhs)
+
+            return simplified_eq
+
+        expr1_sym = simplify_equation(expression1)
+        expr2_sym = simplify_equation(expression2)
+
+        division_result_1 = simplify(expr1_sym / expr2_sym)
+        division_result_2 = simplify(expr2_sym / expr1_sym)
+
+        if ((division_result_1.is_Integer and division_result_1 != 0) or  # noqa: W504
+                (division_result_2.is_Integer and division_result_2 != 0)):
+            return True
+        else:
+            return False
+
+    def interval_equal(self, expression1, expression2):
+        # Check if two intervals are mathematically equivalent
+        def compare_two_interval(inter1, inter2):
+            if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]:
+                return False
+
+            inter1 = inter1.strip('[]()')
+            inter2 = inter2.strip('[]()')
+
+            items_1 = inter1.split(',')
+            items_2 = inter2.split(',')
+
+            for item_1, item_2 in zip(items_1, items_2):
+                if not self.expression_equal(item_1, item_2):
+                    return False
+            return True
+
+        interval1 = expression1
+        interval2 = expression2
+
+        if interval1 == interval2:
+            return True
+        else:
+            inter_list1 = interval1.split("\\cup")
+            inter_list2 = interval2.split("\\cup")
+
+            if len(inter_list1) != len(inter_list2):
+                return False
+            else:
+                for inter1, inter2 in zip(inter_list1, inter_list2):
+                    if not compare_two_interval(inter1, inter2):
+                        return False
+                return True
+
+    def preprocess(self, expression1, expression2):
+        # Preprocess expressions to extract and replace special symbols
+        def extract_boxed_content(latex_str):
+            boxed_matches = re.finditer(r'\\boxed{', latex_str)
+            results = ""
+
+            for match in boxed_matches:
+                start_index = match.end()
+                end_index = start_index
+                stack = 1
+
+                while stack > 0 and end_index < len(latex_str):
+                    if latex_str[end_index] == '{':
+                        stack += 1
+                    elif latex_str[end_index] == '}':
+                        stack -= 1
+                    end_index += 1
+
+                if stack == 0:
+                    content = latex_str[start_index:end_index - 1]
+                    results += content + ","
+                else:
+                    raise ValueError("Mismatched braces in LaTeX string.")
+
+            if results == "":
+                last_line_ans = latex_str.strip().split("\n")[-1]
+                dollar_pattern = r"\$(.*?)\$"
+                answers = re.findall(dollar_pattern, last_line_ans)
+
+                if answers:
+                    for ans in answers:
+                        results += ans + ","
+                else:
+                    results = latex_str
+
+            return results
+
+        def sepcial_symbol_replace(expression):
+
+            expression = expression.replace("\\text{cm}^2", '').replace("\\text{cm}", "").replace("\\,cm", '').replace("\\text{ cm}", '').replace("cm", '').replace("\\text{分米}^2", '').replace("cm^{2}", '').replace("60 \\text{ cm}^2",'').replace("\\ \\text{m}", "").replace("\\text{米}","").strip()  # noqa: E501
+
+            expression = re.sub(r"(.+)m$", r"\1", expression)
+
+            if "\\in " in expression:
+                expression = expression.split("\\in ")[1]
+
+            for signal in self.special_signal_map:
+                expression = expression.replace(signal, self.special_signal_map[signal])
+
+            expression = re.sub(r'(\\sin|\\cos|\\tan)(\d+)', r'\1((\2/180)\\pi)', expression)
+
+            expression = expression.strip("\n,.:;^_=+`!@#%^&*~，。")
+
+            pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}'
+            expression = re.sub(pattern, r'\1', expression)
+
+            return expression
+
+        exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content(expression2)
+
+        exp1, exp2 = sepcial_symbol_replace(exp1), sepcial_symbol_replace(exp2)
+
+        return exp1, exp2
+
+    def can_compute_power(self, expr):
+        # Checks if a power expression can be computed
+        if isinstance(expr, Pow):
+            base, exp = expr.as_base_exp()
+            if base.is_number and exp.is_number:
+                MAX_EXP = 1000  # Adjust based on computing environment
+                if abs(exp.evalf()) > MAX_EXP:
+                    return False
+                else:
+                    return True
+            else:
+                return False
+        else:
+            return True  # Not a power expression, can compute
+
+
+class MMMath(ImageBaseDataset):
+
+    TYPE = 'VQA'
+
+    DATASET_URL = {
+        'MM-Math': 'https://opencompass.openxlab.space/utils/VLMEval/MM-Math.tsv',
+    }
+    DATASET_MD5 = {
+        'MM-Math': '1f064ed7c4e0e8926a3fa65849419ca5',
+    }
+
+    @classmethod
+    def evaluate(self, eval_file, **kwargs):
+
+        data = load(eval_file)
+        judger = AutoScoringJudge()
+        func = judger.judge
+
+        tups = [dict(expression1=x, expression2=y) for x, y in zip(data['answer'], data['prediction'])]
+
+        res = track_progress_rich(func, tups, nproc=16)
+        data['hit'] = res
+        dump(data, eval_file)
+
+        score_file = eval_file.replace('.xlsx', '_score.json')
+        score = {}
+        score['overall'] = np.mean(data['hit'])
+        # Results by Difficulty
+        difficulties = set(data['difficulty'])
+        for d in difficulties:
+            score[f'Difficulty-{d}'] = np.mean(data[data['difficulty'] == d]['hit'])
+
+        # Results by Year
+        years = set(data['year'])
+        for y in years:
+            score[f'Year-{y}'] = np.mean(data[data['year'] == y]['hit'])
+
+        # Results by Knowledge-L1
+        points = set(data['knowledge_l1'])
+        for p in points:
+            score[f'Knowledge-L1-{p}'] = np.mean(data[data['knowledge_l1'] == p]['hit'])
+
+        # Results by Knowledge-L2
+        points = set(data['knowledge_l2'])
+        for p in points:
+            score[f'Knowledge-L2-{p}'] = np.mean(data[data['knowledge_l2'] == p]['hit'])
+
+        dump(score, score_file)
+        return score