init

81028572 · luopl · 81028572 · 81028572 · 81028572 · 81028572
Commit 81028572 authored Sep 28, 2024 by luopl
20 changed files
--- a/VLMEvalKit/vlmeval/api/bluelm_v_api.py
+++ b/VLMEvalKit/vlmeval/api/bluelm_v_api.py
+from vlmeval.smp import *
+from vlmeval.api.base import BaseAPI
+import os
+import json
+def multimodal(images, text, url, key, temperature=0, max_tokens=1024, history=[]):
+    if images:
+        pics = []
+        for image in images:
+            with open(image, 'rb') as f:
+                pic = base64.b64encode(f.read()).decode('utf-8')
+            pics.append(pic)
+        data = {'images': pics, 'text': text, 'key': key, 'temperature': temperature, 'max_new_tokens': max_tokens}
+    else:
+        data = {'text': text, 'key': key, 'temperature': temperature, 'max_new_tokens': max_tokens}
+    response = requests.post(url, json=data, headers={'Content-Type': 'application/json'})
+    response = json.loads(response.text)
+    return response
+class BlueLMWrapper(BaseAPI):
+    is_api: bool = True
+    def __init__(self,
+                 model: str = 'BlueLM-V-v3.0',
+                 retry: int = 5,
+                 wait: int = 5,
+                 verbose: bool = True,
+                 temperature: float = 0.0,
+                 system_prompt: str = None,
+                 max_tokens: int = 1024,
+                 key: str = None,
+                 url: str = 'http://api-ai.vivo.com.cn/multimodal',
+                 **kwargs):
+        self.model = model
+        self.fail_msg = 'Failed to obtain answer BlueLM-V API. '
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+        self.url = url
+        self.key = key
+        if self.key is None:
+            self.key = os.environ.get('BLUELM_V_API_KEY', None)
+        assert self.key is not None, (
+            'Please set the API Key (obtain it here: '
+            'contact by email : shuai.ren@vivo.com'
+        )
+        super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
+    def message_to_promptimg(self, message, dataset=None):
+        num_images = len([x for x in message if x['type'] == 'image'])
+        if num_images == 0:
+            prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+            image = None
+        elif num_images == 1:
+            prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+            image = [x['value'] for x in message if x['type'] == 'image']
+        else:
+            prompt = '\n'.join([x['value'] if x['type'] == 'text' else '<image>' for x in message])
+            if dataset == 'BLINK':
+                image = concat_images_vlmeval(
+                    [x['value'] for x in message if x['type'] == 'image'],
+                    target_size=512)
+            else:
+                image = [x['value'] for x in message if x['type'] == 'image']
+        if dataset in ['MMBench_DEV_EN_V11', 'MMBench_DEV_CN_V11', 'MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11',
+                       'AI2D_TEST', 'AI2D_TEST_TO_MASK', 'MMMU_DEV_VAL']:
+            prompt = prompt.replace('Please select the correct answer from the options above.',
+                                    'Answer with the option’s letter from the given choices directly.')
+        elif dataset in ['ChartQA_TEST']:
+            prompt = prompt.replace('Answer the question using a single word or phrase.',
+                                    'Answer the question using a single number or phrase.')
+        elif dataset in ['DocVQA_VAL', 'DocVQA_TEST', ]:
+            prompt = prompt.replace('Answer the question using a single word or phrase.',
+                                    'Give the short answer directly.')
+        elif dataset in ['TextVQA_VAL']:
+            prompt = prompt.replace('Answer the question using a single word or phrase.',
+                                    'When the provided information is insufficient, respond with ’Unanswerable’.'
+                                    'Answer the question using a single word or phrase.')
+        elif dataset in ['MTVQA_TEST']:
+            prompt = prompt.replace('\nAnswer the question using a word or phrase in the language of the question.', '')
+        elif dataset in ['MathVista_MINI']:
+            if 'Choices:' in prompt:
+                prompt = prompt.replace('Choices:', 'Options:').replace('Hint:', 'Context:')
+                for i in range(1, 7):  # replace A ~ F
+                    prompt = prompt.replace(f'({chr(64 + i)})', f'{chr(64 + i)}.')
+                prompt += '\nAnswer with the option’s letter from the given choices directly.'
+            else:
+                prompt += '\nAnswer the question using a single word or phrase.'
+        return prompt, image
+    def generate_inner(self, inputs, **kwargs) -> str:
+        assert isinstance(inputs, str) or isinstance(inputs, list)
+        pure_text = np.all([x['type'] == 'text' for x in inputs])
+        assert not pure_text
+        prompt, image_path = self.message_to_promptimg(inputs, kwargs['dataset'])
+        try:
+            response = multimodal(image_path, prompt, self.url, self.key, self.temperature, self.max_tokens)
+            answer = response['result']
+            return 0, answer, 'Succeeded! '
+        except Exception as err:
+            if self.verbose:
+                self.logger.error(err)
+                self.logger.error(f'The input messages are {inputs}.')
+            return -1, '', ''
+class BlueLM_V_API(BlueLMWrapper):
+    def generate(self, message, dataset=None):
+        return super(BlueLM_V_API, self).generate(message, dataset=dataset)
--- a/VLMEvalKit/vlmeval/api/claude.py
+++ b/VLMEvalKit/vlmeval/api/claude.py
+from vlmeval.smp import *
+from vlmeval.api.base import BaseAPI
+from time import sleep
+import base64
+import mimetypes
+from PIL import Image
+url = 'https://openxlab.org.cn/gw/alles-apin-hub/v1/claude/v1/text/chat'
+headers = {
+    'alles-apin-token': '',
+    'Content-Type': 'application/json'
+}
+class Claude_Wrapper(BaseAPI):
+    is_api: bool = True
+    def __init__(self,
+                 model: str = 'claude-3-opus-20240229',
+                 key: str = None,
+                 retry: int = 10,
+                 wait: int = 3,
+                 system_prompt: str = None,
+                 verbose: bool = True,
+                 temperature: float = 0,
+                 max_tokens: int = 1024,
+                 **kwargs):
+        self.model = model
+        self.headers = headers
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+        if key is not None:
+            self.key = key
+        else:
+            self.key = os.environ.get('ALLES', '')
+        self.headers['alles-apin-token'] = self.key
+        super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
+    # inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
+    # content can be a string or a list of image & text
+    def prepare_itlist(self, inputs):
+        assert np.all([isinstance(x, dict) for x in inputs])
+        has_images = np.sum([x['type'] == 'image' for x in inputs])
+        if has_images:
+            content_list = []
+            for msg in inputs:
+                if msg['type'] == 'text' and msg['value'] != '':
+                    content_list.append(dict(type='text', text=msg['value']))
+                elif msg['type'] == 'image':
+                    pth = msg['value']
+                    suffix = osp.splitext(pth)[-1].lower()
+                    media_type = mimetypes.types_map.get(suffix, None)
+                    assert media_type is not None
+                    content_list.append(dict(
+                        type='image',
+                        source={
+                            'type': 'base64',
+                            'media_type': media_type,
+                            'data': encode_image_file_to_base64(pth, target_size=4096)
+                        }))
+        else:
+            assert all([x['type'] == 'text' for x in inputs])
+            text = '\n'.join([x['value'] for x in inputs])
+            content_list = [dict(type='text', text=text)]
+        return content_list
+    def prepare_inputs(self, inputs):
+        input_msgs = []
+        assert isinstance(inputs, list) and isinstance(inputs[0], dict)
+        assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs
+        if 'role' in inputs[0]:
+            assert inputs[-1]['role'] == 'user', inputs[-1]
+            for item in inputs:
+                input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content'])))
+        else:
+            input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs)))
+        return input_msgs
+    def generate_inner(self, inputs, **kwargs) -> str:
+        payload = json.dumps({
+            'model': self.model,
+            'max_tokens': self.max_tokens,
+            'messages': self.prepare_inputs(inputs),
+            'system': self.system_prompt,
+            **kwargs
+        })
+        response = requests.request('POST', url, headers=headers, data=payload)
+        ret_code = response.status_code
+        ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
+        answer = self.fail_msg
+        try:
+            resp_struct = json.loads(response.text)
+            answer = resp_struct['data']['content'][0]['text'].strip()
+        except:
+            pass
+        return ret_code, answer, response
+class Claude3V(Claude_Wrapper):
+    def generate(self, message, dataset=None):
+        return super(Claude_Wrapper, self).generate(message)
--- a/VLMEvalKit/vlmeval/api/cloudwalk.py
+++ b/VLMEvalKit/vlmeval/api/cloudwalk.py
+from ..smp import *
+import os
+from .base import BaseAPI
+class CWWrapper(BaseAPI):
+    is_api: bool = True
+    def __init__(self,
+                 model: str = 'cw-congrong-v1.5',
+                 retry: int = 10,
+                 wait: int = 5,
+                 key: str = None,
+                 verbose: bool = True,
+                 system_prompt: str = None,
+                 temperature: float = 0,
+                 timeout: int = 600,
+                 api_base: str = 'http://cwapi-vlm01.cw_rb.azurebot.tk/v1/chat/completions',
+                 max_tokens: int = 1024,
+                 img_size: int = 512,
+                 img_detail: str = 'low',
+                 **kwargs):
+        self.model = model
+        self.cur_idx = 0
+        self.fail_msg = 'Failed to obtain answer via API. '
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+        base = os.environ.get('CW_API_BASE', None)
+        self.api_base = base if base is not None else api_base
+        env_key = os.environ.get('CW_API_KEY', None)
+        self.key = env_key if env_key is not None else key
+        assert self.key is not None, 'API key not provided. Please set CW_API_KEY environment variable or \
+            pass it to the constructor.'
+        assert img_size > 0 or img_size == -1
+        self.img_size = -1  # allways send full size image
+        assert img_detail in ['high', 'low']
+        self.img_detail = img_detail
+        self.vision = True
+        self.timeout = timeout
+        super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
+    # inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
+    # content can be a string or a list of image & text
+    def prepare_inputs(self, inputs):
+        input_msgs = []
+        if self.system_prompt is not None:
+            input_msgs.append(dict(role='system', content=self.system_prompt))
+        has_images = np.sum([x['type'] == 'image' for x in inputs])
+        if has_images:
+            content_list = []
+            for msg in inputs:
+                if msg['type'] == 'text':
+                    content_list.append(dict(type='text', text=msg['value']))
+                elif msg['type'] == 'image':
+                    from PIL import Image
+                    img = Image.open(msg['value'])
+                    b64 = encode_image_to_base64(img, target_size=self.img_size)
+                    img_struct = dict(url=f'data:image/jpeg;base64,{b64}', detail=self.img_detail)
+                    content_list.append(dict(type='image_url', image_url=img_struct))
+            input_msgs.append(dict(role='user', content=content_list))
+        else:
+            assert all([x['type'] == 'text' for x in inputs])
+            text = '\n'.join([x['value'] for x in inputs])
+            input_msgs.append(dict(role='user', content=text))
+        return input_msgs
+    def generate_inner(self, inputs, **kwargs) -> str:
+        input_msgs = self.prepare_inputs(inputs)
+        temperature = kwargs.pop('temperature', self.temperature)
+        max_tokens = kwargs.pop('max_tokens', self.max_tokens)
+        if 0 < max_tokens <= 100:
+            self.logger.warning(
+                'Less than 100 tokens left, '
+                'may exceed the context window with some additional meta symbols. '
+            )
+        if max_tokens <= 0:
+            return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. '
+        headers = {'Content-Type': 'application/json', 'Authorization': f'{self.key}'}
+        payload = dict(
+            model=self.model,
+            messages=input_msgs,
+            max_tokens=max_tokens,
+            n=1,
+            temperature=temperature,
+            **kwargs)
+        response = requests.post(self.api_base, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
+        ret_code = response.status_code
+        ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
+        answer = self.fail_msg
+        try:
+            resp_struct = json.loads(response.text)
+            answer = resp_struct['choices'][0]['message']['content'].strip()
+        except:
+            pass
+        return ret_code, answer, response
--- a/VLMEvalKit/vlmeval/api/gemini.py
+++ b/VLMEvalKit/vlmeval/api/gemini.py
+from vlmeval.smp import *
+from vlmeval.api.base import BaseAPI
+headers = 'Content-Type: application/json'
+class GeminiWrapper(BaseAPI):
+    is_api: bool = True
+    def __init__(self,
+                 model: str = 'gemini-1.0-pro',
+                 retry: int = 5,
+                 wait: int = 5,
+                 key: str = None,
+                 verbose: bool = True,
+                 temperature: float = 0.0,
+                 system_prompt: str = None,
+                 max_tokens: int = 1024,
+                 proxy: str = None,
+                 backend='genai',
+                 project_id='vlmeval',
+                 **kwargs):
+        assert model in ['gemini-1.0-pro', 'gemini-1.5-pro', 'gemini-1.5-flash']
+        self.model = model
+        self.fail_msg = 'Failed to obtain answer via API. '
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+        if key is None:
+            key = os.environ.get('GOOGLE_API_KEY', None)
+        # Try to load backend from environment variable
+        be = os.environ.get('GOOGLE_API_BACKEND', None)
+        if be is not None and be in ['genai', 'vertex']:
+            backend = be
+        assert backend in ['genai', 'vertex']
+        if backend == 'genai':
+            # We have not evaluated Gemini-1.5 w. GenAI backend
+            assert key is not None  # Vertex does not require API Key
+        self.backend = backend
+        self.project_id = project_id
+        self.api_key = key
+        if proxy is not None:
+            proxy_set(proxy)
+        super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
+    def build_msgs_genai(self, inputs):
+        messages = [] if self.system_prompt is None else [self.system_prompt]
+        for inp in inputs:
+            if inp['type'] == 'text':
+                messages.append(inp['value'])
+            elif inp['type'] == 'image':
+                messages.append(Image.open(inp['value']))
+        return messages
+    def build_msgs_vertex(self, inputs):
+        from vertexai.generative_models import Part, Image
+        messages = [] if self.system_prompt is None else [self.system_prompt]
+        for inp in inputs:
+            if inp['type'] == 'text':
+                messages.append(inp['value'])
+            elif inp['type'] == 'image':
+                messages.append(Part.from_image(Image.load_from_file(inp['value'])))
+        return messages
+    def generate_inner(self, inputs, **kwargs) -> str:
+        if self.backend == 'genai':
+            import google.generativeai as genai
+            assert isinstance(inputs, list)
+            pure_text = np.all([x['type'] == 'text' for x in inputs])
+            genai.configure(api_key=self.api_key)
+            if pure_text and self.model == 'gemini-1.0-pro':
+                model = genai.GenerativeModel('gemini-1.0-pro')
+            else:
+                assert self.model in ['gemini-1.5-pro', 'gemini-1.5-flash']
+                model = genai.GenerativeModel(self.model)
+            messages = self.build_msgs_genai(inputs)
+            gen_config = dict(max_output_tokens=self.max_tokens, temperature=self.temperature)
+            gen_config.update(kwargs)
+            try:
+                answer = model.generate_content(
+                    messages,
+                    generation_config=genai.types.GenerationConfig(**gen_config)).text
+                return 0, answer, 'Succeeded! '
+            except Exception as err:
+                if self.verbose:
+                    self.logger.error(err)
+                    self.logger.error(f'The input messages are {inputs}.')
+                return -1, '', ''
+        elif self.backend == 'vertex':
+            import vertexai
+            from vertexai.generative_models import GenerativeModel
+            vertexai.init(project=self.project_id, location='us-central1')
+            model_name = 'gemini-1.0-pro-vision' if self.model == 'gemini-1.0-pro' else self.model
+            model = GenerativeModel(model_name=model_name)
+            messages = self.build_msgs_vertex(inputs)
+            try:
+                resp = model.generate_content(messages)
+                answer = resp.text
+                return 0, answer, 'Succeeded! '
+            except Exception as err:
+                if self.verbose:
+                    self.logger.error(err)
+                    self.logger.error(f'The input messages are {inputs}.')
+                return -1, '', ''
+class GeminiProVision(GeminiWrapper):
+    def generate(self, message, dataset=None):
+        return super(GeminiProVision, self).generate(message)
--- a/VLMEvalKit/vlmeval/api/glm_vision.py
+++ b/VLMEvalKit/vlmeval/api/glm_vision.py
+from vlmeval.smp import *
+from vlmeval.api.base import BaseAPI
+from vlmeval.dataset import DATASET_TYPE
+from vlmeval.smp.vlm import encode_image_file_to_base64
+class GLMVisionWrapper(BaseAPI):
+    is_api: bool = True
+    def __init__(self,
+                 model: str,
+                 retry: int = 5,
+                 wait: int = 5,
+                 key: str = None,
+                 verbose: bool = True,
+                 system_prompt: str = None,
+                 max_tokens: int = 1024,
+                 proxy: str = None,
+                 **kwargs):
+        self.model = model
+        self.fail_msg = 'Failed to obtain answer via API. '
+        self.default_params = {
+            'top_p': 0.6,
+            'top_k': 2,
+            'temperature': 0.8,
+            'repetition_penalty': 1.1,
+            'best_of': 1,
+            'do_sample': True,
+            'stream': False,
+            'max_tokens': max_tokens
+        }
+        if key is None:
+            key = os.environ.get('GLMV_API_KEY', None)
+        assert key is not None, (
+            'Please set the API Key (obtain it here: '
+            'https://open.bigmodel.cn/dev/howuse/introduction)'
+        )
+        self.key = key
+        super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
+    def image_to_base64(self, image_path):
+        import base64
+        with open(image_path, 'rb') as image_file:
+            encoded_string = base64.b64encode(image_file.read())
+            return encoded_string.decode('utf-8')
+    def build_msgs(self, msgs_raw, system_prompt=None, dataset=None):
+        msgs = cp.deepcopy(msgs_raw)
+        content = []
+        text = ''
+        for i, msg in enumerate(msgs):
+            if msg['type'] == 'text':
+                text += msg['value']
+            elif msg['type'] == 'image':
+                content.append(dict(type='image_url', image_url=dict(url=encode_image_file_to_base64(msg['value']))))
+        if dataset is not None and DATASET_TYPE(dataset) in ['MCQ', 'Y/N']:
+            text += '\nShort Answer.'
+        content.append(dict(type='text', text=text))
+        ret = [dict(role='user', content=content)]
+        return ret
+    def generate_inner(self, inputs, **kwargs) -> str:
+        assert isinstance(inputs, str) or isinstance(inputs, list)
+        inputs = [inputs] if isinstance(inputs, str) else inputs
+        messages = self.build_msgs(msgs_raw=inputs, dataset=kwargs.get('dataset', None))
+        url = 'https://api.chatglm.cn/v1/chat/completions'
+        headers = {
+            'Content-Type': 'application/json',
+            'Request-Id': 'remote-test',
+            'Authorization': f'Bearer {self.key}'
+        }
+        payload = {
+            'model': self.model,
+            'messages': messages,
+            **self.default_params
+        }
+        response = requests.post(url, headers=headers, data=json.dumps(payload), verify=False)
+        output = []
+        try:
+            assert response.status_code == 200
+            for line in response.iter_lines():
+                data = json.loads(line.decode('utf-8').lstrip('data: '))
+                output.append(data['choices'][0]['message']['content'])
+            answer = ''.join(output).replace('</s>', '')
+            if self.verbose:
+                self.logger.info(f'inputs: {inputs}\nanswer: {answer}')
+            return 0, answer, 'Succeeded! '
+        except Exception as err:
+            if self.verbose:
+                self.logger.error(err)
+                self.logger.error(f'The input messages are {inputs}.')
+            return -1, self.fail_msg, ''
+class GLMVisionAPI(GLMVisionWrapper):
+    def generate(self, message, dataset=None):
+        return super(GLMVisionAPI, self).generate(message, dataset=dataset)
--- a/VLMEvalKit/vlmeval/api/gpt.py
+++ b/VLMEvalKit/vlmeval/api/gpt.py
+from ..smp import *
+import os
+import sys
+from .base import BaseAPI
+APIBASES = {
+    'OFFICIAL': 'https://api.openai.com/v1/chat/completions',
+}
+def GPT_context_window(model):
+    length_map = {
+        'gpt-4': 8192,
+        'gpt-4-0613': 8192,
+        'gpt-4-turbo-preview': 128000,
+        'gpt-4-1106-preview': 128000,
+        'gpt-4-0125-preview': 128000,
+        'gpt-4-vision-preview': 128000,
+        'gpt-4-turbo': 128000,
+        'gpt-4-turbo-2024-04-09': 128000,
+        'gpt-3.5-turbo': 16385,
+        'gpt-3.5-turbo-0125': 16385,
+        'gpt-3.5-turbo-1106': 16385,
+        'gpt-3.5-turbo-instruct': 4096,
+    }
+    if model in length_map:
+        return length_map[model]
+    else:
+        return 128000
+class OpenAIWrapper(BaseAPI):
+    is_api: bool = True
+    def __init__(self,
+                 model: str = 'gpt-3.5-turbo-0613',
+                 retry: int = 5,
+                 wait: int = 5,
+                 key: str = None,
+                 verbose: bool = True,
+                 system_prompt: str = None,
+                 temperature: float = 0,
+                 timeout: int = 60,
+                 api_base: str = None,
+                 max_tokens: int = 1024,
+                 img_size: int = 512,
+                 img_detail: str = 'low',
+                 use_azure: bool = False,
+                 **kwargs):
+        self.model = model
+        self.cur_idx = 0
+        self.fail_msg = 'Failed to obtain answer via API. '
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+        self.use_azure = use_azure
+        if 'step-1v' in model:
+            env_key = os.environ.get('STEPAI_API_KEY', '')
+            if key is None:
+                key = env_key
+        elif 'yi-vision' in model:
+            env_key = os.environ.get('YI_API_KEY', '')
+            if key is None:
+                key = env_key
+        else:
+            if use_azure:
+                env_key = os.environ.get('AZURE_OPENAI_API_KEY', None)
+                assert env_key is not None, 'Please set the environment variable AZURE_OPENAI_API_KEY. '
+                if key is None:
+                    key = env_key
+                assert isinstance(key, str), (
+                    'Please set the environment variable AZURE_OPENAI_API_KEY to your openai key. '
+                )
+            else:
+                env_key = os.environ.get('OPENAI_API_KEY', '')
+                if key is None:
+                    key = env_key
+                assert isinstance(key, str) and key.startswith('sk-'), (
+                    f'Illegal openai_key {key}. '
+                    'Please set the environment variable OPENAI_API_KEY to your openai key. '
+                )
+        self.key = key
+        assert img_size > 0 or img_size == -1
+        self.img_size = img_size
+        assert img_detail in ['high', 'low']
+        self.img_detail = img_detail
+        self.timeout = timeout
+        super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
+        if use_azure:
+            api_base_template = (
+                '{endpoint}openai/deployments/{deployment_name}/chat/completions?api-version={api_version}'
+            )
+            endpoint = os.getenv('AZURE_OPENAI_ENDPOINT', None)
+            assert endpoint is not None, 'Please set the environment variable AZURE_OPENAI_ENDPOINT. '
+            deployment_name = os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME', None)
+            assert deployment_name is not None, 'Please set the environment variable AZURE_OPENAI_DEPLOYMENT_NAME. '
+            api_version = os.getenv('OPENAI_API_VERSION', None)
+            assert api_version is not None, 'Please set the environment variable OPENAI_API_VERSION. '
+            self.api_base = api_base_template.format(
+                endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
+                deployment_name=os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME'),
+                api_version=os.getenv('OPENAI_API_VERSION')
+            )
+        else:
+            if api_base is None:
+                if 'OPENAI_API_BASE' in os.environ and os.environ['OPENAI_API_BASE'] != '':
+                    self.logger.info('Environment variable OPENAI_API_BASE is set. Will use it as api_base. ')
+                    api_base = os.environ['OPENAI_API_BASE']
+                else:
+                    api_base = 'OFFICIAL'
+            assert api_base is not None
+            if api_base in APIBASES:
+                self.api_base = APIBASES[api_base]
+            elif api_base.startswith('http'):
+                self.api_base = api_base
+            else:
+                self.logger.error('Unknown API Base. ')
+                sys.exit(-1)
+        self.logger.info(f'Using API Base: {self.api_base}; API Key: {self.key}')
+    # inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
+    # content can be a string or a list of image & text
+    def prepare_itlist(self, inputs):
+        assert np.all([isinstance(x, dict) for x in inputs])
+        has_images = np.sum([x['type'] == 'image' for x in inputs])
+        if has_images:
+            content_list = []
+            for msg in inputs:
+                if msg['type'] == 'text':
+                    content_list.append(dict(type='text', text=msg['value']))
+                elif msg['type'] == 'image':
+                    from PIL import Image
+                    img = Image.open(msg['value'])
+                    b64 = encode_image_to_base64(img, target_size=self.img_size)
+                    img_struct = dict(url=f'data:image/jpeg;base64,{b64}', detail=self.img_detail)
+                    content_list.append(dict(type='image_url', image_url=img_struct))
+        else:
+            assert all([x['type'] == 'text' for x in inputs])
+            text = '\n'.join([x['value'] for x in inputs])
+            content_list = [dict(type='text', text=text)]
+        return content_list
+    def prepare_inputs(self, inputs):
+        input_msgs = []
+        if self.system_prompt is not None:
+            input_msgs.append(dict(role='system', content=self.system_prompt))
+        assert isinstance(inputs, list) and isinstance(inputs[0], dict)
+        assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs
+        if 'role' in inputs[0]:
+            assert inputs[-1]['role'] == 'user', inputs[-1]
+            for item in inputs:
+                input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content'])))
+        else:
+            input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs)))
+        return input_msgs
+    def generate_inner(self, inputs, **kwargs) -> str:
+        input_msgs = self.prepare_inputs(inputs)
+        temperature = kwargs.pop('temperature', self.temperature)
+        max_tokens = kwargs.pop('max_tokens', self.max_tokens)
+        context_window = GPT_context_window(self.model)
+        max_tokens = min(max_tokens, context_window - self.get_token_len(inputs))
+        if 0 < max_tokens <= 100:
+            self.logger.warning(
+                'Less than 100 tokens left, '
+                'may exceed the context window with some additional meta symbols. '
+            )
+        if max_tokens <= 0:
+            return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. '
+        # Will send request if use Azure, dk how to use openai client for it
+        if self.use_azure:
+            headers = {'Content-Type': 'application/json', 'api-key': self.key}
+        else:
+            headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.key}'}
+        payload = dict(
+            model=self.model,
+            messages=input_msgs,
+            max_tokens=max_tokens,
+            n=1,
+            temperature=temperature,
+            **kwargs)
+        response = requests.post(
+            self.api_base,
+            headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
+        ret_code = response.status_code
+        ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
+        answer = self.fail_msg
+        try:
+            resp_struct = json.loads(response.text)
+            answer = resp_struct['choices'][0]['message']['content'].strip()
+        except:
+            pass
+        return ret_code, answer, response
+    def get_image_token_len(self, img_path, detail='low'):
+        import math
+        if detail == 'low':
+            return 85
+        im = Image.open(img_path)
+        height, width = im.size
+        if width > 1024 or height > 1024:
+            if width > height:
+                height = int(height * 1024 / width)
+                width = 1024
+            else:
+                width = int(width * 1024 / height)
+                height = 1024
+        h = math.ceil(height / 512)
+        w = math.ceil(width / 512)
+        total = 85 + 170 * h * w
+        return total
+    def get_token_len(self, inputs) -> int:
+        import tiktoken
+        try:
+            enc = tiktoken.encoding_for_model(self.model)
+        except:
+            enc = tiktoken.encoding_for_model('gpt-4')
+        assert isinstance(inputs, list)
+        tot = 0
+        for item in inputs:
+            if 'role' in item:
+                tot += self.get_token_len(item['content'])
+            elif item['type'] == 'text':
+                tot += len(enc.encode(item['value']))
+            elif item['type'] == 'image':
+                tot += self.get_image_token_len(item['value'], detail=self.img_detail)
+        return tot
+class GPT4V(OpenAIWrapper):
+    def generate(self, message, dataset=None):
+        return super(GPT4V, self).generate(message)
--- a/VLMEvalKit/vlmeval/api/hf_chat_model.py
+++ b/VLMEvalKit/vlmeval/api/hf_chat_model.py
+import os
+import sys
+import os.path as osp
+import torch
+from ..smp import *
+def get_gpu_num(model_name):
+    model_name = model_name.lower()
+    kws = {
+        8: ['65b', '70b'],
+        4: ['30b', '33b', '35b', '40b'],
+        2: ['13b', '14b', '20b'],
+        1: ['6b', '7b', 'moss'],
+    }
+    for k in [8, 4, 2, 1]:
+        for keyword in kws[k]:
+            if keyword in model_name:
+                return k
+    return 8
+validated_llms = [
+    'internlm/internlm-chat-7b', 'internlm/internlm-chat-7b-8k', 'internlm/internlm-chat-20b',
+    'Qwen/Qwen-7B-Chat', 'Qwen/Qwen-14B-Chat',
+    'THUDM/chatglm2-6b', 'THUDM/chatglm2-6b-32k', 'THUDM/chatglm3-6b', 'THUDM/chatglm3-6b-32k',
+    'baichuan-inc/Baichuan2-7B-Chat', 'baichuan-inc/Baichuan2-13B-Chat',
+    'lmsys/vicuna-7b-v1.5', 'lmsys/vicuna-13b-v1.5',
+    'meta-llama/Llama-2-7b-chat-hf'
+]
+Auto_model = ['chatglm']
+class HFChatModel:
+    def _get_context_length(self, model, model_path):
+        # By default, we use model.config.seq_length
+        model_path = model_path.lower()
+        if 'baichuan' in model_path:
+            context_window = model.config.model_max_length
+        elif 'internlm' in model_path or 'llama' in model_path:
+            context_window = model.config.max_position_embeddings
+        elif 'vicuna' in model_path:
+            context_window = model.generation_config.max_length
+        else:
+            # chatglm & qwen
+            context_window = model.config.seq_length
+        return context_window
+    def _get_context_length_robust(self, model, model_path):
+        try:
+            context_window = self._get_context_length(model, model_path)
+            return context_window
+        except:
+            self.logger.critical(
+                'Failed to extract context_window information from config / generation_config. '
+                'Please read the above code and check if the logic works for you model path'
+            )
+            raise NotImplementedError
+    def __init__(self,
+                 model_path,
+                 system_prompt: str = None,
+                 **kwargs):
+        self.logger = get_logger('HFChatModel')
+        if 'vicuna' in model_path.lower():
+            try:
+                from fastchat.model import get_conversation_template
+            except:
+                self.logger.critical('Please install fastchat first to use vicuna. ')
+                sys.exit(-1)
+        self.explicit_device = kwargs.pop('device', None)
+        if self.explicit_device is None:
+            # If CUDA_VISIBLE_DEVICES is not properly set
+            if 'CUDA_VISIBLE_DEVICES' not in os.environ or os.environ['CUDA_VISIBLE_DEVICES'] == '0,1,2,3,4,5,6,7':
+                num_gpu = get_gpu_num(model_path)
+                gpu_offset = kwargs.pop('gpu_offset', 0)
+                cuda_visible_devices = ','.join([str(i) for i in range(gpu_offset, gpu_offset + num_gpu)])
+                os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices
+        from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
+        from transformers.generation import GenerationConfig
+        if model_path not in validated_llms:
+            self.logger.warning(f'{model_path} not in validated LLMs, may have inference troubles. ')
+        self.model_path = model_path
+        if listinstr(Auto_model, model_path):
+            LoadModel = AutoModel
+        else:
+            LoadModel = AutoModelForCausalLM
+        assert osp.exists(model_path) or len(model_path.split('/')) == 2
+        device = self.explicit_device if self.explicit_device else 'auto'
+        precision = {}
+        if 'internlm-chat-7b' in model_path:
+            precision = {'torch_dtype': torch.float16}
+        elif 'internlm-chat-20b' in model_path:
+            precision = {'torch_dtype': torch.bfloat16}
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        model = LoadModel.from_pretrained(model_path, trust_remote_code=True, device_map='cpu', **precision)
+        model = model.eval()
+        if device != 'cpu':
+            model = model.to(f'cuda:{device}' if isinstance(device, int) else 'cuda')
+        try:
+            model.generation_config = GenerationConfig.from_pretrained(
+                model_path, trust_remote_code=True, device_map=device)
+        except:
+            pass
+        torch.cuda.empty_cache()
+        self.model = model
+        self.context_length = self._get_context_length_robust(model=model, model_path=model_path)
+        self.answer_buffer = 192
+        self.system_prompt = system_prompt
+        for k, v in kwargs.items():
+            self.logger.info(f'Following args will be used for generation (If not set specifically), {k}: {v}. ')
+        self.kwargs = kwargs
+    def generate_str(self, input, **kwargs):
+        if 'baichuan' in self.model_path.lower():
+            messages = []
+            messages.append({'role': 'user', 'content': input})
+            resp = self.model.chat(self.tokenizer, messages, **kwargs)
+        elif 'vicuna' in self.model_path.lower():
+            from fastchat.model import get_conversation_template
+            conv = get_conversation_template('vicuna')
+            conv.append_message(conv.roles[0], input)
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+            inputs = self.tokenizer([prompt], return_tensors='pt')
+            if torch.cuda.is_available():
+                for k in inputs:
+                    inputs[k] = inputs[k].cuda()
+            params = dict(do_sample=True, temperature=0.7, repetition_penalty=1.0, max_new_tokens=512)
+            params.update(self.kwargs)
+            params.update(kwargs)
+            outputs = self.model.generate(**inputs, **params)
+            resp = self.tokenizer.decode(
+                outputs[0][len(inputs['input_ids'][0]):],
+                skip_special_tokens=True,
+                spaces_between_special_tokens=False)
+        else:
+            params = self.kwargs
+            params.update(kwargs)
+            resp, _ = self.model.chat(self.tokenizer, input, history=[], **params)
+        return resp
+    def length_ok(self, inputs):
+        tot = len(self.tokenizer.encode(self.system_prompt)) if self.system_prompt is not None else 0
+        for s in inputs:
+            tot += len(self.tokenizer.encode(s))
+        return tot + self.answer_buffer < self.context_length
+    def generate_list(self, full_inputs, offset=0, **kwargs):
+        assert isinstance(full_inputs, list)
+        inputs = full_inputs[offset:]
+        if not self.length_ok(inputs):
+            return self.chat(full_inputs, offset + 1)
+        model_path = self.model_path.lower()
+        if sum([x in model_path for x in ['baichuan']]):
+            input_msgs = []
+            if self.system_prompt is not None:
+                input_msgs.append(dict(role='user', content=self.system_prompt))
+            if len(inputs):
+                assert isinstance(inputs, list) and isinstance(inputs[0], str)
+                roles = ['user', 'assistant'] if len(inputs) % 2 == 1 else ['assistant', 'user']
+                roles = roles * len(inputs)
+                for role, msg in zip(roles, inputs):
+                    input_msgs.append(dict(role=role, content=msg))
+            response = self.model.chat(self.tokenizer, input_msgs)
+        elif sum([x in model_path for x in ['vicuna']]):
+            from fastchat.model import get_conversation_template
+            conv = get_conversation_template('vicuna')
+            assert isinstance(inputs, list) and isinstance(inputs[0], str)
+            if len(inputs) % 2 == 1:
+                if self.system_prompt is not None:
+                    conv.append_message(conv.roles[0], self.system_prompt)
+                for i in range(len(inputs) // 2):
+                    conv.append_message(conv.roles[0], inputs[2 * i])
+                    conv.append_message(conv.roles[1], inputs[2 * i + 1])
+            else:
+                assert self.system_prompt is not None
+                conv.append_message(conv.roles[0], self.system_prompt)
+                conv.append_message(conv.roles[1], inputs[0])
+                for i in range(len(inputs) // 2 - 1):
+                    conv.append_message(conv.roles[0], inputs[2 * i + 1])
+                    conv.append_message(conv.roles[1], inputs[2 * i + 2])
+            conv.append_message(conv.roles[0], inputs[-1])
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+            inputs = self.tokenizer([prompt], return_tensors='pt')
+            if torch.cuda.is_available():
+                for k in inputs:
+                    inputs[k] = inputs[k].cuda()
+            params = dict(do_sample=True, temperature=0.7, repetition_penalty=1.0, max_new_tokens=512)
+            params.update(self.kwargs)
+            params.update(kwargs)
+            outputs = self.model.generate(**inputs, **params)
+            response = self.tokenizer.decode(
+                outputs[0][len(inputs['input_ids'][0]):],
+                skip_special_tokens=True,
+                spaces_between_special_tokens=False)
+            response = response.lstrip('\n')
+        else:
+            # The default option, support internlm, chatglm, qwen
+            history, msg = [], None
+            if len(inputs) % 2 == 1:
+                if self.system_prompt is not None:
+                    history = [(self.system_prompt, '')]
+                for i in range(len(inputs) // 2):
+                    history.append((inputs[2 * i], inputs[2 * i + 1]))
+            else:
+                assert self.system_prompt is not None
+                history = [(self.system_prompt, inputs[0])]
+                for i in range(len(inputs) // 2 - 1):
+                    history.append((inputs[2 * i + 1], inputs[2 * i + 2]))
+            msg = inputs[-1]
+            params = self.kwargs
+            params.update(kwargs)
+            response, _ = self.model.chat(self.tokenizer, msg, history=history, **params)
+        return response, offset
+    def generate(self, inputs, **kwargs):
+        if isinstance(inputs, str):
+            return self.generate_str(inputs, **kwargs)
+        elif isinstance(inputs, list):
+            return self.generate_list(inputs, **kwargs)
--- a/VLMEvalKit/vlmeval/api/hunyuan.py
+++ b/VLMEvalKit/vlmeval/api/hunyuan.py
+from vlmeval.smp import *
+import os
+import sys
+from vlmeval.api.base import BaseAPI
+class HunyuanWrapper(BaseAPI):
+    is_api: bool = True
+    _apiVersion = '2023-09-01'
+    _service = 'hunyuan'
+    def __init__(self,
+                 model: str = 'hunyuan-vision',
+                 retry: int = 5,
+                 wait: int = 5,
+                 secret_key: str = None,
+                 secret_id: str = None,
+                 verbose: bool = True,
+                 system_prompt: str = None,
+                 temperature: float = 0,
+                 timeout: int = 60,
+                 api_base: str = 'hunyuan.tencentcloudapi.com',
+                 **kwargs):
+        self.model = model
+        self.cur_idx = 0
+        self.fail_msg = 'Failed to obtain answer via API. '
+        self.temperature = temperature
+        warnings.warn('You may need to set the env variable HUNYUAN_SECRET_ID & HUNYUAN_SECRET_KEY to use Hunyuan. ')
+        secret_key = os.environ.get('HUNYUAN_SECRET_KEY', secret_key)
+        assert secret_key is not None, 'Please set the environment variable HUNYUAN_SECRET_KEY. '
+        secret_id = os.environ.get('HUNYUAN_SECRET_ID', secret_id)
+        assert secret_id is not None, 'Please set the environment variable HUNYUAN_SECRET_ID. '
+        self.model = model
+        self.endpoint = api_base
+        self.secret_id = secret_id
+        self.secret_key = secret_key
+        self.timeout = timeout
+        try:
+            from tencentcloud.common import credential
+            from tencentcloud.common.profile.client_profile import ClientProfile
+            from tencentcloud.common.profile.http_profile import HttpProfile
+            from tencentcloud.hunyuan.v20230901 import hunyuan_client
+        except ImportError:
+            warnings.warn('Please install tencentcloud-sdk-python to use Hunyuan API. ')
+            exit(-1)
+        super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
+        cred = credential.Credential(self.secret_id, self.secret_key)
+        httpProfile = HttpProfile()
+        httpProfile.endpoint = self.endpoint
+        clientProfile = ClientProfile()
+        clientProfile.httpProfile = httpProfile
+        self.client = hunyuan_client.HunyuanClient(cred, 'ap-beijing', clientProfile)
+        self.logger.info(
+            f'Using Endpoint: {self.endpoint}; API Secret ID: {self.secret_id}; API Secret Key: {self.secret_key}'
+        )
+    # inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
+    # content can be a string or a list of image & text
+    def prepare_itlist(self, inputs):
+        assert np.all([isinstance(x, dict) for x in inputs])
+        has_images = np.sum([x['type'] == 'image' for x in inputs])
+        if has_images:
+            content_list = []
+            for msg in inputs:
+                if msg['type'] == 'text':
+                    content_list.append(dict(Type='text', Text=msg['value']))
+                elif msg['type'] == 'image':
+                    from PIL import Image
+                    img = Image.open(msg['value'])
+                    b64 = encode_image_to_base64(img)
+                    img_struct = dict(Url=f'data:image/jpeg;base64,{b64}')
+                    content_list.append(dict(Type='image_url', ImageUrl=img_struct))
+        else:
+            assert all([x['type'] == 'text' for x in inputs])
+            text = '\n'.join([x['value'] for x in inputs])
+            content_list = [dict(Type='text', Text=text)]
+        return content_list
+    def prepare_inputs(self, inputs):
+        input_msgs = []
+        if self.system_prompt is not None:
+            input_msgs.append(dict(Role='system', Content=self.system_prompt))
+        assert isinstance(inputs, list) and isinstance(inputs[0], dict)
+        assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs
+        if 'role' in inputs[0]:
+            assert inputs[-1]['role'] == 'user', inputs[-1]
+            for item in inputs:
+                input_msgs.append(dict(Role=item['role'], Contents=self.prepare_itlist(item['content'])))
+        else:
+            input_msgs.append(dict(Role='user', Contents=self.prepare_itlist(inputs)))
+        return input_msgs
+    def generate_inner(self, inputs, **kwargs) -> str:
+        from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
+        from tencentcloud.hunyuan.v20230901 import models
+        input_msgs = self.prepare_inputs(inputs)
+        temperature = kwargs.pop('temperature', self.temperature)
+        payload = dict(
+            Model=self.model,
+            Messages=input_msgs,
+            Temperature=temperature,
+            **kwargs)
+        retry_counter = 0
+        while retry_counter < 3:
+            try:
+                req = models.ChatCompletionsRequest()
+                req.from_json_string(json.dumps(payload))
+                resp = self.client.ChatCompletions(req)
+                resp = json.loads(resp.to_json_string())
+                answer = resp['Choices'][0]['Message']['Content']
+                return 0, answer, resp
+            except TencentCloudSDKException as e:
+                self.logger.error(f'Got error code: {e.get_code()}')
+                if e.get_code() == 'ClientNetworkError':
+                    return -1, self.fail_msg + e.get_code(), None
+                elif e.get_code() in ['InternalError', 'ServerNetworkError']:
+                    if retry_counter == 3:
+                        return -1, self.fail_msg + e.get_code(), None
+                    retry_counter += 1
+                    continue
+                elif e.get_code() in ['LimitExceeded']:
+                    time.sleep(5)
+                    if retry_counter == 3:
+                        return -1, self.fail_msg + e.get_code(), None
+                    retry_counter += 1
+                    continue
+                else:
+                    return -1, self.fail_msg + str(e), None
+        return -1, self.fail_msg, None
+class HunyuanVision(HunyuanWrapper):
+    def generate(self, message, dataset=None):
+        return super(HunyuanVision, self).generate(message)
--- a/VLMEvalKit/vlmeval/api/qwen_api.py
+++ b/VLMEvalKit/vlmeval/api/qwen_api.py
+from http import HTTPStatus
+import os
+from vlmeval.api.base import BaseAPI
+from vlmeval.smp import *
+# Note: This is a pure language model API.
+class QwenAPI(BaseAPI):
+    is_api: bool = True
+    def __init__(self,
+                 model: str = 'qwen-max-1201',
+                 retry: int = 5,
+                 wait: int = 5,
+                 verbose: bool = True,
+                 seed: int = 2680,
+                 temperature: float = 0.0,
+                 system_prompt: str = None,
+                 key: str = None,
+                 max_tokens: int = 1024,
+                 proxy: str = None,
+                 **kwargs):
+        assert model in ['qwen-turbo', 'qwen-plus', 'qwen-max', 'qwen-max-1201', 'qwen-max-longcontext']
+        self.model = model
+        import dashscope
+        self.fail_msg = 'Failed to obtain answer via API. '
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+        self.seed = seed
+        if key is None:
+            key = os.environ.get('DASHSCOPE_API_KEY', None)
+        assert key is not None, (
+            'Please set the API Key (obtain it here: '
+            'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
+        )
+        dashscope.api_key = key
+        if proxy is not None:
+            proxy_set(proxy)
+        super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
+    @staticmethod
+    def build_msgs(msgs_raw, system_prompt=None):
+        msgs = cp.deepcopy(msgs_raw)
+        ret = []
+        if system_prompt is not None:
+            ret.append(dict(role='system', content=system_prompt))
+        for i, msg in enumerate(msgs):
+            role = 'user' if i % 2 == 0 else 'assistant'
+            ret.append(dict(role=role, content=msg))
+        return ret
+    def generate_inner(self, inputs, **kwargs) -> str:
+        from dashscope import MultiModalConversation
+        assert isinstance(inputs, str) or isinstance(inputs, list)
+        inputs = [inputs] if isinstance(inputs, str) else inputs
+        messages = self.build_msgs(msgs_raw=inputs, system_prompt=self.system_prompt)
+        import dashscope
+        response = dashscope.Generation.call(
+            model=self.model,
+            messages=messages,
+            seed=self.seed,
+            temperature=self.temperature,
+            max_tokens=self.max_tokens,
+            result_format='message',  # set the result to be "message" format.
+        )
+        if response.status_code != HTTPStatus.OK:
+            return -1, 'Error: Bad Response Statuse Code. ', f'The response status code is {response.status_code}. '
+        try:
+            return 0, response['output']['choices'][0]['message']['content'].strip(), 'Succeeded! '
+        except Exception as err:
+            return -1, f'Error: Failed to parse the response. {err}', response
--- a/VLMEvalKit/vlmeval/api/qwen_vl_api.py
+++ b/VLMEvalKit/vlmeval/api/qwen_vl_api.py
+from __future__ import annotations
+import os
+import warnings
+from vlmeval.smp import *
+from vlmeval.api.base import BaseAPI
+from vlmeval.vlm.qwen2_vl.prompt import Qwen2VLPromptMixin
+def ensure_image_url(image: str) -> str:
+    prefixes = ['http://', 'https://', 'file://', 'data:image;']
+    if any(image.startswith(prefix) for prefix in prefixes):
+        return image
+    if os.path.exists(image):
+        return 'file://' + image
+    raise ValueError(f'Invalid image: {image}')
+class Qwen2VLAPI(Qwen2VLPromptMixin, BaseAPI):
+    is_api: bool = True
+    def __init__(
+        self,
+        model: str = 'qwen-vl-max-0809',
+        key: str | None = None,
+        min_pixels: int | None = None,
+        max_pixels: int | None = None,
+        max_length=2048,
+        top_p=0.001,
+        top_k=1,
+        temperature=0.01,
+        repetition_penalty=1.0,
+        presence_penalty=0.0,
+        seed=3407,
+        use_custom_prompt: bool = True,
+        **kwargs,
+    ):
+        import dashscope
+        self.model = model
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.generate_kwargs = dict(
+            max_length=max_length,
+            top_p=top_p,
+            top_k=top_k,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            presence_penalty=presence_penalty,
+            seed=seed,
+        )
+        key = os.environ.get('DASHSCOPE_API_KEY', None) if key is None else key
+        assert key is not None, (
+            'Please set the API Key (obtain it here: '
+            'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
+        )
+        dashscope.api_key = key
+        super().__init__(use_custom_prompt=use_custom_prompt, **kwargs)
+    def _prepare_content(self, inputs: list[dict[str, str]], dataset: str | None = None) -> list[dict[str, str]]:
+        """
+        inputs list[dict[str, str]], each dict has keys: ['type', 'value']
+        """
+        content = []
+        for s in inputs:
+            if s['type'] == 'image':
+                item = {'type': 'image', 'image': ensure_image_url(s['value'])}
+                if dataset == 'OCRBench':
+                    item['min_pixels'] = 10 * 10 * 28 * 28
+                    warnings.warn(f"OCRBench dataset uses custom min_pixels={item['min_pixels']}")
+                    if self.max_pixels is not None:
+                        item['max_pixels'] = self.max_pixels
+                else:
+                    if self.min_pixels is not None:
+                        item['min_pixels'] = self.min_pixels
+                    if self.max_pixels is not None:
+                        item['max_pixels'] = self.max_pixels
+            elif s['type'] == 'text':
+                item = {'type': 'text', 'text': s['value']}
+            else:
+                raise ValueError(f"Invalid message type: {s['type']}, {s}")
+            content.append(item)
+        return content
+    def generate_inner(self, inputs, **kwargs) -> str:
+        import dashscope
+        messages = []
+        if self.system_prompt is not None:
+            messages.append({'role': 'system', 'content': self.system_prompt})
+        messages.append(
+            {'role': 'user', 'content': self._prepare_content(inputs, dataset=kwargs.get('dataset', None))}
+        )
+        if self.verbose:
+            print(f'\033[31m{messages}\033[0m')
+        # generate
+        generation_kwargs = self.generate_kwargs.copy()
+        kwargs.pop('dataset', None)
+        generation_kwargs.update(kwargs)
+        try:
+            response = dashscope.MultiModalConversation.call(
+                model=self.model,
+                messages=messages,
+                **generation_kwargs,
+            )
+            if self.verbose:
+                print(response)
+            answer = response.output.choices[0]['message']['content'][0]['text']
+            return 0, answer, 'Succeeded! '
+        except Exception as err:
+            if self.verbose:
+                self.logger.error(err)
+                self.logger.error(f'The input messages are {inputs}.')
+            return -1, '', ''
+class QwenVLWrapper(BaseAPI):
+    is_api: bool = True
+    def __init__(self,
+                 model: str = 'qwen-vl-plus',
+                 retry: int = 5,
+                 wait: int = 5,
+                 key: str = None,
+                 verbose: bool = True,
+                 temperature: float = 0.0,
+                 system_prompt: str = None,
+                 max_tokens: int = 1024,
+                 proxy: str = None,
+                 **kwargs):
+        assert model in ['qwen-vl-plus', 'qwen-vl-max']
+        self.model = model
+        import dashscope
+        self.fail_msg = 'Failed to obtain answer via API. '
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+        if key is None:
+            key = os.environ.get('DASHSCOPE_API_KEY', None)
+        assert key is not None, (
+            'Please set the API Key (obtain it here: '
+            'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
+        )
+        dashscope.api_key = key
+        if proxy is not None:
+            proxy_set(proxy)
+        super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
+    # inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
+    # content can be a string or a list of image & text
+    def prepare_itlist(self, inputs):
+        assert np.all([isinstance(x, dict) for x in inputs])
+        has_images = np.sum([x['type'] == 'image' for x in inputs])
+        if has_images:
+            content_list = []
+            for msg in inputs:
+                if msg['type'] == 'text':
+                    content_list.append(dict(text=msg['value']))
+                elif msg['type'] == 'image':
+                    content_list.append(dict(image='file://' + msg['value']))
+        else:
+            assert all([x['type'] == 'text' for x in inputs])
+            text = '\n'.join([x['value'] for x in inputs])
+            content_list = [dict(text=text)]
+        return content_list
+    def prepare_inputs(self, inputs):
+        input_msgs = []
+        if self.system_prompt is not None:
+            input_msgs.append(dict(role='system', content=self.system_prompt))
+        assert isinstance(inputs, list) and isinstance(inputs[0], dict)
+        assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs
+        if 'role' in inputs[0]:
+            assert inputs[-1]['role'] == 'user', inputs[-1]
+            for item in inputs:
+                input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content'])))
+        else:
+            input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs)))
+        return input_msgs
+    def generate_inner(self, inputs, **kwargs) -> str:
+        from dashscope import MultiModalConversation
+        assert isinstance(inputs, str) or isinstance(inputs, list)
+        if 'type' in inputs[0]:
+            pure_text = np.all([x['type'] == 'text' for x in inputs])
+        else:
+            pure_text = True
+            for inp in inputs:
+                if not np.all([x['type'] == 'text' for x in inp['content']]):
+                    pure_text = False
+                    break
+        assert not pure_text
+        messages = self.prepare_inputs(inputs)
+        gen_config = dict(max_output_tokens=self.max_tokens, temperature=self.temperature)
+        gen_config.update(kwargs)
+        try:
+            response = MultiModalConversation.call(model=self.model, messages=messages)
+            if self.verbose:
+                print(response)
+            answer = response.output.choices[0]['message']['content'][0]['text']
+            return 0, answer, 'Succeeded! '
+        except Exception as err:
+            if self.verbose:
+                self.logger.error(err)
+                self.logger.error(f'The input messages are {inputs}.')
+            return -1, '', ''
+class QwenVLAPI(QwenVLWrapper):
+    def generate(self, message, dataset=None):
+        return super(QwenVLAPI, self).generate(message)
--- a/VLMEvalKit/vlmeval/api/reka.py
+++ b/VLMEvalKit/vlmeval/api/reka.py
+from vlmeval.smp import *
+from vlmeval.api.base import BaseAPI
+from time import sleep
+import mimetypes
+class Reka_Wrapper(BaseAPI):
+    is_api: bool = True
+    INTERLEAVE: bool = False
+    def __init__(self,
+                 model: str = 'reka-flash-20240226',
+                 key: str = None,
+                 retry: int = 10,
+                 wait: int = 3,
+                 system_prompt: str = None,
+                 verbose: bool = True,
+                 temperature: float = 0,
+                 max_tokens: int = 1024,
+                 **kwargs):
+        try:
+            import reka
+        except ImportError:
+            raise ImportError('Please install reka by running "pip install reka-api"')
+        self.model = model
+        default_kwargs = dict(temperature=temperature, request_output_len=max_tokens)
+        default_kwargs.update(kwargs)
+        self.kwargs = default_kwargs
+        if key is not None:
+            self.key = key
+        else:
+            self.key = os.environ.get('REKA_API_KEY', '')
+        super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
+    def generate_inner(self, inputs, **kwargs) -> str:
+        import reka
+        reka.API_KEY = self.key
+        dataset = kwargs.pop('dataset', None)
+        prompt, image_path = self.message_to_promptimg(inputs, dataset=dataset)
+        image_b64 = encode_image_file_to_base64(image_path)
+        response = reka.chat(
+            model_name=self.model,
+            human=prompt,
+            media_url=f'data:image/jpeg;base64,{image_b64}',
+            **self.kwargs)
+        try:
+            return 0, response['text'], response
+        except:
+            return -1, self.fail_msg, response
+class Reka(Reka_Wrapper):
+    def generate(self, message, dataset=None):
+        return super(Reka_Wrapper, self).generate(message)
--- a/VLMEvalKit/vlmeval/api/sensechat_vision.py
+++ b/VLMEvalKit/vlmeval/api/sensechat_vision.py
+from vlmeval.smp import *
+from vlmeval.api.base import BaseAPI
+from vlmeval.dataset import img_root_map
+from vlmeval.dataset import DATASET_TYPE
+class SenseChatVisionWrapper(BaseAPI):
+    is_api: bool = True
+    def __init__(self,
+                 model: str = 'SenseChat-5-Vision',
+                 retry: int = 5,
+                 wait: int = 5,
+                 ak: str = None,
+                 sk: str = None,
+                 verbose: bool = True,
+                 system_prompt: str = None,
+                 max_tokens: int = 1024,
+                 proxy: str = None,
+                 **kwargs):
+        self.model = model
+        self.fail_msg = 'Failed to obtain answer via API. '
+        self.ak = os.environ.get('SENSECHAT_AK', None) if ak is None else ak
+        self.sk = os.environ.get('SENSECHAT_SK', None) if sk is None else sk
+        assert self.ak is not None and self.sk is not None
+        self.max_new_tokens = max_tokens
+        super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
+    def dump_image(self, line, dataset):
+        """Dump the image(s) of the input line to the corresponding dataset folder.
+        Args:
+            line (line of pd.DataFrame): The raw input line.
+            dataset (str): The name of the dataset.
+        Returns:
+            str | list[str]: The paths of the dumped images.
+        """
+        ROOT = LMUDataRoot()
+        assert isinstance(dataset, str)
+        img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset)
+        os.makedirs(img_root, exist_ok=True)
+        if 'image' in line:
+            if isinstance(line['image'], list):
+                tgt_path = []
+                assert 'image_path' in line
+                for img, im_name in zip(line['image'], line['image_path']):
+                    path = osp.join(img_root, im_name)
+                    if not read_ok(path):
+                        decode_base64_to_image_file(img, path)
+                    tgt_path.append(path)
+            else:
+                tgt_path = osp.join(img_root, f"{line['index']}.jpg")
+                if not read_ok(tgt_path):
+                    decode_base64_to_image_file(line['image'], tgt_path)
+                tgt_path = [tgt_path]
+        else:
+            assert 'image_path' in line
+            tgt_path = toliststr(line['image_path'])
+        return tgt_path
+    def image_to_base64(self, image_path):
+        import base64
+        with open(image_path, 'rb') as image_file:
+            encoded_string = base64.b64encode(image_file.read())
+            return encoded_string.decode('utf-8')
+    def encode_jwt_token(self, ak, sk):
+        import jwt
+        headers = {'alg': 'HS256', 'typ': 'JWT'}
+        payload = {
+            'iss': ak,
+            'exp': int(time.time())
+            + 1800,  # 填写您期望的有效时间，此处示例代表当前时间+30分钟
+            'nbf': int(time.time()) - 5,  # 填写您期望的生效时间，此处示例代表当前时间-5秒
+        }
+        token = jwt.encode(payload, sk, headers=headers)
+        return token
+    def use_custom_prompt(self, dataset):
+        return True
+    def build_multi_choice_prompt(self, line, dataset=None):
+        question = line['question']
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+        prompt = question
+        if len(options):
+            prompt += '\n请直接回答选项字母。' if cn_string(
+                prompt) else "\nAnswer with the option's letter from the given choices directly."
+        else:
+            prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
+        return prompt
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert dataset is None or isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+        if dataset is not None and listinstr(['MME'], dataset):
+            question = line['question']
+            prompt = question + ' Answer the question using a single word or phrase.'
+        elif dataset is not None and listinstr(['HallusionBench'], dataset):
+            question = line['question']
+            prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.'
+        elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ' and 'MMMU' not in dataset:
+            prompt = self.build_multi_choice_prompt(line, dataset)
+        elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
+            if 'MathVista' in dataset:
+                prompt = line['question']
+            elif listinstr(['LLaVABench'], dataset):
+                question = line['question']
+                prompt = question + '\nAnswer this question in detail.'
+            elif listinstr(['MMVet'], dataset):
+                prompt = line['question']
+            else:
+                question = line['question']
+                prompt = question + '\nAnswer the question using a single word or phrase.'
+        elif dataset is not None and 'MMMU' in dataset:
+            question = line['question']
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            for key, item in options.items():
+                question += f'\n{key}. {item}'
+            prompt = {
+                'multiple-choice': 'You are an expert in {}. Please solve the university-level {} examination question, which includes interleaved images and text. Your output should be divided into two parts: First, reason about the correct answer. Then write the answer in the following format where X is exactly one of the choices given by the problem: "ANSWER: X". If you are uncertain of the correct answer, guess the most likely one.',  # noqa: E501
+                'open': 'You are an expert in {}. Please solve the university-level {} examination question, which includes interleaved images and text. Your output should be divided into two parts: First, reason about the correct answer. Then write the answer in the following format where X is only the answer and nothing else: "ANSWER: X"'  # noqa: E501
+            }
+            subject = '_'.join(line['id'].split('_')[1:-1])
+            prompt = prompt[line['question_type']].format(subject, subject) + '\n' + question
+        else:
+            prompt = line['question']
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
+    def message_to_promptimg(self, message, dataset=None):
+        if dataset is None or listinstr(['MMMU', 'BLINK'], dataset):
+            prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+            image = [[x['value'] for x in message if x['type'] == 'image'][0]]
+        else:
+            prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+            image = [x['value'] for x in message if x['type'] == 'image']
+        return prompt, image
+    def generate_inner(self, inputs, **kwargs) -> str:
+        assert isinstance(inputs, str) or isinstance(inputs, list)
+        inputs = [inputs] if isinstance(inputs, str) else inputs
+        dataset = kwargs.get('dataset', None)
+        if dataset is not None and listinstr(['ChartQA_TEST'], dataset):
+            self.max_num = 12
+        elif dataset is not None and listinstr(['DocVQA_VAL', 'DocVQA_TEST'], dataset):
+            self.max_num = 18
+        elif dataset is not None and listinstr(['InfoVQA_VAL', 'InfoVQA_TEST', 'OCRBench'], dataset):
+            self.max_num = 24
+        else:
+            self.max_num = 6
+        if dataset is None:
+            pass
+        elif listinstr(['AI2D_TEST'], dataset):
+            self.max_new_tokens = 10
+        elif 'MMMU' in dataset:
+            self.max_new_tokens = 1024
+        elif 'MMBench' in dataset:
+            self.max_new_tokens = 100
+        prompt, image = self.message_to_promptimg(message=inputs, dataset=dataset)
+        url = 'https://api.sensenova.cn/v1/llm/chat-completions'
+        api_secret_key = self.encode_jwt_token(self.ak, self.sk)
+        content = [{
+            'image_base64': self.image_to_base64(item),
+            'image_file_id': '',
+            'image_url': '',
+            'text': '',
+            'text': '',
+            'type': 'image_base64'
+        } for item in image]
+        content.append({
+            'image_base64': '',
+            'image_file_id': '',
+            'image_url': '',
+            'text': prompt,
+            'type': 'text'
+        })
+        message = [{'content': content, 'role': 'user'}]
+        data = {
+            'messages': message,
+            'max_new_tokens': self.max_new_tokens,
+            'model': self.model,
+            'stream': False,
+        }
+        headers = {
+            'Content-type': 'application/json',
+            'Authorization': 'Bearer ' + api_secret_key
+        }
+        response = requests.post(
+            url,
+            headers=headers,
+            json=data,
+        )
+        request_id = response.headers['x-request-id']
+        time.sleep(1)
+        try:
+            assert response.status_code == 200
+            response = response.json()['data']['choices'][0]['message'].strip()
+            if dataset is not None and 'MMMU' in dataset:
+                response = response.split('ANSWER: ')[-1].strip()
+            if self.verbose:
+                self.logger.info(f'inputs: {inputs}\nanswer: {response}')
+            return 0, response, 'Succeeded! '
+        except Exception as err:
+            if self.verbose:
+                self.logger.error('---------------------------ERROR---------------------------')
+                self.logger.error(response.json())
+                self.logger.error(err)
+                self.logger.error('---------------------------request_id---------------------------' + request_id)
+                self.logger.error(
+                    'api error' + response.json()['error']['message']
+                    + str([input['value'] if input['type'] == 'image' else None for input in inputs])
+                )
+                self.logger.error(f'The input messages are {inputs}.')
+            return -1, response.json()['error']['message'], ''
+class SenseChatVisionAPI(SenseChatVisionWrapper):
+    def generate(self, message, dataset=None):
+        return super(SenseChatVisionAPI, self).generate(message, dataset=dataset)
--- a/VLMEvalKit/vlmeval/api/stepai.py
+++ b/VLMEvalKit/vlmeval/api/stepai.py
+from vlmeval.smp import *
+from vlmeval.api.base import BaseAPI
+url = 'https://api.stepfun.com/v1/chat/completions'
+headers = {
+    'Content-Type': 'application/json',
+    'Authorization': 'Bearer {}',
+}
+class StepAPI_INT(BaseAPI):
+    is_api: bool = True
+    def __init__(self,
+                 model: str = 'step-1v-8k',
+                 retry: int = 10,
+                 wait: int = 3,
+                 key: str = None,
+                 temperature: float = 0,
+                 max_tokens: int = 300,
+                 verbose: bool = True,
+                 system_prompt: str = None,
+                 **kwargs):
+        self.model = model
+        self.fail_msg = 'Fail to obtain answer via API.'
+        self.headers = headers
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+        self.system_prompt = system_prompt
+        if key is not None:
+            self.key = key
+        else:
+            self.key = os.environ.get('STEPAI_API_KEY', '')
+        headers['Authorization'] = headers['Authorization'].format(self.key)
+        super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
+    @staticmethod
+    def build_msgs(msgs_raw):
+        messages = []
+        message = {'role': 'user', 'content': []}
+        for msg in msgs_raw:
+            if msg['type'] == 'image':
+                image_b64 = encode_image_file_to_base64(msg['value'])
+                message['content'].append({
+                    'image_url': {'url': 'data:image/webp;base64,%s' % (image_b64)},
+                    'type': 'image_url'
+                })
+            elif msg['type'] == 'text':
+                message['content'].append({
+                    'text': msg['value'],
+                    'type': 'text'
+                })
+        messages.append(message)
+        return messages
+    def generate_inner(self, inputs, **kwargs) -> str:
+        print(inputs, '\n')
+        payload = dict(
+            model=self.model,
+            max_tokens=self.max_tokens,
+            temperature=self.temperature,
+            messages=self.build_msgs(msgs_raw=inputs),
+            **kwargs)
+        response = requests.post(url, headers=headers, data=json.dumps(payload))
+        ret_code = response.status_code
+        ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
+        answer = self.fail_msg
+        try:
+            resp_struct = json.loads(response.text)
+            answer = resp_struct['choices'][0]['message']['content'].strip()
+        except:
+            pass
+        return ret_code, answer, response
+class Step1V_INT(StepAPI_INT):
+    def generate(self, message, dataset=None):
+        return super(StepAPI_INT, self).generate(message)
--- a/VLMEvalKit/vlmeval/config.py
+++ b/VLMEvalKit/vlmeval/config.py
+from vlmeval.vlm import *
+from vlmeval.api import *
+from functools import partial
+PandaGPT_ROOT = None
+MiniGPT4_ROOT = None
+TransCore_ROOT = None
+Yi_ROOT = None
+OmniLMM_ROOT = None
+Mini_Gemini_ROOT = None
+VXVERSE_ROOT = None
+VideoChat2_ROOT = None
+VideoChatGPT_ROOT = None
+PLLaVA_ROOT = None
+RBDash_ROOT = None
+LLAVA_V1_7B_MODEL_PTH = 'Please set your local path to LLaVA-7B-v1.1 here, the model weight is obtained by merging LLaVA delta weight based on vicuna-7b-v1.1 in https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md with vicuna-7b-v1.1. '
+video_models = {
+    'Video-LLaVA-7B':partial(VideoLLaVA, model_path='LanguageBind/Video-LLaVA-7B'),
+    'Video-LLaVA-7B-HF':partial(VideoLLaVA_HF, model_path='LanguageBind/Video-LLaVA-7B-hf'),
+    'VideoChat2-HD':partial(VideoChat2_HD, model_path='OpenGVLab/VideoChat2_HD_stage4_Mistral_7B', root=VideoChat2_ROOT, config_file='./vlmeval/vlm/video_llm/configs/videochat2_hd.json'),
+    'Chat-UniVi-7B': partial(Chatunivi, model_path="Chat-UniVi/Chat-UniVi"),
+    'Chat-UniVi-7B-v1.5': partial(Chatunivi, model_path="Chat-UniVi/Chat-UniVi-7B-v1.5"),
+    'LLaMA-VID-7B': partial(LLaMAVID, model_path='YanweiLi/llama-vid-7b-full-224-video-fps-1'),
+    'Video-ChatGPT': partial(VideoChatGPT, model_path='MBZUAI/Video-ChatGPT-7B', dir_root=VideoChatGPT_ROOT),
+    'PLLaVA-7B': partial(PLLaVA, model_path='ermu2001/pllava-7b', dir_root=PLLaVA_ROOT),
+    'PLLaVA-13B': partial(PLLaVA, model_path='ermu2001/pllava-13b', dir_root=PLLaVA_ROOT),
+    'PLLaVA-34B': partial(PLLaVA, model_path='ermu2001/pllava-34b', dir_root=PLLaVA_ROOT),
+}
+ungrouped = {
+    'TransCore_M': partial(TransCoreM, root=TransCore_ROOT),
+    'PandaGPT_13B': partial(PandaGPT, name='PandaGPT_13B', root=PandaGPT_ROOT),
+    'flamingov2': partial(OpenFlamingo, name='v2', mpt_pth='anas-awadalla/mpt-7b', ckpt_pth='openflamingo/OpenFlamingo-9B-vitl-mpt7b'),
+    'VisualGLM_6b': partial(VisualGLM, model_path='THUDM/visualglm-6b'),
+    'mPLUG-Owl2': partial(mPLUG_Owl2, model_path='MAGAer13/mplug-owl2-llama2-7b'),
+    'mPLUG-Owl3': partial(mPLUG_Owl3, model_path='mPLUG/mPLUG-Owl3-7B-240728'),
+    'emu2_chat': partial(Emu, model_path='BAAI/Emu2-Chat'),
+    'OmniLMM_12B': partial(OmniLMM12B, model_path='openbmb/OmniLMM-12B', root=OmniLMM_ROOT),
+    'MGM_7B': partial(Mini_Gemini, model_path='YanweiLi/MGM-7B-HD', root=Mini_Gemini_ROOT),
+    'Bunny-llama3-8B': partial(BunnyLLama3, model_path='BAAI/Bunny-v1_1-Llama-3-8B-V'),
+    'VXVERSE': partial(VXVERSE, model_name='XVERSE-V-13B', root=VXVERSE_ROOT),
+    'paligemma-3b-mix-448': partial(PaliGemma, model_path='google/paligemma-3b-mix-448'),
+    '360VL-70B': partial(QH_360VL, model_path='qihoo360/360VL-70B'),
+    'Llama-3-MixSenseV1_1': partial(LLama3Mixsense, model_path='Zero-Vision/Llama-3-MixSenseV1_1'),
+    'Parrot': partial(Parrot, model_path='AIDC-AI/Parrot-7B'),
+    'OmChat': partial(OmChat, model_path='omlab/omchat-v2.0-13B-single-beta_hf'),
+    'RBDash_72b': partial(RBDash, model_path='RBDash-Team/RBDash-v1.2-72b', root=RBDash_ROOT),
+    'Pixtral-12B': partial(Pixtral, model_path="mistralai/Pixtral-12B-2409")
+}
+api_models = {
+    # GPT
+    'GPT4V': partial(GPT4V, model='gpt-4-1106-vision-preview', temperature=0, img_size=512, img_detail='low', retry=10),
+    'GPT4V_HIGH': partial(GPT4V, model='gpt-4-1106-vision-preview', temperature=0, img_size=-1, img_detail='high', retry=10),
+    'GPT4V_20240409': partial(GPT4V, model='gpt-4-turbo-2024-04-09', temperature=0, img_size=512, img_detail='low', retry=10),
+    'GPT4V_20240409_HIGH': partial(GPT4V, model='gpt-4-turbo-2024-04-09', temperature=0, img_size=-1, img_detail='high', retry=10),
+    'GPT4o': partial(GPT4V, model='gpt-4o-2024-05-13', temperature=0, img_size=512, img_detail='low', retry=10),
+    'GPT4o_HIGH': partial(GPT4V, model='gpt-4o-2024-05-13', temperature=0, img_size=-1, img_detail='high', retry=10),
+    'GPT4o_20240806': partial(GPT4V, model='gpt-4o-2024-08-06', temperature=0, img_size=-1, img_detail='high', retry=10),
+    'GPT4o_MINI': partial(GPT4V, model='gpt-4o-mini-2024-07-18', temperature=0, img_size=-1, img_detail='high', retry=10),
+    # Gemini
+    'GeminiPro1-0': partial(GeminiProVision, model='gemini-1.0-pro', temperature=0, retry=10),  # now GeminiPro1-0 is only supported by vertex backend
+    'GeminiPro1-5': partial(GeminiProVision, model='gemini-1.5-pro', temperature=0, retry=10),
+    'GeminiFlash1-5': partial(GeminiProVision, model='gemini-1.5-flash', temperature=0, retry=10),
+    # Qwen-VL
+    'QwenVLPlus': partial(QwenVLAPI, model='qwen-vl-plus', temperature=0, retry=10),
+    'QwenVLMax': partial(QwenVLAPI, model='qwen-vl-max', temperature=0, retry=10),
+    # Reka
+    'RekaEdge': partial(Reka, model='reka-edge-20240208'),
+    'RekaFlash': partial(Reka, model='reka-flash-20240226'),
+    'RekaCore': partial(Reka, model='reka-core-20240415'),
+    # Step1V
+    'Step1V': partial(GPT4V, model='step-1v-8k', api_base="https://api.stepfun.com/v1/chat/completions", temperature=0, retry=10, img_detail='high'),
+    # Yi-Vision
+    'Yi-Vision': partial(GPT4V, model='yi-vision', api_base="https://api.lingyiwanwu.com/v1/chat/completions", temperature=0, retry=10),
+    # Claude
+    'Claude3V_Opus': partial(Claude3V, model='claude-3-opus-20240229', temperature=0, retry=10),
+    'Claude3V_Sonnet': partial(Claude3V, model='claude-3-sonnet-20240229', temperature=0, retry=10),
+    'Claude3V_Haiku': partial(Claude3V, model='claude-3-haiku-20240307', temperature=0, retry=10),
+    'Claude3-5V_Sonnet': partial(Claude3V, model='claude-3-5-sonnet-20240620', temperature=0, retry=10),
+    # GLM4V
+    'GLM4V': partial(GLMVisionAPI, model='glm4v-biz-eval', temperature=0, retry=10),
+    # CongRong
+    'CloudWalk': partial(CWWrapper, model='cw-congrong-v1.5', temperature=0, retry=10),
+    # SenseChat-V
+    'SenseChat-5-Vision': partial(SenseChatVisionAPI, model='SenseChat-5-Vision', temperature=0, retry=10),
+    'HunYuan-Vision': partial(HunyuanVision, model='hunyuan-vision', temperature=0, retry=10),
+    # BlueLM-V
+    "BlueLM_V": partial(BlueLM_V_API, model='BlueLM-VL-v3.0', temperature=0, retry=10)
+}
+mmalaya_series = {
+    'MMAlaya': partial(MMAlaya, model_path='DataCanvas/MMAlaya'),
+    'MMAlaya2': partial(MMAlaya2, model_path='DataCanvas/MMAlaya2'),
+}
+minicpm_series = {
+    'MiniCPM-V': partial(MiniCPM_V, model_path='openbmb/MiniCPM-V'),
+    'MiniCPM-V-2': partial(MiniCPM_V, model_path='openbmb/MiniCPM-V-2'),
+    'MiniCPM-Llama3-V-2_5': partial(MiniCPM_Llama3_V, model_path='openbmb/MiniCPM-Llama3-V-2_5'),
+    'MiniCPM-V-2_6': partial(MiniCPM_V_2_6, model_path='openbmb/MiniCPM-V-2_6'),
+}
+xtuner_series = {
+    'llava-internlm2-7b': partial(LLaVA_XTuner, llm_path='internlm/internlm2-chat-7b', llava_path='xtuner/llava-internlm2-7b', visual_select_layer=-2, prompt_template='internlm2_chat'),
+    'llava-internlm2-20b': partial(LLaVA_XTuner, llm_path='internlm/internlm2-chat-20b', llava_path='xtuner/llava-internlm2-20b', visual_select_layer=-2, prompt_template='internlm2_chat'),
+    'llava-internlm-7b': partial(LLaVA_XTuner, llm_path='internlm/internlm-chat-7b', llava_path='xtuner/llava-internlm-7b', visual_select_layer=-2, prompt_template='internlm_chat'),
+    'llava-v1.5-7b-xtuner': partial(LLaVA_XTuner, llm_path='lmsys/vicuna-7b-v1.5', llava_path='xtuner/llava-v1.5-7b-xtuner', visual_select_layer=-2, prompt_template='vicuna'),
+    'llava-v1.5-13b-xtuner': partial(LLaVA_XTuner, llm_path='lmsys/vicuna-13b-v1.5', llava_path='xtuner/llava-v1.5-13b-xtuner', visual_select_layer=-2, prompt_template='vicuna'),
+    'llava-llama-3-8b': partial(LLaVA_XTuner, llm_path='xtuner/llava-llama-3-8b-v1_1', llava_path='xtuner/llava-llama-3-8b-v1_1', visual_select_layer=-2, prompt_template='llama3_chat'),
+}
+qwen_series = {
+    'qwen_base': partial(QwenVL, model_path='Qwen/Qwen-VL'),
+    'qwen_chat': partial(QwenVLChat, model_path='Qwen/Qwen-VL-Chat'),
+    'monkey': partial(Monkey, model_path='echo840/Monkey'),
+    'monkey-chat': partial(MonkeyChat, model_path='echo840/Monkey-Chat'),
+    'minimonkey': partial(MiniMonkey, model_path='mx262/MiniMokney')
+}
+llava_series = {
+    'llava_v1.5_7b': partial(LLaVA, model_path='liuhaotian/llava-v1.5-7b'),
+    'llava_v1.5_13b': partial(LLaVA, model_path='liuhaotian/llava-v1.5-13b'),
+    'llava_v1_7b': partial(LLaVA, model_path=LLAVA_V1_7B_MODEL_PTH),
+    'sharegpt4v_7b': partial(LLaVA, model_path='Lin-Chen/ShareGPT4V-7B'),
+    'sharegpt4v_13b': partial(LLaVA, model_path='Lin-Chen/ShareGPT4V-13B'),
+    'llava_next_vicuna_7b': partial(LLaVA_Next, model_path='llava-hf/llava-v1.6-vicuna-7b-hf'),
+    'llava_next_vicuna_13b': partial(LLaVA_Next, model_path='llava-hf/llava-v1.6-vicuna-13b-hf'),
+    'llava_next_mistral_7b': partial(LLaVA_Next, model_path='llava-hf/llava-v1.6-mistral-7b-hf'),
+    'llava_next_yi_34b': partial(LLaVA_Next, model_path='llava-hf/llava-v1.6-34b-hf'),
+    'llava_next_llama3': partial(LLaVA_Next, model_path='llava-hf/llama3-llava-next-8b-hf'),
+    'llava_next_72b': partial(LLaVA_Next, model_path='llava-hf/llava-next-72b-hf'),
+    'llava_next_110b': partial(LLaVA_Next, model_path='llava-hf/llava-next-110b-hf'),
+    'llava_next_qwen_32b': partial(LLaVA_Next2, model_path='lmms-lab/llava-next-qwen-32b'),
+    'llava_next_interleave_7b': partial(LLaVA_Next, model_path='llava-hf/llava-interleave-qwen-7b-hf'),
+    'llava_next_interleave_7b_dpo': partial(LLaVA_Next, model_path='llava-hf/llava-interleave-qwen-7b-dpo-hf'),
+    'llava_onevision_qwen2_0.5b_si': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-0.5b-si'),
+    'llava_onevision_qwen2_7b_si': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-7b-si'),
+    'llava_onevision_qwen2_72b_si': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-72b-si'),
+    'llava_onevision_qwen2_0.5b_ov': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-0.5b-ov'),
+    'llava_onevision_qwen2_7b_ov': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-7b-ov'),
+    'llava_onevision_qwen2_72b_ov': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-72b-ov'),
+}
+internvl_series = {
+    'InternVL-Chat-V1-1': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-V1-1', version='V1.1'),
+    'InternVL-Chat-V1-2': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-V1-2', version='V1.2'),
+    'InternVL-Chat-V1-2-Plus': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-V1-2-Plus', version='V1.2'),
+    'InternVL-Chat-V1-5': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-V1-5', version='V1.5'),
+    'Mini-InternVL-Chat-2B-V1-5': partial(InternVLChat, model_path='OpenGVLab/Mini-InternVL-Chat-2B-V1-5', version='V1.5'),
+    'Mini-InternVL-Chat-4B-V1-5': partial(InternVLChat, model_path='OpenGVLab/Mini-InternVL-Chat-4B-V1-5', version='V1.5'),
+    # InternVL2 series
+    'InternVL2-1B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-1B', version='V2.0'),
+    'InternVL2-2B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-2B', version='V2.0'),
+    'InternVL2-4B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-4B', version='V2.0'),
+    'InternVL2-8B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-8B', version='V2.0'),
+    'InternVL2-26B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-26B', version='V2.0'),
+    'InternVL2-40B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-40B', version='V2.0', load_in_8bit=True),
+    'InternVL2-76B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-Llama3-76B', version='V2.0'),
+}
+yivl_series = {
+    'Yi_VL_6B': partial(Yi_VL, model_path='01-ai/Yi-VL-6B', root=Yi_ROOT),
+    'Yi_VL_34B': partial(Yi_VL, model_path='01-ai/Yi-VL-34B', root=Yi_ROOT),
+}
+xcomposer_series = {
+    'XComposer': partial(XComposer, model_path='internlm/internlm-xcomposer-vl-7b'),
+    'sharecaptioner': partial(ShareCaptioner, model_path='Lin-Chen/ShareCaptioner'),
+    'XComposer2': partial(XComposer2, model_path='internlm/internlm-xcomposer2-vl-7b'),
+    'XComposer2_1.8b': partial(XComposer2, model_path='internlm/internlm-xcomposer2-vl-1_8b'),
+    'XComposer2_4KHD': partial(XComposer2_4KHD, model_path='internlm/internlm-xcomposer2-4khd-7b'),
+    'XComposer2d5': partial(XComposer2d5, model_path='internlm/internlm-xcomposer2d5-7b'),
+}
+minigpt4_series = {
+    'MiniGPT-4-v2': partial(MiniGPT4, mode='v2', root=MiniGPT4_ROOT),
+    'MiniGPT-4-v1-7B': partial(MiniGPT4, mode='v1_7b', root=MiniGPT4_ROOT),
+    'MiniGPT-4-v1-13B': partial(MiniGPT4, mode='v1_13b', root=MiniGPT4_ROOT),
+}
+idefics_series = {
+    'idefics_9b_instruct': partial(IDEFICS, model_path='HuggingFaceM4/idefics-9b-instruct'),
+    'idefics_80b_instruct': partial(IDEFICS, model_path='HuggingFaceM4/idefics-80b-instruct'),
+    'idefics2_8b': partial(IDEFICS2, model_path='HuggingFaceM4/idefics2-8b'),
+    # Idefics3 follows Idefics2 Pattern
+    'Idefics3-8B-Llama3': partial(IDEFICS2, model_path='HuggingFaceM4/Idefics3-8B-Llama3'),
+}
+instructblip_series = {
+    'instructblip_7b': partial(InstructBLIP, name='instructblip_7b'),
+    'instructblip_13b': partial(InstructBLIP, name='instructblip_13b'),
+}
+deepseekvl_series = {
+    'deepseek_vl_7b': partial(DeepSeekVL, model_path='deepseek-ai/deepseek-vl-7b-chat'),
+    'deepseek_vl_1.3b': partial(DeepSeekVL, model_path='deepseek-ai/deepseek-vl-1.3b-chat'),
+}
+cogvlm_series = {
+    'cogvlm-grounding-generalist': partial(CogVlm, model_path='THUDM/cogvlm-grounding-generalist-hf', tokenizer_name='lmsys/vicuna-7b-v1.5'),
+    'cogvlm-chat': partial(CogVlm, model_path='THUDM/cogvlm-chat-hf', tokenizer_name='lmsys/vicuna-7b-v1.5'),
+    'cogvlm2-llama3-chat-19B': partial(CogVlm, model_path='THUDM/cogvlm2-llama3-chat-19B'),
+    'glm-4v-9b': partial(GLM4v, model_path='THUDM/glm-4v-9b')
+}
+wemm_series = {
+    'WeMM': partial(WeMM, model_path='feipengma/WeMM'),
+}
+cambrian_series = {
+    'cambrian_8b': partial(Cambrian, model_path='nyu-visionx/cambrian-8b'),
+    'cambrian_13b': partial(Cambrian, model_path='nyu-visionx/cambrian-13b'),
+    'cambrian_34b': partial(Cambrian, model_path='nyu-visionx/cambrian-34b'),
+}
+chameleon_series = {
+    'chameleon_7b': partial(Chameleon, model_path='facebook/chameleon-7b'),
+    'chameleon_30b': partial(Chameleon, model_path='facebook/chameleon-30b'),
+}
+vila_series = {
+    'VILA1.5-3b': partial(VILA, model_path='Efficient-Large-Model/VILA1.5-3b'),
+    'Llama-3-VILA1.5-8b': partial(VILA, model_path='Efficient-Large-Model/Llama-3-VILA1.5-8b'),
+    'VILA1.5-13b': partial(VILA, model_path='Efficient-Large-Model/VILA1.5-13b'),
+    'VILA1.5-40b': partial(VILA, model_path='Efficient-Large-Model/VILA1.5-40b'),
+}
+ovis_series = {
+    'Ovis1.5-Llama3-8B': partial(Ovis, model_path='AIDC-AI/Ovis1.5-Llama3-8B'),
+    'Ovis1.5-Gemma2-9B': partial(Ovis, model_path='AIDC-AI/Ovis1.5-Gemma2-9B'),
+    'Ovis1.6-Gemma2-9B': partial(Ovis1_6, model_path='AIDC-AI/Ovis1.6-Gemma2-9B')
+}
+mantis_series = {
+    'Mantis-8B-siglip-llama3': partial(Mantis, model_path='TIGER-Lab/Mantis-8B-siglip-llama3'),
+    'Mantis-8B-clip-llama3': partial(Mantis, model_path='TIGER-Lab/Mantis-8B-clip-llama3'),
+    'Mantis-8B-Idefics2': partial(Mantis, model_path='TIGER-Lab/Mantis-8B-Idefics2'),
+    'Mantis-8B-Fuyu': partial(Mantis, model_path='TIGER-Lab/Mantis-8B-Fuyu')
+}
+phi3_series = {
+    'Phi-3-Vision': partial(Phi3Vision, model_path='microsoft/Phi-3-vision-128k-instruct'),
+    'Phi-3.5-Vision': partial(Phi3_5Vision, model_path='microsoft/Phi-3.5-vision-instruct')
+}
+xgen_mm_series = {
+    'xgen-mm-phi3-interleave-r-v1.5': partial(XGenMM, model_path='Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5'),
+    'xgen-mm-phi3-dpo-r-v1.5': partial(XGenMM, model_path='Salesforce/xgen-mm-phi3-mini-instruct-dpo-r-v1.5'),
+}
+qwen2vl_series = {
+    'Qwen-VL-Max-0809': partial(Qwen2VLAPI, model='qwen-vl-max-0809', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'Qwen-VL-Plus-0809': partial(Qwen2VLAPI, model='qwen-vl-plus-0809', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'Qwen2-VL-7B-Instruct': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'Qwen2-VL-7B-Instruct-AWQ': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct-AWQ', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'Qwen2-VL-7B-Instruct-GPTQ-Int4': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'Qwen2-VL-7B-Instruct-GPTQ-Int8': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'Qwen2-VL-2B-Instruct': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-2B-Instruct', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'Qwen2-VL-2B-Instruct-AWQ': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-2B-Instruct-AWQ', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'Qwen2-VL-2B-Instruct-GPTQ-Int4': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4', min_pixels=1280*28*28, max_pixels=16384*28*28),
+    'Qwen2-VL-2B-Instruct-GPTQ-Int8': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8', min_pixels=1280*28*28, max_pixels=16384*28*28),
+}
+slime_series = {
+    'Slime-7B': partial(SliME, model_path='yifanzhang114/SliME-vicuna-7B'),
+    'Slime-8B': partial(SliME, model_path='yifanzhang114/SliME-Llama3-8B'),
+    'Slime-13B': partial(SliME, model_path='yifanzhang114/SliME-vicuna-13B'),
+}
+eagle_series={
+    'Eagle-X4-8B-Plus': partial(Eagle, model_path='NVEagle/Eagle-X4-8B-Plus'),
+    'Eagle-X4-13B-Plus': partial(Eagle, model_path='NVEagle/Eagle-X4-13B-Plus'),
+    'Eagle-X5-7B': partial(Eagle, model_path='NVEagle/Eagle-X5-7B'),
+    'Eagle-X5-13B': partial(Eagle, model_path='NVEagle/Eagle-X5-13B'),
+    'Eagle-X5-13B-Chat': partial(Eagle, model_path='NVEagle/Eagle-X5-13B-Chat'),
+    'Eagle-X5-34B-Chat': partial(Eagle, model_path='NVEagle/Eagle-X5-34B-Chat'),
+    'Eagle-X5-34B-Plus': partial(Eagle, model_path='NVEagle/Eagle-X5-34B-Plus'),
+}
+moondream_series={
+    'Moondream1': partial(Moondream1, model_path='vikhyatk/moondream1'),
+    'Moondream2': partial(Moondream2, model_path='vikhyatk/moondream2'),
+}
+supported_VLM = {}
+model_groups = [
+    ungrouped, api_models,
+    xtuner_series, qwen_series, llava_series, internvl_series, yivl_series,
+    xcomposer_series, minigpt4_series, idefics_series, instructblip_series,
+    deepseekvl_series, minicpm_series, cogvlm_series, wemm_series,
+    cambrian_series, chameleon_series, video_models, ovis_series, vila_series,
+    mantis_series, mmalaya_series, phi3_series, xgen_mm_series, qwen2vl_series, 
+    slime_series, eagle_series, moondream_series
+]
+for grp in model_groups:
+    supported_VLM.update(grp)
--- a/VLMEvalKit/vlmeval/dataset/__init__.py
+++ b/VLMEvalKit/vlmeval/dataset/__init__.py
+import warnings
+from .image_base import img_root_map, ImageBaseDataset
+from .image_caption import ImageCaptionDataset
+from .image_yorn import ImageYORNDataset
+from .image_mcq import (
+    ImageMCQDataset, MMMUDataset, CustomMCQDataset, MUIRDataset, GMAIMMBenchDataset, MMERealWorld, HRBenchDataset
+)
+from .image_mt import MMDUDataset
+from .image_vqa import (
+    ImageVQADataset, MathVision, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, TableVQABench,
+    CustomVQADataset, CRPE, MathVerse
+)
+from .vcr import VCRDataset
+from .mmlongbench import MMLongBench
+from .dude import DUDE
+from .slidevqa import SlideVQA
+from .mmbench_video import MMBenchVideo
+from .text_mcq import CustomTextMCQDataset, TextMCQDataset
+from .videomme import VideoMME
+from .mvbench import MVBench, MVBench_MP4
+from .utils import *
+from ..smp import *
+class ConcatDataset(ImageBaseDataset):
+    # This dataset takes multiple dataset names as input and aggregate them into a single dataset.
+    # Each single dataset should not have a field named `SUB_DATASET`
+    DATASET_SETS = {
+        'MMMB': ['MMMB_ar', 'MMMB_cn', 'MMMB_en', 'MMMB_pt', 'MMMB_ru', 'MMMB_tr'],
+        'MTL_MMBench_DEV': [
+            'MMBench_dev_ar', 'MMBench_dev_cn', 'MMBench_dev_en',
+            'MMBench_dev_pt', 'MMBench_dev_ru', 'MMBench_dev_tr'
+        ]
+    }
+    def __init__(self, dataset):
+        datasets = self.DATASET_SETS[dataset]
+        self.dataset_map = {}
+        # The name of the compliation
+        self.dataset_name = dataset
+        self.datasets = datasets
+        for dname in datasets:
+            dataset = build_dataset(dname)
+            assert dataset is not None, dataset
+            self.dataset_map[dname] = dataset
+        TYPES = [x.TYPE for x in self.dataset_map.values()]
+        MODALITIES = [x.MODALITY for x in self.dataset_map.values()]
+        assert np.all([x == TYPES[0] for x in TYPES]), (datasets, TYPES)
+        assert np.all([x == MODALITIES[0] for x in MODALITIES]), (datasets, MODALITIES)
+        self.TYPE = TYPES[0]
+        self.MODALITY = MODALITIES[0]
+        data_all = []
+        for dname in datasets:
+            data = self.dataset_map[dname].data
+            data['SUB_DATASET'] = [dname] * len(data)
+            data_new = localize_df(data, dname, nproc=16)
+            data_all.append(data_new)
+        data = pd.concat(data_all)
+        data['original_index'] = data.pop('index')
+        data['index'] = np.arange(len(data))
+        self.data = data
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        idx = line['original_index']
+        dname = line['SUB_DATASET']
+        org_data = self.dataset_map[dname].data
+        org_line = cp.deepcopy(org_data[org_data['index'] == idx]).iloc[0]
+        return self.dataset_map[dname].build_prompt(org_line)
+    def dump_image(self, line):
+        # Assert all images are pre-dumped
+        assert 'image' not in line
+        assert 'image_path' in line
+        tgt_path = toliststr(line['image_path'])
+        return tgt_path
+    @classmethod
+    def supported_datasets(cls):
+        return list(cls.DATASET_SETS)
+    def evaluate(self, eval_file, **judge_kwargs):
+        suffix = eval_file.split('.')[-1]
+        # First, split the eval_file by dataset
+        data_all = load(eval_file)
+        for dname in self.datasets:
+            tgt = eval_file.replace(self.dataset_name, dname)
+            data_sub = data_all[data_all['SUB_DATASET'] == dname]
+            data_sub.pop('index')
+            data_sub['index'] = data_sub.pop('original_index')
+            data_sub.pop('SUB_DATASET')
+            dump(data_sub, tgt)
+        # Then, evaluate each dataset separately
+        results_all = []
+        for dname in self.datasets:
+            tgt = eval_file.replace(self.dataset_name, dname)
+            res = self.dataset_map[dname].evaluate(tgt, **judge_kwargs)
+            assert isinstance(res, pd.DataFrame)
+            res['DATASET'] = [dname] * len(res)
+            results_all.append(res)
+        result = pd.concat(results_all)
+        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        dump(result, score_file)
+        return result
+# Add new supported dataset class here
+IMAGE_DATASET = [
+    ImageCaptionDataset, ImageYORNDataset, ImageMCQDataset, ImageVQADataset, MathVision,
+    MMMUDataset, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, TableVQABench,
+    MMLongBench, VCRDataset, MMDUDataset, DUDE, SlideVQA, MUIRDataset,
+    GMAIMMBenchDataset, MMERealWorld, HRBenchDataset, CRPE, MathVerse
+]
+VIDEO_DATASET = [
+    MMBenchVideo, VideoMME, MVBench, MVBench_MP4
+]
+TEXT_DATASET = [
+    TextMCQDataset
+]
+CUSTOM_DATASET = [
+    CustomMCQDataset, CustomVQADataset, CustomTextMCQDataset
+]
+DATASET_COLLECTION = [ConcatDataset]
+DATASET_CLASSES = IMAGE_DATASET + VIDEO_DATASET + TEXT_DATASET + CUSTOM_DATASET + DATASET_COLLECTION
+SUPPORTED_DATASETS = []
+for DATASET_CLS in DATASET_CLASSES:
+    SUPPORTED_DATASETS.extend(DATASET_CLS.supported_datasets())
+def DATASET_TYPE(dataset, *, default: str = 'MCQ') -> str:
+    for cls in DATASET_CLASSES:
+        if dataset in cls.supported_datasets():
+            if hasattr(cls, 'TYPE'):
+                return cls.TYPE
+    # Have to add specific routine to handle ConcatDataset
+    if dataset in ConcatDataset.DATASET_SETS:
+        dataset_list = ConcatDataset.DATASET_SETS[dataset]
+        TYPES = [DATASET_TYPE(dname) for dname in dataset_list]
+        assert np.all([x == TYPES[0] for x in TYPES]), (dataset_list, TYPES)
+        return TYPES[0]
+    if 'openended' in dataset.lower():
+        return 'VQA'
+    warnings.warn(f'Dataset {dataset} is a custom one and not annotated as `openended`, will treat as {default}. ')
+    return default
+def build_dataset(dataset_name, **kwargs):
+    for cls in DATASET_CLASSES:
+        if dataset_name in cls.supported_datasets():
+            return cls(dataset=dataset_name, **kwargs)
+    warnings.warn(f'Dataset {dataset_name} is not officially supported. ')
+    data_file = osp.join(LMUDataRoot(), f'{dataset_name}.tsv')
+    if not osp.exists(data_file):
+        warnings.warn(f'Data file {data_file} does not exist. Dataset building failed. ')
+        return None
+    data = load(data_file)
+    if 'question' not in [x.lower() for x in data.columns]:
+        warnings.warn(f'Data file {data_file} does not have a `question` column. Dataset building failed. ')
+        return None
+    if 'A' in data and 'B' in data:
+        if 'image' in data or 'image_path' in data:
+            warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom MCQ dataset. ')
+            return CustomMCQDataset(dataset=dataset_name, **kwargs)
+        else:
+            warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom Text MCQ dataset. ')
+            return CustomTextMCQDataset(dataset=dataset_name, **kwargs)
+    else:
+        warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom VQA dataset. ')
+        return CustomVQADataset(dataset=dataset_name, **kwargs)
+__all__ = [
+    'build_dataset', 'img_root_map', 'build_judge', 'extract_answer_from_item', 'prefetch_answer', 'DEBUG_MESSAGE'
+] + [cls.__name__ for cls in DATASET_CLASSES]
--- a/VLMEvalKit/vlmeval/dataset/dude.py
+++ b/VLMEvalKit/vlmeval/dataset/dude.py
+import math
+from typing import List
+from .utils.judge_util import build_judge
+from .image_base import ImageBaseDataset
+from .mmlongbench import concat_images, MMLongBench_auxeval, anls_compute
+from ..smp import *
+FAIL_MSG = 'Failed to obtain answer via API.'
+def DUDE_acc(result_file):
+    data = load(result_file)
+    overall_score = 0.0
+    score_list = list()
+    for i in range(len(data)):
+        item = data.iloc[i]
+        if isinstance(item['answer'], float) and math.isnan(item['answer']):
+            item['answer'] = 'Not answerable'
+        item['answer'] = item['answer'].lower()
+        item['pred'] = item['pred'].lower()
+        score = anls_compute(item['answer'], item['pred'])
+        score_list.append(score)
+        overall_score += score
+    data['score'] = score_list
+    dump(data, result_file)
+    res = dict()
+    res['category'], res['num'], res['avg_score'] = ['anls'], [len(data)], [overall_score / len(data)]
+    res = pd.DataFrame(res)
+    return res
+class DUDE(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'DUDE': 'https://opencompass.openxlab.space/utils/VLMEval/DUDE.tsv',
+        'DUDE_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/DUDE_MINI.tsv',
+    }
+    DATASET_MD5 = {
+        'DUDE': '130d860d08206e1e407cd77150c10d88',
+        'DUDE_MINI': 'e0c0d998114f0cca7516d12039d2b538',
+    }
+    SUPPORTED_MODELS = {
+        'GPT4': (1, 1),
+        'GPT4V': (1, 1),
+        'GPT4V_HIGH': (1, 1),
+        'GPT4o': (1, 1),
+        'GPT4o_HIGH': (1, 1),
+        'GPT4o_MINI': (1, 1),
+        'XComposer2d5': (1, -1),
+        'XComposer2_4KHD': (1, -1),
+        'MiniCPM-Llama3-V-2_5': (1, 5),
+        'InternVL-Chat-V1-5': (5, 2),
+    }
+    def __init__(self, dataset, **kwargs):
+        self.model_list = list(self.SUPPORTED_MODELS.keys())
+        model_name = kwargs['model']
+        if not listinstr(self.model_list, model_name):
+            raise AssertionError("{} doesn't support the evaluation on DUDE.".format(model_name))
+        super(DUDE, self).__init__(dataset)
+        self.is_api = True if listinstr(['GPT4'], model_name) else False
+        self.max_pages = 120
+        concat_num, column_num = self.SUPPORTED_MODELS.get(model_name)
+        self.concat_num = concat_num
+        self.column_num = column_num
+    def prepare_tsv(self, url, file_md5=None):
+        data_root = LMUDataRoot()
+        os.makedirs(data_root, exist_ok=True)
+        file_name = url.split('/')[-1]
+        data_path = osp.join(data_root, file_name)
+        if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
+            pass
+        else:
+            warnings.warn('The dataset tsv is not downloaded')
+            download_file(url, data_path)
+        return load(data_path)
+    def dump_image(self, origin_line):
+        os.makedirs(self.img_root, exist_ok=True)
+        try:
+            import fitz
+        except:
+            warnings.warn('Please use `pip install pymupdf` to parse PDF files.')
+        line = origin_line.copy()
+        if not isinstance(line['image_path'], List):
+            line['image_path'] = [line['image_path']]
+        line['image_path'] = line['image_path'][:self.max_pages]
+        skip_pdf_parse = True
+        for im_name in line['image_path']:
+            path = osp.join(self.img_root, im_name)
+            if not read_ok(path):
+                skip_pdf_parse = False
+                break
+        # Just for being compatible with the zooped loop: zip(line['image'], line['image_path'])
+        if skip_pdf_parse:
+            line['image'] = line['image_path']
+        else:
+            pdf_data = base64.b64decode(line['image'])
+            pdf_file = io.BytesIO(pdf_data)
+            encoded_images = []
+            with fitz.open(stream=pdf_file, filetype='pdf') as doc:
+                doc = doc[:self.max_pages]
+                for page in doc:
+                    image = page.get_pixmap(dpi=144)
+                    image_file = io.BytesIO(image.tobytes(output='png'))
+                    image = Image.open(image_file)
+                    encoded_image = encode_image_to_base64(image)
+                    encoded_images.append(encoded_image)
+            line['image'] = encoded_images
+            print('process {}'.format(line['doc_id']))
+        if 'image' in line:
+            if isinstance(line['image'], list):
+                tgt_path = []
+                assert 'image_path' in line
+                for img, im_name in zip(line['image'], line['image_path']):
+                    path = osp.join(self.img_root, im_name)
+                    if not read_ok(path):
+                        decode_base64_to_image_file(img, path)
+                    tgt_path.append(path)
+            else:
+                tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
+                if not read_ok(tgt_path):
+                    decode_base64_to_image_file(line['image'], tgt_path)
+                tgt_path = [tgt_path]
+        else:
+            assert 'image_path' in line
+            tgt_path = toliststr(line['image_path'])
+        if self.concat_num > 0 and not self.is_api:
+            concatenated_images = concat_images(tgt_path, max_concat=self.concat_num, column_num=self.column_num)
+            old_tgt_path = tgt_path
+            assert isinstance(old_tgt_path, list)
+            if self.column_num != -1:
+                tgt_path = [
+                    '_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat{}_{}.jpg'.format(self.concat_num, i)
+                    for i in range(len(concatenated_images))
+                ]
+            else:
+                tgt_path = ['_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat_all.jpg']
+            for path, concatenated_image in zip(tgt_path, concatenated_images):
+                if not read_ok(path):
+                    decode_base64_to_image_file(encode_image_to_base64(concatenated_image), path)
+                    num_images, image_size = len(old_tgt_path), concatenated_image.size
+                    print('concat {} images to a new one with size {}. save at {}'.format(num_images, image_size, path))
+        return tgt_path
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        logger = get_logger('Evaluation')
+        model = judge_kwargs['model']
+        suffix = eval_file.split('.')[-1]
+        storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        if osp.exists(storage):
+            logger.warning(f'GPT scoring file {storage} already exists, will reuse it in DUDE_eval. ')
+        else:
+            data = load(eval_file)
+            model = build_judge(max_tokens=128, **judge_kwargs)
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+            ans = {}
+            if osp.exists(tmp_file):
+                ans = load(tmp_file)
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+            if len(indices):
+                new_results = list()
+                for model, line in tqdm(tups):
+                    res = MMLongBench_auxeval(model, line)
+                    new_results.append(res)
+            log_map, res_map, pred_map = {}, {}, {}
+            all_inds = [line['index'] for line in lines]
+            for k, v in zip(all_inds, new_results):
+                log_map[k] = v['log']
+                res_map[k] = v['res']
+                pred_map[k] = v['pred']
+            data['res'] = [res_map[idx] for idx in data['index']]
+            data['log'] = [log_map[idx] for idx in data['index']]
+            data['pred'] = [pred_map[idx] for idx in data['index']]
+            dump(data, storage)
+        score = DUDE_acc(storage)
+        score_pth = storage.replace('.xlsx', '_score.csv')
+        dump(score, score_pth)
+        logger.info(f'DUDE successfully finished evaluating {eval_file}, results saved in {score_pth}')
+        logger.info('Score: ')
+        logger.info(score)
--- a/VLMEvalKit/vlmeval/dataset/image_base.py
+++ b/VLMEvalKit/vlmeval/dataset/image_base.py
+import pandas as pd
+from abc import abstractmethod
+from ..smp import *
+def img_root_map(dataset):
+    if 'CRPE' in dataset:
+        return 'CRPE'
+    if 'OCRVQA' in dataset:
+        return 'OCRVQA'
+    if 'COCO_VAL' == dataset:
+        return 'COCO'
+    if 'MMMU' in dataset:
+        return 'MMMU'
+    mmbench_root_map = {
+        'MMBench_DEV_EN': 'MMBench', 'MMBench_TEST_EN': 'MMBench',
+        'MMBench_DEV_CN': 'MMBench', 'MMBench_TEST_CN': 'MMBench',
+        'MMBench': 'MMBench', 'MMBench_CN': 'MMBench',
+        'MMBench_DEV_EN_V11': 'MMBench_V11', 'MMBench_TEST_EN_V11': 'MMBench_V11',
+        'MMBench_DEV_CN_V11': 'MMBench_V11', 'MMBench_TEST_CN_V11': 'MMBench_V11',
+        'MMBench_V11': 'MMBench', 'MMBench_CN_V11': 'MMBench',
+    }
+    if dataset in mmbench_root_map:
+        return mmbench_root_map[dataset]
+    return dataset
+class ImageBaseDataset:
+    MODALITY = 'IMAGE'
+    DATASET_URL = {}
+    DATASET_MD5 = {}
+    def __init__(self, dataset='MMBench', skip_noimg=True):
+        ROOT = LMUDataRoot()
+        # You can override this variable to save image files to a different directory
+        self.dataset_name = dataset
+        self.img_root = osp.join(ROOT, 'images', img_root_map(dataset))
+        data = self.load_data(dataset)
+        self.skip_noimg = skip_noimg
+        if skip_noimg and 'image' in data:
+            data = data[~pd.isna(data['image'])]
+        data['index'] = [str(x) for x in data['index']]
+        self.meta_only = True
+        # The image field can store the base64 encoded image or another question index (for saving space)
+        if 'image' in data:
+            data['image'] = [str(x) for x in data['image']]
+            image_map = {x: y for x, y in zip(data['index'], data['image'])}
+            for k in image_map:
+                if len(image_map[k]) <= 64:
+                    idx = image_map[k]
+                    assert idx in image_map and len(image_map[idx]) > 64
+                    image_map[k] = image_map[idx]
+            images = [toliststr(image_map[k]) for k in data['index']]
+            data['image'] = [x[0] if len(x) == 1 else x for x in images]
+            self.meta_only = False
+        if 'image_path' in data:
+            paths = [toliststr(x) for x in data['image_path']]
+            data['image_path'] = [x[0] if len(x) == 1 else x for x in paths]
+        if np.all([istype(x, int) for x in data['index']]):
+            data['index'] = [int(x) for x in data['index']]
+        self.data = data
+        self.post_build(dataset)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        return dict(self.data.iloc[idx])
+    def prepare_tsv(self, url, file_md5=None):
+        data_root = LMUDataRoot()
+        os.makedirs(data_root, exist_ok=True)
+        update_flag = False
+        file_name = url.split('/')[-1]
+        data_path = osp.join(data_root, file_name)
+        if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
+            pass
+        else:
+            warnings.warn('The dataset tsv is not downloaded')
+            download_file(url, data_path)
+            update_flag = True
+        if file_size(data_path, 'GB') > 1:
+            local_path = data_path.replace('.tsv', '_local.tsv')
+            if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
+                from ..tools import LOCALIZE
+                LOCALIZE(data_path, local_path)
+            data_path = local_path
+        return load(data_path)
+    def dump_image(self, line):
+        os.makedirs(self.img_root, exist_ok=True)
+        if 'image' in line:
+            if isinstance(line['image'], list):
+                tgt_path = []
+                assert 'image_path' in line
+                for img, im_name in zip(line['image'], line['image_path']):
+                    path = osp.join(self.img_root, im_name)
+                    if not read_ok(path):
+                        decode_base64_to_image_file(img, path)
+                    tgt_path.append(path)
+            else:
+                tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
+                if not read_ok(tgt_path):
+                    decode_base64_to_image_file(line['image'], tgt_path)
+                tgt_path = [tgt_path]
+        else:
+            assert 'image_path' in line
+            tgt_path = toliststr(line['image_path'])
+        return tgt_path
+    def display(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        assert isinstance(line, pd.Series) or isinstance(line, dict)
+        mmqa_display(line)
+    # Return a list of dataset names that are supported by this class, can override
+    @classmethod
+    def supported_datasets(cls):
+        return list(cls.DATASET_URL)
+    # Given the dataset name, return the dataset as a pandas dataframe, can override
+    def load_data(self, dataset):
+        url = self.DATASET_URL[dataset]
+        file_md5 = self.DATASET_MD5[dataset] if dataset in self.DATASET_MD5 else None
+        return self.prepare_tsv(url, file_md5)
+    # Post built hook, will be called after the dataset is built, can override
+    def post_build(self, dataset):
+        pass
+    # Given one data record, return the built prompt (a multi-modal message), can override
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        if self.meta_only:
+            tgt_path = toliststr(line['image_path'])
+        else:
+            tgt_path = self.dump_image(line)
+        question = line['question']
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=question))
+        return msgs
+    # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
+    @abstractmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        pass
--- a/VLMEvalKit/vlmeval/dataset/image_caption.py
+++ b/VLMEvalKit/vlmeval/dataset/image_caption.py
+from .image_base import ImageBaseDataset
+from ..smp import *
+class COCO_Caption_Scorer():
+    def __init__(self, ref, gt):
+        from pycocoevalcap.bleu.bleu import Bleu
+        from pycocoevalcap.rouge.rouge import Rouge
+        from pycocoevalcap.cider.cider import Cider
+        self.ref = ref
+        self.gt = gt
+        print('setting up scorers...')
+        self.scorers = [
+            (Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']),
+            (Rouge(), 'ROUGE_L'),
+            (Cider(), 'CIDEr'),
+        ]
+    def compute_scores(self):
+        total_scores = {}
+        for scorer, method in self.scorers:
+            print('computing %s score...' % (scorer.method()))
+            score, scores = scorer.compute_score(self.gt, self.ref)
+            if isinstance(method, list):
+                for sc, scs, m in zip(score, scores, method):
+                    print('%s: %0.3f' % (m, sc * 100))
+                total_scores['Bleu'] = [x * 100 for x in score]
+            else:
+                print('%s: %0.3f' % (method, score * 100))
+                total_scores[method] = score * 100
+        print('*****DONE*****')
+        for key, value in total_scores.items():
+            print('{}:{}'.format(key, value))
+        return total_scores
+class ImageCaptionDataset(ImageBaseDataset):
+    TYPE = 'Caption'
+    DATASET_URL = {
+        'COCO_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/COCO_VAL.tsv',
+    }
+    DATASET_MD5 = {
+        'COCO_VAL': '72a5079dead060269ac222c5aa5128af',
+    }
+    def load_data(self, dataset):
+        data = super().load_data(dataset)
+        if 'question' not in data:
+            data['question'] = [(
+                'Please describe this image in general. Directly provide the description, '
+                'do not include prefix like "This image depicts". '
+            )] * len(data)
+        return data
+    # It returns a dictionary of scores
+    @classmethod
+    def evaluate(self, eval_file, **kwargs):
+        data = load(eval_file)
+        lt = len(data)
+        lines = [data.iloc[i] for i in range(lt)]
+        ref, gt = {}, {}
+        for i, line in enumerate(lines):
+            ref[str(i)] = [str(line['prediction'])]
+            gt[str(i)] = eval(line['answer'])
+        scorer = COCO_Caption_Scorer(ref, gt)
+        coco_caption_score_dict = scorer.compute_scores()
+        score_pth = eval_file.replace('.xlsx', '_score.json')
+        dump(coco_caption_score_dict, score_pth)
+        return coco_caption_score_dict
--- a/VLMEvalKit/vlmeval/dataset/image_mcq.py
+++ b/VLMEvalKit/vlmeval/dataset/image_mcq.py
+import warnings
+from .image_base import ImageBaseDataset
+from .utils import build_judge, DEBUG_MESSAGE
+from ..smp import *
+MMMB_URLS = {
+    'MMMB_ar': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_ar.tsv',
+    'MMMB_cn': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_cn.tsv',
+    'MMMB_en': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_en.tsv',
+    'MMMB_pt': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_pt.tsv',
+    'MMMB_ru': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_ru.tsv',
+    'MMMB_tr': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_tr.tsv',
+}
+MTL_MMBench_URLS = {
+    'MMBench_dev_ar': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_ar.tsv',
+    'MMBench_dev_cn': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_cn.tsv',
+    'MMBench_dev_en': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_en.tsv',
+    'MMBench_dev_pt': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_pt.tsv',
+    'MMBench_dev_tr': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_tr.tsv',
+    'MMBench_dev_ru': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_ru.tsv',
+}
+MMMB_MD5 = {
+    'MMMB_ar': 'f3a18b6385f1d9701840aa42de27aead', 'MMMB_cn': '13ed82fa89730037292fcaa27f08f430',
+    'MMMB_en': '1cd781a71ec5a2983c090b84105d6a01', 'MMMB_pt': '548ea2b3bb2da991790386f0015d30d1',
+    'MMMB_ru': 'ce1cc8a0533425ab0d86b326ebfc2984', 'MMMB_tr': '0733739d43090327975294292bc5cd67'
+}
+MTL_MMBench_MD5 = {
+    'MMBench_dev_ar': '4271b4a0d0200e1a86380a878e0d64a4', 'MMBench_dev_cn': '2ed5135326fed02c8e51ea50dda8222f',
+    'MMBench_dev_en': 'd9ab776fc018b3d45785e9a5c23431c2', 'MMBench_dev_pt': '4ddfbcd27ef12444b908c03831cd0295',
+    'MMBench_dev_tr': '4fab39d501389d3d6cc90264bb708f11', 'MMBench_dev_ru': '5ba1171ff2e68f80637bf78349e402a5'
+}
+class ImageMCQDataset(ImageBaseDataset):
+    TYPE = 'MCQ'
+    DATASET_URL = {
+        # MMBench v1.0
+        'MMBench_DEV_EN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_EN.tsv',
+        'MMBench_TEST_EN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_EN.tsv',
+        'MMBench_DEV_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_CN.tsv',
+        'MMBench_TEST_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_CN.tsv',
+        'MMBench': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench.tsv',  # Internal Only
+        'MMBench_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN.tsv',    # Internal Only
+        # MMBench v1.1
+        'MMBench_DEV_EN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_EN_V11.tsv',
+        'MMBench_TEST_EN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_EN_V11.tsv',
+        'MMBench_DEV_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_CN_V11.tsv',
+        'MMBench_TEST_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_CN_V11.tsv',
+        'MMBench_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_V11.tsv',  # Internal Only
+        'MMBench_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN_V11.tsv',    # Internal Only
+        # SEEDBench Series
+        'SEEDBench_IMG': 'https://opencompass.openxlab.space/utils/VLMEval/SEEDBench_IMG.tsv',
+        'SEEDBench2': 'https://huggingface.co/datasets/VLMEval/SEEDBench2/resolve/main/SEEDBench2.tsv',
+        'SEEDBench2_Plus': 'https://opencompass.openxlab.space/utils/VLMEval/SEEDBench2_Plus.tsv',
+        # ScienceQA Series
+        'ScienceQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/ScienceQA_VAL.tsv',
+        'ScienceQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/ScienceQA_TEST.tsv',
+        # MMT-Bench
+        'MMT-Bench_ALL_MI': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_ALL_MI.tsv',
+        'MMT-Bench_ALL': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_ALL.tsv',
+        'MMT-Bench_VAL_MI': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_VAL_MI.tsv',
+        'MMT-Bench_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_VAL.tsv',
+        # AesBench
+        'AesBench_VAL': 'https://huggingface.co/datasets/VLMEval/AesBench/resolve/main/AesBench_VAL.tsv',
+        'AesBench_TEST': 'https://huggingface.co/datasets/VLMEval/AesBench/resolve/main/AesBench_TEST.tsv',
+        # Q-Bench1
+        'Q-Bench1_VAL': 'https://huggingface.co/datasets/zhangzicheng/qbench_tsv/resolve/main/Q-Bench1_VAL.tsv',
+        'Q-Bench1_TEST': 'https://huggingface.co/datasets/zhangzicheng/qbench_tsv/resolve/main/Q-Bench1_TEST.tsv',
+        # A-Bench
+        'A-Bench_VAL': 'https://huggingface.co/datasets/zhangzicheng/abench_tsv/resolve/main/A-bench_VAL.tsv',
+        'A-Bench_TEST': 'https://huggingface.co/datasets/zhangzicheng/abench_tsv/resolve/main/A-bench_TEST.tsv',
+        # Other Benchmarks
+        'CCBench': 'https://opencompass.openxlab.space/utils/VLMEval/CCBench.tsv',
+        'AI2D_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST.tsv',
+        'AI2D_TEST_NO_MASK': 'https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST_NO_MASK.tsv',
+        'MMStar': 'https://opencompass.openxlab.space/utils/VLMEval/MMStar.tsv',
+        'RealWorldQA': 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv',
+        'MLLMGuard_DS': 'https://opencompass.openxlab.space/utils/VLMEval/MLLMGuard_DS.tsv',
+        'BLINK': 'https://opencompass.openxlab.space/utils/VLMEval/BLINK.tsv',
+        'TaskMeAnything_v1_imageqa_random': (
+            'https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-random/'
+            'resolve/main/TaskMeAnything-v1-imageqa-random.tsv'
+        )
+    }
+    DATASET_MD5 = {
+        # MMBench v1.0
+        'MMBench_DEV_EN': 'b6caf1133a01c6bb705cf753bb527ed8',
+        'MMBench_TEST_EN': '6939fadb0ce626fefc0bdc9c64efc528',
+        'MMBench_DEV_CN': '08b8fc3324a5ed74155350f57be69fbd',
+        'MMBench_TEST_CN': '7e1239baf0ee4c8b513e19705a0f317e',
+        'MMBench': '4115aea3383f3dd0083be6a633e0f820',  # Internal Only
+        'MMBench_CN': '2e053ffc90ea598b1feae13c36dc13ee',    # Internal Only
+        # MMBench v1.1
+        'MMBench_DEV_EN_V11': '30c05be8f2f347a50be25aa067248184',
+        'MMBench_TEST_EN_V11': '26f0f15381a21720255091d3e0316ce6',
+        'MMBench_DEV_CN_V11': '593f9b5f6bea453d870a798b34ae4f37',
+        'MMBench_TEST_CN_V11': '74bbe4556dac745613c7cbe5ad787050',
+        'MMBench_V11': 'b9276414f57af1308dcc4d0cd9b42e7c',  # Internal Only
+        'MMBench_CN_V11': '95f6980dd1b4de38e3cbffe0305a3f25',    # Internal Only
+        # SEEDBench
+        'SEEDBench_IMG': '68017231464752261a2526d6ca3a10c0',
+        'SEEDBench2': '4ec15cf864c4f16274112284f531813e',
+        'SEEDBench2_Plus': 'e32d3216dc4f452b0fe497a52015d1fd',
+        # ScienceQA
+        'ScienceQA_VAL': '96320d05e142e585e7204e72affd29f3',
+        'ScienceQA_TEST': 'e42e9e00f9c59a80d8a5db35bc32b71f',
+        # MMT-Bench
+        'MMT-Bench_ALL_MI': '5272157097e19cdd7cb41e412ab3b7c7',
+        'MMT-Bench_ALL': 'b273a2f4c596fe4f2605de0494cd632f',
+        'MMT-Bench_VAL_MI': 'c7d7b998eb5cd9aa36c7d4f721472462',
+        'MMT-Bench_VAL': '8dd4b730f53dbf9c3aed90ca31c928e0',
+        # AesBench
+        'AesBench_VAL': '3edb0c319e9187aa0b97fe7a11700a8c',
+        'AesBench_TEST': '58b1f7ba2cc32e1d68896d6ee716bbf8',
+        # Q-Bench1
+        'Q-Bench1_VAL': '837bdb6cd2da571713543462815187b7',
+        'Q-Bench1_TEST': '15e759bfd58c9d5f30b23a317d347153',
+        # A-Bench
+        'A-Bench_VAL': '218563ec50d34bb336c814143a5bb9c1',
+        'A-Bench_TEST': '567013fb033a20cf23f51d8e865bd16c',
+        # Other Benchmarks
+        'CCBench': 'f5dde47f24dc5a6fb6e595b409b466ac',
+        'AI2D_TEST': '0f593e0d1c7df9a3d69bf1f947e71975',
+        'AI2D_TEST_NO_MASK': 'fd8f463634d4fe9fbd23b876e8eea5be',
+        'MMStar': 'e1ecd2140806c1b1bbf54b43372efb9e',
+        'RealWorldQA': '92321028d2bc29040284b6674721e48f',
+        'MLLMGuard_DS': '975fc0dd7119386e198c37d71e274b3f',
+        'BLINK': '3b6649b6a662184ea046908e5506260e',
+        'TaskMeAnything_v1_imageqa_random': '023fef69e2ca21827afb77c5ec3bc889'
+    }
+    DATASET_URL.update(MMMB_URLS)
+    DATASET_URL.update(MTL_MMBench_URLS)
+    DATASET_MD5.update(MMMB_MD5)
+    DATASET_MD5.update(MTL_MMBench_MD5)
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        if self.meta_only:
+            tgt_path = toliststr(line['image_path'])
+        else:
+            tgt_path = self.dump_image(line)
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        options_prompt = 'Options:\n'
+        for key, item in options.items():
+            options_prompt += f'{key}. {item}\n'
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        prompt = ''
+        if hint is not None:
+            prompt += f'Hint: {hint}\n'
+        prompt += f'Question: {question}\n'
+        if len(options):
+            prompt += options_prompt
+            prompt += 'Please select the correct answer from the options above. \n'
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=prompt))
+        return msgs
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.multiple_choice import report_acc, report_acc_MMT, mcq_circular_eval, mcq_vanilla_eval
+        # assert dataset is not None
+        dataset_map = {
+            'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_EN_V11': 'MMBench_V11',
+            'MMBench_TEST_CN': 'MMBench_CN', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11'
+        }
+        dataset = self.dataset_name
+        if dataset in dataset_map:
+            dataset = dataset_map[dataset]
+        nproc = judge_kwargs.pop('nproc', 4)
+        circular = False
+        if listinstr(['mmbench', 'ccbench'], dataset.lower()):
+            data = load(eval_file)
+            data['index'] = [int(x) for x in data['index']]
+            dump(data, eval_file)
+            circular = True
+        suffix = eval_file.split('.')[-1]
+        model = judge_kwargs.get('model', 'exact_matching')
+        assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
+        name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
+        name_str = name_str_map[model] if model in name_str_map else model
+        if model == 'exact_matching':
+            model = None
+        elif gpt_key_set():
+            model = build_judge(**judge_kwargs)
+            if not model.working():
+                warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                warnings.warn(DEBUG_MESSAGE)
+                model = None
+        else:
+            warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+            model = None
+        result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+        data = load(eval_file)
+        data = data.sort_values(by='index')
+        data['prediction'] = [str(x) for x in data['prediction']]
+        # If not choice label, then use lower case
+        for k in data.keys():
+            data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
+        meta = self.data
+        meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
+        data_map = {x: y for x, y in zip(data['index'], data['question'])}
+        for k in data_map:
+            assert k in meta_q_map, (
+                f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
+            )
+        if circular:
+            data = mcq_circular_eval(model, data, meta, nproc, result_file, self.dataset_name)
+        else:
+            data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
+        # load split
+        dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+        data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+        # May have different report acc functions for different datasets
+        if 'MMT' in dataset:
+            acc = report_acc_MMT(data)
+        else:
+            acc = report_acc(data)
+        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        dump(acc, score_file)
+        if dataset == 'AesBench_VAL':
+            warnings.warn('Note that AesBench VAL is just a toy version of AesBench TEST. For full results, \
+                           please evaluate on AesBench TEST. The AesBench TEST dataset is more than 20 times \
+                           larger than the VAL dataset and the leaderboard results are based on AesBench TEST.')
+        return acc
+class MMMUDataset(ImageMCQDataset):
+    DATASET_URL = {
+        'MMMU_DEV_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_DEV_VAL.tsv',
+        'MMMU_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_TEST.tsv',
+    }
+    DATASET_MD5 = {
+        'MMMU_DEV_VAL': '521afc0f3bf341e6654327792781644d',
+        'MMMU_TEST': 'c19875d11a2d348d07e5eb4bdf33166d',
+    }
+    @staticmethod
+    def split_MMMU(msgs):
+        text, images = None, []
+        for s in msgs:
+            if s['type'] == 'image':
+                images.append(s['value'])
+            elif s['type'] == 'text':
+                assert text is None
+                text = s['value']
+        text_segs = text.split('<image ')
+        if len(text_segs) == 1:
+            return msgs
+        segs = [dict(type='text', value=text_segs[0])]
+        for i, seg in enumerate(text_segs):
+            if i == 0:
+                continue
+            assert istype(seg[0], int) and seg[1] == '>'
+            image_idx = int(seg[0]) - 1
+            segs.append(dict(type='image', value=images[image_idx]))
+            segs.append(dict(type='text', value=seg[2:]))
+        return segs
+    def build_prompt(self, line):
+        msgs = super().build_prompt(line)
+        msgs = self.split_MMMU(msgs)
+        return msgs
+class MUIRDataset(ImageMCQDataset):
+    DATASET_URL = {
+        'MUIRBench': 'http://opencompass.openxxlab.com/utils/VLMEval/MUIRBench.tsv'
+    }
+    DATASET_MD5 = {
+        'MUIRBench': '2e5e6fd7699761b08a7cb3ab8c0c2ec8'
+    }
+    @staticmethod
+    def split_MUIR(msgs):
+        text, images = None, []
+        # Separate images and text from msgs
+        for s in msgs:
+            if s['type'] == 'image':
+                images.append(s['value'])
+            elif s['type'] == 'text':
+                assert text is None  # Ensure only one text entry is expected
+                text = s['value']
+        # Split text by <image> tags
+        text_segs = text.split('<image>')
+        # Initialize the segments list
+        segs = []
+        # Iterate through the text segments and images
+        for i, seg in enumerate(text_segs):
+            # Append the image if this is not the first segment and there are still images left
+            if i > 0 and i - 1 < len(images):
+                segs.append(dict(type='image', value=images[i - 1]))
+            # Append the text segment (if it's non-empty)
+            if len(seg) > 0:
+                segs.append(dict(type='text', value=seg))
+        return segs
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        if self.meta_only:
+            tgt_path = toliststr(line['image_path'])
+        else:
+            tgt_path = self.dump_image(line)
+        question = line['question']
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        # options_prompt = ''
+        options_prompt = '\n'.join([f'{key}. {item}' for key, item in options.items()])
+        # for key, item in options.items():
+        #     options_prompt += f'{key}. {item}\n'
+        prompt = ''
+        prompt += f'{question}\n'
+        if len(options):
+            prompt += options_prompt
+            prompt += "\nAnswer with the option's letter from the given choices directly."
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=prompt))
+        msgs = self.split_MUIR(msgs)
+        return msgs
+class GMAIMMBenchDataset(ImageMCQDataset):
+    DATASET_URL = {
+        'GMAI-MMBench_VAL': 'https://huggingface.co/datasets/VLMEval/GMAI-MMBench/resolve/main/GMAI-MMBench_VAL.tsv'
+    }
+    DATASET_MD5 = {
+        'GMAI-MMBench_VAL': '254bd581627866f1c499d3d6b4422324'
+    }
+    def report_acc_by_groups(self, df, group_column):
+        res = defaultdict(list)
+        # Check for the 'split' column
+        if 'split' in df:
+            splits = list(set(df['split']))
+            res['split'] = splits
+        else:
+            df['split'] = ['none'] * len(df)
+            res['split'] = ['none']
+        res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']]
+        if group_column not in df:
+            raise ValueError(f"Column '{group_column}' not found in dataframe.")
+        abilities = list(set(df[group_column]))
+        abilities = ['None' if isinstance(ab, float) and pd.isna(ab) else ab for ab in abilities]
+        abilities.sort()
+        for ab in abilities:
+            ab_name = ab
+            sub_df = df[df[group_column] == ab]
+            res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
+        return pd.DataFrame(res)
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.multiple_choice import report_acc, mcq_vanilla_eval
+        nproc = judge_kwargs.pop('nproc', 4)
+        suffix = eval_file.split('.')[-1]
+        model = judge_kwargs.get('model', 'exact_matching')
+        assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
+        name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
+        name_str = name_str_map[model] if model in name_str_map else model
+        if model == 'exact_matching':
+            model = None
+        elif gpt_key_set():
+            model = build_judge(**judge_kwargs)
+            if not model.working():
+                warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                warnings.warn(DEBUG_MESSAGE)
+                model = None
+        else:
+            warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+            model = None
+        result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+        data = load(eval_file)
+        data = data.sort_values(by='index')
+        data['prediction'] = [str(x) for x in data['prediction']]
+        # If not choice label, then use lower case
+        for k in data.keys():
+            data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
+        meta = self.data
+        meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
+        data_map = {x: y for x, y in zip(data['index'], data['question'])}
+        for k in data_map:
+            assert k in meta_q_map, (
+                f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
+            )
+        data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
+        # load split
+        dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+        data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+        acc = report_acc(data)
+        for group_col in ['clinical vqa task', 'department', 'perceptual granularity']:
+            acc_grouped = self.report_acc_by_groups(data, group_col)
+            score_file_grouped = eval_file.replace(f'.{suffix}', f'_{group_col}_acc.csv')
+            dump(acc_grouped, score_file_grouped)
+        return acc
+class MMERealWorld(ImageMCQDataset):
+    TYPE = 'MMERealWorld'
+    DATASET_MD5 = {
+        'MME-RealWorld': '271c33ec814c39533c467ec6fb8a6f36',
+        'MME-RealWorld-CN': 'daaa763d52a760a38606d5dedb3fe444',
+    }
+    SYS = {
+        'MME-RealWorld': (
+            'Select the best answer to the above multiple-choice question based on the image. '
+            'Respond with only the letter (A, B, C, D, or E) of the correct option. \n'
+            'The best answer is:'
+        ),
+        'MME-RealWorld-CN': (
+            '根据图像选择上述多项选择题的最佳答案。只需回答正确选项的字母（A, B, C, D 或 E）。\n'
+            '最佳答案为：'
+        ),
+    }
+    @classmethod
+    def supported_datasets(cls):
+        return ['MME-RealWorld', 'MME-RealWorld-CN']
+    def load_data(self, dataset='MME-RealWorld', repo_id='yifanzhang114/MME-RealWorld-Base64'):
+        def check_integrity(pth):
+            data_file = osp.join(pth, f'{dataset}.tsv')
+            if not os.path.exists(data_file):
+                return False
+            if md5(data_file) != self.DATASET_MD5[dataset]:
+                return False
+            return True
+        def generate_tsv(pth):
+            tsv_file = os.path.join(pth, f'{dataset}.tsv')
+            if os.path.exists(tsv_file):
+                print(f'{tsv_file} already exists.')
+                return
+            json_dir = os.path.join(pth, dataset)
+            json_files = [f for f in os.listdir(json_dir) if f.endswith('.json')]
+            data_list = []
+            for json_file in json_files:
+                with open(os.path.join(json_dir, json_file), 'r') as f:
+                    data = json.load(f)
+                    for item in tqdm(data):
+                        choice_prompt = 'The choices are listed below:\n' if dataset == 'MME-RealWorld' else '选项如下所示:\n'
+                        data_list.append({
+                            'index': item['index'],
+                            'image': item['image'],
+                            'question': item['question'],
+                            'multi-choice options': choice_prompt + '\n'.join(item['multi-choice options']),
+                            'A': item['multi-choice options'][0][4:],
+                            'B': item['multi-choice options'][1][4:],
+                            'C': item['multi-choice options'][2][4:],
+                            'D': item['multi-choice options'][3][4:],
+                            'E': item['multi-choice options'][4][4:],
+                            'answer': item['answer'],
+                            'category': item['category'],
+                            'l2-category': item['l2-category']
+                        })
+            df = pd.DataFrame(data_list)
+            df.to_csv(tsv_file, sep='\t', index=False)
+            print(f'TSV file saved to {tsv_file}')
+        # Check if dataset is cached and has integrity
+        update_flag = False
+        cache_path = get_cache_path(repo_id)
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+            print(f'Using cached dataset from {cache_path}')
+        else:
+            from huggingface_hub import snapshot_download
+            # Download or find the dataset path
+            dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
+            generate_tsv(dataset_path)
+            update_flag = True
+        data_path = os.path.join(dataset_path, f'{dataset}.tsv')
+        if file_size(data_path, 'GB') > 1:
+            local_path = data_path.replace('.tsv', '_local.tsv')
+            if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
+                from vlmeval.tools import LOCALIZE
+                LOCALIZE(data_path, local_path)
+            data_path = local_path
+        return load(data_path)
+    def post_build(self, dataset):
+        self.TYPE = 'MMERealWorld'
+    # Given one data record, return the built prompt (a multi-modal message), can override
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        if self.meta_only:
+            tgt_path = toliststr(line['image_path'])
+        else:
+            tgt_path = self.dump_image(line)
+        question = line['question']
+        choice_prompt = line['multi-choice options'] + '\n'
+        question += ' ' + choice_prompt + self.SYS[self.dataset_name]
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=question))
+        return msgs
+    # It returns a dictionary
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.multiple_choice import extract_characters_regex, get_dimension_rating
+        assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
+        FAIL_MSG = 'Failed to obtain answer via API.'
+        tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
+        tgt_file = eval_file.replace('.xlsx', '_rating.json')
+        score_file = eval_file.replace('.xlsx', '_score.xlsx')
+        if not osp.exists(score_file):
+            res = {} if not osp.exists(tmp_file) else load(tmp_file)
+            res = {k: v for k, v in res.items() if FAIL_MSG not in v}
+            data = load(eval_file)
+            cnt_rejected = 0
+            data_un = data[~pd.isna(data['prediction'])]
+            for idx in data['index']:
+                ans = data.loc[data['index'] == idx, 'answer'].values[0]
+                pred = data.loc[data['index'] == idx, 'prediction'].values[0]
+                extract_pred = extract_characters_regex(pred)
+                if extract_pred == '':
+                    cnt_rejected += 1
+                    data.loc[data['index'] == idx, 'score'] = 0
+                else:
+                    data.loc[data['index'] == idx, 'score'] = int(extract_pred == ans)
+            print(
+                f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
+                f'failed to obtain the score for another {cnt_rejected} questions. '
+                f'Those questions will be counted as 0 score in ALL rating.'
+            )
+            dump(data, score_file)
+        rating = get_dimension_rating(score_file)
+        dump(rating, tgt_file)
+        return rating
+class HRBenchDataset(ImageMCQDataset):
+    DATASET_URL = {
+        'HRBench4K': 'https://huggingface.co/datasets/DreamMr/HR-Bench/resolve/main/hr_bench_4k.tsv',
+        'HRBench8K': 'https://huggingface.co/datasets/DreamMr/HR-Bench/resolve/main/hr_bench_8k.tsv',
+    }
+    DATASET_MD5 = {
+        'HRBench4K': 'f6b041b03d49543494b8a56d2e35be65',
+        'HRBench8K': '274c9c7f89329b804a4723178a00219c',
+    }
+    def evaluate(self, eval_file, **judge_kwargs):
+        assert os.path.exists(eval_file), '{} does not exist!'.format(eval_file)
+        from .utils.multiple_choice import mcq_vanilla_eval
+        from .utils.hrbench import report_acc_hrbench
+        nproc = judge_kwargs.pop('nproc', 4)
+        suffix = eval_file.split('.')[-1]
+        model = judge_kwargs.get('model', 'extract_matching')
+        assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
+        name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
+        name_str = name_str_map[model] if model in name_str_map else model
+        if model == 'exact_matching':
+            model = None
+        elif gpt_key_set():
+            model = build_judge(**judge_kwargs)
+            if not model.working():
+                warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                warnings.warn(DEBUG_MESSAGE)
+                model = None
+        else:
+            warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+            model = None
+        result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+        data = load(eval_file)
+        data = data.sort_values(by='index')
+        data['prediction'] = [str(x) for x in data['prediction']]
+        # If not choice label, then use lower case
+        for k in data.keys():
+            data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
+        meta = self.data
+        meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
+        data_map = {x: y for x, y in zip(data['index'], data['question'])}
+        for k in data_map:
+            assert k in meta_q_map, (
+                f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
+            )
+        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        if osp.exists(score_file):
+            acc = load(score_file)
+            return acc
+        data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
+        dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+        data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
+        acc = report_acc_hrbench(data)
+        score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        dump(acc, score_file)
+        return acc
+class CustomMCQDataset(ImageMCQDataset):
+    def load_data(self, dataset):
+        data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
+        if file_size(data_path, 'GB') > 1:
+            local_path = data_path.replace('.tsv', '_local.tsv')
+            if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
+                from ..tools import LOCALIZE
+                LOCALIZE(data_path, local_path)
+            data_path = local_path
+        return load(data_path)
--- a/VLMEvalKit/vlmeval/dataset/image_mt.py
+++ b/VLMEvalKit/vlmeval/dataset/image_mt.py
+from .image_base import ImageBaseDataset
+from .utils.judge_util import build_judge
+from ..smp import *
+from ..utils import track_progress_rich
+class ImageMTDataset(ImageBaseDataset):
+    TYPE = 'MT'
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+        if self.meta_only:
+            tgt_path = toliststr(line['image_path'])
+        else:
+            tgt_path = self.dump_image(line)
+        questions = toliststr(line['question'])
+        if 'answer' in line:
+            answers = toliststr(line['answer'])
+        else:
+            answers = [''] * len(questions)
+        assert len(questions) == len(answers)
+        dlgs, pics_number = [], 0
+        for i in range(len(questions)):
+            q, a = questions[i], answers[i]
+            if '<ImageHere>' in q:
+                content = []
+                tag_number = q.count('<ImageHere>')
+                images = tgt_path[pics_number: pics_number + tag_number]
+                pics_number += tag_number
+                q_split = q.split('<ImageHere>')
+                for i in range(tag_number):
+                    qsp, im = q_split[i], images[i]
+                    if qsp != '':
+                        content.append(dict(type='text', value=qsp))
+                    content.append(dict(type='image', value=im))
+                if q_split[-1] != '':
+                    content.append(dict(type='text', value=q_split[-1]))
+            else:
+                content = [dict(type='text', value=q)]
+            dlgs.append(dict(role='user', content=content))
+            assert '<ImageHere>' not in a, 'We currently do not support images in the answer. '
+            content = [dict(type='text', value=a)]
+            dlgs.append(dict(role='assistant', content=content))
+        return dlgs
+class MMDUDataset(ImageMTDataset):
+    DATASET_URL = {'MMDU': 'https://opencompass.openxlab.space/utils/VLMEval/MMDU.tsv'}
+    DATASET_MD5 = {'MMDU': '848b635a88a078f49aebcc6e39792061'}
+    DIMS = [
+        'Creativity', 'Richness', 'Visual Perception', 'Logical Coherence',
+        'Answer Accuracy', 'Image Relationship Understanding', 'Overall Score'
+    ]
+    def calculat_metric(self, ans):
+        all = defaultdict(lambda: 0)
+        tot = defaultdict(lambda: 0)
+        valid = defaultdict(lambda: 0)
+        for k in ans:
+            res = ans[k]['res']
+            assert isinstance(res, pd.DataFrame)
+            lt = len(res)
+            for i in range(lt):
+                line = res.iloc[i]
+                for k in self.DIMS:
+                    tot[k] += 1
+                    if k in line and line[k] is not None:
+                        try:
+                            score = int(line[k])
+                            score = np.clip(score, 0, 10)
+                            all[k] += score
+                            valid[k] += 1
+                        except Exception as e:
+                            print(f'Failed to parse the score: {str(e)}')
+        sp1 = {'set': 'all'}
+        sp1.update({k: all[k] / tot[k] * 10 for k in self.DIMS})
+        sp2 = {'set': 'valid'}
+        sp2.update({k: all[k] / valid[k] * 10 for k in self.DIMS})
+        return pd.DataFrame([sp1, sp2])
+    def evaluate(self, eval_file, **judge_kwargs):
+        suffix = eval_file.split('.')[-1]
+        model = judge_kwargs['model']
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+        score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
+        nproc = judge_kwargs.pop('nproc', 4)
+        data = load(eval_file)
+        model = judge_kwargs.pop('model', 'gpt-4o')
+        judge_model = build_judge(model=model, **judge_kwargs)
+        lt = len(data)
+        lines = [data.iloc[i] for i in range(lt)]
+        tups = [(judge_model, line) for line in lines]
+        indices = [line['index'] for line in lines]
+        ans = {}
+        if osp.exists(tmp_file):
+            ans = load(tmp_file)
+        tups = [x for x, i in zip(tups, indices) if i not in ans]
+        indices = [i for i in indices if i not in ans]
+        from .utils.mmdu import mmdu_score
+        if len(indices):
+            new_results = track_progress_rich(
+                mmdu_score,
+                tups,
+                nproc=nproc,
+                chunksize=nproc,
+                keys=indices,
+                save=tmp_file,)
+            ans = load(tmp_file)
+            for k, v in zip(indices, new_results):
+                assert k in ans
+        metric = self.calculat_metric(ans)
+        dump(metric, score_file)
+        return metric