Commit bc5ebf0f authored by luopl's avatar luopl
Browse files

Initial commit

parents
Pipeline #2167 canceled with stages
from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
import os
import re
import json
from PIL import Image
import base64
from io import BytesIO
class ChatResponse(dict):
def __getattr__(self, name):
value = self.get(name)
if isinstance(value, dict):
return ChatResponse(value) # 如果值是字典,递归包装成 DotDict
elif isinstance(value, list):
return [ChatResponse(v) if isinstance(v, dict) else v for v in value] # 如果值是列表,处理其中的字典
return value
def __setattr__(self, name, value):
self[name] = value
def __delattr__(self, name):
del self[name]
from ..dataset import DATASET_TYPE
class TaichuVLWrapper(BaseAPI):
is_api: bool = True
def __init__(self,
model: str = 'Taichu-VL-2B',
retry: int = 5,
wait: int = 5,
verbose: bool = True,
temperature: float = 0.0,
system_prompt: str = None,
max_tokens: int = 4096,
key: str = None,
url: str = None,
**kwargs):
self.model = model
self.kwargs = kwargs
self.max_tokens = max_tokens
self.system_prompt = '[sys]You are a helpful assistant.[/sys]'
self.hint_prompt = '|<Hint>|'
self.mcq_prompt = '|<MCQ>|'
self.datasets_use_system = ['MMVet']
self.datasets_use_multichoice = [
'MathVista', 'MathVision']
openai_key = os.environ.get('OPENAI_API_KEY', None)
use_openai = os.environ.get('USE_OPENAI_EVAL', True)
self.use_openai_evaluate = (isinstance(openai_key, str) and openai_key.startswith('sk-') and use_openai)
self.api_key = os.environ.get('TAICHU_API_KEY', key)
self.api_url = url
assert self.api_key is not None, 'Please set the API Key'
super().__init__(wait=wait, retry=retry, system_prompt=self.system_prompt, verbose=verbose, **kwargs)
def set_dump_image(self, dump_image_func):
self.dump_image_func = dump_image_func
def dump_image(self, line, dataset):
return self.dump_image_func(line)
def use_custom_prompt(self, dataset):
if listinstr(['MCQ', 'VQA'], DATASET_TYPE(dataset)):
return True
elif dataset is not None and listinstr(['HallusionBench'], dataset):
return True
return False
def clear_prompt(self, prompt):
prompt = re.sub(r"Hint:.*?Question:", "", prompt, flags=re.S).strip()
prompt = re.sub(r"\nChoices:\n.*", "", prompt, flags=re.S).strip()
return prompt
def encode_image(self, pil_image):
buffer = BytesIO()
pil_image.save(buffer, format='PNG')
base64_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
return base64_str
def build_prompt(self, line, dataset=None):
if isinstance(line, int):
line = self.data.iloc[line]
tgt_path = self.dump_image(line, dataset)
question = line['question']
hint = None
if listinstr(self.datasets_use_system, dataset):
system_prompt = self.system_prompt
else:
system_prompt = ''
mcq = False
if DATASET_TYPE(dataset) == 'MCQ' or listinstr(self.datasets_use_multichoice, dataset):
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
if listinstr(self.datasets_use_multichoice, dataset):
options = {}
if not pd.isna(line['choices']):
for i, c in enumerate(eval(line['choices'])):
options[string.ascii_uppercase[i]] = c
question = self.clear_prompt(question)
# support chinese
if listinstr(['_CN', '_cn'], dataset):
options_prompt = '\n选项:\n'
else:
options_prompt = '\nOPTIONS:\n'
options_prompt += '\n'.join(f"{key}:{value}" for key, value in options.items())
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
mcq = True if len(options) else False
if len(options):
prompt = question + options_prompt
else:
prompt = question
else:
prompt = question
msgs = []
if system_prompt:
msgs.append(dict(type='text', value=system_prompt))
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs.append(dict(type='image', value=tgt_path))
if hint:
prompt = 'Hint: ' + hint + '\n' + prompt
msgs.append(dict(type='text', value=prompt))
if mcq:
msgs.append(dict(type='text', value=self.mcq_prompt))
return msgs
def prompt_to_request_messages(self, inputs):
messages = [
{'role': 'user', 'content': []}
]
is_mcq = False
for x in inputs:
if x['type'] == 'text':
if x['value'] == self.system_prompt:
messages = [{'role': 'system', 'content': [{"type": "text", "text": x['value']}]}] + messages
elif self.mcq_prompt == x['value']:
is_mcq = True
else:
messages[-1]['content'].append(
{"type": "text", "text": x['value']},
)
if x['type'] == 'image':
_url = self.encode_image(Image.open(x['value']))
messages[-1]['content'].append(
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{_url}"}},
)
else:
continue
return messages, is_mcq
def generate_inner(self, inputs, **kwargs) -> str:
messages, is_mcq = self.prompt_to_request_messages(inputs)
data = {
"model": self.model,
"messages": messages,
"max_tokens": self.max_tokens,
"temperature": 0,
"top_p": 0.8,
"stream": False,
"extra_body": {
"repetition_penalty": 1
}
}
headers = {
'Authorization': self.api_key,
'Content-Type': 'application/json'
}
try:
chat_response = requests.post(self.api_url, json=data, headers=headers)
response = ChatResponse(json.loads(chat_response.content))
result = response.choices[0].message.content
# Extract index to exact matching when ChatGPT is unavailable.
if self.use_openai_evaluate is False and is_mcq is True:
try:
result = result[0]
except:
result = 'A'
return 0, result, 'Succeeded! '
except Exception as err:
if self.verbose:
self.logger.error(f'{type(err)}: {err}')
self.logger.error(f'The input messages are {inputs}.')
return -1, '', ''
class TaichuVLAPI(TaichuVLWrapper):
def generate(self, message, dataset=None):
return super(TaichuVLAPI, self).generate(message, dataset=dataset)
from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
from vlmeval.dataset import DATASET_TYPE, img_root_map
class TaiyiWrapper(BaseAPI):
is_api: bool = True
def __init__(self,
model: str = 'taiyi',
retry: int = 5,
wait: int = 5,
key: str = None,
verbose: bool = False,
system_prompt: str = None,
temperature: float = 0,
timeout: int = 60,
url: str = "https://taiyi.megvii.com/v1/chat/completions",
max_tokens: int = 1024,
**kwargs):
self.model = model
self.fail_msg = 'Failed to obtain answer via API. '
self.max_tokens = max_tokens
self.temperature = temperature
if key is None:
key = os.environ.get('TAIYI_API_KEY', None)
assert key is not None, ('Please set the API Key ')
self.key = key
self.timeout = timeout
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
assert url is not None, ('Please set the url ')
self.url = url
self.logger.info(f'Using url: {self.url}; API Key: {self.key}')
def use_custom_prompt(self, dataset):
if DATASET_TYPE(dataset) == 'Y/N' or DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'VQA':
return True
return False
def prepare_inputs(self, inputs):
input_msgs = []
if self.system_prompt is not None:
input_msgs.append(dict(role='system', content=self.system_prompt))
has_images = np.sum([x['type'] == 'image' for x in inputs])
if has_images:
content_list = []
for msg in inputs:
if msg['type'] == 'text':
content_list.append(dict(type='text', text=msg['value']))
elif msg['type'] == 'image':
imgbytes = open(msg['value'],'rb').read()
b64 = base64.b64encode(imgbytes).decode('ascii')
img_struct = dict(url=f'data:image/jpeg;base64,{b64}')
content_list.append(dict(type='image_url', image_url=img_struct))
input_msgs.append(dict(role='user', content=content_list))
else:
assert all([x['type'] == 'text' for x in inputs])
text = '\n'.join([x['value'] for x in inputs])
input_msgs.append(dict(role='user', content=text))
return input_msgs
def set_dump_image(self, dump_image_func):
self.dump_image_func = dump_image_func
def dump_image(self, line, dataset):
return self.dump_image_func(line)
def image_first(self, msgs):
nr_img = 0
for s in msgs:
if s['type'] == 'image':
nr_img += 1
if nr_img == 1:
new_msgs = []
img_msg = None
for s in msgs:
if s['type'] == 'text':
new_msgs.append(s)
else:
img_msg = s
new_msgs.insert(0, img_msg)
else:
new_msgs = msgs
return new_msgs
def build_multi_choice_prompt(self, line, dataset=None):
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if len(options):
prompt += '\n请直接回答选项字母。' if cn_string(
prompt) else "\nAnswer with the option's letter from the given choices directly."
else:
prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
return prompt
def build_yorn_prompt(self, line, dataset=None):
if listinstr(['HallusionBench'], dataset):
pre_prompt = 'Read the following question carefully, think and solve it step by step.\n\n'
else:
pre_prompt = ''
prompt = pre_prompt + line['question'] + ' Please answer yes or no as the final answer.'
return prompt
def build_vqa_prompt(self, line, dataset=None):
if listinstr(['OCRBench'], dataset):
pre_prompt = 'Carefully identify the text in the image and answer the question.\n\n'
else:
pre_prompt = ''
if listinstr(['MMVet'], dataset):
post_prompt = '\nAnswer this question in detail.'
else:
post_prompt = ''
prompt = pre_prompt + line['question'] + post_prompt
return prompt
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert dataset is None or isinstance(dataset, str)
tgt_path = self.dump_image(line, dataset)
if DATASET_TYPE(dataset) == 'MCQ':
prompt = self.build_multi_choice_prompt(line, dataset)
elif DATASET_TYPE(dataset) == 'Y/N':
prompt = self.build_yorn_prompt(line, dataset)
elif DATASET_TYPE(dataset) == 'VQA':
prompt = self.build_vqa_prompt(line, dataset)
else:
raise RuntimeError(f'Invalid dataset type: {DATASET_TYPE(dataset)}')
message = []
message.extend([dict(type='image', value=s) for s in tgt_path])
message.extend([dict(type='text', value=prompt)])
# interleave dataset
if dataset.startswith('MMMU_'):
from .. import MMMUDataset
message = MMMUDataset.split_MMMU(message)
message = self.image_first(message)
return message
def generate_inner(self, inputs, **kwargs) -> str:
input_msgs = self.prepare_inputs(inputs)
temperature = kwargs.pop('temperature', self.temperature)
headers = {'Authorization': f'Bearer {self.key}'}
payload = dict(
model=self.model,
messages=input_msgs,
n=1,
temperature=temperature,
**kwargs)
response = requests.post(self.url, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
ret_code = response.status_code
ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
answer = self.fail_msg
try:
resp_struct = json.loads(response.text)
answer = resp_struct['choices'][0]['message']['content'].strip()
except:
pass
return ret_code, answer, response
class TaiyiAPI(TaiyiWrapper):
def generate(self, message, dataset=None):
return super(TaiyiAPI, self).generate(message)
from vlmeval.vlm import *
from vlmeval.api import *
from functools import partial
PandaGPT_ROOT = None
MiniGPT4_ROOT = None
TransCore_ROOT = None
Yi_ROOT = None
OmniLMM_ROOT = None
Mini_Gemini_ROOT = None
VXVERSE_ROOT = None
VideoChat2_ROOT = None
VideoChatGPT_ROOT = None
PLLaVA_ROOT = None
RBDash_ROOT = None
LLAVA_V1_7B_MODEL_PTH = 'Please set your local path to LLaVA-7B-v1.1 here, the model weight is obtained by merging LLaVA delta weight based on vicuna-7b-v1.1 in https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md with vicuna-7b-v1.1. '
video_models = {
'Video-LLaVA-7B':partial(VideoLLaVA, model_path='LanguageBind/Video-LLaVA-7B'),
'Video-LLaVA-7B-HF':partial(VideoLLaVA_HF, model_path='LanguageBind/Video-LLaVA-7B-hf'),
'VideoChat2-HD':partial(VideoChat2_HD, model_path='OpenGVLab/VideoChat2_HD_stage4_Mistral_7B', root=VideoChat2_ROOT, config_file='./vlmeval/vlm/video_llm/configs/videochat2_hd.json'),
'Chat-UniVi-7B': partial(Chatunivi, model_path="Chat-UniVi/Chat-UniVi"),
'Chat-UniVi-7B-v1.5': partial(Chatunivi, model_path="Chat-UniVi/Chat-UniVi-7B-v1.5"),
'LLaMA-VID-7B': partial(LLaMAVID, model_path='YanweiLi/llama-vid-7b-full-224-video-fps-1'),
'Video-ChatGPT': partial(VideoChatGPT, model_path='MBZUAI/Video-ChatGPT-7B', dir_root=VideoChatGPT_ROOT),
'PLLaVA-7B': partial(PLLaVA, model_path='ermu2001/pllava-7b', dir_root=PLLaVA_ROOT),
'PLLaVA-13B': partial(PLLaVA, model_path='ermu2001/pllava-13b', dir_root=PLLaVA_ROOT),
'PLLaVA-34B': partial(PLLaVA, model_path='ermu2001/pllava-34b', dir_root=PLLaVA_ROOT),
}
ungrouped = {
'TransCore_M': partial(TransCoreM, root=TransCore_ROOT),
'PandaGPT_13B': partial(PandaGPT, name='PandaGPT_13B', root=PandaGPT_ROOT),
'flamingov2': partial(OpenFlamingo, name='v2', mpt_pth='anas-awadalla/mpt-7b', ckpt_pth='openflamingo/OpenFlamingo-9B-vitl-mpt7b'),
'VisualGLM_6b': partial(VisualGLM, model_path='THUDM/visualglm-6b'),
'mPLUG-Owl2': partial(mPLUG_Owl2, model_path='MAGAer13/mplug-owl2-llama2-7b'),
'mPLUG-Owl3': partial(mPLUG_Owl3, model_path='mPLUG/mPLUG-Owl3-7B-240728'),
'emu2_chat': partial(Emu, model_path='BAAI/Emu2-Chat'),
'OmniLMM_12B': partial(OmniLMM12B, model_path='openbmb/OmniLMM-12B', root=OmniLMM_ROOT),
'MGM_7B': partial(Mini_Gemini, model_path='YanweiLi/MGM-7B-HD', root=Mini_Gemini_ROOT),
'Bunny-llama3-8B': partial(BunnyLLama3, model_path='BAAI/Bunny-v1_1-Llama-3-8B-V'),
'VXVERSE': partial(VXVERSE, model_name='XVERSE-V-13B', root=VXVERSE_ROOT),
'paligemma-3b-mix-448': partial(PaliGemma, model_path='google/paligemma-3b-mix-448'),
'360VL-70B': partial(QH_360VL, model_path='qihoo360/360VL-70B'),
'Llama-3-MixSenseV1_1': partial(LLama3Mixsense, model_path='Zero-Vision/Llama-3-MixSenseV1_1'),
'Parrot': partial(Parrot, model_path='AIDC-AI/Parrot-7B'),
'OmChat': partial(OmChat, model_path='omlab/omchat-v2.0-13B-single-beta_hf'),
'RBDash_72b': partial(RBDash, model_path='RBDash-Team/RBDash-v1.5', root=RBDash_ROOT),
'Pixtral-12B': partial(Pixtral, model_path='mistralai/Pixtral-12B-2409'),
'Falcon2-VLM-11B': partial(Falcon2VLM, model_path='tiiuae/falcon-11B-vlm')
}
api_models = {
# GPT
'GPT4V': partial(GPT4V, model='gpt-4-1106-vision-preview', temperature=0, img_size=512, img_detail='low', retry=10, verbose=False),
'GPT4V_HIGH': partial(GPT4V, model='gpt-4-1106-vision-preview', temperature=0, img_size=-1, img_detail='high', retry=10, verbose=False),
'GPT4V_20240409': partial(GPT4V, model='gpt-4-turbo-2024-04-09', temperature=0, img_size=512, img_detail='low', retry=10, verbose=False),
'GPT4V_20240409_HIGH': partial(GPT4V, model='gpt-4-turbo-2024-04-09', temperature=0, img_size=-1, img_detail='high', retry=10, verbose=False),
'GPT4o': partial(GPT4V, model='gpt-4o-2024-05-13', temperature=0, img_size=512, img_detail='low', retry=10, verbose=False),
'GPT4o_HIGH': partial(GPT4V, model='gpt-4o-2024-05-13', temperature=0, img_size=-1, img_detail='high', retry=10, verbose=False),
'GPT4o_20240806': partial(GPT4V, model='gpt-4o-2024-08-06', temperature=0, img_size=-1, img_detail='high', retry=10, verbose=False),
'GPT4o_20241120': partial(GPT4V, model='gpt-4o-2024-11-20', temperature=0, img_size=-1, img_detail='high', retry=10, verbose=False),
'GPT4o_MINI': partial(GPT4V, model='gpt-4o-mini-2024-07-18', temperature=0, img_size=-1, img_detail='high', retry=10, verbose=False),
# Gemini
'GeminiPro1-0': partial(GeminiProVision, model='gemini-1.0-pro', temperature=0, retry=10), # now GeminiPro1-0 is only supported by vertex backend
'GeminiPro1-5': partial(GeminiProVision, model='gemini-1.5-pro', temperature=0, retry=10),
'GeminiFlash1-5': partial(GeminiProVision, model='gemini-1.5-flash', temperature=0, retry=10),
'GeminiFlash2-0': partial(GeminiProVision, model='gemini-2.0-flash-exp', temperature=0, retry=10),
'GeminiPro1-5-002': partial(GPT4V, model='gemini-1.5-pro-002', temperature=0, retry=10), # Internal Use Only
'GeminiFlash1-5-002': partial(GPT4V, model='gemini-1.5-flash-002', temperature=0, retry=10), # Internal Use Only
# Qwen-VL
'QwenVLPlus': partial(QwenVLAPI, model='qwen-vl-plus', temperature=0, retry=10),
'QwenVLMax': partial(QwenVLAPI, model='qwen-vl-max', temperature=0, retry=10),
# Reka
'RekaEdge': partial(Reka, model='reka-edge-20240208'),
'RekaFlash': partial(Reka, model='reka-flash-20240226'),
'RekaCore': partial(Reka, model='reka-core-20240415'),
# Step1V
'Step1V': partial(GPT4V, model='step-1v-32k', api_base="https://api.stepfun.com/v1/chat/completions", temperature=0, retry=10, img_size=-1, img_detail='high'),
'Step1.5V-mini': partial(GPT4V, model='step-1.5v-mini', api_base="https://api.stepfun.com/v1/chat/completions", temperature=0, retry=10, img_size=-1, img_detail='high'),
# Yi-Vision
'Yi-Vision': partial(GPT4V, model='yi-vision', api_base="https://api.lingyiwanwu.com/v1/chat/completions", temperature=0, retry=10),
# Claude
'Claude3V_Opus': partial(Claude3V, model='claude-3-opus-20240229', temperature=0, retry=10, verbose=False),
'Claude3V_Sonnet': partial(Claude3V, model='claude-3-sonnet-20240229', temperature=0, retry=10, verbose=False),
'Claude3V_Haiku': partial(Claude3V, model='claude-3-haiku-20240307', temperature=0, retry=10, verbose=False),
'Claude3-5V_Sonnet': partial(Claude3V, model='claude-3-5-sonnet-20240620', temperature=0, retry=10, verbose=False),
'Claude3-5V_Sonnet_20241022': partial(Claude3V, model='claude-3-5-sonnet-20241022', temperature=0, retry=10, verbose=False),
# GLM4V
'GLM4V': partial(GLMVisionAPI, model='glm4v-biz-eval', temperature=0, retry=10),
'GLM4V_PLUS': partial(GLMVisionAPI, model='cogvlm-evaluation-241203', temperature=0, retry=10),
# MiniMax abab
'abab6.5s': partial(GPT4V, model='abab6.5s-chat', api_base='https://api.minimax.chat/v1/chat/completions', temperature=0, retry=10),
'abab7-preview': partial(GPT4V, model='abab7-chat-preview', api_base='https://api.minimax.chat/v1/chat/completions', temperature=0, retry=10),
# CongRong
'CloudWalk': partial(CWWrapper, model='cw-congrong-v1.5', temperature=0, retry=10),
# SenseChat-V
'SenseChat-Vision': partial(SenseChatVisionAPI, model='SenseChat-Vision', temperature=0, retry=10),
'HunYuan-Vision': partial(HunyuanVision, model='hunyuan-vision', temperature=0, retry=10),
'bailingMM': partial(bailingMMAPI, model='bailingMM-mini', temperature=0, retry=10),
# BlueLM-V
"BlueLM_V": partial(BlueLM_V_API, model='BlueLM-VL-v3.0', temperature=0, retry=10),
# JiuTian-VL
"JTVL": partial(JTVLChatAPI, model='jt-vl-chat', temperature=0, retry=10),
"Taiyi": partial(TaiyiAPI, model='taiyi', temperature=0, retry=10),
# TeleMM
'TeleMM': partial(TeleMMAPI, model='TeleAI/TeleMM', temperature=0, retry=10),
# lmdeploy api
'lmdeploy': partial(LMDeployAPI, api_base='http://0.0.0.0:23333/v1/chat/completions', temperature=0, retry=10),
# Taichu-VL
'Taichu-VL-2B': partial(TaichuVLAPI, model='Taichu-VL-2B', url='https://platform.wair.ac.cn/api/v1/infer/10381/v1/chat/completions'),
}
mmalaya_series = {
'MMAlaya': partial(MMAlaya, model_path='DataCanvas/MMAlaya'),
'MMAlaya2': partial(MMAlaya2, model_path='DataCanvas/MMAlaya2'),
}
minicpm_series = {
'MiniCPM-V': partial(MiniCPM_V, model_path='openbmb/MiniCPM-V'),
'MiniCPM-V-2': partial(MiniCPM_V, model_path='openbmb/MiniCPM-V-2'),
'MiniCPM-Llama3-V-2_5': partial(MiniCPM_Llama3_V, model_path='openbmb/MiniCPM-Llama3-V-2_5'),
'MiniCPM-V-2_6': partial(MiniCPM_V_2_6, model_path='openbmb/MiniCPM-V-2_6'),
}
xtuner_series = {
'llava-internlm2-7b': partial(LLaVA_XTuner, llm_path='internlm/internlm2-chat-7b', llava_path='xtuner/llava-internlm2-7b', visual_select_layer=-2, prompt_template='internlm2_chat'),
'llava-internlm2-20b': partial(LLaVA_XTuner, llm_path='internlm/internlm2-chat-20b', llava_path='xtuner/llava-internlm2-20b', visual_select_layer=-2, prompt_template='internlm2_chat'),
'llava-internlm-7b': partial(LLaVA_XTuner, llm_path='internlm/internlm-chat-7b', llava_path='xtuner/llava-internlm-7b', visual_select_layer=-2, prompt_template='internlm_chat'),
'llava-v1.5-7b-xtuner': partial(LLaVA_XTuner, llm_path='lmsys/vicuna-7b-v1.5', llava_path='xtuner/llava-v1.5-7b-xtuner', visual_select_layer=-2, prompt_template='vicuna'),
'llava-v1.5-13b-xtuner': partial(LLaVA_XTuner, llm_path='lmsys/vicuna-13b-v1.5', llava_path='xtuner/llava-v1.5-13b-xtuner', visual_select_layer=-2, prompt_template='vicuna'),
'llava-llama-3-8b': partial(LLaVA_XTuner, llm_path='xtuner/llava-llama-3-8b-v1_1', llava_path='xtuner/llava-llama-3-8b-v1_1', visual_select_layer=-2, prompt_template='llama3_chat'),
}
qwen_series = {
'qwen_base': partial(QwenVL, model_path='Qwen/Qwen-VL'),
'qwen_chat': partial(QwenVLChat, model_path='Qwen/Qwen-VL-Chat'),
'monkey': partial(Monkey, model_path='echo840/Monkey'),
'monkey-chat': partial(MonkeyChat, model_path='echo840/Monkey-Chat'),
'minimonkey': partial(MiniMonkey, model_path='mx262/MiniMonkey')
}
llava_series = {
'llava_v1.5_7b': partial(LLaVA, model_path='liuhaotian/llava-v1.5-7b'),
'llava_v1.5_13b': partial(LLaVA, model_path='liuhaotian/llava-v1.5-13b'),
'llava_v1_7b': partial(LLaVA, model_path=LLAVA_V1_7B_MODEL_PTH),
'sharegpt4v_7b': partial(LLaVA, model_path='Lin-Chen/ShareGPT4V-7B'),
'sharegpt4v_13b': partial(LLaVA, model_path='Lin-Chen/ShareGPT4V-13B'),
'llava_next_vicuna_7b': partial(LLaVA_Next, model_path='llava-hf/llava-v1.6-vicuna-7b-hf'),
'llava_next_vicuna_13b': partial(LLaVA_Next, model_path='llava-hf/llava-v1.6-vicuna-13b-hf'),
'llava_next_mistral_7b': partial(LLaVA_Next, model_path='llava-hf/llava-v1.6-mistral-7b-hf'),
'llava_next_yi_34b': partial(LLaVA_Next, model_path='llava-hf/llava-v1.6-34b-hf'),
'llava_next_llama3': partial(LLaVA_Next, model_path='llava-hf/llama3-llava-next-8b-hf'),
'llava_next_72b': partial(LLaVA_Next, model_path='llava-hf/llava-next-72b-hf'),
'llava_next_110b': partial(LLaVA_Next, model_path='llava-hf/llava-next-110b-hf'),
'llava_next_qwen_32b': partial(LLaVA_Next2, model_path='lmms-lab/llava-next-qwen-32b'),
'llava_next_interleave_7b': partial(LLaVA_Next, model_path='llava-hf/llava-interleave-qwen-7b-hf'),
'llava_next_interleave_7b_dpo': partial(LLaVA_Next, model_path='llava-hf/llava-interleave-qwen-7b-dpo-hf'),
'llava-onevision-qwen2-0.5b-ov-hf': partial(LLaVA_OneVision_HF, model_path='llava-hf/llava-onevision-qwen2-0.5b-ov-hf'),
'llava-onevision-qwen2-0.5b-si-hf': partial(LLaVA_OneVision_HF, model_path='llava-hf/llava-onevision-qwen2-0.5b-si-hf'),
'llava-onevision-qwen2-7b-ov-hf': partial(LLaVA_OneVision_HF, model_path='llava-hf/llava-onevision-qwen2-7b-ov-hf'),
'llava-onevision-qwen2-7b-si-hf': partial(LLaVA_OneVision_HF, model_path='llava-hf/llava-onevision-qwen2-7b-si-hf'),
'llava_onevision_qwen2_0.5b_si': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-0.5b-si'),
'llava_onevision_qwen2_7b_si': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-7b-si'),
'llava_onevision_qwen2_72b_si': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-72b-si'),
'llava_onevision_qwen2_0.5b_ov': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-0.5b-ov'),
'llava_onevision_qwen2_7b_ov': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-7b-ov'),
'llava_onevision_qwen2_72b_ov': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-72b-ov-sft'),
'Aquila-VL-2B': partial(LLaVA_OneVision, model_path='BAAI/Aquila-VL-2B-llava-qwen'),
'llava_video_qwen2_7b':partial(LLaVA_OneVision, model_path='lmms-lab/LLaVA-Video-7B-Qwen2'),
'llava_video_qwen2_72b':partial(LLaVA_OneVision, model_path='lmms-lab/LLaVA-Video-72B-Qwen2'),
'varco-vision-hf':partial(LLaVA_OneVision_HF, model_path='NCSOFT/VARCO-VISION-14B-HF'),
}
internvl_series = {
'InternVL-Chat-V1-1': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-V1-1', version='V1.1'),
'InternVL-Chat-V1-2': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-V1-2', version='V1.2'),
'InternVL-Chat-V1-2-Plus': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-V1-2-Plus', version='V1.2'),
# InternVL1.5 series
'InternVL-Chat-V1-5': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-V1-5', version='V1.5'),
'Mini-InternVL-Chat-2B-V1-5': partial(InternVLChat, model_path='OpenGVLab/Mini-InternVL-Chat-2B-V1-5', version='V1.5'),
'Mini-InternVL-Chat-4B-V1-5': partial(InternVLChat, model_path='OpenGVLab/Mini-InternVL-Chat-4B-V1-5', version='V1.5'),
# InternVL2 series
'InternVL2-1B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-1B', version='V2.0'),
'InternVL2-2B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-2B', version='V2.0'),
'InternVL2-4B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-4B', version='V2.0'),
'InternVL2-8B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-8B', version='V2.0'),
'InternVL2-26B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-26B', version='V2.0'),
'InternVL2-40B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-40B', version='V2.0'),
'InternVL2-76B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-Llama3-76B', version='V2.0'),
# InternVL2 MPO series
'InternVL2-8B-MPO': partial(InternVLChat, model_path='OpenGVLab/InternVL2-8B-MPO', version='V2.0'),
'InternVL2-8B-MPO-CoT': partial(InternVLChat, model_path='OpenGVLab/InternVL2-8B-MPO', version='V2.0', use_mpo_prompt=True),
# InternVL2.5 series
'InternVL2_5-1B': partial(InternVLChat, model_path='OpenGVLab/InternVL2_5-1B', version='V2.0'),
'InternVL2_5-2B': partial(InternVLChat, model_path='OpenGVLab/InternVL2_5-2B', version='V2.0'),
'InternVL2_5-4B': partial(InternVLChat, model_path='OpenGVLab/InternVL2_5-4B', version='V2.0'),
'InternVL2_5-8B': partial(InternVLChat, model_path='OpenGVLab/InternVL2_5-8B', version='V2.0'),
'InternVL2_5-26B': partial(InternVLChat, model_path='OpenGVLab/InternVL2_5-26B', version='V2.0'),
'InternVL2_5-38B': partial(InternVLChat, model_path='OpenGVLab/InternVL2_5-38B', version='V2.0'),
'InternVL2_5-78B': partial(InternVLChat, model_path='OpenGVLab/InternVL2_5-78B', version='V2.0'),
}
sail_series = {
'SAIL-VL-2B': partial(SailVL, model_path='BytedanceDouyinContent/SAIL-VL-2B')
}
yivl_series = {
'Yi_VL_6B': partial(Yi_VL, model_path='01-ai/Yi-VL-6B', root=Yi_ROOT),
'Yi_VL_34B': partial(Yi_VL, model_path='01-ai/Yi-VL-34B', root=Yi_ROOT),
}
xcomposer_series = {
'XComposer': partial(XComposer, model_path='internlm/internlm-xcomposer-vl-7b'),
'sharecaptioner': partial(ShareCaptioner, model_path='Lin-Chen/ShareCaptioner'),
'XComposer2': partial(XComposer2, model_path='internlm/internlm-xcomposer2-vl-7b'),
'XComposer2_1.8b': partial(XComposer2, model_path='internlm/internlm-xcomposer2-vl-1_8b'),
'XComposer2_4KHD': partial(XComposer2_4KHD, model_path='internlm/internlm-xcomposer2-4khd-7b'),
'XComposer2d5': partial(XComposer2d5, model_path='internlm/internlm-xcomposer2d5-7b'),
}
minigpt4_series = {
'MiniGPT-4-v2': partial(MiniGPT4, mode='v2', root=MiniGPT4_ROOT),
'MiniGPT-4-v1-7B': partial(MiniGPT4, mode='v1_7b', root=MiniGPT4_ROOT),
'MiniGPT-4-v1-13B': partial(MiniGPT4, mode='v1_13b', root=MiniGPT4_ROOT),
}
idefics_series = {
'idefics_9b_instruct': partial(IDEFICS, model_path='HuggingFaceM4/idefics-9b-instruct'),
'idefics_80b_instruct': partial(IDEFICS, model_path='HuggingFaceM4/idefics-80b-instruct'),
'idefics2_8b': partial(IDEFICS2, model_path='HuggingFaceM4/idefics2-8b'),
# Idefics3 follows Idefics2 Pattern
'Idefics3-8B-Llama3': partial(IDEFICS2, model_path='HuggingFaceM4/Idefics3-8B-Llama3'),
}
smolvlm_series = {
'SmolVLM': partial(SmolVLM, model_path='HuggingFaceTB/SmolVLM-Instruct'),
'SmolVLM-DPO': partial(SmolVLM, model_path='HuggingFaceTB/SmolVLM-Instruct-DPO'),
'SmolVLM-Synthetic': partial(SmolVLM, model_path='HuggingFaceTB/SmolVLM-Instruct'),
}
instructblip_series = {
'instructblip_7b': partial(InstructBLIP, name='instructblip_7b'),
'instructblip_13b': partial(InstructBLIP, name='instructblip_13b'),
}
deepseekvl_series = {
'deepseek_vl_7b': partial(DeepSeekVL, model_path='deepseek-ai/deepseek-vl-7b-chat'),
'deepseek_vl_1.3b': partial(DeepSeekVL, model_path='deepseek-ai/deepseek-vl-1.3b-chat'),
}
janus_series = {
'Janus-1.3B': partial(Janus, model_path='deepseek-ai/Janus-1.3B')
}
cogvlm_series = {
'cogvlm-grounding-generalist': partial(CogVlm, model_path='THUDM/cogvlm-grounding-generalist-hf', tokenizer_name='lmsys/vicuna-7b-v1.5'),
'cogvlm-chat': partial(CogVlm, model_path='THUDM/cogvlm-chat-hf', tokenizer_name='lmsys/vicuna-7b-v1.5'),
'cogvlm2-llama3-chat-19B': partial(CogVlm, model_path='THUDM/cogvlm2-llama3-chat-19B'),
'glm-4v-9b': partial(GLM4v, model_path='THUDM/glm-4v-9b')
}
wemm_series = {
'WeMM': partial(WeMM, model_path='feipengma/WeMM'),
}
cambrian_series = {
'cambrian_8b': partial(Cambrian, model_path='nyu-visionx/cambrian-8b'),
'cambrian_13b': partial(Cambrian, model_path='nyu-visionx/cambrian-13b'),
'cambrian_34b': partial(Cambrian, model_path='nyu-visionx/cambrian-34b'),
}
chameleon_series = {
'chameleon_7b': partial(Chameleon, model_path='facebook/chameleon-7b'),
'chameleon_30b': partial(Chameleon, model_path='facebook/chameleon-30b'),
}
vila_series = {
'VILA1.5-3b': partial(VILA, model_path='Efficient-Large-Model/VILA1.5-3b'),
'Llama-3-VILA1.5-8b': partial(VILA, model_path='Efficient-Large-Model/Llama-3-VILA1.5-8b'),
'VILA1.5-13b': partial(VILA, model_path='Efficient-Large-Model/VILA1.5-13b'),
'VILA1.5-40b': partial(VILA, model_path='Efficient-Large-Model/VILA1.5-40b'),
}
ovis_series = {
'Ovis1.5-Llama3-8B': partial(Ovis, model_path='AIDC-AI/Ovis1.5-Llama3-8B'),
'Ovis1.5-Gemma2-9B': partial(Ovis, model_path='AIDC-AI/Ovis1.5-Gemma2-9B'),
'Ovis1.6-Gemma2-9B': partial(Ovis1_6, model_path='AIDC-AI/Ovis1.6-Gemma2-9B'),
'Ovis1.6-Llama3.2-3B': partial(Ovis1_6, model_path='AIDC-AI/Ovis1.6-Llama3.2-3B'),
'Ovis1.6-Gemma2-27B': partial(Ovis1_6_Plus, model_path='AIDC-AI/Ovis1.6-Gemma2-27B')
}
mantis_series = {
'Mantis-8B-siglip-llama3': partial(Mantis, model_path='TIGER-Lab/Mantis-8B-siglip-llama3'),
'Mantis-8B-clip-llama3': partial(Mantis, model_path='TIGER-Lab/Mantis-8B-clip-llama3'),
'Mantis-8B-Idefics2': partial(Mantis, model_path='TIGER-Lab/Mantis-8B-Idefics2'),
'Mantis-8B-Fuyu': partial(Mantis, model_path='TIGER-Lab/Mantis-8B-Fuyu')
}
phi3_series = {
'Phi-3-Vision': partial(Phi3Vision, model_path='microsoft/Phi-3-vision-128k-instruct'),
'Phi-3.5-Vision': partial(Phi3_5Vision, model_path='microsoft/Phi-3.5-vision-instruct')
}
xgen_mm_series = {
'xgen-mm-phi3-interleave-r-v1.5': partial(XGenMM, model_path='Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5'),
'xgen-mm-phi3-dpo-r-v1.5': partial(XGenMM, model_path='Salesforce/xgen-mm-phi3-mini-instruct-dpo-r-v1.5'),
}
qwen2vl_series = {
'Qwen-VL-Max-0809': partial(Qwen2VLAPI, model='qwen-vl-max-0809', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen-VL-Plus-0809': partial(Qwen2VLAPI, model='qwen-vl-plus-0809', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen2-VL-72B-Instruct': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-72B-Instruct', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen2-VL-7B-Instruct': partial(Qwen2VLChat, model_path='/home/luopl1/Qwen2-VL-7B-Instruct', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen2-VL-7B-Instruct-AWQ': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct-AWQ', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen2-VL-7B-Instruct-GPTQ-Int4': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen2-VL-7B-Instruct-GPTQ-Int8': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen2-VL-2B-Instruct': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-2B-Instruct', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen2-VL-2B-Instruct-AWQ': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-2B-Instruct-AWQ', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen2-VL-2B-Instruct-GPTQ-Int4': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen2-VL-2B-Instruct-GPTQ-Int8': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8', min_pixels=1280*28*28, max_pixels=16384*28*28),
'XinYuan-VL-2B-Instruct': partial(Qwen2VLChat, model_path='Cylingo/Xinyuan-VL-2B', min_pixels=1280*28*28, max_pixels=16384*28*28),
}
slime_series = {
'Slime-7B': partial(SliME, model_path='yifanzhang114/SliME-vicuna-7B'),
'Slime-8B': partial(SliME, model_path='yifanzhang114/SliME-Llama3-8B'),
'Slime-13B': partial(SliME, model_path='yifanzhang114/SliME-vicuna-13B'),
}
eagle_series={
'Eagle-X4-8B-Plus': partial(Eagle, model_path='NVEagle/Eagle-X4-8B-Plus'),
'Eagle-X4-13B-Plus': partial(Eagle, model_path='NVEagle/Eagle-X4-13B-Plus'),
'Eagle-X5-7B': partial(Eagle, model_path='NVEagle/Eagle-X5-7B'),
'Eagle-X5-13B': partial(Eagle, model_path='NVEagle/Eagle-X5-13B'),
'Eagle-X5-13B-Chat': partial(Eagle, model_path='NVEagle/Eagle-X5-13B-Chat'),
'Eagle-X5-34B-Chat': partial(Eagle, model_path='NVEagle/Eagle-X5-34B-Chat'),
'Eagle-X5-34B-Plus': partial(Eagle, model_path='NVEagle/Eagle-X5-34B-Plus'),
}
moondream_series={
'Moondream1': partial(Moondream1, model_path='vikhyatk/moondream1'),
'Moondream2': partial(Moondream2, model_path='vikhyatk/moondream2'),
}
llama_series={
'Llama-3.2-11B-Vision-Instruct': partial(llama_vision, model_path='meta-llama/Llama-3.2-11B-Vision-Instruct'),
'LLaVA-CoT': partial(llama_vision, model_path='Xkev/Llama-3.2V-11B-cot'),
'Llama-3.2-90B-Vision-Instruct': partial(llama_vision, model_path='meta-llama/Llama-3.2-90B-Vision-Instruct'),
}
molmo_series={
'molmoE-1B-0924': partial(molmo, model_path='allenai/MolmoE-1B-0924'),
'molmo-7B-D-0924': partial(molmo, model_path='allenai/Molmo-7B-D-0924'),
'molmo-7B-O-0924': partial(molmo, model_path='allenai/Molmo-7B-O-0924'),
'molmo-72B-0924': partial(molmo, model_path='allenai/Molmo-72B-0924'),
}
kosmos_series={
'Kosmos2': partial(Kosmos2, model_path='microsoft/kosmos-2-patch14-224')
}
points_series = {
'POINTS-Yi-1.5-9B-Chat': partial(POINTS, model_path='WePOINTS/POINTS-Yi-1-5-9B-Chat'),
'POINTS-Qwen-2.5-7B-Chat': partial(POINTS, model_path='WePOINTS/POINTS-Qwen-2-5-7B-Chat'),
'POINTSV15-Qwen-2.5-7B-Chat': partial(POINTSV15, model_path='WePOINTS/POINTS-1-5-Qwen-2-5-7B-Chat'),
}
nvlm_series = {
'NVLM': partial(NVLM, model_path='nvidia/NVLM-D-72B'),
}
vintern_series = {
'Vintern-3B-beta': partial(VinternChat, model_path='5CD-AI/Vintern-3B-beta'),
'Vintern-1B-v2': partial(VinternChat, model_path='5CD-AI/Vintern-1B-v2'),
}
aria_series = {
"Aria": partial(Aria, model_path='rhymes-ai/Aria')
}
h2ovl_series = {
'h2ovl-mississippi-2b': partial(H2OVLChat, model_path='h2oai/h2ovl-mississippi-2b'),
'h2ovl-mississippi-1b': partial(H2OVLChat, model_path='h2oai/h2ovl-mississippi-800m'),
}
valley_series = {
'valley_eagle': partial(ValleyEagleChat, model_path='bytedance-research/Valley-Eagle-7B'),
}
supported_VLM = {}
model_groups = [
ungrouped, api_models,
xtuner_series, qwen_series, llava_series, internvl_series, yivl_series,
xcomposer_series, minigpt4_series, idefics_series, instructblip_series,
deepseekvl_series, janus_series, minicpm_series, cogvlm_series, wemm_series,
cambrian_series, chameleon_series, video_models, ovis_series, vila_series,
mantis_series, mmalaya_series, phi3_series, xgen_mm_series, qwen2vl_series,
slime_series, eagle_series, moondream_series, llama_series, molmo_series,
kosmos_series, points_series, nvlm_series, vintern_series, h2ovl_series, aria_series,
smolvlm_series, sail_series, valley_series
]
for grp in model_groups:
supported_VLM.update(grp)
import warnings
from .image_base import img_root_map, ImageBaseDataset
from .image_caption import ImageCaptionDataset
from .image_yorn import ImageYORNDataset
from .image_mcq import (
ImageMCQDataset, MMMUDataset, CustomMCQDataset, MUIRDataset, GMAIMMBenchDataset, MMERealWorld, HRBenchDataset,
NaturalBenchDataset
)
from .image_mt import MMDUDataset
from .image_vqa import (
ImageVQADataset, MathVision, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, TableVQABench,
CustomVQADataset, CRPE, MathVerse, OlympiadBench, QSpatial, VizWiz, MMNIAH
)
from .text_mcq import CustomTextMCQDataset, TextMCQDataset
from .vcr import VCRDataset
from .mmlongbench import MMLongBench
from .dude import DUDE
from .slidevqa import SlideVQA
from .mmbench_video import MMBenchVideo
from .videomme import VideoMME
from .mvbench import MVBench, MVBench_MP4
from .mlvu import MLVU, MLVU_MCQ, MLVU_OpenEnded
from .tempcompass import TempCompass, TempCompass_Captioning, TempCompass_MCQ, TempCompass_YorN
from .longvideobench import LongVideoBench
from .video_concat_dataset import ConcatVideoDataset
from .mmgenbench import MMGenBench
from .miabench import MIABench
from .cmmmu import CMMMU
from .wildvision import WildVision
from .mmmath import MMMath
from .dynamath import Dynamath
from .utils import *
from .video_dataset_config import *
from ..smp import *
class ConcatDataset(ImageBaseDataset):
# This dataset takes multiple dataset names as input and aggregate them into a single dataset.
# Each single dataset should not have a field named `SUB_DATASET`
DATASET_SETS = {
'MMMB': ['MMMB_ar', 'MMMB_cn', 'MMMB_en', 'MMMB_pt', 'MMMB_ru', 'MMMB_tr'],
'MTL_MMBench_DEV': [
'MMBench_dev_ar', 'MMBench_dev_cn', 'MMBench_dev_en',
'MMBench_dev_pt', 'MMBench_dev_ru', 'MMBench_dev_tr'
]
}
def __init__(self, dataset):
datasets = self.DATASET_SETS[dataset]
self.dataset_map = {}
# The name of the compliation
self.dataset_name = dataset
self.datasets = datasets
for dname in datasets:
dataset = build_dataset(dname)
assert dataset is not None, dataset
self.dataset_map[dname] = dataset
TYPES = [x.TYPE for x in self.dataset_map.values()]
MODALITIES = [x.MODALITY for x in self.dataset_map.values()]
assert np.all([x == TYPES[0] for x in TYPES]), (datasets, TYPES)
assert np.all([x == MODALITIES[0] for x in MODALITIES]), (datasets, MODALITIES)
self.TYPE = TYPES[0]
self.MODALITY = MODALITIES[0]
data_all = []
for dname in datasets:
data = self.dataset_map[dname].data
data['SUB_DATASET'] = [dname] * len(data)
data_new = localize_df(data, dname, nproc=16)
data_all.append(data_new)
data = pd.concat(data_all)
data['original_index'] = data.pop('index')
data['index'] = np.arange(len(data))
self.data = data
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
idx = line['original_index']
dname = line['SUB_DATASET']
org_data = self.dataset_map[dname].data
org_line = cp.deepcopy(org_data[org_data['index'] == idx]).iloc[0]
return self.dataset_map[dname].build_prompt(org_line)
def dump_image(self, line):
# Assert all images are pre-dumped
assert 'image' not in line
assert 'image_path' in line
tgt_path = toliststr(line['image_path'])
return tgt_path
@classmethod
def supported_datasets(cls):
return list(cls.DATASET_SETS)
def evaluate(self, eval_file, **judge_kwargs):
suffix = eval_file.split('.')[-1]
# First, split the eval_file by dataset
data_all = load(eval_file)
for dname in self.datasets:
tgt = eval_file.replace(self.dataset_name, dname)
data_sub = data_all[data_all['SUB_DATASET'] == dname]
data_sub.pop('index')
data_sub['index'] = data_sub.pop('original_index')
data_sub.pop('SUB_DATASET')
dump(data_sub, tgt)
# Then, evaluate each dataset separately
results_all = []
for dname in self.datasets:
tgt = eval_file.replace(self.dataset_name, dname)
res = self.dataset_map[dname].evaluate(tgt, **judge_kwargs)
assert isinstance(res, pd.DataFrame)
res['DATASET'] = [dname] * len(res)
results_all.append(res)
result = pd.concat(results_all)
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
dump(result, score_file)
return result
# Add new supported dataset class here
IMAGE_DATASET = [
ImageCaptionDataset, ImageYORNDataset, ImageMCQDataset, ImageVQADataset, MathVision,
MMMUDataset, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, TableVQABench,
MMLongBench, VCRDataset, MMDUDataset, DUDE, SlideVQA, MUIRDataset,
GMAIMMBenchDataset, MMERealWorld, HRBenchDataset, CRPE, MathVerse, NaturalBenchDataset,
MIABench, OlympiadBench, WildVision, MMMath, QSpatial, Dynamath, MMGenBench, VizWiz, MMNIAH,
CMMMU
]
VIDEO_DATASET = [
MMBenchVideo, VideoMME, MVBench, MVBench_MP4, LongVideoBench,
MLVU, MLVU_MCQ, MLVU_OpenEnded,
TempCompass, TempCompass_MCQ, TempCompass_Captioning, TempCompass_YorN
]
TEXT_DATASET = [
TextMCQDataset
]
CUSTOM_DATASET = [
CustomMCQDataset, CustomVQADataset, CustomTextMCQDataset
]
DATASET_COLLECTION = [ConcatDataset, ConcatVideoDataset]
DATASET_CLASSES = IMAGE_DATASET + VIDEO_DATASET + TEXT_DATASET + CUSTOM_DATASET + DATASET_COLLECTION
SUPPORTED_DATASETS = []
for DATASET_CLS in DATASET_CLASSES:
SUPPORTED_DATASETS.extend(DATASET_CLS.supported_datasets())
def DATASET_TYPE(dataset, *, default: str = 'MCQ') -> str:
for cls in DATASET_CLASSES:
if dataset in cls.supported_datasets():
if hasattr(cls, 'TYPE'):
return cls.TYPE
# Have to add specific routine to handle ConcatDataset
if dataset in ConcatDataset.DATASET_SETS:
dataset_list = ConcatDataset.DATASET_SETS[dataset]
TYPES = [DATASET_TYPE(dname) for dname in dataset_list]
assert np.all([x == TYPES[0] for x in TYPES]), (dataset_list, TYPES)
return TYPES[0]
if 'openended' in dataset.lower():
return 'VQA'
warnings.warn(f'Dataset {dataset} is a custom one and not annotated as `openended`, will treat as {default}. ')
return default
def DATASET_MODALITY(dataset, *, default: str = 'IMAGE') -> str:
if dataset is None:
warnings.warn(f'Dataset is not specified, will treat modality as {default}. ')
return default
for cls in DATASET_CLASSES:
if dataset in cls.supported_datasets():
if hasattr(cls, 'MODALITY'):
return cls.MODALITY
# Have to add specific routine to handle ConcatDataset
if dataset in ConcatDataset.DATASET_SETS:
dataset_list = ConcatDataset.DATASET_SETS[dataset]
MODALITIES = [DATASET_MODALITY(dname) for dname in dataset_list]
assert np.all([x == MODALITIES[0] for x in MODALITIES]), (dataset_list, MODALITIES)
return MODALITIES[0]
if 'VIDEO' in dataset.lower():
return 'VIDEO'
elif 'IMAGE' in dataset.lower():
return 'IMAGE'
warnings.warn(f'Dataset {dataset} is a custom one, will treat modality as {default}. ')
return default
def build_dataset(dataset_name, **kwargs):
for cls in DATASET_CLASSES:
if dataset_name in supported_video_datasets:
return supported_video_datasets[dataset_name](**kwargs)
elif dataset_name in cls.supported_datasets():
return cls(dataset=dataset_name, **kwargs)
warnings.warn(f'Dataset {dataset_name} is not officially supported. ')
data_file = osp.join(LMUDataRoot(), f'{dataset_name}.tsv')
if not osp.exists(data_file):
warnings.warn(f'Data file {data_file} does not exist. Dataset building failed. ')
return None
data = load(data_file)
if 'question' not in [x.lower() for x in data.columns]:
warnings.warn(f'Data file {data_file} does not have a `question` column. Dataset building failed. ')
return None
if 'A' in data and 'B' in data:
if 'image' in data or 'image_path' in data:
warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom MCQ dataset. ')
return CustomMCQDataset(dataset=dataset_name, **kwargs)
else:
warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom Text MCQ dataset. ')
return CustomTextMCQDataset(dataset=dataset_name, **kwargs)
else:
warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom VQA dataset. ')
return CustomVQADataset(dataset=dataset_name, **kwargs)
__all__ = [
'build_dataset', 'img_root_map', 'build_judge', 'extract_answer_from_item', 'prefetch_answer', 'DEBUG_MESSAGE'
] + [cls.__name__ for cls in DATASET_CLASSES]
from .image_base import ImageBaseDataset
import random
from collections import Counter
import os
import re
import tempfile
from ..smp import *
def get_multi_choice_prediction(response, all_choices, index2ans):
for char in [',', '.', '!', '?', ';', ':', "'"]:
response = response.strip(char)
response = " " + response + " " # add space to avoid partial match
candidates = []
for choice in all_choices: # (A) (B) (C) (D)
# Add the choice to candidates each time it appears in the response
candidates.extend([choice for _ in range(response.count(f'({choice})'))])
if len(candidates) == 0:
for choice in all_choices: # A B C D
# Similarly, add the choice for each occurrence
candidates.extend([choice for _ in range(response.count(f'{choice}'))])
if len(candidates) == 0 and len(response.split()) >= 1:
for index, ans in index2ans.items():
# Add index for each occurrence of ans in response
candidates.extend([index for _ in range(response.count(ans))])
# if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
if len(candidates) == 0 and len(response.split()) >= 1:
for index, ans in index2ans.items():
if ans in response:
candidates.append(index)
# index_ans = False # it's content ans.
if len(candidates) == 0: # still not get answer, randomly choose one.
return random.choice(all_choices)
# return ''
else:
# Count the occurrence of each candidate
candidate_counts = Counter(candidates)
# Select the most frequent candidates
max_count = max(candidate_counts.values())
most_frequent_candidates = [c for c in all_choices if candidate_counts.get(c, 0) == max_count]
# Combine the most frequent candidates in ABCD order
return ''.join(most_frequent_candidates)
def extract_numbers(string):
# Pattern for numbers with Chinese commas
pattern_commas = r'-?\d{1,3}(?:,\d{3})+'
# Pattern for scientific notation
pattern_scientific = r'-?\d+(?:\.\d+)?[eE][+-]?\d+'
# Pattern for simple numbers without Chinese commas
pattern_simple = r'-?(?:\d+\.\d+|\.\d+|\d+)(?![eE][+-]?\d+)(?!,\d)'
# Extract numbers with Chinese commas
numbers_with_commas = re.findall(pattern_commas, string)
# Extract numbers in scientific notation
numbers_scientific = re.findall(pattern_scientific, string)
# Extract simple numbers without Chinese commas
numbers_simple = re.findall(pattern_simple, string)
# Combine all extracted numbers
all_numbers = numbers_with_commas + numbers_scientific + numbers_simple
return all_numbers
def check_is_number(string):
try:
float(string.replace(',', ''))
return True
except ValueError:
# check if there's comma inside
return False
def count_letters(string):
return sum(c.isalpha() and 'a' <= c <= 'z' or 'A' <= c <= 'Z' for c in string)
def normalize_str(string, answer):
# check if characters in the string
# if number, numerize it.
if string is None:
return [string]
string = string.strip()
is_number = check_is_number(string)
if is_number:
string = string.replace(',', '')
string = float(string)
# leave 2 decimal
string = round(string, 2)
return [string]
else: # it's likely to be a string
if len(string) > len(answer) + 20 or count_letters(string) > count_letters(answer) + 2:
return []
return [string]
def get_fill_blank_prediction(response, answer):
"""get the prediction from the generated response,
return a list of predicted strings or numbers"""
def get_key_subresponses(response):
response = response.strip("。").strip()
sub_responses = re.split(r'。|\n', response)
indicators_of_keys = ['是', '为', '所以', '等于', '方案', '选择',
'正确答案', '因此', '最后', '答案', '结果']
key_responses = []
for index, resp in enumerate(sub_responses):
# if last one, accept it's an equation (the entire response can be just one sentence with equation)
if index == len(sub_responses) - 1:
indicators_of_keys.extend(['='])
shortest_key_response = None
# the shortest response that may contain the answer (tail part of the response)
for indicator in indicators_of_keys:
if indicator in resp:
if not shortest_key_response:
shortest_key_response = resp.split(indicator)[-1].strip()
else:
if len(resp.split(indicator)[-1].strip()) < len(shortest_key_response):
shortest_key_response = resp.split(indicator)[-1].strip()
if shortest_key_response:
# and it's not trivial
if shortest_key_response.strip() not in [":", ",", ".", "!", "?", ";", ":", "'"]:
key_responses.append(shortest_key_response)
if len(key_responses) == 0: # did not found any
return [response]
return key_responses
key_responses = get_key_subresponses(response)
pred_list = key_responses.copy() # keep the original string response
for resp in key_responses:
pred_list.extend(extract_numbers(resp))
tmp_pred_list = []
for i in range(len(pred_list)):
tmp_pred_list.extend(normalize_str(pred_list[i], answer))
pred_list = tmp_pred_list
# remove duplicates
pred_list = list(set(pred_list))
return pred_list
def get_TF_prediction(response):
"""get the prediction from the generated response,
return a list of predicted strings or numbers"""
def get_key_subresponses(response):
response = response.strip("。").strip()
sub_responses = re.split(r'。|\n', response)
indicators_of_keys = ['是', '为', '所以', '判断',
'陈述', '说法', '表达', '答案', '结果']
key_responses = []
for index, resp in enumerate(sub_responses):
shortest_key_response = None
# the shortest response that may contain the answer (tail part of the response)
for indicator in indicators_of_keys:
if indicator in resp:
if not shortest_key_response:
shortest_key_response = resp.split(indicator)[-1].strip()
else:
if len(resp.split(indicator)[-1].strip()) < len(shortest_key_response):
shortest_key_response = resp.split(indicator)[-1].strip()
if shortest_key_response:
# and it's not trivial
if shortest_key_response.strip() not in [":", ",", ".", "!", "?", ";", ":", "'"]:
key_responses.append(shortest_key_response)
if len(key_responses) == 0: # did not found any
return [response]
return key_responses
key_responses = get_key_subresponses(response)
pred_list = key_responses.copy() # keep the original string response
# remove duplicates
pred_list = list(set(pred_list))
return pred_list
class CMMMU(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'CMMMU_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/CMMMU_VAL.tsv'
}
DATASET_MD5 = {
'CMMMU_VAL': 'b4727e2fce2415bf646379e60c11a726'
}
def dump_image(self, line):
os.makedirs(self.img_root, exist_ok=True)
tgt_path_z = []
if isinstance(line['image'], list):
for i in range(len(line['image'])):
tgt_path = osp.join(self.img_root, f"{line['index']}--{i + 1}.jpg")
if not read_ok(tgt_path):
decode_base64_to_image_file(line['image'][i], tgt_path)
tgt_path_z.append(tgt_path)
else:
tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
if not read_ok(tgt_path):
decode_base64_to_image_file(line['image'], tgt_path)
tgt_path_z.append(tgt_path)
return tgt_path_z
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
suffix = eval_file.split('.')[-1]
result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
if not osp.exists(result_file):
data = load(eval_file)
assert 'answer' in data and 'prediction' in data
data['prediction'] = [str(x) for x in data['prediction']]
data['answer'] = [str(x) for x in data['answer']]
correct_count = 0
correct_category = {
'技术与工程': [0, 0],
'科学': [0, 0],
'健康与医学': [0, 0],
'商业': [0, 0],
'艺术与设计': [0, 0],
'人文社会科学': [0, 0],
}
for i in tqdm(data.iterrows()):
line = i[1]
correct_category[line['category']][0] += 1
# Options
if line['type'] == '选择':
index2ans = {
'A': line['option1'],
'B': line['option2'],
'C': line['option3'],
'D': line['option4']
}
fact_option = get_multi_choice_prediction(line['prediction'], ['A', 'B', 'C', 'D'], index2ans)
if fact_option == line['answer']:
correct_count += 1
correct_category[line['category']][1] += 1
# Binary
elif line['type'] == '判断':
positive_keywords = ['正确', '对', '准确', '肯定', '对的']
negative_keywords = ['不对', '错误', '不正确', '不准确', '不合适', '否定', '错的', '错']
ambiguous_keywords = ['对错', '是否正确', '否正确', '或者', '是否', '正确性', '对不']
def judge_similarity(pred_list, positive_keywords, negative_keywords):
positive_count = 0
negative_count = 0
for pred in pred_list:
if any(pos_word in pred for pos_word in positive_keywords):
positive_count += 1
elif any(neg_word in pred for neg_word in negative_keywords):
negative_count += 1
if positive_count > negative_count:
return "对"
elif negative_count > positive_count:
return "错"
else:
return random.choice(['对', '错'])
answer = get_TF_prediction(line['prediction'])
answer = [word for word in answer if not any(ambiguous in word for ambiguous in ambiguous_keywords)]
fact_answer = judge_similarity(answer, positive_keywords, negative_keywords)
if fact_answer == line['answer']:
correct_count += 1
correct_category[line['category']][1] += 1
# Fill the Blank
else:
norm_answers = normalize_str(line['answer'], line['answer'])
predicted_answer = get_fill_blank_prediction(line['prediction'], line['answer'])
for pred in predicted_answer:
# already normalized
if isinstance(pred, str): # if it's a string, then find if ans in the pred_i
for norm_ans in norm_answers:
# only see if the string answer in the string pred
# print(norm_ans, pred)
if isinstance(norm_ans, str) and norm_ans in pred:
correct_count += 1
correct_category[line['category']][1] += 1
else: # it's a number
if pred in norm_answers:
correct_count += 1
correct_category[line['category']][1] += 1
accuracyz = {}
accuracyz['总准确率'] = correct_count / len(data)
for i in correct_category.keys():
accuracyz[i] = correct_category[i][1] / correct_category[i][0]
accuracyz = d2df(accuracyz)
accuracyz.round(10)
dump(accuracyz, result_file)
result = pd.read_csv(result_file)
return result
def build_prompt(self, line):
if line['type'] == '选择':
tgt_path = self.dump_image(line)
question = line['question']
options_prompt = 'Options:\n'
for i in [['A', '1'], ['B', '2'], ['C', '3'], ['D', '4']]:
options_prompt += i[0] + '. ' + line['option' + i[1]] + '\n'
prompt = (f'问题: {question}\n' + options_prompt
+ '请回答上述多项选择题,并选出正确选项。这些题目可能包括单选和多选题型。如果所提供的信息不足以确定一个明确的答案,那么请根据可用的数据和你的判断来选择最可能正确的选项。')
msgs = []
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=prompt))
return msgs
elif line['type'] == '判断':
msgs = super().build_prompt(line)
assert msgs[-1]['type'] == 'text'
msgs[-1]['value'] += '\n请回答上述判断题,并根据题目描述和所给的信息来判断问题中陈述的对错。如果信息不完整或不足以作出绝对判断,请运用你的逻辑推理和现有信息来做出最可能的判断。'
return msgs
else:
msgs = super().build_prompt(line)
assert msgs[-1]['type'] == 'text'
msgs[-1]['value'] += '\n请回答上述填空题,并根据题目的要求和所提供的信息来给出最恰当的答案。如果信息不足以确切回答,那么请依据现有的数据和你的推理能力来填写最合理的答案。'
return msgs
import math
from typing import List
from .utils.judge_util import build_judge
from .image_base import ImageBaseDataset
from .mmlongbench import concat_images, MMLongBench_auxeval, anls_compute
from ..smp import *
FAIL_MSG = 'Failed to obtain answer via API.'
def DUDE_acc(result_file):
data = load(result_file)
overall_score = 0.0
score_list = list()
for i in range(len(data)):
item = data.iloc[i]
if isinstance(item['answer'], float) and math.isnan(item['answer']):
item['answer'] = 'Not answerable'
item['answer'] = item['answer'].lower()
item['pred'] = item['pred'].lower()
score = anls_compute(item['answer'], item['pred'])
score_list.append(score)
overall_score += score
data['score'] = score_list
dump(data, result_file)
res = dict()
res['category'], res['num'], res['avg_score'] = ['anls'], [len(data)], [overall_score / len(data)]
res = pd.DataFrame(res)
return res
class DUDE(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'DUDE': 'https://opencompass.openxlab.space/utils/VLMEval/DUDE.tsv',
'DUDE_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/DUDE_MINI.tsv',
}
DATASET_MD5 = {
'DUDE': '130d860d08206e1e407cd77150c10d88',
'DUDE_MINI': 'e0c0d998114f0cca7516d12039d2b538',
}
SUPPORTED_MODELS = {
'GPT4': (1, 1),
'GPT4V': (1, 1),
'GPT4V_HIGH': (1, 1),
'GPT4o': (1, 1),
'GPT4o_HIGH': (1, 1),
'GPT4o_MINI': (1, 1),
'XComposer2d5': (1, -1),
'XComposer2_4KHD': (1, -1),
'MiniCPM-Llama3-V-2_5': (1, 5),
'InternVL-Chat-V1-5': (5, 2),
}
def __init__(self, dataset, **kwargs):
self.model_list = list(self.SUPPORTED_MODELS.keys())
model_name = kwargs['model']
if not listinstr(self.model_list, model_name):
raise AssertionError("{} doesn't support the evaluation on DUDE.".format(model_name))
super(DUDE, self).__init__(dataset)
self.is_api = True if listinstr(['GPT4'], model_name) else False
self.max_pages = 120
concat_num, column_num = self.SUPPORTED_MODELS.get(model_name)
self.concat_num = concat_num
self.column_num = column_num
def prepare_tsv(self, url, file_md5=None):
data_root = LMUDataRoot()
os.makedirs(data_root, exist_ok=True)
file_name = url.split('/')[-1]
data_path = osp.join(data_root, file_name)
if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
pass
else:
warnings.warn('The dataset tsv is not downloaded')
download_file(url, data_path)
return load(data_path)
def dump_image(self, origin_line):
os.makedirs(self.img_root, exist_ok=True)
try:
import fitz
except Exception as e:
logging.critical(f'{type(e)}: {e}')
logging.critical('Please use `pip install pymupdf` to parse PDF files.')
line = origin_line.copy()
if not isinstance(line['image_path'], List):
line['image_path'] = [line['image_path']]
line['image_path'] = line['image_path'][:self.max_pages]
skip_pdf_parse = True
for im_name in line['image_path']:
path = osp.join(self.img_root, im_name)
if not read_ok(path):
skip_pdf_parse = False
break
# Just for being compatible with the zooped loop: zip(line['image'], line['image_path'])
if skip_pdf_parse:
line['image'] = line['image_path']
else:
pdf_data = base64.b64decode(line['image'])
pdf_file = io.BytesIO(pdf_data)
encoded_images = []
with fitz.open(stream=pdf_file, filetype='pdf') as doc:
doc = doc[:self.max_pages]
for page in doc:
image = page.get_pixmap(dpi=144)
image_file = io.BytesIO(image.tobytes(output='png'))
image = Image.open(image_file)
encoded_image = encode_image_to_base64(image)
encoded_images.append(encoded_image)
line['image'] = encoded_images
print('process {}'.format(line['doc_id']))
if 'image' in line:
if isinstance(line['image'], list):
tgt_path = []
assert 'image_path' in line
for img, im_name in zip(line['image'], line['image_path']):
path = osp.join(self.img_root, im_name)
if not read_ok(path):
decode_base64_to_image_file(img, path)
tgt_path.append(path)
else:
tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
if not read_ok(tgt_path):
decode_base64_to_image_file(line['image'], tgt_path)
tgt_path = [tgt_path]
else:
assert 'image_path' in line
tgt_path = toliststr(line['image_path'])
if self.concat_num > 0 and not self.is_api:
concatenated_images = concat_images(tgt_path, max_concat=self.concat_num, column_num=self.column_num)
old_tgt_path = tgt_path
assert isinstance(old_tgt_path, list)
if self.column_num != -1:
tgt_path = [
'_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat{}_{}.jpg'.format(self.concat_num, i)
for i in range(len(concatenated_images))
]
else:
tgt_path = ['_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat_all.jpg']
for path, concatenated_image in zip(tgt_path, concatenated_images):
if not read_ok(path):
decode_base64_to_image_file(encode_image_to_base64(concatenated_image), path)
num_images, image_size = len(old_tgt_path), concatenated_image.size
print('concat {} images to a new one with size {}. save at {}'.format(num_images, image_size, path))
return tgt_path
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
logger = get_logger('Evaluation')
model = judge_kwargs['model']
suffix = eval_file.split('.')[-1]
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
if osp.exists(storage):
logger.warning(f'GPT scoring file {storage} already exists, will reuse it in DUDE_eval. ')
else:
data = load(eval_file)
model = build_judge(max_tokens=128, **judge_kwargs)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = {}
if osp.exists(tmp_file):
ans = load(tmp_file)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
if len(indices):
new_results = list()
for model, line in tqdm(tups):
res = MMLongBench_auxeval(model, line)
new_results.append(res)
log_map, res_map, pred_map = {}, {}, {}
all_inds = [line['index'] for line in lines]
for k, v in zip(all_inds, new_results):
log_map[k] = v['log']
res_map[k] = v['res']
pred_map[k] = v['pred']
data['res'] = [res_map[idx] for idx in data['index']]
data['log'] = [log_map[idx] for idx in data['index']]
data['pred'] = [pred_map[idx] for idx in data['index']]
dump(data, storage)
score = DUDE_acc(storage)
score_pth = storage.replace('.xlsx', '_score.csv')
dump(score, score_pth)
logger.info(f'DUDE successfully finished evaluating {eval_file}, results saved in {score_pth}')
logger.info('Score: ')
logger.info(score)
import re
import json
import sympy as sp
import numpy as np
import pandas as pd
from sympy import simplify, Eq, sympify, Pow, pi
from sympy.parsing.latex import parse_latex
import sys
import math
import os
import os.path as osp
import argparse
from .image_base import ImageBaseDataset
from .utils import build_judge
from ..utils import track_progress_rich
from ..smp import load, dump, d2df, toliststr
def preprocess(str1):
if 0 <= str1.find("{") < str1.rfind("}"):
str1 = str1[str1.find("{"): str1.rfind("}") + 1]
str2 = str1.replace("\\", "")
str2 = str2.replace("\\n", "\n")
return str2
def transfer(str1):
if "\u03c0" in str1:
strs = str1.split('\u03c0')
str1 = strs[0]
return float(str1) * np.pi
else:
return float(str1)
def parse_answer(answer, answer_type="multiple choice"):
if answer_type == "float":
if answer.isdigit():
return True, float(answer)
else:
parts = answer.split(' ')
answer = parts[0]
try:
answer = transfer(answer)
return True, answer
except:
return False, None
elif answer_type == "multiple choice":
if len(answer) == 1:
return True, answer.upper()
else:
in_flag = [ch in answer.upper() for ch in 'ABCDE']
if sum(in_flag) == 1:
for ch in 'ABCDE':
if ch in answer.upper():
return True, ch
return False, None
else:
return True, answer
def DynaMath_auxeval(model, line):
pred = line['prediction']
pred = preprocess(pred)
succeed, short_answer = None, None
try:
dj = json.loads(pred, strict=False)
short_answer = dj.get("short answer")
assert short_answer is not None
succeed, short_answer = parse_answer(short_answer, answer_type=line['anwser_type'])
assert succeed
except:
# Failed to parse the JSON, use an auxiliary LLM to get the short answer
if line['answer_type'] == 'multiple choice':
inst = "Output the corresponing choice option, such as 'A', 'B', 'C', 'D', in a single line."
elif line['answer_type'] == 'float':
inst = "Output a three-digit floating-point number in a single line."
else:
inst = (
"Output a short answer in a single line. Any float numbers in the answer "
"should be formatted as three-digit floating-point numbers."
)
prompt = f"Free-form answer: {pred}\nInstruction: {inst}"
response = pred
succeed, short_answer = parse_answer(response, line['answer_type'])
if not succeed:
response = model.generate(prompt)
succeed, short_answer = parse_answer(response, line['answer_type'])
if line['answer_type'] == 'float':
if succeed:
diff = float(short_answer) - float(line['answer'])
if abs(diff) <= 0.001:
return dict(parse=True, extracted=short_answer, correct=True)
else:
return dict(parse=True, extracted=short_answer, correct=False)
else:
return dict(parse=False, extracted=None, correct=False)
elif line['answer_type'] == 'multiple choice':
if succeed:
return dict(parse=True, extracted=short_answer, correct=(short_answer == line['answer']))
else:
if line['answer'] in pred[:3].upper():
return dict(parse=False, extracted=None, correct=True)
else:
return dict(parse=False, extracted=None, correct=False)
else:
if succeed:
return dict(parse=True, extracted=short_answer, correct=(short_answer.lower() in line['answer'].lower()))
else:
return dict(parse=False, extracted=None, correct=(short_answer.lower() in line['answer'].lower()))
class Dynamath(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {'DynaMath': 'https://opencompass.openxlab.space/utils/VLMEval/DynaMath.tsv'}
DATASET_MD5 = {'DynaMath': 'b8425ad9a7114571fc9366e013699494'}
GUIDE = """
## Answer Instruction Please provide an answer to the question outlined above. Your response should adhere \
to the following JSON format, which includes two keys: 'solution' and 'short answer'. The 'solution' key can contain \
detailed steps needed to solve the question, and the 'short answer' key should provide a concise response. {INST}
Example of expected JSON response format:
"""
EXAMPLE = {
"solution": "[Detailed step-by-step explanation]",
"short answer": "[Concise Answer]"
}
TEXT_EXAMPLE = json.dumps(EXAMPLE, indent=4)
# Given one data record, return the built prompt (a multi-modal message), can override
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
if self.meta_only:
tgt_path = toliststr(line['image_path'])
else:
tgt_path = self.dump_image(line)
prompt = f"## Question\n {line['question']}"
if line['answer_type'] == 'multiple choice':
inst = "Provide the corresponing choice option in the 'short answer' key, such as 'A', 'B', 'C', or 'D'."
elif line['answer_type'] == 'float':
inst = "Format the answer as a three-digit floating-point number and provide it in the 'short answer' key."
else:
inst = "Float numbers in the answer should be formatted as three-digit floating-point numbers."
prompt = prompt + self.GUIDE.format(INST=inst) + self.TEXT_EXAMPLE
msgs = []
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=prompt))
return msgs
def evaluate(self, eval_file, **judge_kwargs):
judge_name = judge_kwargs.pop('model', 'gpt-4o-mini')
model = build_judge(model=judge_name, **judge_kwargs)
suffix = eval_file.split('.')[-1]
storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx') # noqa: F841
score_file = eval_file.replace(f'.{suffix}', f'_{judge_name}_score.csv') # noqa: F841
tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl') # noqa: F841
nproc = judge_kwargs.pop('nproc', 6) # noqa: F841
res = load(tmp_file) if os.path.exists(tmp_file) else {}
res = {k: v for k, v in res.items() if v is not None}
model.system_prompt = """\
You are a helpful assistant that helps me to format free-form answers into a short answer according to the instruction.
"""
if not osp.exists(storage):
data = load(eval_file)
lt = len(data)
payloads = [dict(model=model, line=data.iloc[i]) for i in range(lt) if data.iloc[i]['index'] not in res]
keys = [idx for idx in data['index'] if idx not in res]
if len(keys):
results = track_progress_rich(DynaMath_auxeval, payloads, nproc=nproc, save=tmp_file, keys=keys)
for k, r in zip(keys, results):
res[k] = r
data['parse'] = [res[idx]['parse'] for idx in data['index']]
data['extracted'] = [res[idx]['extracted'] for idx in data['index']]
data['correct'] = [res[idx]['correct'] for idx in data['index']]
dump(data, storage)
data = load(storage)
# Calculate Average Accuracy
score_avg = {}
score_avg['Overall'] = np.mean(data['correct'])
subs = set(data['subject'])
for sub in subs:
data_sub = data[data['subject'] == sub]
score_avg[f'Subject-{sub}'] = np.mean(data_sub['correct'])
lvls = set(data['knowledge_level'])
for lvl in lvls:
data_lvl = data[data['knowledge_level'] == lvl]
score_avg[f'Level-{lvl}'] = np.mean(data_lvl['correct'])
# Calculate the Worst Case Accuracy
score_worst = {}
data_worst = data[data['varid'] == 1]
qid2corr = {idx: True for idx in data_worst['index']}
lt = len(data)
for i in range(lt):
item = data.iloc[i]
qid2corr[item['qid']] *= item['correct']
data_worst['correct'] = [qid2corr[idx] for idx in data_worst['qid']]
score_worst['Overall'] = np.mean(data_worst['correct'])
subs = set(data_worst['subject'])
for sub in subs:
data_sub = data_worst[data_worst['subject'] == sub]
score_worst[f'Subject-{sub}'] = np.mean(data_sub['correct'])
lvls = set(data_worst['knowledge_level'])
for lvl in lvls:
data_lvl = data_worst[data_worst['knowledge_level'] == lvl]
score_worst[f'Level-{lvl}'] = np.mean(data_lvl['correct'])
d1 = {'Setting': 'Average'}
d1.update(score_avg)
d2 = {'Setting': 'Worst Case'}
d2.update(score_worst)
score = pd.concat([d2df(d1), d2df(d2)], ignore_index=True)
dump(score, score_file)
return score
import pandas as pd
from abc import abstractmethod
from ..smp import *
def img_root_map(dataset):
if 'MM_NIAH' in dataset:
return 'MMNIAH'
if 'CRPE' in dataset:
return 'CRPE'
if 'OCRVQA' in dataset:
return 'OCRVQA'
if 'COCO_VAL' == dataset:
return 'COCO'
if 'MMMU' in dataset:
return 'MMMU'
if "QSpatial" in dataset:
return "QSpatial"
mmbench_root_map = {
'MMBench_DEV_EN': 'MMBench', 'MMBench_TEST_EN': 'MMBench',
'MMBench_DEV_CN': 'MMBench', 'MMBench_TEST_CN': 'MMBench',
'MMBench': 'MMBench', 'MMBench_CN': 'MMBench',
'MMBench_DEV_EN_V11': 'MMBench_V11', 'MMBench_TEST_EN_V11': 'MMBench_V11',
'MMBench_DEV_CN_V11': 'MMBench_V11', 'MMBench_TEST_CN_V11': 'MMBench_V11',
'MMBench_V11': 'MMBench', 'MMBench_CN_V11': 'MMBench',
}
if dataset in mmbench_root_map:
return mmbench_root_map[dataset]
return dataset
class ImageBaseDataset:
MODALITY = 'IMAGE'
DATASET_URL = {}
DATASET_MD5 = {}
def __init__(self, dataset='MMBench', skip_noimg=True):
ROOT = LMUDataRoot()
# You can override this variable to save image files to a different directory
self.dataset_name = dataset
self.img_root = osp.join(ROOT, 'images', img_root_map(dataset))
data = self.load_data(dataset)
self.skip_noimg = skip_noimg
if skip_noimg and 'image' in data:
data = data[~pd.isna(data['image'])]
data['index'] = [str(x) for x in data['index']]
self.meta_only = True
# The image field can store the base64 encoded image or another question index (for saving space)
if 'image' in data:
data['image'] = [str(x) for x in data['image']]
image_map = {x: y for x, y in zip(data['index'], data['image'])}
for k in image_map:
if len(image_map[k]) <= 64:
idx = image_map[k]
assert idx in image_map and len(image_map[idx]) > 64
image_map[k] = image_map[idx]
images = [toliststr(image_map[k]) for k in data['index']]
data['image'] = [x[0] if len(x) == 1 else x for x in images]
self.meta_only = False
if 'image_path' in data:
paths = [toliststr(x) for x in data['image_path']]
data['image_path'] = [x[0] if len(x) == 1 else x for x in paths]
if np.all([istype(x, int) for x in data['index']]):
data['index'] = [int(x) for x in data['index']]
self.data = data
self.post_build(dataset)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return dict(self.data.iloc[idx])
def prepare_tsv(self, url, file_md5=None):
data_root = LMUDataRoot()
os.makedirs(data_root, exist_ok=True)
update_flag = False
file_name = url.split('/')[-1]
data_path = osp.join(data_root, file_name)
if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
pass
else:
warnings.warn('The dataset tsv is not downloaded')
download_file(url, data_path)
update_flag = True
if file_size(data_path, 'GB') > 1:
local_path = data_path.replace('.tsv', '_local.tsv')
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
from ..tools import LOCALIZE
LOCALIZE(data_path, local_path)
data_path = local_path
return load(data_path)
def dump_image(self, line):
os.makedirs(self.img_root, exist_ok=True)
if 'image' in line:
if isinstance(line['image'], list):
tgt_path = []
assert 'image_path' in line
for img, im_name in zip(line['image'], line['image_path']):
path = osp.join(self.img_root, im_name)
if not read_ok(path):
decode_base64_to_image_file(img, path)
tgt_path.append(path)
else:
tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
if not read_ok(tgt_path):
decode_base64_to_image_file(line['image'], tgt_path)
tgt_path = [tgt_path]
else:
assert 'image_path' in line
tgt_path = toliststr(line['image_path'])
return tgt_path
def display(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
assert isinstance(line, pd.Series) or isinstance(line, dict)
mmqa_display(line)
# Return a list of dataset names that are supported by this class, can override
@classmethod
def supported_datasets(cls):
return list(cls.DATASET_URL)
# Given the dataset name, return the dataset as a pandas dataframe, can override
def load_data(self, dataset):
url = self.DATASET_URL[dataset]
file_md5 = self.DATASET_MD5[dataset] if dataset in self.DATASET_MD5 else None
return self.prepare_tsv(url, file_md5)
# Post built hook, will be called after the dataset is built, can override
def post_build(self, dataset):
pass
# Given one data record, return the built prompt (a multi-modal message), can override
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
if self.meta_only:
tgt_path = toliststr(line['image_path'])
else:
tgt_path = self.dump_image(line)
question = line['question']
msgs = []
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=question))
return msgs
# Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
@abstractmethod
def evaluate(self, eval_file, **judge_kwargs):
pass
from .image_base import ImageBaseDataset
from ..smp import *
class COCO_Caption_Scorer():
def __init__(self, ref, gt):
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
self.ref = ref
self.gt = gt
print('setting up scorers...')
self.scorers = [
(Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']),
(Rouge(), 'ROUGE_L'),
(Cider(), 'CIDEr'),
]
def compute_scores(self):
total_scores = {}
for scorer, method in self.scorers:
print('computing %s score...' % (scorer.method()))
score, scores = scorer.compute_score(self.gt, self.ref)
if isinstance(method, list):
for sc, scs, m in zip(score, scores, method):
print('%s: %0.3f' % (m, sc * 100))
total_scores['Bleu'] = [x * 100 for x in score]
else:
print('%s: %0.3f' % (method, score * 100))
total_scores[method] = score * 100
print('*****DONE*****')
for key, value in total_scores.items():
print('{}:{}'.format(key, value))
return total_scores
class ImageCaptionDataset(ImageBaseDataset):
TYPE = 'Caption'
DATASET_URL = {
'COCO_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/COCO_VAL.tsv',
}
DATASET_MD5 = {
'COCO_VAL': '72a5079dead060269ac222c5aa5128af',
}
def load_data(self, dataset):
data = super().load_data(dataset)
if 'question' not in data:
data['question'] = [(
'Please describe this image in general. Directly provide the description, '
'do not include prefix like "This image depicts". '
)] * len(data)
return data
# It returns a dictionary of scores
@classmethod
def evaluate(self, eval_file, **kwargs):
data = load(eval_file)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
ref, gt = {}, {}
for i, line in enumerate(lines):
ref[str(i)] = [str(line['prediction'])]
gt[str(i)] = eval(line['answer'])
scorer = COCO_Caption_Scorer(ref, gt)
coco_caption_score_dict = scorer.compute_scores()
score_pth = eval_file.replace('.xlsx', '_score.json')
dump(coco_caption_score_dict, score_pth)
return coco_caption_score_dict
import warnings
from .image_base import ImageBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..smp import *
import pandas as pd
MMMB_URLS = {
'MMMB_ar': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_ar.tsv',
'MMMB_cn': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_cn.tsv',
'MMMB_en': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_en.tsv',
'MMMB_pt': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_pt.tsv',
'MMMB_ru': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_ru.tsv',
'MMMB_tr': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_tr.tsv',
}
MTL_MMBench_URLS = {
'MMBench_dev_ar': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_ar.tsv',
'MMBench_dev_cn': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_cn.tsv',
'MMBench_dev_en': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_en.tsv',
'MMBench_dev_pt': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_pt.tsv',
'MMBench_dev_tr': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_tr.tsv',
'MMBench_dev_ru': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_ru.tsv',
}
MMMB_MD5 = {
'MMMB_ar': 'f3a18b6385f1d9701840aa42de27aead', 'MMMB_cn': '13ed82fa89730037292fcaa27f08f430',
'MMMB_en': '1cd781a71ec5a2983c090b84105d6a01', 'MMMB_pt': '548ea2b3bb2da991790386f0015d30d1',
'MMMB_ru': 'ce1cc8a0533425ab0d86b326ebfc2984', 'MMMB_tr': '0733739d43090327975294292bc5cd67'
}
MTL_MMBench_MD5 = {
'MMBench_dev_ar': '4271b4a0d0200e1a86380a878e0d64a4', 'MMBench_dev_cn': '2ed5135326fed02c8e51ea50dda8222f',
'MMBench_dev_en': 'd9ab776fc018b3d45785e9a5c23431c2', 'MMBench_dev_pt': '4ddfbcd27ef12444b908c03831cd0295',
'MMBench_dev_tr': '4fab39d501389d3d6cc90264bb708f11', 'MMBench_dev_ru': '5ba1171ff2e68f80637bf78349e402a5'
}
class ImageMCQDataset(ImageBaseDataset):
TYPE = 'MCQ'
DATASET_URL = {
# MMBench v1.0
'MMBench_DEV_EN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_EN.tsv',
'MMBench_TEST_EN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_EN.tsv',
'MMBench_DEV_CN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_CN.tsv',
'MMBench_TEST_CN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_CN.tsv',
'MMBench': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench.tsv', # Internal
'MMBench_CN': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_CN.tsv', # Internal
# MMBench v1.1
'MMBench_DEV_EN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_EN_V11.tsv',
'MMBench_TEST_EN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_EN_V11.tsv',
'MMBench_DEV_CN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_CN_V11.tsv',
'MMBench_TEST_CN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_CN_V11.tsv',
'MMBench_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_V11.tsv', # Internal
'MMBench_CN_V11': 'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_CN_V11.tsv', # Internal
# SEEDBench Series
'SEEDBench_IMG': 'https://opencompass.openxlab.space/utils/benchmarks/SEEDBench/SEEDBench_IMG.tsv',
'SEEDBench2': 'https://huggingface.co/datasets/VLMEval/SEEDBench2/resolve/main/SEEDBench2.tsv',
'SEEDBench2_Plus': 'https://opencompass.openxlab.space/utils/benchmarks/SEEDBench/SEEDBench2_Plus.tsv',
# ScienceQA Series
'ScienceQA_VAL': 'https://opencompass.openxlab.space/utils/benchmarks/ScienceQA/ScienceQA_VAL.tsv',
'ScienceQA_TEST': 'https://opencompass.openxlab.space/utils/benchmarks/ScienceQA/ScienceQA_TEST.tsv',
# MMT-Bench
'MMT-Bench_ALL_MI': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_ALL_MI.tsv',
'MMT-Bench_ALL': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_ALL.tsv',
'MMT-Bench_VAL_MI': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_VAL_MI.tsv',
'MMT-Bench_VAL': 'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_VAL.tsv',
# AesBench
'AesBench_VAL': 'https://huggingface.co/datasets/VLMEval/AesBench/resolve/main/AesBench_VAL.tsv',
'AesBench_TEST': 'https://huggingface.co/datasets/VLMEval/AesBench/resolve/main/AesBench_TEST.tsv',
# Q-Bench1
'Q-Bench1_VAL': 'https://huggingface.co/datasets/zhangzicheng/qbench_tsv/resolve/main/Q-Bench1_VAL.tsv',
'Q-Bench1_TEST': 'https://huggingface.co/datasets/zhangzicheng/qbench_tsv/resolve/main/Q-Bench1_TEST.tsv',
# A-Bench
'A-Bench_VAL': 'https://huggingface.co/datasets/zhangzicheng/abench_tsv/resolve/main/A-bench_VAL.tsv',
'A-Bench_TEST': 'https://huggingface.co/datasets/zhangzicheng/abench_tsv/resolve/main/A-bench_TEST.tsv',
# R-Bench
'R-Bench-Dis': 'https://huggingface.co/datasets/lcysyzxdxc/R-Bench/blob/main/R-bench-dis.tsv',
'R-Bench-Ref': 'https://huggingface.co/datasets/lcysyzxdxc/R-Bench/blob/main/R-bench-ref.tsv',
# Other Benchmarks
'CCBench': 'https://opencompass.openxlab.space/utils/VLMEval/CCBench.tsv',
'AI2D_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST.tsv',
'AI2D_TEST_NO_MASK': 'https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST_NO_MASK.tsv',
'MMStar': 'https://opencompass.openxlab.space/utils/VLMEval/MMStar.tsv',
'RealWorldQA': 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv',
'MLLMGuard_DS': 'https://opencompass.openxlab.space/utils/VLMEval/MLLMGuard_DS.tsv',
'BLINK': 'https://opencompass.openxlab.space/utils/VLMEval/BLINK.tsv',
'TaskMeAnything_v1_imageqa_random': (
'https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-random/'
'resolve/main/TaskMeAnything-v1-imageqa-random.tsv'
),
'A-OKVQA': 'https://huggingface.co/datasets/Allen8/A-OKVQA/resolve/main/a-okvqa.tsv',
'WorldMedQA-V': 'https://opencompass.openxlab.space/utils/VLMEval/WorldMedQA-V.tsv',
'VisOnlyQA-VLMEvalKit': (
'https://huggingface.co/datasets/ryokamoi/VisOnlyQA_Eval_Real/'
'resolve/main/visonlyqa_vlmevalkit.tsv'
),
}
DATASET_MD5 = {
# MMBench v1.0
'MMBench_DEV_EN': 'b6caf1133a01c6bb705cf753bb527ed8',
'MMBench_TEST_EN': '6939fadb0ce626fefc0bdc9c64efc528',
'MMBench_DEV_CN': '08b8fc3324a5ed74155350f57be69fbd',
'MMBench_TEST_CN': '7e1239baf0ee4c8b513e19705a0f317e',
'MMBench': '4115aea3383f3dd0083be6a633e0f820', # Internal Only
'MMBench_CN': '2e053ffc90ea598b1feae13c36dc13ee', # Internal Only
# MMBench v1.1
'MMBench_DEV_EN_V11': '30c05be8f2f347a50be25aa067248184',
'MMBench_TEST_EN_V11': '26f0f15381a21720255091d3e0316ce6',
'MMBench_DEV_CN_V11': '593f9b5f6bea453d870a798b34ae4f37',
'MMBench_TEST_CN_V11': '74bbe4556dac745613c7cbe5ad787050',
'MMBench_V11': 'b9276414f57af1308dcc4d0cd9b42e7c', # Internal Only
'MMBench_CN_V11': '95f6980dd1b4de38e3cbffe0305a3f25', # Internal Only
# SEEDBench
'SEEDBench_IMG': '68017231464752261a2526d6ca3a10c0',
'SEEDBench2': '4ec15cf864c4f16274112284f531813e',
'SEEDBench2_Plus': 'e32d3216dc4f452b0fe497a52015d1fd',
# ScienceQA
'ScienceQA_VAL': '96320d05e142e585e7204e72affd29f3',
'ScienceQA_TEST': 'e42e9e00f9c59a80d8a5db35bc32b71f',
# MMT-Bench
'MMT-Bench_ALL_MI': '5272157097e19cdd7cb41e412ab3b7c7',
'MMT-Bench_ALL': 'b273a2f4c596fe4f2605de0494cd632f',
'MMT-Bench_VAL_MI': 'c7d7b998eb5cd9aa36c7d4f721472462',
'MMT-Bench_VAL': '8dd4b730f53dbf9c3aed90ca31c928e0',
# AesBench
'AesBench_VAL': '3edb0c319e9187aa0b97fe7a11700a8c',
'AesBench_TEST': '58b1f7ba2cc32e1d68896d6ee716bbf8',
# Q-Bench1
'Q-Bench1_VAL': '837bdb6cd2da571713543462815187b7',
'Q-Bench1_TEST': '15e759bfd58c9d5f30b23a317d347153',
# A-Bench
'A-Bench_VAL': '218563ec50d34bb336c814143a5bb9c1',
'A-Bench_TEST': '567013fb033a20cf23f51d8e865bd16c',
# R-Bench
'R-Bench-Dis': 'd6e961dbfc43350688af2560226830b4',
'R-Bench-Ref': '270c1cb555acb523f3fdb178ed57021d',
# Other Benchmarks
'CCBench': 'f5dde47f24dc5a6fb6e595b409b466ac',
'AI2D_TEST': '0f593e0d1c7df9a3d69bf1f947e71975',
'AI2D_TEST_NO_MASK': 'fd8f463634d4fe9fbd23b876e8eea5be',
'MMStar': 'e1ecd2140806c1b1bbf54b43372efb9e',
'RealWorldQA': '92321028d2bc29040284b6674721e48f',
'MLLMGuard_DS': '975fc0dd7119386e198c37d71e274b3f',
'BLINK': '3b6649b6a662184ea046908e5506260e',
'TaskMeAnything_v1_imageqa_random': '023fef69e2ca21827afb77c5ec3bc889',
'WorldMedQA-V': '441e63875e30c87f5750528b57b41285',
"VisOnlyQA-VLMEvalKit": 'cf460a31d2acb8d3a7cecd0e69298bfa',
}
DATASET_URL.update(MMMB_URLS)
DATASET_URL.update(MTL_MMBench_URLS)
DATASET_MD5.update(MMMB_MD5)
DATASET_MD5.update(MTL_MMBench_MD5)
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
if self.meta_only:
tgt_path = toliststr(line['image_path'])
else:
tgt_path = self.dump_image(line)
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = 'Options:\n'
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = ''
if hint is not None:
prompt += f'Hint: {hint}\n'
prompt += f'Question: {question}\n'
if len(options):
prompt += options_prompt
prompt += 'Please select the correct answer from the options above. \n'
msgs = []
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=prompt))
return msgs
def evaluate(self, eval_file, **judge_kwargs):
from .utils.multiple_choice import report_acc, report_acc_MMT, mcq_circular_eval, mcq_vanilla_eval
# assert dataset is not None
dataset_map = {
'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_EN_V11': 'MMBench_V11',
'MMBench_TEST_CN': 'MMBench_CN', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11'
}
dataset = self.dataset_name
if dataset in dataset_map:
dataset = dataset_map[dataset]
nproc = judge_kwargs.pop('nproc', 4)
circular = False
if listinstr(['mmbench', 'ccbench'], dataset.lower()):
data = load(eval_file)
data['index'] = [int(x) for x in data['index']]
dump(data, eval_file)
circular = True
suffix = eval_file.split('.')[-1]
model = judge_kwargs.get('model', 'exact_matching')
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
name_str = name_str_map[model] if model in name_str_map else model
if model == 'exact_matching':
model = None
elif gpt_key_set():
model = build_judge(**judge_kwargs)
if not model.working():
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
warnings.warn(DEBUG_MESSAGE)
model = None
else:
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
model = None
result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
data = load(eval_file)
data = data.sort_values(by='index')
data['prediction'] = [str(x) for x in data['prediction']]
# If not choice label, then use lower case
for k in data.keys():
data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
meta = self.data
meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
data_map = {x: y for x, y in zip(data['index'], data['question'])}
for k in data_map:
assert k in meta_q_map, (
f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
)
if circular:
data = mcq_circular_eval(model, data, meta, nproc, result_file, self.dataset_name)
else:
data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
# load split
dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
# May have different report acc functions for different datasets
if 'MMT' in dataset:
acc = report_acc_MMT(data)
else:
acc = report_acc(data)
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
dump(acc, score_file)
if dataset == 'AesBench_VAL':
warnings.warn('Note that AesBench VAL is just a toy version of AesBench TEST. For full results, \
please evaluate on AesBench TEST. The AesBench TEST dataset is more than 20 times \
larger than the VAL dataset and the leaderboard results are based on AesBench TEST.')
if dataset == 'VisOnlyQA-VLMEvalKit':
warnings.warn('Note that the results on VisOnlyQA-VLMEvalKit are different from the results on \
the original VisOnlyQA. VisOnlyQA-VLMEvalKit does not include the \
chemistry__shape_multi split and uses a different evaluation prompt. Please \
explicitly specify the version of the dataset when you report results.')
return acc
class MMMUDataset(ImageMCQDataset):
DATASET_URL = {
'MMMU_DEV_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_DEV_VAL.tsv',
'MMMU_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_TEST.tsv',
}
DATASET_MD5 = {
'MMMU_DEV_VAL': '521afc0f3bf341e6654327792781644d',
'MMMU_TEST': 'c19875d11a2d348d07e5eb4bdf33166d',
}
@staticmethod
def split_MMMU(msgs):
text, images = None, []
for s in msgs:
if s['type'] == 'image':
images.append(s['value'])
elif s['type'] == 'text':
assert text is None
text = s['value']
text_segs = text.split('<image ')
if len(text_segs) == 1:
return msgs
segs = [dict(type='text', value=text_segs[0])]
for i, seg in enumerate(text_segs):
if i == 0:
continue
assert istype(seg[0], int) and seg[1] == '>'
image_idx = int(seg[0]) - 1
segs.append(dict(type='image', value=images[image_idx]))
segs.append(dict(type='text', value=seg[2:]))
return segs
def build_prompt(self, line):
msgs = super().build_prompt(line)
msgs = self.split_MMMU(msgs)
return msgs
class MUIRDataset(ImageMCQDataset):
DATASET_URL = {
'MUIRBench': 'http://opencompass.openxxlab.com/utils/VLMEval/MUIRBench.tsv'
}
DATASET_MD5 = {
'MUIRBench': '2e5e6fd7699761b08a7cb3ab8c0c2ec8'
}
@staticmethod
def split_MUIR(msgs):
text, images = None, []
# Separate images and text from msgs
for s in msgs:
if s['type'] == 'image':
images.append(s['value'])
elif s['type'] == 'text':
assert text is None # Ensure only one text entry is expected
text = s['value']
# Split text by <image> tags
text_segs = text.split('<image>')
# Initialize the segments list
segs = []
# Iterate through the text segments and images
for i, seg in enumerate(text_segs):
# Append the image if this is not the first segment and there are still images left
if i > 0 and i - 1 < len(images):
segs.append(dict(type='image', value=images[i - 1]))
# Append the text segment (if it's non-empty)
if len(seg) > 0:
segs.append(dict(type='text', value=seg))
return segs
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
if self.meta_only:
tgt_path = toliststr(line['image_path'])
else:
tgt_path = self.dump_image(line)
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
# options_prompt = ''
options_prompt = '\n'.join([f'{key}. {item}' for key, item in options.items()])
# for key, item in options.items():
# options_prompt += f'{key}. {item}\n'
prompt = ''
prompt += f'{question}\n'
if len(options):
prompt += options_prompt
prompt += "\nAnswer with the option's letter from the given choices directly."
msgs = []
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=prompt))
msgs = self.split_MUIR(msgs)
return msgs
class GMAIMMBenchDataset(ImageMCQDataset):
DATASET_URL = {
'GMAI-MMBench_VAL': 'https://huggingface.co/datasets/VLMEval/GMAI-MMBench/resolve/main/GMAI-MMBench_VAL.tsv',
'GMAI_mm_bench_TEST_part_1': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_1.tsv', # noqa: E501
'GMAI_mm_bench_TEST_part_2': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_2.tsv', # noqa: E501
'GMAI_mm_bench_TEST_part_3': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_3.tsv', # noqa: E501
'GMAI_mm_bench_TEST_part_4': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_4.tsv', # noqa: E501
'GMAI_mm_bench_TEST_part_5': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_5.tsv', # noqa: E501
'GMAI_mm_bench_TEST_part_6': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_6.tsv', # noqa: E501
'GMAI_mm_bench_TEST_part_7': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_7.tsv', # noqa: E501
'GMAI_mm_bench_TEST_part_8': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_8.tsv', # noqa: E501
'GMAI_mm_bench_TEST_part_9': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_9.tsv', # noqa: E501
'GMAI_mm_bench_TEST_part_10': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_10.tsv', # noqa: E501
'GMAI_mm_bench_TEST_part_11': 'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_11.tsv', # noqa: E501
}
DATASET_MD5 = {
'GMAI-MMBench_VAL': '254bd581627866f1c499d3d6b4422324',
'GMAI_mm_bench_TEST_part_1': '900d735231230a63f4ed45665c078ef4',
'GMAI_mm_bench_TEST_part_2': '1b27ab621386945d7e4a765ad2d22b0e',
'GMAI_mm_bench_TEST_part_3': '44bdc2b6267dd505d529b8cad06f0fb2',
'GMAI_mm_bench_TEST_part_4': '5a04a04fcac9f1466709f242fdb80acb',
'GMAI_mm_bench_TEST_part_5': 'c70baf8909eda9af0ddeab275c721336',
'GMAI_mm_bench_TEST_part_6': '825abc39596b644dead9350d0cfa3b96',
'GMAI_mm_bench_TEST_part_7': 'defb8aed2fb77365a76b6b9abd6a2701',
'GMAI_mm_bench_TEST_part_8': 'ff490d60b85f2bb0abb67a435b298c65',
'GMAI_mm_bench_TEST_part_9': 'ff67c86f40da93b09139ac1d1ba5dc6b',
'GMAI_mm_bench_TEST_part_10': '3dae94627b9ac0fe00180d4780fbf6dc',
'GMAI_mm_bench_TEST_part_11': 'd08dc813f0eb6bbab63cae2a9d113c4b',
}
@classmethod
def supported_datasets(cls):
return ['GMAI-MMBench_VAL', 'GMAI-MMBench_TEST']
def load_data(self, dataset):
if dataset == 'GMAI-MMBench_VAL':
data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
if file_size(data_path, 'GB') > 1:
local_path = data_path.replace('.tsv', '_local.tsv')
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL'):
from ..tools import LOCALIZE
LOCALIZE(data_path, local_path)
data_path = local_path
return load(data_path)
elif dataset == 'GMAI-MMBench_TEST':
dfs = []
for part_num in range(1, 12):
part_name = f'GMAI_mm_bench_TEST_part_{part_num}'
url = self.DATASET_URL[part_name]
file_md5 = self.DATASET_MD5.get(part_name)
tsv_path = osp.join(LMUDataRoot(), f'{part_name}.tsv')
if not osp.exists(tsv_path) or (file_md5 and md5(tsv_path) != file_md5):
download_file(url, filename=tsv_path)
local_path = tsv_path.replace('.tsv', '_local.tsv')
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL'):
from ..tools import LOCALIZE
LOCALIZE(tsv_path, local_path)
tsv_path = local_path
# 加载数据
df = load(tsv_path)
dfs.append(df)
# 合并所有数据
data = pd.concat(dfs, ignore_index=True)
return data
else:
raise ValueError(f"未知的数据集:{dataset}")
def report_acc_by_groups(self, df, group_column):
res = defaultdict(list)
# Check for the 'split' column
if 'split' in df:
splits = list(set(df['split']))
res['split'] = splits
else:
df['split'] = ['none'] * len(df)
res['split'] = ['none']
res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']]
if group_column not in df:
raise ValueError(f"Column '{group_column}' not found in dataframe.") # noqa: E713
abilities = list(set(df[group_column]))
abilities = ['None' if isinstance(ab, float) and pd.isna(ab) else ab for ab in abilities]
abilities.sort()
for ab in abilities:
ab_name = ab
sub_df = df[df[group_column] == ab]
res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
return pd.DataFrame(res)
def evaluate(self, eval_file, **judge_kwargs):
from .utils.multiple_choice import report_acc, mcq_vanilla_eval
nproc = judge_kwargs.pop('nproc', 4)
suffix = eval_file.split('.')[-1]
model = judge_kwargs.get('model', 'exact_matching')
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
name_str = name_str_map[model] if model in name_str_map else model
if model == 'exact_matching':
model = None
elif gpt_key_set():
model = build_judge(**judge_kwargs)
if not model.working():
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
warnings.warn(DEBUG_MESSAGE)
model = None
else:
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
model = None
result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
data = load(eval_file)
data = data.sort_values(by='index')
data['prediction'] = [str(x) for x in data['prediction']]
# If not choice label, then use lower case
for k in data.keys():
data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
meta = self.data
meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
data_map = {x: y for x, y in zip(data['index'], data['question'])}
for k in data_map:
assert k in meta_q_map, (
f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
)
data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
# load split
dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
acc = report_acc(data)
for group_col in ['clinical vqa task', 'department', 'perceptual granularity']:
acc_grouped = self.report_acc_by_groups(data, group_col)
score_file_grouped = eval_file.replace(f'.{suffix}', f'_{group_col}_acc.csv')
dump(acc_grouped, score_file_grouped)
return acc
class MMERealWorld(ImageMCQDataset):
TYPE = 'MMERealWorld'
DATASET_MD5 = {
'MME-RealWorld': '271c33ec814c39533c467ec6fb8a6f36',
'MME-RealWorld-Lite': '4c17057d7d3b6c4a0d4397c3dae0881c',
'MME-RealWorld-CN': 'daaa763d52a760a38606d5dedb3fe444',
}
SYS = {
'MME-RealWorld': (
'Select the best answer to the above multiple-choice question based on the image. '
'Respond with only the letter (A, B, C, D, or E) of the correct option. \n'
'The best answer is:'
),
'MME-RealWorld-Lite': (
'Select the best answer to the above multiple-choice question based on the image. '
'Respond with only the letter (A, B, C, D, or E) of the correct option. \n'
'The best answer is:'
),
'MME-RealWorld-CN': (
'根据图像选择上述多项选择题的最佳答案。只需回答正确选项的字母(A, B, C, D 或 E)。\n'
'最佳答案为:'
),
}
@classmethod
def supported_datasets(cls):
return ['MME-RealWorld', 'MME-RealWorld-CN', 'MME-RealWorld-Lite',]
def load_data(
self, dataset="MME-RealWorld", repo_id="yifanzhang114/MME-RealWorld-Base64"
):
def check_integrity(pth):
data_file = osp.join(pth, f"{dataset}.tsv")
if not os.path.exists(data_file):
return False
if md5(data_file) != self.DATASET_MD5[dataset]:
return False
return True
def generate_tsv(pth):
tsv_file = os.path.join(pth, f"{dataset}.tsv")
if os.path.exists(tsv_file):
print(f"{tsv_file} already exists.")
return
json_dir = os.path.join(pth, dataset)
json_files = [f for f in os.listdir(json_dir) if f.endswith(".json")]
data_list = []
for json_file in json_files:
with open(os.path.join(json_dir, json_file), "r") as f:
data = json.load(f)
for item in tqdm(data):
choice_prompt = (
"The choices are listed below:\n"
if dataset in ["MME-RealWorld", "MME-RealWorld-Lite"]
else "选项如下所示:\n"
)
data_list.append(
{
"index": item["index"],
"image": item["image"],
"question": item["question"],
"multi-choice options": choice_prompt
+ "\n".join(item["multi-choice options"]),
"A": item["multi-choice options"][0][4:],
"B": item["multi-choice options"][1][4:],
"C": item["multi-choice options"][2][4:],
"D": item["multi-choice options"][3][4:],
"E": item["multi-choice options"][4][4:],
"answer": item["answer"],
"category": item["category"],
"l2-category": item["l2-category"],
}
)
df = pd.DataFrame(data_list)
df.to_csv(tsv_file, sep="\t", index=False)
print(f"TSV file saved to {tsv_file}")
# Check if dataset is cached and has integrity
if dataset == "MME-RealWorld-Lite":
url = 'https://huggingface.co/datasets/yifanzhang114/MME-RealWorld-Base64/resolve/main/mme_realworld_lite.tsv' # noqa: E501
file_md5 = (
self.DATASET_MD5[dataset] if dataset in self.DATASET_MD5 else None
)
datas = self.prepare_tsv(url, file_md5)
choice_prompt = "The choices are listed below:\n"
for index, item in datas.iterrows():
options = eval(item["multi-choice options"])
datas.loc[index, "multi-choice options"] = choice_prompt + "\n".join(
options
)
datas.loc[index, "A"] = options[0][4:]
datas.loc[index, "B"] = options[1][4:]
datas.loc[index, "C"] = options[2][4:]
datas.loc[index, "D"] = options[3][4:]
datas.loc[index, "E"] = options[4][4:]
return datas
update_flag = False
cache_path = get_cache_path(repo_id)
if cache_path is not None and check_integrity(cache_path):
dataset_path = cache_path
print(f"Using cached dataset from {cache_path}")
else:
from huggingface_hub import snapshot_download
# Download or find the dataset path
dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
generate_tsv(dataset_path)
update_flag = True
data_path = os.path.join(dataset_path, f"{dataset}.tsv")
if file_size(data_path, "GB") > 1:
local_path = data_path.replace(".tsv", "_local.tsv")
if (
not osp.exists(local_path)
or os.environ.get("FORCE_LOCAL", None)
or update_flag
):
from vlmeval.tools import LOCALIZE
LOCALIZE(data_path, local_path)
data_path = local_path
return load(data_path)
def post_build(self, dataset):
self.TYPE = 'MMERealWorld'
# Given one data record, return the built prompt (a multi-modal message), can override
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
if self.meta_only:
tgt_path = toliststr(line['image_path'])
else:
tgt_path = self.dump_image(line)
question = line['question']
choice_prompt = line['multi-choice options'] + '\n'
question += ' ' + choice_prompt + self.SYS[self.dataset_name]
msgs = []
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=question))
return msgs
# It returns a dictionary
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.multiple_choice import extract_characters_regex, get_dimension_rating
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
FAIL_MSG = 'Failed to obtain answer via API.'
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
tgt_file = eval_file.replace('.xlsx', '_rating.json')
score_file = eval_file.replace('.xlsx', '_score.xlsx')
if not osp.exists(score_file):
res = {} if not osp.exists(tmp_file) else load(tmp_file)
res = {k: v for k, v in res.items() if FAIL_MSG not in v}
data = load(eval_file)
cnt_rejected = 0
data_un = data[~pd.isna(data['prediction'])]
for idx in data['index']:
ans = data.loc[data['index'] == idx, 'answer'].values[0]
pred = data.loc[data['index'] == idx, 'prediction'].values[0]
extract_pred = extract_characters_regex(pred)
if extract_pred == '':
cnt_rejected += 1
data.loc[data['index'] == idx, 'score'] = 0
else:
data.loc[data['index'] == idx, 'score'] = int(extract_pred == ans)
print(
f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
f'failed to obtain the score for another {cnt_rejected} questions. '
f'Those questions will be counted as 0 score in ALL rating.'
)
dump(data, score_file)
rating = get_dimension_rating(score_file)
dump(rating, tgt_file)
return rating
class HRBenchDataset(ImageMCQDataset):
DATASET_URL = {
'HRBench4K': 'https://huggingface.co/datasets/DreamMr/HR-Bench/resolve/main/hr_bench_4k.tsv',
'HRBench8K': 'https://huggingface.co/datasets/DreamMr/HR-Bench/resolve/main/hr_bench_8k.tsv',
}
DATASET_MD5 = {
'HRBench4K': 'f6b041b03d49543494b8a56d2e35be65',
'HRBench8K': '274c9c7f89329b804a4723178a00219c',
}
def evaluate(self, eval_file, **judge_kwargs):
assert os.path.exists(eval_file), '{} does not exist!'.format(eval_file)
from .utils.multiple_choice import mcq_vanilla_eval
from .utils.hrbench import report_acc_hrbench
nproc = judge_kwargs.pop('nproc', 4)
suffix = eval_file.split('.')[-1]
model = judge_kwargs.get('model', 'extract_matching')
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
name_str = name_str_map[model] if model in name_str_map else model
if model == 'exact_matching':
model = None
elif gpt_key_set():
model = build_judge(**judge_kwargs)
if not model.working():
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
warnings.warn(DEBUG_MESSAGE)
model = None
else:
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
model = None
result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
data = load(eval_file)
data = data.sort_values(by='index')
data['prediction'] = [str(x) for x in data['prediction']]
# If not choice label, then use lower case
for k in data.keys():
data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
meta = self.data
meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
data_map = {x: y for x, y in zip(data['index'], data['question'])}
for k in data_map:
assert k in meta_q_map, (
f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
)
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
if osp.exists(score_file):
acc = load(score_file)
return acc
data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
acc = report_acc_hrbench(data)
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
dump(acc, score_file)
return acc
class CustomMCQDataset(ImageMCQDataset):
def load_data(self, dataset):
data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
if file_size(data_path, 'GB') > 1:
local_path = data_path.replace('.tsv', '_local.tsv')
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
from ..tools import LOCALIZE
LOCALIZE(data_path, local_path)
data_path = local_path
return load(data_path)
class NaturalBenchDataset(ImageMCQDataset):
DATASET_URL = {
'NaturalBenchDataset': (
'https://huggingface.co/datasets/BaiqiL/'
'NaturalBench/resolve/main/NaturalBenchDataset.tsv'
),
}
DATASET_MD5 = {
'NaturalBenchDataset':'dbe25b044bc35696426381e9ba4fe930',
}
def build_prompt(self, line):
SUFFIX_FOR_VQA = {
"yes_no": "Please answer Yes or No.",
"multiple_choice": "Please output the letter corresponding to the correct option."
}
if isinstance(line, int):
line = self.data.iloc[line]
if self.meta_only:
tgt_path = toliststr(line['image_path'])
else:
tgt_path = self.dump_image(line)
question = line['question']
prompt = f'{question} {SUFFIX_FOR_VQA[line["type"]]}'
msgs = []
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=prompt))
return msgs
def evaluate(self, eval_file, **judge_kwargs):
from .utils.naturalbench import extract_answer, get_scores
data = load(eval_file)
data = data.sort_values(by='index')
predictions = [str(x) for x in data['prediction']]
answers = [str(x) for x in data['answer']]
indexs = [str(x) for x in data['index']]
meta = self.data
types = [str(x) for x in meta['type']]
results = {}
assert len(predictions) == len(answers) == len(indexs) == len(types) == (1900 * 4)
number_answered_samples = len(predictions) // 4
for i in range(number_answered_samples):
results[i] = {
"q0_i0": extract_answer(predictions[i * 4], types[i * 4]),
"q0_i1": extract_answer(predictions[i * 4 + 1], types[i * 4 + 1]),
"q1_i0": extract_answer(predictions[i * 4 + 2], types[i * 4 + 2]),
"q1_i1": extract_answer(predictions[i * 4 + 3], types[i * 4 + 3])
}
scores = get_scores(results)
print(scores)
score_file = 'NaturalBench_acc.csv'
df = pd.DataFrame(list(scores.items()), columns=['Metric', 'Score'])
dump(df, score_file)
return scores
from .image_base import ImageBaseDataset
from .utils.judge_util import build_judge
from ..smp import *
from ..utils import track_progress_rich
class ImageMTDataset(ImageBaseDataset):
TYPE = 'MT'
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
if self.meta_only:
tgt_path = toliststr(line['image_path'])
else:
tgt_path = self.dump_image(line)
questions = toliststr(line['question'])
if 'answer' in line:
answers = toliststr(line['answer'])
else:
answers = [''] * len(questions)
assert len(questions) == len(answers)
dlgs, pics_number = [], 0
for i in range(len(questions)):
q, a = questions[i], answers[i]
if '<ImageHere>' in q:
content = []
tag_number = q.count('<ImageHere>')
images = tgt_path[pics_number: pics_number + tag_number]
pics_number += tag_number
q_split = q.split('<ImageHere>')
for i in range(tag_number):
qsp, im = q_split[i], images[i]
if qsp != '':
content.append(dict(type='text', value=qsp))
content.append(dict(type='image', value=im))
if q_split[-1] != '':
content.append(dict(type='text', value=q_split[-1]))
else:
content = [dict(type='text', value=q)]
dlgs.append(dict(role='user', content=content))
assert '<ImageHere>' not in a, 'We currently do not support images in the answer. '
content = [dict(type='text', value=a)]
dlgs.append(dict(role='assistant', content=content))
return dlgs
class MMDUDataset(ImageMTDataset):
DATASET_URL = {'MMDU': 'https://opencompass.openxlab.space/utils/VLMEval/MMDU.tsv'}
DATASET_MD5 = {'MMDU': '848b635a88a078f49aebcc6e39792061'}
DIMS = [
'Creativity', 'Richness', 'Visual Perception', 'Logical Coherence',
'Answer Accuracy', 'Image Relationship Understanding', 'Overall Score'
]
def calculat_metric(self, ans):
all = defaultdict(lambda: 0)
tot = defaultdict(lambda: 0)
valid = defaultdict(lambda: 0)
for k in ans:
res = ans[k]['res']
assert isinstance(res, pd.DataFrame)
lt = len(res)
for i in range(lt):
line = res.iloc[i]
for k in self.DIMS:
tot[k] += 1
if k in line and line[k] is not None:
try:
score = int(line[k])
score = np.clip(score, 0, 10)
all[k] += score
valid[k] += 1
except Exception as e:
print(f'Failed to parse the score: {str(e)}')
sp1 = {'set': 'all'}
sp1.update({k: all[k] / tot[k] * 10 for k in self.DIMS})
sp2 = {'set': 'valid'}
sp2.update({k: all[k] / valid[k] * 10 for k in self.DIMS})
return pd.DataFrame([sp1, sp2])
def evaluate(self, eval_file, **judge_kwargs):
suffix = eval_file.split('.')[-1]
model = judge_kwargs['model']
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
nproc = judge_kwargs.pop('nproc', 4)
data = load(eval_file)
model = judge_kwargs.pop('model', 'gpt-4o')
judge_model = build_judge(model=model, **judge_kwargs)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(judge_model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = {}
if osp.exists(tmp_file):
ans = load(tmp_file)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
from .utils.mmdu import mmdu_score
if len(indices):
new_results = track_progress_rich(
mmdu_score,
tups,
nproc=nproc,
chunksize=nproc,
keys=indices,
save=tmp_file,)
ans = load(tmp_file)
for k, v in zip(indices, new_results):
assert k in ans
metric = self.calculat_metric(ans)
dump(metric, score_file)
return metric
import os
import re
import tempfile
from functools import partial
import pandas as pd
from .image_base import ImageBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..smp import *
from ..utils import track_progress_rich
class ImageVQADataset(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'OCRVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TEST.tsv',
'OCRVQA_TESTCORE': 'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TESTCORE.tsv',
'TextVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/TextVQA_VAL.tsv',
'DocVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_VAL.tsv',
'DocVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_TEST.tsv',
'InfoVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_VAL.tsv',
'InfoVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_TEST.tsv',
'ChartQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/ChartQA_TEST.tsv',
'GQA_TestDev_Balanced': 'https://opencompass.openxlab.space/utils/VLMEval/GQA_TestDev_Balanced.tsv',
}
DATASET_MD5 = {
'OCRVQA_TEST': 'ca46a6d74b403e9d6c0b670f6fc00db9',
'OCRVQA_TESTCORE': 'c5239fe77db8bdc1f2ad8e55e0d1fe97',
'TextVQA_VAL': 'b233b31f551bbf4056f2f955da3a92cd',
'DocVQA_VAL': 'd5ee77e1926ff10690d469c56b73eabf',
'DocVQA_TEST': '6a2f28cac26ef2d3447374e8c6f6c8e9',
'InfoVQA_VAL': '2342e9c225222f0ef4dec545ebb126fe',
'InfoVQA_TEST': 'df535bf51b88dc9718252c34131a6227',
'ChartQA_TEST': 'c902e0aa9be5582a7aad6dcf52734b42',
'GQA_TestDev_Balanced': 'fead7df22befc1ed3ca2b62ea26fa17b',
}
def build_prompt(self, line):
msgs = super().build_prompt(line)
assert msgs[-1]['type'] == 'text'
msgs[-1]['value'] += '\nAnswer the question using a single word or phrase.'
return msgs
# It returns a DataFrame
def evaluate(self, eval_file, **judge_kwargs):
from .utils.vqa_eval import hit_calculate, process_line
data = load(eval_file)
dataset = self.dataset_name
assert 'answer' in data and 'prediction' in data
data['prediction'] = [str(x) for x in data['prediction']]
data['answer'] = [str(x) for x in data['answer']]
lt = len(data)
pool = mp.Pool(16)
lines = [data.iloc[i] for i in range(lt)]
if listinstr(['TextVQA'], dataset):
res = pool.map(partial(process_line, method='vqa_score'), lines)
elif listinstr(['ChartQA'], dataset):
res = pool.map(partial(process_line, method='relaxed_accuracy'), lines)
elif listinstr(['OCRVQA', 'GQA'], dataset):
res = pool.map(partial(process_line, method='accuracy'), lines)
elif listinstr(['DocVQA', 'InfoVQA'], dataset):
res = pool.map(partial(process_line, method='anls'), lines)
else: # default using vqa_score to calculate score
res = pool.map(process_line, lines)
hit = hit_calculate(res, dataset)
ret = dict()
if 'split' in data:
splits = set(data['split'])
for sp in splits:
sub = [r for l, r in zip(lines, res) if l['split'] == sp]
# [np.mean(x['match']) >= full_score_weight for x in sub]
hit = hit_calculate(sub, dataset)
ret[sp] = np.mean(hit) * 100
sub = [r for l, r in zip(lines, res)]
hit = hit_calculate(sub, dataset)
ret['Overall'] = np.mean(hit) * 100
else:
ret['Overall'] = np.mean(hit) * 100
if 'category' in data:
cates = list(set(data['category']))
cates.sort()
for c in cates:
sub = [r for l, r in zip(lines, res) if l['category'] == c]
# [np.mean(x['match']) >= full_score_weight for x in sub]
hit = hit_calculate(sub, dataset)
ret[c] = np.mean(hit) * 100
ret = d2df(ret)
ret.round(2)
suffix = eval_file.split('.')[-1]
result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
dump(ret, result_file)
return ret
class VizWiz(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'VizWiz': 'https://opencompass.openxlab.space/utils/VLMEval/VizWiz.tsv'
}
DATASET_MD5 = {
'VizWiz': 'fa4ac4164467563ed2fac6eac6631bd0'
}
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.vqa_eval import hit_calculate, process_line
suffix = eval_file.split('.')[-1]
result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
if not osp.exists(result_file):
data = load(eval_file)
assert 'answers' in data and 'prediction' in data
data['prediction'] = [str(x) for x in data['prediction']]
data['answer'] = [str(x) for x in data['answers']]
lt = len(data)
pool = mp.Pool(16)
lines = [data.iloc[i] for i in range(lt)]
res = pool.map(process_line, lines)
hit = hit_calculate(res, 'VizWiz')
ret = dict()
ret['Overall'] = np.mean(hit) * 100
ret = d2df(ret)
ret.round(2)
dump(ret, result_file)
retz = pd.read_csv(result_file)
return retz
class OCRBench(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'OCRBench': 'https://opencompass.openxlab.space/utils/VLMEval/OCRBench.tsv'
}
DATASET_MD5 = {'OCRBench': 'e953d98a987cc6e26ef717b61260b778'}
# It returns a dictionary
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
OCRBench_score = {
'Regular Text Recognition': 0,
'Irregular Text Recognition': 0,
'Artistic Text Recognition': 0,
'Handwriting Recognition': 0,
'Digit String Recognition': 0,
'Non-Semantic Text Recognition': 0,
'Scene Text-centric VQA': 0,
'Doc-oriented VQA': 0,
'Key Information Extraction': 0,
'Handwritten Mathematical Expression Recognition': 0,
}
data = load(eval_file)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
for i in tqdm(range(len(lines))):
line = lines[i]
predict = str(line['prediction'])
answers = eval(line['answer'])
category = line['category']
if category == 'Handwritten Mathematical Expression Recognition':
for j in range(len(answers)):
answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
predict = predict.strip().replace('\n', ' ').replace(' ', '')
if answer in predict:
OCRBench_score[category] += 1
break
else:
for j in range(len(answers)):
answer = answers[j].lower().strip().replace('\n', ' ')
predict = predict.lower().strip().replace('\n', ' ')
if answer in predict:
OCRBench_score[category] += 1
break
final_score_dict = {}
final_score_dict['Text Recognition'] = \
(OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition']
+ OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition']
+ OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition'])
final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA']
final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA']
final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction']
final_score_dict['Handwritten Mathematical Expression Recognition'] = \
(OCRBench_score['Handwritten Mathematical Expression Recognition'])
final_score_dict['Final Score'] = \
(final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA']
+ final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction']
+ final_score_dict['Handwritten Mathematical Expression Recognition'])
final_score_dict['Final Score Norm'] = (float(final_score_dict['Final Score']) / 10)
score_pth = eval_file.replace('.xlsx', '_score.json')
dump(final_score_dict, score_pth)
return final_score_dict
class MathVista(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'MathVista_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVista_MINI.tsv'
}
DATASET_MD5 = {'MathVista_MINI': 'f199b98e178e5a2a20e7048f5dcb0464'}
# It returns a DataFrame
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.mathvista import MathVista_auxeval, MathVista_acc
model = judge_kwargs['model']
suffix = eval_file.split('.')[-1]
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
data = load(eval_file)
model = build_judge(max_tokens=128, **judge_kwargs)
assert model.working(), ('MathVista evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = {}
if osp.exists(tmp_file):
ans = load(tmp_file)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
if len(indices):
new_results = track_progress_rich(
MathVista_auxeval,
tups,
nproc=nproc,
chunksize=nproc,
keys=indices,
save=tmp_file,
)
ans = load(tmp_file)
for k, v in zip(indices, new_results):
assert k in ans
assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
data['res'] = [ans[idx]['res'] for idx in data['index']]
data['log'] = [ans[idx]['log'] for idx in data['index']]
dump(data, storage)
score = MathVista_acc(storage)
score_pth = storage.replace('.xlsx', '_score.csv')
dump(score, score_pth)
return score
class MathVerse(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'MathVerse_MINI': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIV.tsv', # noqa
'MathVerse_MINI_Vision_Only': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVOnly.tsv', # noqa
'MathVerse_MINI_Vision_Dominant': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVDom.tsv', # noqa
'MathVerse_MINI_Vision_Intensive': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVInt.tsv', # noqa
'MathVerse_MINI_Text_Lite': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINITLite.tsv', # noqa
'MathVerse_MINI_Text_Dominant': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINITDom.tsv', # noqa
}
DATASET_MD5 = {
'MathVerse_MINI': '5017caca32b7fa110c350a1bea861b65',
'MathVerse_MINI_Vision_Only': '68a11d4680014ac881fa37adeadea3a4',
'MathVerse_MINI_Vision_Dominant': 'b8fb63852d261ab2aaefba29cc2414d3',
'MathVerse_MINI_Vision_Intensive': '01cbd35be202bb0c4873a4186a63bc19',
'MathVerse_MINI_Text_Lite': '19e4b13bdd30b89a03b2e358bcfefa04',
'MathVerse_MINI_Text_Dominant': '4f5cd2fa6630ea00bb11d6fde1f6fe6a',
}
# It returns a DataFrame
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.mathverse import MathVerse_auxeval_extract, MathVerse_auxeval_score, MathVerse_acc
model = judge_kwargs['model']
suffix = eval_file.split('.')[-1]
storage_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.xlsx')
tmp_file_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.pkl')
storage_score = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
tmp_file_score = eval_file.replace(f'.{suffix}', f'_{model}_score.pkl')
nproc = judge_kwargs.pop('nproc', 4)
# stage1: extract the answer
if not osp.exists(storage_extract):
data = load(eval_file)
model = build_judge(max_tokens=128, **judge_kwargs)
assert model.working(), ('MathVerse evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = {}
if osp.exists(tmp_file_extract):
ans = load(tmp_file_extract)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
if len(indices):
new_results = track_progress_rich(
MathVerse_auxeval_extract,
tups,
nproc=nproc,
chunksize=nproc,
keys=indices,
save=tmp_file_extract,
)
ans = load(tmp_file_extract)
for k, v in zip(indices, new_results):
assert k in ans
assert ans[k]['log_extract'] == v['log_extract'] and ans[k]['extract'] == v['extract']
data['extract'] = [ans[idx]['extract'] for idx in data['index']]
data['log_extract'] = [ans[idx]['log_extract'] for idx in data['index']]
dump(data, storage_extract)
# stage2: score the answer
if not osp.exists(storage_score):
data = load(storage_extract)
model = build_judge(max_tokens=128, **judge_kwargs)
assert model.working(), ('MathVerse evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = {}
if osp.exists(tmp_file_score):
ans = load(tmp_file_score)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
if len(indices):
new_results = track_progress_rich(
MathVerse_auxeval_score,
tups,
nproc=nproc,
chunksize=nproc,
keys=indices,
save=tmp_file_score,
)
ans = load(tmp_file_score)
for k, v in zip(indices, new_results):
assert k in ans
assert ans[k]['log_score'] == v['log_score'] and ans[k]['score'] == v['score']
data['score'] = [ans[idx]['score'] for idx in data['index']]
data['log_score'] = [ans[idx]['log_score'] for idx in data['index']]
dump(data, storage_score)
score = MathVerse_acc(storage_score)
score_pth = storage_score.replace('.xlsx', '.csv')
dump(score, score_pth)
return score
class MathVision(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'MathVision': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision.tsv',
'MathVision_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision_MINI.tsv'
}
DATASET_MD5 = {
'MathVision': '93f6de14f7916e598aa1b7165589831e',
'MathVision_MINI': '060fe4fa5d868987ce179307bd5f8a33'
}
# It returns a DataFrame
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.mathv import MATH_V_auxeval, MATH_V_acc
if 'model' in judge_kwargs:
model = judge_kwargs['model']
else:
model = os.path.basename(os.environ.get('LOCAL_LLM'))
suffix = eval_file.split('.')[-1]
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
data = load(eval_file)
model = build_judge(max_tokens=128, **judge_kwargs)
assert model.working(), ('MATH-Vision evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = {}
if osp.exists(tmp_file):
ans = load(tmp_file)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
if len(indices):
new_results = track_progress_rich(
MATH_V_auxeval,
tups,
nproc=nproc,
chunksize=nproc,
keys=indices,
save=tmp_file,
)
ans = load(tmp_file)
for k, v in zip(indices, new_results):
assert k in ans
assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
data['res'] = [ans[idx]['res'] for idx in data['index']]
data['log'] = [ans[idx]['log'] for idx in data['index']]
dump(data, storage)
score = MATH_V_acc(storage)
score_pth = storage.replace('.xlsx', '_score.csv')
dump(score, score_pth)
return score
class OlympiadBench(ImageBaseDataset):
TYPE = 'VQA_ex_prompt'
DATASET_URL = {
'OlympiadBench': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench.tsv',
'OlympiadBench_EN': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench_EN.tsv',
'OlympiadBench_CN': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench_CN.tsv'
}
DATASET_MD5 = {
'OlympiadBench': '9735ae0f0299eae1e7d07f5a7feab914',
'OlympiadBench_EN': '5c68e100d394351fc7049f29d4d4efed',
'OlympiadBench_CN': 'ea01b16788955702c79650c701e5b623'
}
def dump_image(self, line):
os.makedirs(self.img_root, exist_ok=True)
tgt_path_z = []
if isinstance(line['image'], list):
for i in range(len(line['image'])):
tgt_path = osp.join(self.img_root, f"{line['index']}--{i + 1}.jpg")
if not read_ok(tgt_path):
decode_base64_to_image_file(line['image'][i], tgt_path)
tgt_path_z.append(tgt_path)
else:
tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
if not read_ok(tgt_path):
decode_base64_to_image_file(line['image'], tgt_path)
tgt_path_z.append(tgt_path)
return tgt_path_z
def build_prompt(self, line):
from .utils.olympiadbench import get_answer_type_text, make_input
self.is_chinese = 'zh' in line['source']
self.is_math = 'maths' in line['source']
self.is_theorem_proving = 'TP' in line['source']
if self.is_chinese:
subject_content = '数学' if self.is_math else '物理'
if self.is_theorem_proving:
prompt = (
f"以下是中国{subject_content}竞赛中的证明题。请根据题目的要求,运用逻辑推理及常用定理证明题目中的命题。"
"证明过程中使用的变量和公式请使用LaTeX格式表示。"
)
else:
answer_type_text = get_answer_type_text(line['answer_type'], is_chinese=True,
multiple_answer=line['is_multiple_answer'])
if line['is_multiple_answer']:
multiple_answer_text = '\\boxed{用英文逗号连接的多个答案}'
else:
multiple_answer_text = '\\boxed{答案}'
unit_text = ''
if line['unit']:
multiple_answer_text += '(单位)'
unit_text = ',注意答案的单位不要放在\\boxed{}中'
prompt = (
f'以下是中国{subject_content}竞赛中的解答题{answer_type_text}。请根据题目的要求和所提供的信息计算得出答案。'
f'解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以“所以最终答案是{multiple_answer_text}。”'
f'显式给出结果{unit_text}。'
)
else:
subject_content = 'Math' if self.is_math else 'Physics'
if self.is_theorem_proving:
prompt = (
f'The following is a theorem proving problem from an International {subject_content} competition. '
'Please use logical reasoning and common theorems to prove the proposition in the problem '
'according to the given requirements. '
'Please use LaTeX format to represent the variables and formulas used in the proof.'
)
else:
if line['is_multiple_answer']:
multiple_answer_text = '\\boxed{multiple answers connected with commas}'
else:
multiple_answer_text = '\\boxed{answer}'
unit_text = ''
if line['unit']:
multiple_answer_text += '(unit)'
unit_text = ', note that the unit of the answer should not be included in \\boxed{}'
answer_type_text = get_answer_type_text(line['answer_type'], is_chinese=False,
multiple_answer=line['is_multiple_answer'])
prompt = (
f'The following is an open-ended problem from an International {subject_content} competition. '
f'{answer_type_text}Please calculate the answer according to the given requirements and '
'the information provided. Please use LaTeX format to represent the variables and formulas '
'used in the solution process and results. Please end your solution with "So the final answer '
f'is {multiple_answer_text}." and give the result explicitly{unit_text}.'
)
if self.is_math:
input = make_input(prompt, line['question'])
else:
if 'context' in line.keys() and str(line['context']) != 'nan': # cannot be null
input = make_input(prompt, line['context'] + '\n' + line['question'])
else:
input = make_input(prompt, line['question'])
ret = [dict(type='text', value=input)]
tgt_path = self.dump_image(line)
ret.extend([dict(type='image', value=s) for s in tgt_path])
return ret
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.olympiadbench import MathJudger, extract_answer
judger = MathJudger()
suffix = eval_file.split('.')[-1]
name_str1 = 'judge'
name_str2 = 'score'
result_file = eval_file.replace(f'.{suffix}', f'_{name_str1}_result.xlsx')
score_file = eval_file.replace(f'.{suffix}', f'_{name_str2}_result.csv')
if not osp.exists(result_file):
data = load(eval_file)
scorez = []
for i in tqdm(data.iterrows()):
line = i[1]
model_answer = line['prediction']
is_chinese = 'zh' in line['source']
model_answer = extract_answer(is_chinese, model_answer, is_deepseek=False)
answer_type = line['answer_type']
final_answer = line['final_answer'][2:-2]
if str(answer_type) != 'nan' and 'Tuple' in answer_type:
judge_result = judger.judge(model_answer, final_answer)
else:
if str(line['error']) != 'nan':
if ',' in line['error']:
precisions = line['error'].split(',')
precisions = [float(p) if p else 1e-8 for p in precisions]
judge_result = judger.judge(model_answer, final_answer, precisions)
else:
precision = float(line['error'])
judge_result = judger.judge(model_answer, final_answer, precision)
else:
judge_result = judger.judge(model_answer, final_answer)
scorez.append(judge_result)
data['score'] = scorez
dump(data, result_file)
judge_file = load(result_file)
if not osp.exists(score_file):
name_list = ['OE_MM_maths_en_COMP', 'OE_MM_maths_zh_CEE', 'OE_MM_maths_zh_COMP', 'OE_MM_physics_en_COMP',
'OE_MM_physics_zh_CEE','OE_TO_maths_en_COMP', 'OE_TO_maths_zh_CEE', 'OE_TO_maths_zh_COMP',
'OE_TO_physics_en_COMP', 'OE_TO_physics_zh_CEE']
sample_list = [[] for _ in range(len(name_list))]
for i in judge_file.iterrows():
line = i[1]
for j in range(len(name_list)):
if line['source'] == name_list[j]:
sample_list[j].append(line['score'])
acc_dict = {}
correct_list = []
# fine-grained
for i in range(len(name_list)):
correct_num = 0
for j in sample_list[i]:
if j:
correct_num += 1
correct_list.append(correct_num)
acc = 100 * correct_num / len(sample_list[i])
acc_dict[name_list[i]] = [acc]
# 4 grained
labela = ['zh', 'en']
labelb = ['maths', 'physics']
grain_list = [[x,y] for x in labela for y in labelb]
for j in grain_list:
dict_name = j[0] + "_" + j[1]
correct_num = 0
full_num = 0
for i in range(len(name_list)):
if all(k in name_list[i] for k in j):
correct_num += correct_list[i]
full_num += len(sample_list[i])
acc = 100 * correct_num / full_num
acc_dict[dict_name] = [acc]
# 2 grained
grain_list = ['maths', 'physics']
for j in grain_list:
dict_name = j
correct_num = 0
full_num = 0
for i in range(len(name_list)):
if j in name_list[i]:
correct_num += correct_list[i]
full_num += len(sample_list[i])
acc = 100 * correct_num / full_num
acc_dict[dict_name] = [acc]
# AVG
correct_num = sum(correct_list)
acc = 100 * correct_num / len(judge_file)
acc_dict['AVG'] = [acc]
acc_pd = pd.DataFrame(acc_dict)
acc_pd.to_csv(score_file, index=False, encoding='gbk')
accdz = pd.read_csv(score_file)
return accdz
class LLaVABench(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {'LLaVABench': 'https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv'}
DATASET_MD5 = {'LLaVABench': 'd382a093f749a697820d3dadd61c8428'}
# It returns a DataFrame
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.llavabench import (
build_prompt,
LLaVABench_atomeval,
LLaVABench_score,
)
suffix = '.' + eval_file.split('.')[-1]
record_file = eval_file.replace(suffix, '_openai_result' + suffix)
score_file = eval_file.replace(suffix, '_score.csv')
nproc = judge_kwargs.pop('nproc', 4)
system_prompt = 'You are a helpful and precise assistant for checking the quality of the answer.'
if not osp.exists(record_file):
data = load(eval_file)
lines = [data.iloc[i] for i in range(len(data))]
model = build_judge(temperature=0.2, system_prompt=system_prompt, **judge_kwargs)
assert model.working(), ('LLaVABench evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
prompts = [build_prompt(line) for line in lines]
tups = [(model, prompt) for prompt in prompts]
scores = track_progress_rich(LLaVABench_atomeval, tups, nproc=nproc, chunksize=nproc)
data['gpt4_score'] = [x[0] for x in scores]
data['score'] = [x[1] for x in scores]
dump(data, record_file)
data = load(record_file)
ret = LLaVABench_score(data).round(1)
dump(ret, score_file)
return ret
class MMVet(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'MMVet': 'https://opencompass.openxlab.space/utils/VLMEval/MMVet.tsv'
}
DATASET_MD5 = {'MMVet': '748aa6d4aa9d4de798306a63718455e3'}
# It returns a DataFrame
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.mmvet import MMVet_auxeval, MMVet_acc
suffix = eval_file.split('.')[-1]
model = judge_kwargs['model']
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
data = load(eval_file)
model = build_judge(max_tokens=3, **judge_kwargs)
assert model.working(), ('MMVet evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = load(tmp_file) if osp.exists(tmp_file) else {}
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
if len(indices):
new_results = track_progress_rich(
MMVet_auxeval,
tups,
nproc=nproc,
chunksize=nproc,
keys=indices,
save=tmp_file,
)
ans = load(tmp_file)
for k, v in zip(indices, new_results):
assert k in ans
assert ans[k]['log'] == v['log'] and ans[k]['score'] == v['score']
data['score'] = [ans[idx]['score'] for idx in data['index']]
data['log'] = [ans[idx]['log'] for idx in data['index']]
dump(data, storage)
score, score_fine = MMVet_acc(storage)
score_pth = storage.replace('.xlsx', '_score.csv')
score_fine_pth = storage.replace('.xlsx', '_score_fine.csv')
dump(score, score_pth)
dump(score_fine, score_fine_pth)
return score
class MTVQADataset(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {'MTVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/MTVQA_TEST.tsv'}
DATASET_MD5 = {'MTVQA_TEST': 'd87c17dbab934b7cd89c0a3c1c5657f4'}
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
data = load(eval_file)
assert 'answer' in data and 'prediction' in data and 'category' in data
data['prediction'] = [str(x) for x in data['prediction']]
data['answer'] = [str(x) for x in data['answer']]
if 'split' in data:
assert np.all([x.lower() == 'test' for x in data['split']]), 'We only support MTVQA_TEST for now. '
lt = len(data)
category_scores = defaultdict(list)
for i in range(lt):
line = data.iloc[i]
ans = line['answer'].strip().lower().replace('.', '')
pred = line['prediction'].strip().lower().replace('.', '')
cate = line['category']
score = 1.0 if ans in pred else 0.0
category_scores[cate].append(score)
category_scores['Average'].append(score)
# Calculate the average score for each category, the score is normalized to [0, 100]
category_averages = {category: np.mean(scores) * 100 for category, scores in category_scores.items()}
suffix = eval_file.split('.')[-1]
result_file = eval_file.replace(f'.{suffix}', '_acc.json')
dump(category_averages, result_file)
return category_averages
# MT-VQA adopts a custom prompt
def build_prompt(self, line):
msgs = super().build_prompt(line)
assert sum([x['type'] == 'text' for x in msgs]) == 1
for item in msgs:
if item['type'] == 'text':
item['value'] += '\nAnswer the question using a word or phrase in the language of the question.'
return msgs
class TableVQABench(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'TableVQABench': 'https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/mentor-vil/datasets/tablevqa-bench.tsv'
}
DATASET_MD5 = {'TableVQABench': '2550adc61bdc82d8e62f3b003de7c62d'}
from .utils.tablevqabench import FINTABNETQA_PROMPT, VTABFACT_PROMPT, VWTQ_PROMPT
# It returns a DataFrame
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
import pandas as pd
from .utils.tablevqabench import evaluate_fintabnet, evaluate_tabfact, evaluate_wtq
data = load(eval_file)
assert 'answer' in data and 'prediction' in data
data['prediction'] = data['prediction'].str.replace('^Answer: ', '', regex=True)
data_group = dict(tuple(data.groupby('split')))
eval_result = {'split': [], 'average_scores': []}
for split in ['fintabnetqa', 'vtabfact', 'vwtq', 'vwtq_syn']:
data_split = data_group[split].to_dict(orient='records')
if split == 'fintabnetqa':
split_eval_meta = evaluate_fintabnet(data_split, ['accuracy'])
elif split == 'vtabfact':
split_eval_meta = evaluate_tabfact(data_split, ['accuracy'])
elif split == 'vwtq' or split == 'vwtq_syn':
split_eval_meta = evaluate_wtq(data_split, ['accuracy'])
eval_result['split'].append(split)
eval_result['average_scores'].append(split_eval_meta['average_scores'])
suffix = eval_file.split('.')[-1]
result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
eval_result = pd.DataFrame(eval_result)
dump(eval_result, result_file)
return eval_result
# TableVQABench adopts a custom prompt
def build_prompt(self, line):
msgs = super().build_prompt(line)
assert sum([x['type'] == 'text' for x in msgs]) == 1
for item in msgs:
if item['type'] == 'text':
if line['split'] == 'fintabnetqa':
item['value'] = self.FINTABNETQA_PROMPT.format_map({'question': item['value']})
elif line['split'] == 'vtabfact':
item['value'] = self.VTABFACT_PROMPT.format_map({'question': item['value']})
elif line['split'] == 'vwtq_syn' or line['split'] == 'vwtq':
item['value'] = self.VWTQ_PROMPT.format_map({'question': item['value']})
return msgs
class CustomVQADataset(ImageBaseDataset):
TYPE = 'VQA'
def load_data(self, dataset):
data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
if file_size(data_path, 'GB') > 1:
local_path = data_path.replace('.tsv', '_local.tsv')
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
from ..tools import LOCALIZE
LOCALIZE(data_path, local_path)
data_path = local_path
return load(data_path)
def evaluate(self, eval_file, **judge_kwargs):
raise NotImplementedError
class CRPE(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'CRPE_EXIST': 'https://huggingface.co/datasets/petter12321/crpe_vlmevalkit/resolve/main/CRPE_EXIST.tsv',
'CRPE_RELATION': 'https://huggingface.co/datasets/petter12321/crpe_vlmevalkit/resolve/main/CRPE_RELATION.tsv'
}
DATASET_MD5 = {
'CRPE_EXIST': '315584e23ac1ff7f8719ed3b7ad90f08',
'CRPE_RELATION': 'bad7094cde0b572288f4b119c2d0c656'}
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.crpe import is_correct
# find-image, count-text, find-text,
# infer-choose, count-image, visual-reasoning
score = {
'exist': 0,
'subject': 0,
'predicate': 0,
'object': 0,
'total': 0,
}
num = {
'exist': 0,
'subject': 0,
'predicate': 0,
'object': 0,
'total': 0,
}
final_score_dict = {
'exist': 0,
'subject': 0,
'predicate': 0,
'object': 0,
'total': 0,
}
data = load(eval_file)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
for i in tqdm(range(len(lines))):
line = lines[i]
predict = str(line['prediction'])
answers = str(line['answer'])
# print("predict =", predict)
# print("answers =", answers)
category = line['category']
if is_correct(answers, predict):
score[category] += 1
score['total'] += 1
num[category] += 1
num['total'] += 1
for category in ['exist', 'subject', 'predicate', 'object', 'total']:
if num[category] != 0:
final_score_dict[category] = score[category] / num[category]
else:
final_score_dict[category] = None
score_pth = eval_file.replace('.xlsx', '_score.json')
dump(final_score_dict, score_pth)
return final_score_dict
def build_prompt(self, line):
ROOT = LMUDataRoot()
msgs = super().build_prompt(line)
for msg in msgs:
if msg['type'] == 'image':
msg['value'] = osp.join(osp.join(ROOT, 'images', self.dataset_name), msg['value'])
return msgs
class QSpatial(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'QSpatial_plus': '',
'QSpatial_scannet': ''
}
# NOTE: To evaluate Q-Spatial-ScanNet, you need to get the permission from ScanNet website
# Once you get the permission, you can use the helper code here to download and extract necessary images:
# https://github.com/andrewliao11/Q-Spatial-Bench-code?tab=readme-ov-file#for-qspatial_scannet
qspatial_root = "TO_BE_REPLACED_WITH_THE_PATH_TO_QSPATIAL_DATASET"
url = "https://raw.githubusercontent.com/andrewliao11/Q-Spatial-Bench-code/refs/heads/main/prompt_templates/"
def post_build(self, dataset):
# Download the prompt templates from github
links = [
self.url + "system_prompt.txt",
self.url + "spatial_prompt_single.txt",
self.url + "spatial_prompt_steps.txt",
self.url + "standard_prompt.txt",
self.url + "zero_shot_prompt.txt"
]
with tempfile.TemporaryDirectory() as temp_dir:
for link in links:
tgt_path = os.path.join(temp_dir, link.split("/")[-1])
os.system(f"wget {link} -O {tgt_path}")
self.system_prompt = open(os.path.join(temp_dir, "system_prompt.txt")).read()
self._prompt_templates = dict(
spatial_prompt_single=open(os.path.join(temp_dir, "spatial_prompt_single.txt")).read(),
spatial_prompt_steps=open(os.path.join(temp_dir, "spatial_prompt_steps.txt")).read(),
standard_prompt=open(os.path.join(temp_dir, "standard_prompt.txt")).read(),
zero_shot_prompt=open(os.path.join(temp_dir, "zero_shot_prompt.txt")).read(),
)
# Given one data record, return the built prompt (a multi-modal message), can override
def build_prompt(self, line):
from jinja2.sandbox import SandboxedEnvironment
text_prompt_template = self._prompt_templates["spatial_prompt_single"]
env = SandboxedEnvironment()
text_prompt = env.from_string(text_prompt_template).render(question=line["question"])
tgt_path = self.dump_image(line)
msgs = []
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=f"{self.system_prompt}\n{text_prompt}"))
return msgs
# Given the dataset name, return the dataset as a pandas dataframe, can override
def load_data(self, dataset):
import io
import pandas as pd
from datasets import load_dataset
hf_dataset = load_dataset("andrewliao11/Q-Spatial-Bench", split=dataset)
df = hf_dataset.to_pandas()
df.reset_index(drop=True, inplace=True)
df['index'] = df.index
df['answer'] = list(zip(df['answer_value'], df['answer_unit']))
df = df[['index'] + [col for col in df.columns if col != 'index']]
if dataset == "QSpatial_scannet":
df = df.drop(columns=["image"])
df["image"] = [Image.open(os.path.join(self.qspatial_root, image_path)) for image_path in df["image_path"]]
else:
df["image"] = [Image.open(io.BytesIO(image_dict["bytes"])) for image_dict in df["image"]]
df["image"] = [encode_image_to_base64(image) for image in df["image"]]
return df
@classmethod
def get_multiplier(self, unit):
unit = unit.lower()
if unit in ["meters", "meter", "m", "metre", "metres"]:
multiplier = 100
elif unit in ["centimeters", "centimeter", "cm"]:
multiplier = 1
elif unit in ["feet", "foot", "ft"]:
multiplier = 30.48
elif unit in ["inch", "inches", "in"]:
multiplier = 2.54
elif unit in ["mm"]:
multiplier = 0.1
else:
print(f"Unknown unit: {unit}")
multiplier = 0.
return multiplier
@classmethod
def parse_string(self, input_str):
# Regular expression to match the pattern (number or range, text)
match = re.match(r'\(([\d.-]+), (.+)\)', input_str)
if match:
number_part = match.group(1)
text = match.group(2)
if '-' in number_part:
start, end = map(float, number_part.split('-'))
number = (start + end) / 2
else:
number = float(number_part)
return number * self.get_multiplier(text)
else:
print(f"Unable to parse the input string {input_str}")
return 0
@classmethod
def parse_prediction(self, vlm_response):
# Value
pattern = r'scalar{([^}]*)}'
str_inside_scalar_boxes = re.findall(pattern, vlm_response)[-1]
scalar_list = re.findall(r'\d+\.?\d*', str_inside_scalar_boxes)
parsed_scalar = np.array(scalar_list).astype(float).mean()
# Unit
pattern = r'distance_unit{([^}]*)}'
str_inside_unit_boxes = re.findall(pattern, vlm_response)
parsed_unit = str_inside_unit_boxes[-1]
pred_value_in_cms = parsed_scalar * self.get_multiplier(parsed_unit)
return pred_value_in_cms
# It returns a dictionary
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
data = load(eval_file)
if "model" in judge_kwargs:
from .utils.qspatial import QSpatial_auxeval
# extract using model
model = judge_kwargs['model']
suffix = eval_file.split('.')[-1]
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
model = build_judge(max_tokens=128, **judge_kwargs)
assert model.working(), ('Evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = {}
if osp.exists(tmp_file):
ans = load(tmp_file)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
if len(indices):
new_results = track_progress_rich(
QSpatial_auxeval,
tups,
nproc=nproc,
chunksize=nproc,
keys=indices,
save=tmp_file,
)
ans = load(tmp_file)
for k, v in zip(indices, new_results):
assert k in ans
assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
data['res'] = [ans[idx]['res'] for idx in data['index']]
data['log'] = [ans[idx]['log'] for idx in data['index']]
dump(data, storage)
data = load(storage)
pred_value_in_cms = []
for res in data["res"]:
try:
pred_value_in_cms.append(self.parse_string(res))
except ValueError:
pred_value_in_cms.append(0.)
pred_value_in_cms = np.array(pred_value_in_cms) + 1e-8
else:
# regex parsing
pred_value_in_cms = []
n_errors_in_parsing = 0
for pred in data["prediction"]:
try:
parsed_value = self.parse_prediction(pred)
except IndexError:
n_errors_in_parsing += 1
parsed_value = 1e-8
pred_value_in_cms.append(parsed_value)
print(f"Encounter {n_errors_in_parsing} errors in parsing")
pred_value_in_cms = np.array(pred_value_in_cms) + 1e-8
# Ground truth
ground_truth_value_in_cms = []
for answer in data["answer"]:
value, unit = eval(answer)
ground_truth_value_in_cms.append(value * self.get_multiplier(unit))
ground_truth_value_in_cms = np.array(ground_truth_value_in_cms) + 1e-8
# Calculate the score
pred_gt = pred_value_in_cms / ground_truth_value_in_cms
gt_pred = ground_truth_value_in_cms / pred_value_in_cms
delta_2 = np.stack([pred_gt, gt_pred]).max(0) < 2.
delta_1_point_5 = np.stack([pred_gt, gt_pred]).max(0) < 1.5
data["eval_score_delta_2"] = delta_2
data["eval_score_delta_1_point_5"] = delta_1_point_5
final_score_dict = {
"delta_2": delta_2.mean(),
"delta_1_point_5": delta_1_point_5.mean()
}
for question_type in set(data["question_type"]):
filtered_data = data[data["question_type"] == question_type]
delta_2_per_question_type = filtered_data["eval_score_delta_2"].mean()
delta_1_point_5_per_question_type = filtered_data["eval_score_delta_1_point_5"].mean()
final_score_dict.update({f"{question_type}_delta_2": delta_2_per_question_type})
final_score_dict.update({f"{question_type}_delta_1_point_5": delta_1_point_5_per_question_type})
score_pth = eval_file.replace('.xlsx', '_score.json')
dump(final_score_dict, score_pth)
return final_score_dict
class MMNIAH(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'MM_NIAH_VAL':
'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/MM_NIAH_VAL.tsv',
'MM_NIAH_TEST':
['https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-aa',
'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ab',
'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ac',
'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ad',
'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ae']}
DATASET_MD5 = {'MM_NIAH_VAL': '27e5a8c3cef7746cb38f89cd86c474c5',
'MM_NIAH_TEST': 'f490eb2a43096307465fe9e7ef13497c'}
def prepare_tsv(self, url, file_md5=None):
import os
data_root = LMUDataRoot()
os.makedirs(data_root, exist_ok=True)
update_flag = False
file_name = 'MM_NIAH_VAL.tsv' if 'MM_NIAH_VAL' in url else 'MM_NIAH_TEST.tsv'
data_path = osp.join(data_root, file_name)
if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
pass
elif file_name == 'MM_NIAH_TEST.tsv':
warnings.warn('The dataset tsv is not downloaded')
for i in range(len(url)):
if osp.exists(osp.join(data_root, 'part-a' + chr(ord('a') + i))):
print('part_a' + chr(ord('a') + i) + ' is existed')
continue
download_file(url[i], data_path)
file_prefix = 'part-'
output_file = data_path
split_files = sorted([f for f in os.listdir(data_root) if f.startswith(file_prefix)])
with open(output_file, 'wb') as outfile:
# 逐个读取每个拆分文件并写入到输出文件
for filename in split_files:
with open(osp.join(data_root, filename), 'rb') as infile:
outfile.write(infile.read())
update_flag = True
else:
warnings.warn('The dataset tsv is not downloaded')
download_file(url, data_path)
update_flag = True
if file_size(data_path, 'GB') > 1:
local_path = data_path.replace('.tsv', '_local.tsv')
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
from ..tools import LOCALIZE
LOCALIZE(data_path, local_path)
data_path = local_path
return load(data_path)
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.mmniah import is_correct
# find-image, count-text, find-text,
# infer-choose, count-image, visual-reasoning
MMNIAH_score = {
'count-text': 0,
'find-image': 0,
'find-text': 0,
'infer-choose': 0,
'count-image': 0,
'visual-reasoning': 0,
'total': 0,
}
MMNIAH_num = {
'count-text': 0,
'find-image': 0,
'find-text': 0,
'infer-choose': 0,
'count-image': 0,
'visual-reasoning': 0,
'total': 0,
}
final_score_dict = {
'count-text': 0,
'find-image': 0,
'find-text': 0,
'infer-choose': 0,
'count-image': 0,
'visual-reasoning': 0,
'total': 0,
}
data = load(eval_file)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
for i in tqdm(range(len(lines))):
line = lines[i]
predict = line['prediction']
answers = line['answer']
category = line['category']
if category in ['visual-reasoning', 'find-image']:
answers = int(answers)
if is_correct(answers, predict):
MMNIAH_score[category] += 1
MMNIAH_score['total'] += 1
MMNIAH_num[category] += 1
MMNIAH_num['total'] += 1
for category in ['find-image', 'count-text', 'find-text',
'infer-choose', 'count-image', 'visual-reasoning', 'total']:
if MMNIAH_num[category] != 0:
final_score_dict[category] = MMNIAH_score[category] / MMNIAH_num[category]
else:
final_score_dict[category] = None
score_pth = eval_file.replace('.xlsx', '_score.json')
dump(final_score_dict, score_pth)
return final_score_dict
def build_prompt(self, line):
msgs = super().build_prompt(line)
if isinstance(line, int):
line = self.data.iloc[line]
totalchoice = line['multi-choice options']
totalchoice = eval(totalchoice)
# find-image, count-text, find-text,
# infer-choose, count-image, visual-reasoning
context = msgs[-1]['value']
context = eval(context)
question = context[0] + '\n' + context[1]
# tgt_path是所有图像地址列表
tgt_path = []
for i in range(len(msgs) - 1):
tgt_path.append(msgs[i]['value'])
choices = totalchoice[0]
choices_image = totalchoice[1]
if choices:
for c_idx, c in enumerate(choices):
question = f"{question}\n{chr(c_idx + ord('A'))}. {c}"
question += "\nAnswer with the option's letter from the given choices directly."
elif choices_image:
for c_idx in range(len(choices_image)):
question = f"{question}\n{chr(c_idx + ord('A'))}. <image>"
question += "\nAnswer with the option's letter from the given choices directly."
else:
question += '\nAnswer the question using a single word or phrase.'
question = '<start>' + question + '<end>'
question = question.split('<image>')
if choices_image:
for i in range(len(question) - 5):
question[i] = question[i] + '\n<image>'
for i in range(len(question) - 5, len(question) - 1):
question[i] = question[i] + '<image>'
else:
for i in range(len(question) - 1):
question[i] = question[i] + '\n<image>'
assert len(tgt_path) + 1 == len(question)
context = []
for i in range(len(tgt_path)):
context.append(question[i])
context.append(tgt_path[i])
context.append(question[-1])
context[0] = context[0][7:]
context[-1] = context[-1][:-5]
msgs = []
for i in range(len(context)):
if i % 2 == 0:
msgs.append(dict(type='text', value=context[i]))
else:
ROOT = LMUDataRoot()
msgs.append(dict(type='image', value=osp.join(osp.join(ROOT, 'images', self.dataset_name), context[i])))
for element in msgs:
if element['value'] == '':
msgs.remove(element)
return msgs
from ..smp import *
from ..utils import *
from .image_base import ImageBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
class ImageYORNDataset(ImageBaseDataset):
TYPE = 'Y/N'
DATASET_URL = {
'MME': 'https://opencompass.openxlab.space/utils/VLMEval/MME.tsv',
'HallusionBench': 'https://opencompass.openxlab.space/utils/VLMEval/HallusionBench.tsv',
'POPE': 'https://opencompass.openxlab.space/utils/VLMEval/POPE.tsv',
'AMBER': 'https://huggingface.co/datasets/yifanzhang114/AMBER_base64/resolve/main/AMBER.tsv',
}
DATASET_MD5 = {
'MME': 'b36b43c3f09801f5d368627fb92187c3',
'HallusionBench': '0c23ac0dc9ef46832d7a24504f2a0c7c',
'POPE': 'c12f5acb142f2ef1f85a26ba2fbe41d5',
'AMBER': '970d94c0410916166e0a76ba75da7934',
}
# It returns a dataframe
def evaluate(self, eval_file, **judge_kwargs):
from .utils.yorn import YOrN_Extraction, YOrN_auxeval
from .utils.yorn import default_rating, MME_rating, Hallusion_rating, POPE_rating, AMBER_rating
dataset = self.dataset_name
data = load(eval_file)
data['prediction'] = [str(x) for x in data['prediction']]
storage = eval_file.replace('.xlsx', '_auxmatch.xlsx')
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(storage):
ans_map = {k: YOrN_Extraction(v) for k, v in zip(data['index'], data['prediction'])}
if osp.exists(tmp_file):
tmp = load(tmp_file)
for k in tmp:
if ans_map[k] == 'Unknown' and tmp[k] != 'Unknown':
ans_map[k] = tmp[k]
data['extracted'] = [ans_map[x] for x in data['index']]
unknown = data[data['extracted'] == 'Unknown']
model = judge_kwargs.get('model', 'exact_matching')
if model == 'exact_matching':
model = None
elif gpt_key_set():
model = build_judge(**judge_kwargs)
if not model.working():
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
warnings.warn(DEBUG_MESSAGE)
model = None
else:
model = None
warnings.warn('OPENAI_API_KEY is not working properly, will use exact matching for evaluation')
if model is not None:
lt = len(unknown)
lines = [unknown.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = list(unknown['index'])
if len(tups):
res = track_progress_rich(
YOrN_auxeval, tups, nproc=nproc, chunksize=nproc, keys=indices, save=tmp_file)
for k, v in zip(indices, res):
ans_map[k] = v
data['extracted'] = [ans_map[x] for x in data['index']]
dump(data, storage)
data = load(storage)
if listinstr(['AMBER'], dataset):
data['score'] = (data['answer'].str.lower() == data['extracted'].str.lower())
else:
data['score'] = (data['answer'] == data['extracted'])
dump(data, storage)
if dataset is not None and listinstr(['MME'], dataset):
score = MME_rating(storage)
elif dataset is not None and listinstr(['Hallusion'], dataset):
score = Hallusion_rating(storage)
elif dataset is not None and listinstr(['POPE'], dataset):
score = POPE_rating(storage)
elif dataset is not None and listinstr(['AMBER'], dataset):
score = AMBER_rating(storage)
else:
score = default_rating(storage)
score_tgt = eval_file.replace('.xlsx', '_score.csv')
dump(score, score_tgt)
return score
from huggingface_hub import snapshot_download
from ..smp import *
from .video_base import VideoBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from glob import glob
FAIL_MSG = 'Failed to obtain answer via API.'
def timestamp_to_seconds(timestamp):
# Split the timestamp into hours, minutes, and seconds
h, m, s = timestamp.split(":")
# Convert hours, minutes, and total seconds (including fractions) to float and compute total seconds
total_seconds = int(h) * 3600 + int(m) * 60 + float(s)
return total_seconds
def uniformly_subsample(lst, K):
n = len(lst)
if K >= n:
return lst
step = n / K
return [lst[int(i * step)] for i in range(K)]
def insert_subtitles_into_frames(
frames,
frame_timestamps,
subtitles,
starting_timestamp_for_subtitles,
duration,
):
interleaved_list = []
cur_i = 0
for subtitle in subtitles:
if "timestamp" in subtitle:
start, end = subtitle["timestamp"]
if not isinstance(end, float):
end = duration
start -= starting_timestamp_for_subtitles
end -= starting_timestamp_for_subtitles
subtitle_timestamp = (start + end) / 2
subtitle_text = subtitle["text"]
else:
start, end = subtitle["start"], subtitle["end"]
start = timestamp_to_seconds(start)
end = timestamp_to_seconds(end)
start -= starting_timestamp_for_subtitles
end -= starting_timestamp_for_subtitles
subtitle_timestamp = (start + end) / 2
subtitle_text = subtitle["line"]
for i, (frame, frame_timestamp) in enumerate(
zip(frames[cur_i:], frame_timestamps[cur_i:])
):
if frame_timestamp <= subtitle_timestamp:
# print("frame:", frame_timestamp)
interleaved_list.append({"type": "image", "value": frame})
cur_i += 1
else:
break
if end - start < 1:
end = subtitle_timestamp + 0.5
start = subtitle_timestamp - 0.5
covering_frames = False
for frame, frame_timestamp in zip(frames, frame_timestamps):
if frame_timestamp < end and frame_timestamp > start:
covering_frames = True
break
if covering_frames:
interleaved_list.append({"type": "text", "value": subtitle_text + "\n"})
else:
pass
for i, (frame, frame_timestamp) in enumerate(
zip(frames[cur_i:], frame_timestamps[cur_i:])
):
interleaved_list.append({"type": "image", "value": frame})
return interleaved_list
class LongVideoBench(VideoBaseDataset):
MD5 = '82905eae3a5ae7383c5a8ee9655e1ab9'
SYS = ''
TYPE = 'Video-MCQ'
def __init__(self, dataset='LongVideoBench', use_subtitle=False, nframe=0, fps=-1):
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
self.use_subtitle = use_subtitle
self.dataset_name = dataset
@classmethod
def supported_datasets(cls):
return ['LongVideoBench']
def prepare_dataset(self, dataset_name='LongVideoBench', repo_id='longvideobench/LongVideoBench'):
def check_integrity(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if not osp.exists(data_file):
return False
if md5(data_file) != self.MD5:
print("md5 mismatch", md5(data_file), self.MD5)
return False
data = load(data_file)
for video_pth in data['video_path']:
if not osp.exists(osp.join(pth, video_pth)):
print(video_pth, "is not found")
return False
return True
if modelscope_flag_set():
repo_id = "AI-ModelScope/LongVideoBench"
cache_path = get_cache_path(repo_id)
if cache_path is not None and check_integrity(cache_path):
dataset_path = cache_path
else:
def generate_tsv(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if osp.exists(data_file) and md5(data_file) == self.MD5:
return
data_file = pd.read_json(osp.join(pth, 'lvb_val.json'))
data_file = data_file.assign(index=range(len(data_file)))
data_file['video'] = data_file['video_id']
data_file['video_path'] = data_file['video_path'].apply(lambda x: f'./videos/{x}')
data_file.to_csv(osp.join(pth, f'{dataset_name}.tsv'), sep='\t', index=False)
if modelscope_flag_set():
from modelscope import dataset_snapshot_download
dataset_snapshot_download(dataset_id=repo_id)
else:
snapshot_download(repo_id=repo_id, repo_type='dataset')
print("All videos are downloaded for LongVideoBench")
if not glob(osp.join(cache_path, "videos")):
tar_files = glob(osp.join(cache_path, "**/*.tar*"), recursive=True)
def untar_video_data(tar_file, cache_dir):
import tarfile
with tarfile.open(tar_file, "r") as tar_ref:
tar_ref.extractall(cache_dir)
print(f"Extracted all files from {tar_file} to {cache_dir}")
def concat_tar_parts(tar_parts, output_tar):
with open(output_tar, "wb") as out_tar:
from tqdm import tqdm
for part in tqdm(sorted(tar_parts)):
with open(part, "rb") as part_file:
out_tar.write(part_file.read())
print(f"Concatenated parts {tar_parts} into {output_tar}")
tar_parts_dict = {}
# Group tar parts together
for tar_file in tar_files:
base_name = tar_file.split(".tar")[0]
if base_name not in tar_parts_dict:
tar_parts_dict[base_name] = []
tar_parts_dict[base_name].append(tar_file)
# Concatenate and untar split parts
for base_name, parts in tar_parts_dict.items():
print(f"Extracting following tar files: {parts}")
output_tar = base_name + ".tar"
if not osp.exists(output_tar):
print('Start concatenating tar files')
concat_tar_parts(parts, output_tar)
print('Finish concatenating tar files')
if not osp.exists(osp.join(cache_path, osp.basename(base_name))):
untar_video_data(output_tar, cache_path)
print('All videos are extracted for LongVideoBench')
dataset_path = cache_path
generate_tsv(dataset_path)
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
return dict(data_file=data_file, root=dataset_path)
def save_video_frames(self, video_path, video_llm=False):
vid_path = osp.join(self.data_root, video_path)
vid = decord.VideoReader(vid_path)
video_info = {
'fps': vid.get_avg_fps(),
'n_frames': len(vid),
}
if self.nframe > 0 and self.fps < 0:
step_size = len(vid) / (self.nframe + 1)
indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
frame_paths = self.frame_paths(video_path[:-4])
elif self.fps > 0:
# not constrained by num_frames, get frames by fps
total_duration = video_info['n_frames'] / video_info['fps']
required_frames = int(total_duration * self.fps)
step_size = video_info['fps'] / self.fps
indices = [int(i * step_size) for i in range(required_frames)]
frame_paths = self.frame_paths_fps(video_path[:-4], len(indices))
flag = np.all([osp.exists(p) for p in frame_paths])
if not flag:
images = [vid[i].asnumpy() for i in indices]
images = [Image.fromarray(arr) for arr in images]
for im, pth in zip(images, frame_paths):
if not osp.exists(pth) and not video_llm:
im.save(pth)
return frame_paths, indices, video_info
# def save_video_into_images(self, line, num_frames=8):
# frame_paths, indices, video_info = self.save_video_frames(line['video_path'], num_frames)
# return frame_paths
def build_prompt(self, line, video_llm):
if isinstance(line, int):
assert line < len(self)
line = self.data.iloc[line]
frames, indices, video_info = self.save_video_frames(line['video_path'], video_llm)
fps = video_info["fps"]
message = [dict(type='text', value=self.SYS)]
if video_llm:
message.append(dict(type='video', value=osp.join(self.data_root, line['video_path'])))
else:
if not self.use_subtitle:
with open(osp.join(self.data_root, "subtitles", line["subtitle_path"])) as f:
subtitles = json.load(f)
frame_message = insert_subtitles_into_frames(
frames,
[ind_ / fps for ind_ in indices],
subtitles,
line["starting_timestamp_for_subtitles"],
line["duration"]
)
message += frame_message
else:
for im in frames:
message.append(dict(type='image', value=im))
line['question'] += '\n' + '\n'.join(
["{}. {}".format(chr(ord("A") + i), cand) for i, cand in enumerate(eval(line['candidates']))]
)
prompt = line["question"] + "\nAnswer with the option's letter from the given choices directly."
message.append(dict(type='text', value=prompt))
return message
# It returns a dictionary
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.longvideobench import get_dimension_rating, extract_characters_regex, extract_option
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
tgt_file = eval_file.replace('.xlsx', '_rating.json')
score_file = eval_file.replace('.xlsx', '_score.xlsx')
if not osp.exists(score_file):
model = judge_kwargs.get('model', 'exact_matching')
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
if model == 'exact_matching':
model = None
elif gpt_key_set():
model = build_judge(**judge_kwargs)
if not model.working():
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
warnings.warn(DEBUG_MESSAGE)
model = None
else:
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
model = None
res = {} if not osp.exists(tmp_file) else load(tmp_file)
res = {k: v for k, v in res.items() if FAIL_MSG not in v}
data = load(eval_file)
data_un = data[~pd.isna(data['prediction'])]
for idx in data['index']:
ans = data.loc[data['index'] == idx, 'correct_choice'].values[0]
ans = chr(ord("A") + ans)
pred = str(data.loc[data['index'] == idx, 'prediction'].values[0])
if extract_characters_regex(pred) == '':
extract_pred = extract_option(
model,
data.loc[data['index'] == idx].to_dict(orient='records')[0],
'LongVideoBench'
)
data.loc[idx, 'score'] = int(extract_pred == ans)
else:
data.loc[idx, 'score'] = int(extract_characters_regex(pred) == ans)
rejected = [x for x in data['score'] if x == -1]
print(
f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
f'failed to obtain the score for another {len(rejected)} questions. '
f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
)
dump(data, score_file)
rating = get_dimension_rating(score_file)
dump(rating, tgt_file)
return rating
import json
import os
import pandas as pd
from .image_base import ImageBaseDataset
from ..smp import *
from .utils import build_judge, DEBUG_MESSAGE
from ..utils import track_progress_rich
def generate_prompt(d):
question = d['question']
weights = eval(d['component_weight'])
components = eval(d['components'])
num_of_component = int(d['num_of_component'])
response = d['prediction']
if num_of_component == 1:
components = f"The first component is: '{components[0]}'. "
score = f"The first component is worth: {weights[0]} scores. "
elif num_of_component == 2:
components = f"The first component is: '{components[0]}', and the second component is '{components[1]}'. "
score = f"The first and second component is each worth {weights[0]} and {weights[1]} scores. "
elif num_of_component == 3:
components = (
f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
f"and the third component is '{components[2]}'. "
)
score = (
"The first, second, and third component is each worth "
f"{weights[0]}, {weights[1]}, and {weights[2]} scores."
)
elif num_of_component == 4:
components = (
f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
f"and the third component is '{components[2]}', and the fourth component is '{components[3]}'. "
)
score = (
"The first, second, third, and fourth component is each worth "
f"{weights[0]}, {weights[1]}, {weights[2]}, and {weights[3]} scores."
)
elif num_of_component == 5:
components = (
f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
f"and the third component is '{components[2]}', and the fourth component is '{components[3]}', "
f"and the fifth component is '{components[4]}'. "
)
score = (
"The first, second, third, fourth, and fifth component is each worth "
f"{weights[0]}, {weights[1]}, {weights[2]}, {weights[3]}, and {weights[4]} scores."
)
return (
"Here is an instruction for a multimodal LLM: '"
f"{question}"
"'. You need to grade if the response from the model follows each component of the instruction. "
f"{components}"
"The response is: '"
f"{response}"
"'. You need to score the response and be strict. The total score ranges from 0 to 10, "
"depending on if the response follows the instruction. "
f"{score}"
"List scores of each component, and the total score in one sentence in this format: "
"score of component 1: x/2, score of component 2: y/8, total score: z/10. Then explain your reasons."
)
def process_rawscore(component_type, raw_score):
first_sentence = raw_score.split('.')[0].split(',')
score_dict = {}
for i in range(len(first_sentence) - 1):
score_ = first_sentence[i].split(':')[1][1:].split('/')
score = int(score_[0]) / int(score_[1])
score_dict[component_type[i]] = score
total_score_ = first_sentence[i + 1].split(':')[1][1:].split('/')
total_score = int(total_score_[0]) / int(total_score_[1])
score_dict['total_score'] = total_score
return score_dict
def get_score_dict(data, score_raw):
cat_score_dict = {}
for i in range(len(data)):
try:
cmp = data['component_type'][i][2:-2]
cmp_list = cmp.split('\', \'')
score_dict = process_rawscore(cmp_list, score_raw[i])
for key, val in score_dict.items():
if key not in cat_score_dict.keys():
cat_score_dict[key] = [val]
else:
cat_score_dict[key].append(val)
except:
pass
cat_score_dict_average = {}
for key, val in cat_score_dict.items():
cat_score_dict_average[key] = sum(val) / len(val)
return cat_score_dict_average
class MIABench(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'MIA-Bench': 'https://opencompass.openxlab.space/utils/VLMEval/Mia-Bench.tsv',
}
DATASET_MD5 = {
'MIA-Bench': '0b9de595f4dd40af18a69b94d89aba82',
}
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
judge_name = judge_kwargs.pop('model', 'gpt-4o')
model = build_judge(model=judge_name, **judge_kwargs)
suffix = eval_file.split('.')[-1]
storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx') # noqa: F841
tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl') # noqa: F841
nproc = judge_kwargs.pop('nproc', 4) # noqa: F841
if not osp.exists(storage):
data = load(eval_file)
num_samples = len(data)
lines = [data.loc[i] for i in range(num_samples)]
prompts = [generate_prompt(line) for line in lines]
org_data = MIABench('MIA-Bench').data
img_map = {x: y for x, y in zip(org_data['index'], org_data['image'])}
image_b64 = [img_map[idx] for idx in data['index']]
indices = list(data['index'])
mm_messages = [
dict(message=[
dict(type='text', value=prompt),
dict(type='image', value=f'data:image/jpeg;base64,{b64}')
])
for prompt, b64 in zip(prompts, image_b64)
]
res = {}
if osp.exists(tmp_file):
res = load(tmp_file)
jobs = {k: v for k, v in zip(indices, mm_messages) if k not in res}
job_keys = list(jobs.keys())
job_vals = [jobs[k] for k in job_keys]
resps = track_progress_rich(
model.generate,
job_vals,
nproc=nproc,
chunksize=nproc,
keys=job_keys,
save=tmp_file,
)
for k, resp in zip(job_keys, resps):
res[k] = resp
data['score_raw'] = [res[idx] for idx in indices]
dump(data, storage)
goresult = load(storage)
results = get_score_dict(goresult, goresult['score_raw'])
result_pth = storage.replace('.xlsx', '_score.csv')
results_pd = pd.DataFrame.from_dict(list(results.items()))
dump(results_pd, result_pth)
return results
import huggingface_hub
from huggingface_hub import snapshot_download
from ..smp import *
from .video_concat_dataset import ConcatVideoDataset
from .video_base import VideoBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..utils import track_progress_rich
import torchvision.transforms as T
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
from decord import VideoReader, cpu
import pandas as pd
import imageio
import cv2
import zipfile
import os
import glob
from .utils.mlvu import *
FAIL_MSG = 'Failed to obtain answer via API.'
class MLVU(ConcatVideoDataset):
def __init__(self, dataset='MLVU', nframe=0, fps=-1):
self.DATASET_SETS[dataset] = ['MLVU_MCQ', 'MLVU_OpenEnded']
self.type_data_dict = {
'M-Avg':['plotQA', 'needle', 'ego', 'count', 'anomaly_reco', 'topic_reasoning'],
'G-Avg':['sub_scene', 'summary']
}
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
@classmethod
def supported_datasets(cls):
return ['MLVU']
def evaluate(self, eval_file, **judge_kwargs):
result = super().evaluate(eval_file=eval_file, **judge_kwargs)
suffix = eval_file.split('.')[-1]
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
for key in self.type_data_dict:
result.loc[key] = 0.0
for name, item in result.iterrows():
if name in self.type_data_dict[key]:
result.loc[key, 'success'] += item['success']
result.loc[key, 'overall'] += item['overall']
if key == 'G-Avg':
result.loc[key, 'acc'] = round(
result.loc[key, 'success'] / result.loc[key, 'overall'], 2
)
else:
result.loc[key, 'acc'] = round(
result.loc[key, 'success'] / result.loc[key, 'overall'] * 100, 1
)
result = result.reset_index().rename(columns={'index': 'task'})
dump(result, score_file)
return result
class MLVU_MCQ(VideoBaseDataset):
MD5 = 'bb5c37e7cf8d43fc9a25c23d2b4633f5'
BASE_SYS = 'Carefully watch this video and pay attention to every detail. '
SYS = BASE_SYS + 'Based on your observations, select the best option that accurately addresses the question.'
TYPE = 'Video-MCQ'
def __init__(self, dataset='MLVU_MCQ', nframe=0, fps=-1):
self.type_data_list = {
'plotQA': ('1_plotQA.json', './MLVU/video/1_plotQA', 'MCQ'),
'needle': ('2_needle.json', './MLVU/video/2_needle', 'MCQ'),
'ego': ('3_ego.json', './MLVU/video/3_ego', 'MCQ'),
'count': ('4_count.json', './MLVU/video/4_count', 'MCQ'),
'order': ('5_order.json', './MLVU/video/5_order', 'MCQ'),
'anomaly_reco': ('6_anomaly_reco.json', './MLVU/video/6_anomaly_reco', 'MCQ'),
'topic_reasoning': ('7_topic_reasoning.json', './MLVU/video/7_topic_reasoning', 'MCQ'),
}
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
@classmethod
def supported_datasets(cls):
return ['MLVU_MCQ']
def prepare_dataset(self, dataset_name='MLVU_MCQ', repo_id='MLVU/MVLU'):
def check_integrity(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if not os.path.exists(data_file):
return False
if md5(data_file) != self.MD5:
return False
data = load(data_file)
for idx, item in data.iterrows():
if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
return False
return True
if modelscope_flag_set():
repo_id = "AI-ModelScope/MLVU"
cache_path = get_cache_path(repo_id)
if cache_path is not None and check_integrity(cache_path):
dataset_path = cache_path
else:
def generate_tsv(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if os.path.exists(data_file) and md5(data_file) == self.MD5:
return
json_data_dir = os.path.join(dataset_path, 'MLVU', 'json')
self.data_list = []
for k, v in self.type_data_list.items():
with open(os.path.join(json_data_dir, v[0]), 'r') as f:
json_data = json.load(f)
for data in json_data:
self.data_list.append({
'task_type': k,
'prefix': v[1],
'duration': data['duration'],
'video': data['video'],
'question': data['question'],
'answer': data['answer'],
'candidates': data['candidates'],
})
data_df = pd.DataFrame(self.data_list)
data_df = data_df.assign(index=range(len(data_df)))
data_df.to_csv(data_file, sep='\t', index=False)
if modelscope_flag_set():
from modelscope import dataset_snapshot_download
dataset_path = dataset_snapshot_download(dataset_id=repo_id)
else:
hf_token = os.environ.get('HUGGINGFACE_TOKEN')
huggingface_hub.login(hf_token)
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
generate_tsv(dataset_path)
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
return dict(root=dataset_path, data_file=data_file)
def qa_template(self, data):
question = f"Question: {data['question']}\n"
question += 'Options:\n'
answer = data['answer']
answer_idx = -1
for idx, c in enumerate(eval(data['candidates'])):
question += f"({chr(ord('A') + idx)}) {c}\n"
if c == answer:
answer_idx = idx
question = question.rstrip()
answer = f"({chr(ord('A') + answer_idx)}) {answer}"
return question, answer
def save_video_frames(self, line):
suffix = line['video'].split('.')[-1]
video = line['video'].replace(f'.{suffix}','')
vid_path = osp.join(self.data_root, line['prefix'], line['video'])
vid = decord.VideoReader(vid_path)
video_info = {
'fps': vid.get_avg_fps(),
'n_frames': len(vid),
}
if self.nframe > 0 and self.fps < 0:
step_size = len(vid) / (self.nframe + 1)
indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
frame_paths = self.frame_paths(video)
elif self.fps > 0:
# not constrained by num_frames, get frames by fps
total_duration = video_info['n_frames'] / video_info['fps']
required_frames = int(total_duration * self.fps)
step_size = video_info['fps'] / self.fps
indices = [int(i * step_size) for i in range(required_frames)]
frame_paths = self.frame_paths_fps(video, len(indices))
flag = np.all([osp.exists(p) for p in frame_paths])
if not flag:
images = [vid[i].asnumpy() for i in indices]
images = [Image.fromarray(arr) for arr in images]
for im, pth in zip(images, frame_paths):
if not osp.exists(pth):
im.save(pth)
return frame_paths
def save_video_into_images(self, line):
frame_paths = self.save_video_frames(line)
return frame_paths
def build_prompt(self, line, video_llm):
if isinstance(line, int):
assert line < len(self)
line = self.data.iloc[line]
question, answer = self.qa_template(line)
message = [dict(type='text', value=self.SYS, role='system')]
message.append(dict(type='text', value=question))
video_path = os.path.join(self.data_root, line['prefix'], line['video'])
if video_llm:
message.append(dict(type='video', value=video_path))
else:
img_frame_paths = self.save_video_into_images(line)
for im in img_frame_paths:
message.append(dict(type='image', value=im))
message.append(dict(type='text', value='\nOnly give the best option.'))
return message
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
score_file = eval_file.replace('.xlsx', '_score.xlsx')
if not osp.exists(score_file):
model = judge_kwargs.setdefault('model', 'chatgpt-0125')
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
if model == 'exact_matching':
model = None
elif gpt_key_set():
model = build_judge(**judge_kwargs)
if not model.working():
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
warnings.warn(DEBUG_MESSAGE)
model = None
else:
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
model = None
res = {} if not osp.exists(tmp_file) else load(tmp_file)
res = {k: v for k, v in res.items() if FAIL_MSG not in v}
data = load(eval_file)
data_un = data[~pd.isna(data['prediction'])]
for idx in data['index']:
ans = data.loc[data['index'] == idx, 'answer'].values[0]
pred = data.loc[data['index'] == idx, 'prediction'].values[0]
options = eval(data.loc[data['index'] == idx, 'candidates'].values[0])
answer_idx = -1
for id, c in enumerate(options):
if c == ans:
answer_idx = id
ans = f"({chr(ord('A') + answer_idx)}) {ans}"
input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0]
for id, option_content in enumerate(eval(input_item['candidates'])):
input_item[chr(ord('A') + id)] = option_content
if option_content == input_item['answer']:
input_item['answer'] = chr(ord('A') + id)
if FAIL_MSG in pred:
data.loc[idx, 'score'] = -1
else:
data.loc[idx, 'score'] = int(check_ans_with_model(
pred, ans, model,
input_item,
'MLVU_MCQ'
))
rejected = [x for x in data['score'] if x == -1]
print(
f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
f'failed to obtain the score for another {len(rejected)} questions. '
f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
)
dump(data, score_file)
rating = get_dimension_rating(score_file)
return rating
class MLVU_OpenEnded(VideoBaseDataset):
MD5 = 'cee573a3627c6ac434ded704c60511ba'
BASE_SYS = 'Carefully watch this video and pay attention to every detail. '
SYS = BASE_SYS + 'Based on your observations, answer the given questions.'
TYPE = 'Video-VQA'
def __init__(self, dataset='MLVU_OpenEnded', nframe=0, fps=-1):
self.type_data_list = {
'sub_scene': ('8_sub_scene.json', './MLVU/video/8_sub_scene', 'VQA'),
'summary': ('9_summary.json', './MLVU/video/9_summary', 'VQA')
}
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
@classmethod
def supported_datasets(cls):
return ['MLVU_OpenEnded']
def prepare_dataset(self, dataset_name='MLVU_OpenEnded', repo_id='MLVU/MVLU'):
def check_integrity(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if not os.path.exists(data_file):
return False
if md5(data_file) != self.MD5:
return False
data = load(data_file)
for idx, item in data.iterrows():
if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
return False
return True
if modelscope_flag_set():
repo_id = "AI-ModelScope/MLVU"
cache_path = get_cache_path(repo_id)
if cache_path is not None and check_integrity(cache_path):
dataset_path = cache_path
else:
def generate_tsv(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if os.path.exists(data_file) and md5(data_file) == self.MD5:
return
json_data_dir = os.path.join(dataset_path, 'MLVU', 'json')
self.data_list = []
for k, v in self.type_data_list.items():
with open(os.path.join(json_data_dir, v[0]), 'r') as f:
json_data = json.load(f)
for data in json_data:
self.data_list.append({
'task_type': k,
'prefix': v[1],
'duration': data['duration'],
'video': data['video'],
'question': data['question'],
'answer': data['answer'],
'scoring_points': data['scoring_points'] if 'scoring_points' in data else ''
})
data_df = pd.DataFrame(self.data_list)
data_df = data_df.assign(index=range(len(data_df)))
data_df.to_csv(data_file, sep='\t', index=False)
if modelscope_flag_set():
from modelscope import dataset_snapshot_download
dataset_path = dataset_snapshot_download(dataset_id=repo_id)
else:
hf_token = os.environ.get('HUGGINGFACE_TOKEN')
huggingface_hub.login(hf_token)
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
generate_tsv(dataset_path)
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
return dict(root=dataset_path, data_file=data_file)
def qa_template(self, data):
question = f"{data['question']}"
answer = data['answer']
return question, answer
def save_video_frames(self, line):
suffix = line['video'].split('.')[-1]
video = line['video'].replace(f'.{suffix}','')
vid_path = osp.join(self.data_root, line['prefix'], line['video'])
vid = decord.VideoReader(vid_path)
video_info = {
'fps': vid.get_avg_fps(),
'n_frames': len(vid),
}
if self.nframe > 0 and self.fps < 0:
step_size = len(vid) / (self.nframe + 1)
indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
frame_paths = self.frame_paths(video)
elif self.fps > 0:
# not constrained by num_frames, get frames by fps
total_duration = video_info['n_frames'] / video_info['fps']
required_frames = int(total_duration * self.fps)
step_size = video_info['fps'] / self.fps
indices = [int(i * step_size) for i in range(required_frames)]
frame_paths = self.frame_paths_fps(video, len(indices))
flag = np.all([osp.exists(p) for p in frame_paths])
if not flag:
images = [vid[i].asnumpy() for i in indices]
images = [Image.fromarray(arr) for arr in images]
for im, pth in zip(images, frame_paths):
if not osp.exists(pth):
im.save(pth)
return frame_paths
def save_video_into_images(self, line):
frame_paths = self.save_video_frames(line)
return frame_paths
def build_prompt(self, line, video_llm):
if isinstance(line, int):
assert line < len(self)
line = self.data.iloc[line]
question, answer = self.qa_template(line)
message = [dict(type='text', value=self.SYS, role='system')]
message.append(dict(type='text', value=question))
video_path = os.path.join(self.data_root, line['prefix'], line['video'])
if video_llm:
message.append(dict(type='video', value=video_path))
else:
img_frame_paths = self.save_video_into_images(line)
for im in img_frame_paths:
message.append(dict(type='image', value=im))
return message
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
model = judge_kwargs['model'] if 'model' in judge_kwargs else judge_kwargs.setdefault('model', 'gpt-4-0125')
if model != 'gpt-4-0125':
print('MLVU Open Ended default using gpt-4-0125! So judge model is changed to gpt-4-0125')
judge_kwargs['model'] = 'gpt-4-0125'
suffix = eval_file.split('.')[-1]
score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(score_file):
data = load(eval_file)
model_dict = {
'sub_scene': build_judge(system_prompt=system_prompt_sub_scene, **judge_kwargs),
'summary': build_judge(system_prompt=system_prompt_summary, **judge_kwargs)
}
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model_dict[line['task_type']], line) for line in lines]
indices = [line['index'] for line in lines]
ans = {}
if osp.exists(tmp_file):
ans = load(tmp_file)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
if len(indices):
_ = track_progress_rich(
MLVU_OpenEnded_generate,
tups,
nproc=nproc,
chunksize=nproc,
keys=indices,
save=tmp_file,
)
ans = load(tmp_file)
data = MLVU_OpenEnded_extract(ans, data)
dump(data, score_file)
rating = get_dimension_rating(score_file)
return rating
from huggingface_hub import snapshot_download
from ..smp import *
from .video_base import VideoBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..utils import track_progress_rich
FAIL_MSG = 'Failed to obtain answer via API.'
def unwrap_hf_pkl(pth, suffix='.mp4'):
base_dir = os.path.join(pth, 'video_pkl/')
target_dir = os.path.join(pth, 'video/')
pickle_files = [os.path.join(base_dir, file) for file in os.listdir(base_dir)]
pickle_files.sort()
if not os.path.exists(target_dir):
os.makedirs(target_dir, exist_ok=True)
for pickle_file in pickle_files:
with open(pickle_file, 'rb') as file:
video_data = pickle.load(file)
# For each video file in the pickle file, write its contents to a new mp4 file
for video_name, video_content in video_data.items():
output_path = os.path.join(target_dir, f'{video_name}{suffix}')
with open(output_path, 'wb') as output_file:
output_file.write(video_content)
print('The video file has been restored and stored from the pickle file.')
else:
print('The video file already exists.')
class MMBenchVideo(VideoBaseDataset):
MD5 = '98f7df3eb1007fc375ea6fe88a98e2ff'
SYS = 'You are an AI assistant responsible for answering questions about videos.'
FRAMES_TMPL_PACK = """
You will be provided with {} separate frames uniformly sampled from a video, \
the frames are provided in chronological order of the video.
Please analyze these images and provide the answer / answers to the \
following question / questions about the video content.
If multiple questions are provided (with indices I1, I2, I3, ...), \
you should organize your answers in the following json format:
{{
'I1': 'Answer to Question I1',
'I2': 'Answer to Question I2',
...
}}
Otherwise, please directly reply with your response to the only question.
Even if the information in these separate frames is not enough to give an answer,
PLEASE GIVE A RESPONSE TO EACH OF THE QUESTIONS IN THE FORMAT DESCRIBED ABOVE.
"""
FRAMES_TMPL_NOPACK = """
You will be provided with {} separate frames uniformly sampled from a video, \
the frames are provided in chronological order of the video.
Please analyze these images and provide the answer to the question about the video content.
Please directly reply with your response to the only question.
"""
TYPE = 'Video-VQA'
def __init__(self, dataset='MMBench-Video', pack=False, nframe=0, fps=-1):
super().__init__(dataset=dataset, pack=pack, nframe=nframe, fps=fps)
@classmethod
def supported_datasets(cls):
return ['MMBench-Video']
def prepare_dataset(self, dataset_name='MMBench-Video', repo_id='opencompass/MMBench-Video'):
def check_integrity(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if md5(data_file) != self.MD5:
return False
data = load(data_file)
for video_pth in data['video_path']:
if not osp.exists(osp.join(pth, video_pth)):
return False
return True
cache_path = get_cache_path(repo_id)
if cache_path is not None and check_integrity(cache_path):
dataset_path = cache_path
else:
if modelscope_flag_set():
from modelscope import dataset_snapshot_download
dataset_path = dataset_snapshot_download(dataset_id=repo_id)
else:
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
unwrap_hf_pkl(dataset_path)
self.video_path = osp.join(dataset_path, 'video/')
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
return dict(data_file=data_file, root=osp.join(dataset_path, 'video'))
def build_prompt_pack(self, line):
if isinstance(line, int):
assert line < len(self)
video = self.videos[line]
elif isinstance(line, pd.Series):
video = line['video']
elif isinstance(line, str):
video = line
frames = self.save_video_frames(video)
sub = self.data[self.data['video'] == video]
sys_prompt = self.SYS + self.FRAMES_TMPL_PACK.format(len(frames))
message = [dict(type='text', value=sys_prompt)]
for im in frames:
message.append(dict(type='image', value=im))
nq = len(sub)
prompt = 'Questions: \n{}\nAnswers: \n'
qs = {int(sub.iloc[i]['index']): sub.iloc[i]['question'] for i in range(nq)}
prompt = prompt.format(json.dumps(qs))
message.append(dict(type='text', value=prompt))
return message
def build_prompt_nopack(self, line, video_llm):
if isinstance(line, int):
assert line < len(self)
line = self.data.iloc[line]
if video_llm:
question = line['question']
prefix, video_idx_path = os.path.split(line['video_path'])
message = [dict(type='text', value=question)]
message.append(dict(type='video', value=os.path.join(self.video_path, video_idx_path)))
return message
else:
frames = self.save_video_frames(line['video'])
sys_prompt = self.FRAMES_TMPL_NOPACK.format(len(frames))
message = [dict(type='text', value=sys_prompt)]
for im in frames:
message.append(dict(type='image', value=im))
prompt = 'Question: {}\nAnswer: '.format(line['question'])
message.append(dict(type='text', value=prompt))
return message
def build_prompt(self, line, video_llm):
if self.pack and not video_llm:
return self.build_prompt_pack(line)
else:
return self.build_prompt_nopack(line, video_llm)
@staticmethod
def remove_side_quote(s, syms=[',', '"', "'"]):
if np.all([x in syms for x in s]):
return ''
while s[0] in syms:
s = s[1:]
while s[-1] in syms:
s = s[:-1]
return s
@staticmethod
def robust_json_load(s):
try:
jsons = list(extract_json_objects(s))
assert len(jsons) == 1
return jsons[0]
except:
if '{' in s and s.find('{') == s.rfind('{'):
sub_str = s[s.find('{') + 1:].strip()
lines = sub_str.split('\n')
res = {}
for l in lines:
l = l.strip()
if ': ' in l:
key = l.split(': ')[0].strip()
val = l.split(': ')[1].strip()
key = MMBenchVideo.remove_side_quote(key)
val = MMBenchVideo.remove_side_quote(val)
if len(key) and len(val):
res[key] = val
return res
return None
def load_pack_answers(self, data_raw):
vstats = defaultdict(lambda: 0)
data = defaultdict(lambda: {})
for k in data_raw:
ans = data_raw[k].strip()
if FAIL_MSG in ans:
vstats['GEN_FAIL'] += 1
continue
res = self.robust_json_load(ans)
if res is not None:
data[k] = res
vstats['PARSE_OK'] += 1
else:
vstats['PARSE_FAIL'] += 1
# return data
meta = cp.deepcopy(self.data)
lt = len(meta)
prediction = []
for i in range(lt):
line = meta.iloc[i]
vid = line['video']
idx = str(line['index'])
prediction.append(data[vid][idx] if idx in data[vid] else None)
meta['prediction'] = prediction
vstats['VALIDQ'] = len([x for x in prediction if x is not None])
vstats['INVALIDQ'] = len([x for x in prediction if x is None])
return meta, vstats
# It returns a dictionary
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.mmbench_video import get_dimension_rating, system_prompt, build_prompt
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
judge = judge_kwargs['model']
nproc = judge_kwargs.pop('nproc', 4)
tmp_file = eval_file.replace('.xlsx', f'_{judge}_tmp.pkl')
tgt_file = eval_file.replace('.xlsx', f'_{judge}_rating.json')
score_file = eval_file.replace('.xlsx', f'_{judge}_score.xlsx')
model = build_judge(system_prompt=system_prompt, **judge_kwargs)
assert model.working(), 'MMBench-Video evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE
if not osp.exists(score_file):
res = {} if not osp.exists(tmp_file) else load(tmp_file)
res = {k: v for k, v in res.items() if model.fail_msg not in v}
data = load(eval_file)
data_un = data[~data['index'].isin(res)]
data_un = data_un[~pd.isna(data_un['prediction'])]
lt = len(data_un)
prompts = [build_prompt(data_un.iloc[i]) for i in range(lt)]
indices = [data_un.iloc[i]['index'] for i in range(lt)]
if len(prompts):
_ = track_progress_rich(
model.generate,
prompts,
keys=indices,
save=tmp_file,
nproc=nproc,
chunksize=nproc
)
score_map = load(tmp_file)
data['score'] = [score_map[idx] if idx in score_map else -1 for idx in data['index']]
rejected = [x for x in score_map.values() if FAIL_MSG in x]
data['score'] = [int(x) if istype(x, int) else -1 for x in data['score']]
print(
f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(score_map)} questions, '
f'failed to obtain the score for another {len(rejected)} questions. '
f'Those questions will be counted as 0 score in ALL rating, and will not be counted in VALID rating.'
)
dump(data, score_file)
rating = get_dimension_rating(score_file)
dump(rating, tgt_file)
return rating
import warnings
import pandas as pd
from abc import abstractmethod
from ..smp import *
from .image_base import ImageBaseDataset
class MMGenBench(ImageBaseDataset):
prompt_list = [
"""
# Role
You are an expert in the field of image understanding, focusing on the \
understanding of images and generating the image caption-prompt.
# Definition Explanation
image caption-prompt: Refers to the caption or description of an image, \
used to provide to a Text-to-Image model to generate a new image.
Text-to-Image model: Can generate a new image based on the provided image \
caption-prompt, such as stable diffusion 3, flux, and other image generation models.
# Task Description
Generate an image caption-prompt based on the input image.
# Key Points and Requirements
1. Accurately understand the input image and precisely generate an image caption-prompt.
2. The generated image caption-prompt, when provided to the Text-to-Image model, requires the \
Text-to-Image model to generate a new image that is as consistent as possible with the input image.
3. The generated image caption-prompt must conform to the preferences of the Text-to-Image model.
4. The generated image caption-prompt should describe the input image in as much \
detail as possible, and it should be between 20 to 60 words.
# Output Format
A string, that is the image caption-prompt. No extra output needed.
"""
]
TYPE = 'GenerateImgPrompt'
DATASET_URL = {
'MMGenBench-Test': 'https://huggingface.co/datasets/lerogo/MMGenBench/resolve/main/MMGenBench-Test.tsv',
'MMGenBench-Domain': 'https://huggingface.co/datasets/lerogo/MMGenBench/resolve/main/MMGenBench-Domain.tsv',
}
PROMPT_MAP = {
'MMGenBench-Test': prompt_list[0],
'MMGenBench-Domain': prompt_list[0],
}
DATASET_MD5 = {
'MMGenBench-Test': "94f8dac6bbf7c20be403f99adeaa73da",
'MMGenBench-Domain': "5c10daf6e2c5f08bdfb0701aa6db86bb",
}
def __init__(self, dataset='MMGenBench', **kwargs):
super().__init__(dataset, **kwargs)
warnings.warn('This dataset is for inference only and does not support direct output of evaluation results.\n')
warnings.warn('Please refer to "https://github.com/lerogo/MMGenBench" for more evaluation information.\n')
def load_data(self, dataset):
data = super().load_data(dataset)
if 'question' not in data:
data['question'] = [(
self.PROMPT_MAP[dataset]
)] * len(data)
return data
# Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
@abstractmethod
def evaluate(self, eval_file, **judge_kwargs):
warnings.warn('This evaluation method is not supported.\n')
warnings.warn('Please refer to "https://github.com/lerogo/MMGenBench" for more evaluation information.\n')
return None
import re
import math
from urllib.request import urlopen
from PIL import Image, ImageDraw, ImageFont
import torchvision.transforms as transforms
from vlmeval.dataset.utils import build_judge, levenshtein_distance
from vlmeval.smp import *
from .image_base import ImageBaseDataset
FAIL_MSG = 'Failed to obtain answer via API.'
def get_gpt4_ICE():
example_1 = """
---
Question: List the primary questions asked about the services in this report.
Analysis: The primary questions asked about the services in the report for The Limes Residential Home are:\n\n
1. Is the service safe?\n
2. Is the service effective?\n
3. Is the service caring?\n
4. Is the service responsive?\n
5. Is the service well-led?
Extracted answer: [
'Is the servife safe?',
'Is the service effective',
'Is the serve caring?',
'Is the service responsive?',
'Is the service well-led?'
]
Answer format: List\n
"""
example_2 = """
---
Question: How many regulations of the HSCA 2008 are breached in all according to this report?
Analysis: According to the report, the provider breached 10 Health and Social Care Act 2008 (Regulated Activities)
Regulations in total. Here are the specifics:\n\n1. Regulation 13: Safeguarding service users from abuse and
improper treatment\n2. Regulation 12: Safe care and treatment\n3. Regulation 18: Staffing\n4. Regulation 11:
Need for consent\n5. Regulation 10: Dignity and respect\n6. Regulation 9: Person-centred care\n7. Regulation 17:
Good governance\n8. Regulation 18 (CQC Registration Regulations 2009): Notification of other incidents\n9.
Regulation 18: Failure to maintain an accurate and up-to-date care plan\n10. Regulation 11: Failure to implement
the Mental Capacity Act 2005 code of practice effectively\n\nThese breaches involve issues concerning staffing,
safeguarding, medicines management, dignity and respect, consent, care planning, governance, and failure to
notify the CQC of incidents.
Extracted answer: 10
Answer format: Integer\n
"""
example_3 = """
---
Question: According to the survey that is the percentage of Chinese who are paying more or
about the same attention to politics after Trump's election?
Analysis: The survey provided does not specify the percentage of Chinese individuals specifically who are paying
more or about the same attention to politics after Trump's election. The report focuses primarily on American
demographics and does not include specific details about the Chinese population in relation to this question. If
you need information about a different demographic or a summary of the findings from the American demographic,
I can certainly help with that!
Extracted answer: Not answerable
Answer format: String\n
"""
example_4 = """
---
Question: How many quotations from male respondent over 50 years old are included in this report?
Analysis: The image you've provided appears to be a screenshot of a document with multiple charts. However, the
text is too small and blurry to read accurately. If you can provide a clearer image or more context, I might be
able to help you with your question.
Extracted answer: Fail to answer
Answer format: String\n
"""
return [example_1, example_2, example_3, example_4]
def build_mmlongbench_gpt4_prompt(line):
task_description = """
Given the question and analysis, you are tasked to extract answers with required formats from the free-form analysis.
- Your extracted answers should be one of the following formats: (1) Integer, (2) Float, (3) String and (4) List.
If you find the analysis the question can not be answered from the given documents, type "Not answerable".
Exception: If the analysis only tells you that it can not read/understand the images or documents,
type "Fail to answer".
- Please make your response as concise as possible. Also note that your response should be formatted as below:
```
Extracted answer: [answer]
Answer format: [answer format]
```
Please read the following example, then extract the answer from the model response
and type it at the end of the prompt.\n
"""
question = line['question']
prediction = str(line['prediction'])
prompt = task_description
examples = get_gpt4_ICE()
for example in examples:
prompt += example
prompt += '---\nQuestion:' + question + '\n'
prompt += 'Analysis: ' + prediction
return prompt
def anls_compute(groundtruth, prediction, threshold=0.5):
dist = levenshtein_distance(groundtruth, prediction)
length = max(len(groundtruth.upper()), len(prediction.upper()))
value = 0.0 if length == 0 else float(dist) / float(length)
anls = 1.0 - value
if anls <= threshold:
anls = 0.0
return anls
def is_float_equal(reference, prediction, include_percentage: bool = False, is_close: float = False) -> bool:
def get_precision(gt_ans: float) -> int:
precision = 3
if '.' in str(gt_ans):
precision = len(str(gt_ans).split('.')[-1])
return precision
reference = float(str(reference).strip().rstrip('%').strip())
try:
prediction = float(str(prediction).strip().rstrip('%').strip())
except:
return False
if include_percentage:
gt_result = [reference / 100, reference, reference * 100]
else:
gt_result = [reference]
for item in gt_result:
try:
if is_close:
if math.isclose(item, prediction, rel_tol=0.01):
return True
precision = max(min(get_precision(prediction), get_precision(item)), 2)
if round(prediction, precision) == round(item, precision):
return True
except Exception:
continue
return False
def get_clean_string(s):
s = str(s).lower().strip()
if s.endswith('mile'):
s.rstrip('mile').strip()
if s.endswith('miles'):
s.rstrip('miles').strip()
if s.endswith('million'):
s.rstrip('million').strip()
# remove parenthesis
s = re.sub(r'\s*\([^)]*\)', '', s).strip()
# remove quotes
s = re.sub(r"^['\"]|['\"]$", '', s).strip()
s = s.strip().lstrip('$').strip()
s = s.strip().rstrip('%').strip()
return s
def is_exact_match(s):
flag = False
# Website
if 'https://' in s:
flag = True
# code file
if s.endswith('.py') or s.endswith('ipynb'):
flag = True
if s.startswith('page'):
flag = True
# telephone number
if re.fullmatch(r'\b\d+(-\d+|\s\d+)?\b', s):
flag = True
# time
if 'a.m.' in s or 'p.m.' in s:
flag = True
# YYYY-MM-DD
if re.fullmatch(r'\b\d{4}[-\s]\d{2}[-\s]\d{2}\b', s):
flag = True
# YYYY-MM
if re.fullmatch(r'\b\d{4}[-\s]\d{2}\b', s):
flag = True
# Email address
if re.fullmatch(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', s):
flag = True
return flag
def isfloat(num):
try:
float(num)
return True
except ValueError:
return False
def get_font():
try:
truetype_url = "http://opencompass.openxlab.space/utils/Fonts/SimHei.ttf"
ff = urlopen(truetype_url)
font = ImageFont.truetype(ff, size=40)
except Exception as e:
logging.warning(f'{type(e)}: {e}')
logging.warning("Fail to download the font. Use the default one.")
font = ImageFont.load_default(size=40)
return font
def frame2img(img_path_list, font, save_path=None, idx_start=0):
imgs = [Image.open(img_path) for img_path in img_path_list]
new_imgs = []
for img in imgs:
w, h = img.size
scale = w / h
if w > h:
new_w = 560 * 2
new_h = int(560 * 2 / scale)
else:
new_w = int(560 * 2 * scale)
new_h = 560 * 2
img = transforms.functional.resize(img, [new_h, new_w],)
new_imgs.append(img)
imgs = new_imgs
new_w = 0
new_h = 0
pad = 40
if w > h:
for im in imgs:
w, h = im.size
new_w = max(new_w, w)
new_h += h + 10 + pad
new_img = Image.new("RGB", (new_w, new_h), "white")
draw = ImageDraw.Draw(new_img)
curr_h = 0
for idx, im in enumerate(imgs):
w, h = im.size
new_img.paste(im, (0, pad + curr_h))
draw.text((0, curr_h), f"<IMAGE {idx+idx_start}>", font=font, fill="black")
if idx + 1 < len(imgs):
draw.line([(0, pad + curr_h + h + 5), (new_w, pad + curr_h + h + 5)], fill='black', width=2)
curr_h += h + 10 + pad
else:
for im in imgs:
w, h = im.size
new_w += w + 10
new_h = max(new_h, h)
new_h += pad
new_img = Image.new('RGB', (new_w, new_h), 'white')
draw = ImageDraw.Draw(new_img)
curr_w = 0
for idx, im in enumerate(imgs):
w, h = im.size
new_img.paste(im, (curr_w, pad))
draw.text((curr_w, 0), f"<IMAGE {idx+idx_start}>", font=font, fill='black')
if idx + 1 < len(imgs):
draw.line([(curr_w + w + 5, 0), (curr_w + w + 5, new_h)], fill='black', width=2)
curr_w += w + 10
if save_path is not None:
new_img.save(save_path)
return new_img
def concat_images(image_list, max_concat=1, column_num=1):
concatenated_images = []
if column_num == -1:
MAX_COLUMN_NUM = 20
max_concat = 1
while len(image_list) / max_concat > MAX_COLUMN_NUM:
max_concat += 1
interval = max(math.ceil(len(image_list) / max_concat), 1)
for i in range(0, len(image_list), interval):
batch_images = image_list[i:i + interval]
concatenated_image = frame2img(batch_images, font=get_font(), idx_start=i)
concatenated_images.append(concatenated_image)
else:
interval = max(math.ceil(len(image_list) / max_concat), 1)
for i in range(0, len(image_list), interval):
batch_images = [Image.open(filename) for filename in image_list[i:i + interval]]
if column_num == 1:
total_height = batch_images[0].height * len(batch_images)
else:
total_height = batch_images[0].height * ((len(batch_images) - 1) // column_num + 1)
concatenated_image = Image.new('RGB', (batch_images[0].width * column_num, total_height), 'white')
x_offset, y_offset = 0, 0
for count, image in enumerate(batch_images):
concatenated_image.paste(image, (x_offset, y_offset))
x_offset += image.width
if (count + 1) % column_num == 0:
y_offset += image.height
x_offset = 0
concatenated_images.append(concatenated_image)
return concatenated_images
def eval_score(gt, pred, answer_type):
if answer_type == 'Int':
try:
gt, pred = int(gt), int(float(pred))
except:
pred = ''
score = (gt == pred)
elif answer_type == 'Float':
try:
gt = float(get_clean_string(str(gt)))
pred = float(get_clean_string(str(pred)))
except:
pred = ''
score = is_float_equal(gt, pred, include_percentage=True, is_close=True)
elif answer_type == 'Str':
gt = get_clean_string(gt)
pred = get_clean_string(pred)
if is_exact_match(gt):
score = (gt == pred)
else:
score = anls_compute(gt, pred)
else:
if isinstance(gt, str) and gt.startswith('['):
gt = eval(gt)
if not isinstance(gt, list):
gt = [gt]
if isinstance(pred, str) and pred.startswith('['):
pred = eval(pred)
if not isinstance(pred, list):
pred = [pred]
print(len(gt), len(pred))
if len(gt) != len(pred):
score = 0.0
else:
gt = sorted([get_clean_string(a) for a in gt])
pred = sorted([get_clean_string(a) for a in pred])
print(gt, pred)
if isfloat(gt[0]) or is_exact_match(gt[0]):
score = ('-'.join(gt) == '-'.join(pred))
else:
score = min([anls_compute(gt_v, pred_v) for gt_v, pred_v in zip(gt, pred)])
return float(score)
def MMLongBench_auxeval(model, line):
prompt = build_mmlongbench_gpt4_prompt(line)
log = ''
retry = 5
for i in range(retry):
prediction = line['prediction']
res = model.generate(prompt, temperature=i * 0.5)
if FAIL_MSG in res:
log += f'Try {i}: output is {prediction}, failed to parse.\n'
else:
log += 'Succeed'
try:
pred = res.split('Answer format:')[0].split('Extracted answer:')[1].strip()
except:
pred = ''
return dict(log=log, res=res, pred=pred)
log += 'All 5 retries failed.\n'
return dict(log=log, res='', pred='')
def get_f1(data):
gt_pos_data = data[data.apply(lambda k: k['answer'] != 'Not answerable', axis=1)]
pred_pos_data = data[data.apply(lambda k: k['pred'] != 'Not answerable', axis=1)]
recall = sum(gt_pos_data['score'].tolist()) / len(gt_pos_data)
precision = sum(pred_pos_data['score'].tolist()) / len(pred_pos_data)
return 2 * recall * precision / (recall + precision)
def MMLongBench_acc(result_file):
data = load(result_file)
overall_score = 0.0
score_list = list()
for i in range(len(data)):
item = data.iloc[i]
try:
score = eval_score(item['answer'], item['pred'], item['answer_format'])
except:
score = 0.0
score_list.append(score)
overall_score += score
data['score'] = score_list
dump(data, result_file)
data_chart = data[data.apply(lambda k: 'Chart' in eval(k['evidence_sources']), axis=1)]
data_table = data[data.apply(lambda k: 'Table' in eval(k['evidence_sources']), axis=1)]
data_image = data[data.apply(lambda k: 'Figure' in eval(k['evidence_sources']), axis=1)]
data_text = data[data.apply(lambda k: 'Pure-text (Plain-text)' in eval(k['evidence_sources']), axis=1)]
data_layout = data[data.apply(lambda k: 'Generalized-text (Layout)' in eval(k['evidence_sources']), axis=1)]
data_single = data[data.apply(lambda k: len(eval(k['evidence_pages'])) == 1, axis=1)]
data_multi = data[data.apply(lambda k: len(eval(k['evidence_pages'])) > 1, axis=1)]
data_unans = data[data.apply(lambda k: len(eval(k['evidence_pages'])) == 0, axis=1)]
res = dict()
res['category'] = [
'overall_f1', 'overall_acc', 'text', 'layout', 'table', 'chart',
'image', 'single-page', 'multi-page', 'unanswerable'
]
res['num'] = [
len(data), len(data), len(data_text), len(data_layout), len(data_table),
len(data_chart), len(data_image), len(data_single), len(data_multi), len(data_unans)
]
res['avg_score'] = [
get_f1(data),
overall_score / len(data),
sum(data_text['score'].tolist()) / len(data_text) if len(data_text) > 0 else 0.0,
sum(data_layout['score'].tolist()) / len(data_layout) if len(data_layout) > 0 else 0.0,
sum(data_table['score'].tolist()) / len(data_table) if len(data_table) > 0 else 0.0,
sum(data_chart['score'].tolist()) / len(data_chart) if len(data_chart) > 0 else 0.0,
sum(data_image['score'].tolist()) / len(data_image) if len(data_image) > 0 else 0.0,
sum(data_single['score'].tolist()) / len(data_single) if len(data_single) > 0 else 0.0,
sum(data_multi['score'].tolist()) / len(data_multi) if len(data_multi) > 0 else 0.0,
sum(data_unans['score'].tolist()) / len(data_unans) if len(data_unans) > 0 else 0.0,
]
res = pd.DataFrame(res)
return res
class MMLongBench(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'MMLongBench_DOC': 'https://opencompass.openxlab.space/utils/VLMEval/MMLongBench_DOC.tsv',
}
DATASET_MD5 = {
'MMLongBench_DOC': '9b393e1f4c52718380d50586197eac9b',
}
SUPPORTED_MODELS = {
'GPT4': (1, 1),
'GPT4V': (1, 1),
'GPT4V_HIGH': (1, 1),
'GPT4o': (1, 1),
'GPT4o_HIGH': (1, 1),
'GPT4o_MINI': (1, 1),
'MiniCPM-Llama3-V-2_5': (1, 5),
'InternVL-Chat-V1-5': (5, 2),
'XComposer2_4KHD': (1, 5),
'XComposer2d5': (1, -1),
}
def __init__(self, dataset, **kwargs):
self.model_list = list(self.SUPPORTED_MODELS.keys())
model_name = kwargs['model']
if not listinstr(self.model_list, model_name):
raise AssertionError("{} doesn't support the evaluation on MMLongBench_DOC.".format(model_name))
super(MMLongBench, self).__init__(dataset)
self.is_api = True if listinstr(['GPT4'], model_name) else False
self.max_pages = 120
concat_num, column_num = self.SUPPORTED_MODELS.get(model_name)
self.concat_num = concat_num
self.column_num = column_num
def dump_image(self, origin_line):
os.makedirs(self.img_root, exist_ok=True)
try:
import fitz
except Exception as e:
logging.critical(f'{type(e)}: {e}')
logging.critical('Please use `pip install pymupdf` to parse PDF files.')
line = origin_line.copy()
line['image_path'] = line['image_path'][:self.max_pages]
skip_pdf_parse = True
for im_name in line['image_path']:
path = osp.join(self.img_root, im_name)
if not read_ok(path):
skip_pdf_parse = False
break
# Just for being compatible with the zooped loop: zip(line['image'], line['image_path'])
if skip_pdf_parse:
line['image'] = line['image_path']
else:
pdf_data = base64.b64decode(line['image'])
pdf_file = io.BytesIO(pdf_data)
encoded_images = []
with fitz.open(stream=pdf_file, filetype='pdf') as doc:
doc = doc[:self.max_pages]
for page in doc:
image = page.get_pixmap(dpi=144)
image_file = io.BytesIO(image.tobytes(output='png'))
image = Image.open(image_file)
encoded_image = encode_image_to_base64(image)
encoded_images.append(encoded_image)
line['image'] = encoded_images
print('process {}'.format(line['doc_id']))
if 'image' in line:
if isinstance(line['image'], list):
tgt_path = []
assert 'image_path' in line
for img, im_name in zip(line['image'], line['image_path']):
path = osp.join(self.img_root, im_name)
if not read_ok(path):
decode_base64_to_image_file(img, path)
tgt_path.append(path)
else:
tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
if not read_ok(tgt_path):
decode_base64_to_image_file(line['image'], tgt_path)
tgt_path = [tgt_path]
else:
assert 'image_path' in line
tgt_path = toliststr(line['image_path'])
if self.concat_num > 0 and not self.is_api:
concatenated_images = concat_images(tgt_path, max_concat=self.concat_num, column_num=self.column_num)
old_tgt_path = tgt_path
assert isinstance(old_tgt_path, list)
if self.column_num != -1:
tgt_path = [
'_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat{}_{}.jpg'.format(self.concat_num, i)
for i in range(len(concatenated_images))
]
else:
tgt_path = [
'_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat_all_{}.jpg'.format(i)
for i in range(len(concatenated_images))
]
for path, concatenated_image in zip(tgt_path, concatenated_images):
if not read_ok(path):
decode_base64_to_image_file(encode_image_to_base64(concatenated_image), path)
num_images, image_size = len(old_tgt_path), concatenated_image.size
print('concat {} images to a new one with size {}. save at {}'.format(num_images, image_size, path))
return tgt_path
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
logger = get_logger('Evaluation')
model = judge_kwargs['model']
suffix = eval_file.split('.')[-1]
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
if osp.exists(storage):
logger.warning(f'GPT scoring file {storage} already exists, will reuse it in MMLongBench_eval. ')
else:
data = load(eval_file)
model = build_judge(max_tokens=128, **judge_kwargs)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = {}
if osp.exists(tmp_file):
ans = load(tmp_file)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
if len(indices):
new_results = list()
for model, line in tqdm(tups):
res = MMLongBench_auxeval(model, line)
new_results.append(res)
log_map, res_map, pred_map = {}, {}, {}
all_inds = [line['index'] for line in lines]
for k, v in zip(all_inds, new_results):
log_map[k] = v['log']
res_map[k] = v['res']
pred_map[k] = v['pred']
data['res'] = [res_map[idx] for idx in data['index']]
data['log'] = [log_map[idx] for idx in data['index']]
data['pred'] = [pred_map[idx] for idx in data['index']]
dump(data, storage)
score = MMLongBench_acc(storage)
score_pth = storage.replace('.xlsx', '_score.csv')
dump(score, score_pth)
logger.info(f'MMLongBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
logger.info('Score: ')
logger.info(score)
import re
import json
import sympy as sp
import numpy as np
from sympy import simplify, Eq, sympify, Pow, pi
from sympy.parsing.latex import parse_latex
import sys
import math
import os
import argparse
from .image_base import ImageBaseDataset
from ..utils import track_progress_rich
from ..smp import load, dump
class AutoScoringJudge:
def __init__(self):
# Map of special symbols to their replacements
self.special_signal_map = {
"\\left": "",
"\\right": "",
"厘米":"",
# "∶": ":",
",": ",",
"$": "",
"(":"(",
")":")",
"\\infty":"oo",
"\\colon ":":",
# "\\approx": "=",
# "\\simeq": "=",
# "\\sim": "=",
# "^\\prime": "'",
# "^{\\prime}": "'",
"+":"+",
"\\, ": "",
"\\,":"",
"^\\circ": "",
"^{\\circ}": "",
# "%": "",
}
self.pi = parse_latex("\\pi")
# MM-Math default precision
self.precision = 1e-2
def trans_greater_sign_to_interval(self, expr:str):
expr_tmp = expr.split("<")
return "(" + expr_tmp[0] + ", " + expr_tmp[-1] + ")"
def split_by_comma(self, expr: str):
# Splits expressions by commas outside of brackets
in_bracket_num = 0
splitted_expr = []
start_idx = 0
for i, char in enumerate(expr):
if char in ["(", "["]:
in_bracket_num += 1
elif char in [")", "]"]:
in_bracket_num -= 1
elif char == "," and in_bracket_num == 0:
splitted_expr.append(expr[start_idx:i].strip())
start_idx = i + 1
if start_idx < len(expr):
splitted_expr.append(expr[start_idx:].strip())
return splitted_expr
def trans_plus_minus_sign(self, expr_list: list):
# Translates plus-minus signs into separate expressions
new_expr_list = []
for expr in expr_list:
if "\\pm" in expr:
new_expr_list.append(expr.replace("\\pm", "+"))
new_expr_list.append(expr.replace("\\pm", "-"))
else:
new_expr_list.append(expr)
return new_expr_list
def judge(self, expression1, expression2, precision=1e-2):
# Judge if two expressions are equal (expression1 is considered as the Ground Truth)
# Default precision is a list for supporting multiple expressions
precision = precision if isinstance(precision, list) else [precision]
try:
expression1, expression2 = self.preprocess(expression1, expression2)
except:
return False
if expression1 == expression2:
# print("Exactly equal")
return True
# Remove Chinese characters from the string, as answers like "yes" or "no" in Chinese have been considered
expression1 = expression1 if re.fullmatch(r"[\u4e00-\u9fff]+", expression1) else re.sub(r'[\u4e00-\u9fff]+', '', expression1) # noqa: E501
expression2 = expression2 if re.fullmatch(r'[\u4e00-\u9fff]+', expression2) else re.sub(r'[\u4e00-\u9fff]+', '', expression2) # noqa: E501
# Check if two < or > in expression
if self.is_two_greater_sign(expression1):
expression1 = self.trans_greater_sign_to_interval(expression1)
if self.is_two_greater_sign(expression2):
expression2 = self.trans_greater_sign_to_interval(expression2)
expression1 = self.split_by_comma(expression1)
expression2 = self.split_by_comma(expression2)
temp_list1 = self.trans_plus_minus_sign(expression1)
temp_list2 = self.trans_plus_minus_sign(expression2)
# Set up a list for allowed errors
if len(precision) <= 1:
precision = precision * len(temp_list1)
if len(temp_list1) != len(temp_list2):
return False
# Check if elements in both lists can be paired and are equal
idx = -1
while len(temp_list1) != 0:
idx = (idx + 1) % len(temp_list1)
item1 = temp_list1[idx]
self.precision = precision[idx]
for item2 in temp_list2:
if self.is_equal(item1, item2):
temp_list1.remove(item1)
temp_list2.remove(item2)
precision.remove(self.precision)
break
else:
# If no match was found, return False
return False
# If all elements are matched, return True
return True
def is_interval(self, expr):
# Checks if an expression is an interval
return expr.startswith(("(", "[")) and expr.endswith((")", "]"))
def is_two_greater_sign(self, expr):
match = re.findall(r'<', expr)
return len(match) == 2
def sympy_sub_pi(self, expression_sympy):
# Replaces the symbol for pi in sympy expressions with its numerical value
return expression_sympy.subs(self.pi, math.pi)
def is_equal(self, expression1, expression2):
# Default first expression is ground truth. Check if expressions are equal in different aspects
if expression1 == expression2 and expression1 != "" and expression2 != "":
# print("Equivalent natively")
return True
# First check if both are intervals
if self.is_interval(expression1) and self.is_interval(expression2):
try:
if self.interval_equal(expression1, expression2):
# print("Interval equivalent")
return True
except:
return False
# Then check for numerical equality
try:
if self.numerical_equal(expression1, expression2):
# print("Numerically equivalent")
return True
except:
pass
# Then check if expressions are mathematically equal
try:
if self.expression_equal(expression1, expression2) and not ("=" in expression1 and "=" in expression2):
# print("Expression equivalent")
return True
except:
pass
# Lastly, check for equation equality
try:
if self.equation_equal(expression1, expression2):
# print("Equation equivalent")
return True
except:
pass
return False
def numerical_equal(self, expression1: str, expression2: str, include_percentage: bool = True):
# Check if two numerical values are equal within an allowed error range
# Includes possible percentage cases
reference = float(expression1)
prediction = float(expression2)
if include_percentage:
gt_result = [reference / 100, reference, reference * 100]
else:
gt_result = [reference]
for item in gt_result:
if abs(item - prediction) <= self.precision * 1.01:
return True
return False
def expression_equal(self, exp1, exp2):
# Check if two expressions are mathematically equivalent
# Extract expression and use sympy for equivalence checking
def extract_expression(expression):
if "=" in expression:
expression = expression.split("=")[1]
return expression.strip()
exp1 = extract_expression(exp1)
exp2 = extract_expression(exp2)
exp_too_long = len(exp1) > 300 or len(exp2) > 300
expr1_sym = sympify(parse_latex(exp1))
expr2_sym = sympify(parse_latex(exp2))
if expr1_sym == expr2_sym:
return True
else:
expr1_sym = self.sympy_sub_pi(expr1_sym)
expr2_sym = self.sympy_sub_pi(expr2_sym)
if (expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol)) or \
(not expr1_sym.has(sp.Symbol) and expr2_sym.has(sp.Symbol)):
return False
elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol):
try:
if not (self.can_compute_power(expr1_sym) and self.can_compute_power(expr2_sym)):
print("These two numbers cannot be calculated by the current computer for: "
f"\"{str(expr1_sym)}\" and \"{str(expr2_sym)}\"")
return False
if exp_too_long:
print(f'Expression {exp1} or {exp2} is too long to compute. ')
return False
if abs(expr1_sym.evalf() - expr2_sym.evalf()) <= self.precision * 1.01:
return True
else:
return False
except:
return False
elif exp_too_long:
print(f'Expression {exp1} or {exp2} is too long to compute. ')
return False
else:
try:
simplified_expr = simplify(expr1_sym - expr2_sym)
num_value = simplified_expr.evalf()
return abs(num_value) < 1e-3
except:
return False
def equation_equal(self, expression1, expression2):
# Check if two equations are mathematically equivalent
# Simplify equations and use sympy for equivalence checking
def simplify_equation(latex_eq):
lhs, rhs = latex_eq.split('=')
lhs_expr = parse_latex(lhs)
rhs_expr = parse_latex(rhs)
equation = Eq(lhs_expr, rhs_expr)
simplified_eq = simplify(equation.lhs - equation.rhs)
return simplified_eq
expr1_sym = simplify_equation(expression1)
expr2_sym = simplify_equation(expression2)
division_result_1 = simplify(expr1_sym / expr2_sym)
division_result_2 = simplify(expr2_sym / expr1_sym)
if ((division_result_1.is_Integer and division_result_1 != 0) or # noqa: W504
(division_result_2.is_Integer and division_result_2 != 0)):
return True
else:
return False
def interval_equal(self, expression1, expression2):
# Check if two intervals are mathematically equivalent
def compare_two_interval(inter1, inter2):
if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]:
return False
inter1 = inter1.strip('[]()')
inter2 = inter2.strip('[]()')
items_1 = inter1.split(',')
items_2 = inter2.split(',')
for item_1, item_2 in zip(items_1, items_2):
if not self.expression_equal(item_1, item_2):
return False
return True
interval1 = expression1
interval2 = expression2
if interval1 == interval2:
return True
else:
inter_list1 = interval1.split("\\cup")
inter_list2 = interval2.split("\\cup")
if len(inter_list1) != len(inter_list2):
return False
else:
for inter1, inter2 in zip(inter_list1, inter_list2):
if not compare_two_interval(inter1, inter2):
return False
return True
def preprocess(self, expression1, expression2):
# Preprocess expressions to extract and replace special symbols
def extract_boxed_content(latex_str):
boxed_matches = re.finditer(r'\\boxed{', latex_str)
results = ""
for match in boxed_matches:
start_index = match.end()
end_index = start_index
stack = 1
while stack > 0 and end_index < len(latex_str):
if latex_str[end_index] == '{':
stack += 1
elif latex_str[end_index] == '}':
stack -= 1
end_index += 1
if stack == 0:
content = latex_str[start_index:end_index - 1]
results += content + ","
else:
raise ValueError("Mismatched braces in LaTeX string.")
if results == "":
last_line_ans = latex_str.strip().split("\n")[-1]
dollar_pattern = r"\$(.*?)\$"
answers = re.findall(dollar_pattern, last_line_ans)
if answers:
for ans in answers:
results += ans + ","
else:
results = latex_str
return results
def sepcial_symbol_replace(expression):
expression = expression.replace("\\text{cm}^2", '').replace("\\text{cm}", "").replace("\\,cm", '').replace("\\text{ cm}", '').replace("cm", '').replace("\\text{分米}^2", '').replace("cm^{2}", '').replace("60 \\text{ cm}^2",'').replace("\\ \\text{m}", "").replace("\\text{米}","").strip() # noqa: E501
expression = re.sub(r"(.+)m$", r"\1", expression)
if "\\in " in expression:
expression = expression.split("\\in ")[1]
for signal in self.special_signal_map:
expression = expression.replace(signal, self.special_signal_map[signal])
expression = re.sub(r'(\\sin|\\cos|\\tan)(\d+)', r'\1((\2/180)\\pi)', expression)
expression = expression.strip("\n,.:;^_=+`!@#%^&*~,。")
pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}'
expression = re.sub(pattern, r'\1', expression)
return expression
exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content(expression2)
exp1, exp2 = sepcial_symbol_replace(exp1), sepcial_symbol_replace(exp2)
return exp1, exp2
def can_compute_power(self, expr):
# Checks if a power expression can be computed
if isinstance(expr, Pow):
base, exp = expr.as_base_exp()
if base.is_number and exp.is_number:
MAX_EXP = 1000 # Adjust based on computing environment
if abs(exp.evalf()) > MAX_EXP:
return False
else:
return True
else:
return False
else:
return True # Not a power expression, can compute
class MMMath(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'MM-Math': 'https://opencompass.openxlab.space/utils/VLMEval/MM-Math.tsv',
}
DATASET_MD5 = {
'MM-Math': '1f064ed7c4e0e8926a3fa65849419ca5',
}
@classmethod
def evaluate(self, eval_file, **kwargs):
data = load(eval_file)
judger = AutoScoringJudge()
func = judger.judge
tups = [dict(expression1=x, expression2=y) for x, y in zip(data['answer'], data['prediction'])]
res = track_progress_rich(func, tups, nproc=16)
data['hit'] = res
dump(data, eval_file)
score_file = eval_file.replace('.xlsx', '_score.json')
score = {}
score['overall'] = np.mean(data['hit'])
# Results by Difficulty
difficulties = set(data['difficulty'])
for d in difficulties:
score[f'Difficulty-{d}'] = np.mean(data[data['difficulty'] == d]['hit'])
# Results by Year
years = set(data['year'])
for y in years:
score[f'Year-{y}'] = np.mean(data[data['year'] == y]['hit'])
# Results by Knowledge-L1
points = set(data['knowledge_l1'])
for p in points:
score[f'Knowledge-L1-{p}'] = np.mean(data[data['knowledge_l1'] == p]['hit'])
# Results by Knowledge-L2
points = set(data['knowledge_l2'])
for p in points:
score[f'Knowledge-L2-{p}'] = np.mean(data[data['knowledge_l2'] == p]['hit'])
dump(score, score_file)
return score
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment