Commit bc5ebf0f authored by luopl's avatar luopl
Browse files

Initial commit

parents
Pipeline #2167 canceled with stages
import huggingface_hub
from huggingface_hub import snapshot_download
from ..smp import *
from .video_base import VideoBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..utils import track_progress_rich
import torchvision.transforms as T
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
from decord import VideoReader, cpu
import imageio
import cv2
import zipfile
import os
import glob
from .utils.mvbench import *
FAIL_MSG = 'Failed to obtain answer via API.'
class MVBench(VideoBaseDataset):
MD5 = 'fd21d36522cdedd46d84dc46715ad832'
SYS = """Carefully watch the video and pay attention to the cause and sequence of events, \
the detail and movement of objects, and the action and pose of persons. \
Based on your observations, select the best option that accurately addresses the question.
"""
TYPE = 'Video-MCQ'
def __init__(self, dataset='MVBench', nframe=0, fps=-1):
self.type_data_list = {
'Action Sequence': ('action_sequence.json',
'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end
'Action Prediction': ('action_prediction.json',
'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end
'Action Antonym': ('action_antonym.json',
'your_data_path/ssv2_video/', 'video', False),
'Fine-grained Action': ('fine_grained_action.json',
'your_data_path/Moments_in_Time_Raw/videos/', 'video', False),
'Unexpected Action': ('unexpected_action.json',
'your_data_path/FunQA_test/test/', 'video', False),
'Object Existence': ('object_existence.json',
'your_data_path/clevrer/video_validation/', 'video', False),
'Object Interaction': ('object_interaction.json',
'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end
'Object Shuffle': ('object_shuffle.json',
'your_data_path/perception/videos/', 'video', False),
'Moving Direction': ('moving_direction.json',
'your_data_path/clevrer/video_validation/', 'video', False),
'Action Localization': ('action_localization.json',
'your_data_path/sta/sta_video/', 'video', True), # has start & end
'Scene Transition': ('scene_transition.json',
'your_data_path/scene_qa/video/', 'video', False),
'Action Count': ('action_count.json',
'your_data_path/perception/videos/', 'video', False),
'Moving Count': ('moving_count.json',
'your_data_path/clevrer/video_validation/', 'video', False),
'Moving Attribute': ('moving_attribute.json',
'your_data_path/clevrer/video_validation/', 'video', False),
'State Change': ('state_change.json',
'your_data_path/perception/videos/', 'video', False),
'Fine-grained Pose': ('fine_grained_pose.json',
'your_data_path/nturgbd/', 'video', False),
'Character Order': ('character_order.json',
'your_data_path/perception/videos/', 'video', False),
'Egocentric Navigation': ('egocentric_navigation.json',
'your_data_path/vlnqa/', 'video', False),
'Episodic Reasoning': ('episodic_reasoning.json',
'your_data_path/tvqa/frames_fps3_hq/', 'frame', True), # has start & end, read frame
'Counterfactual Inference': ('counterfactual_inference.json',
'your_data_path/clevrer/video_validation/', 'video', False),
}
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
@classmethod
def supported_datasets(cls):
return ['MVBench']
def prepare_dataset(self, dataset_name='MVBench', repo_id='OpenGVLab/MVBench'):
def check_integrity(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if not os.path.exists(data_file):
return False
if md5(data_file) != self.MD5:
return False
data = load(data_file)
for idx, item in data.iterrows():
if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
return False
return True
if modelscope_flag_set():
repo_id = 'modelscope/MVBench'
cache_path = get_cache_path(repo_id, branch='main')
if cache_path is not None and check_integrity(cache_path):
dataset_path = cache_path
else:
def unzip_hf_zip(pth):
pth = os.path.join(pth, 'video/')
for filename in os.listdir(pth):
if filename.endswith('.zip'):
# 构建完整的文件路径
zip_path = os.path.join(pth, filename)
# 解压 ZIP 文件
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(pth)
def generate_tsv(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if os.path.exists(data_file) and md5(data_file) == self.MD5:
return
json_data_dir = os.path.join(pth, 'json')
self.data_list = []
for k, v in self.type_data_list.items():
with open(os.path.join(json_data_dir, v[0]), 'r') as f:
json_data = json.load(f)
for data in json_data:
if os.path.exists(os.path.join(pth, v[1].replace('your_data_path', 'video'), data['video'])):
self.data_list.append({
'task_type': k,
'prefix': v[1].replace('your_data_path', 'video'),
'data_type': v[2],
'bound': v[3],
'start': data['start'] if 'start' in data.keys() else None,
'end': data['end'] if 'end' in data.keys() else None,
'video': data['video'],
'question': data['question'],
'answer': data['answer'],
'candidates': data['candidates']
})
else:
print(
'NTURGB-D zip file is removed according to MVBench, you can view it at '
'https://huggingface.co/datasets/OpenGVLab/MVBench for detailed reason.'
)
raise Exception(
f"{os.path.join(v[1].replace('your_data_path', 'video'), data['video'])} does not exist"
)
data_df = pd.DataFrame(self.data_list)
data_df = data_df.assign(index=range(len(data_df)))
data_df.to_csv(data_file, sep='\t', index=False)
def move_files(pth):
src_folder = os.path.join(pth, 'video/data0613')
if not os.path.exists(src_folder):
return
for subdir in os.listdir(src_folder):
subdir_path = os.path.join(src_folder, subdir)
if os.path.isdir(subdir_path):
for subsubdir in os.listdir(subdir_path):
subsubdir_path = os.path.join(subdir_path, subsubdir)
if os.path.isdir(subsubdir_path):
for item in os.listdir(subsubdir_path):
item_path = os.path.join(subsubdir_path, item)
target_folder = os.path.join(pth, 'video', subdir, subsubdir)
if not os.path.exists(target_folder):
os.makedirs(target_folder)
target_path = os.path.join(target_folder, item)
try:
shutil.move(item_path, target_path)
except Exception as e:
print(f"Error moving {item_path} to {target_path}: {e}")
if modelscope_flag_set():
from modelscope import dataset_snapshot_download
dataset_path = dataset_snapshot_download(dataset_id=repo_id, revision='master')
else:
hf_token = os.environ.get('HUGGINGFACE_TOKEN')
huggingface_hub.login(hf_token)
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
unzip_hf_zip(dataset_path)
move_files(dataset_path)
generate_tsv(dataset_path)
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
self.decord_method = {
'video': self.read_video,
'gif': self.read_gif,
'frame': self.read_frame,
}
self.nframe = 8
self.frame_fps = 3
# transform
self.transform = T.Compose([
Stack(),
ToTorchFormatTensor()
])
return dict(root=dataset_path, data_file=data_file)
def get_index(self, bound, fps, max_frame, first_idx=0):
if bound:
start, end = bound[0], bound[1]
else:
start, end = -100000, 100000
start_idx = max(first_idx, round(start * fps))
end_idx = min(round(end * fps), max_frame)
seg_size = float(end_idx - start_idx) / self.num_segments
frame_indices = np.array([
int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
for idx in range(self.num_segments)
])
return frame_indices
def read_video(self, video_path, bound=None):
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
max_frame = len(vr) - 1
fps = float(vr.get_avg_fps())
images_group = list()
frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
for frame_index in frame_indices:
img = Image.fromarray(vr[frame_index].asnumpy())
images_group.append(img)
torch_imgs = self.transform(images_group)
return torch_imgs
def read_gif(self, video_path, bound=None, fps=25):
gif = imageio.get_reader(video_path)
max_frame = len(gif) - 1
images_group = list()
frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
for index, frame in enumerate(gif):
if index in frame_indices:
img = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB)
img = Image.fromarray(img)
images_group.append(img)
torch_imgs = self.transform(images_group)
return torch_imgs
def read_frame(self, video_path, bound=None, fps=3):
max_frame = len(os.listdir(video_path))
images_group = list()
frame_indices = self.get_index(bound, fps, max_frame, first_idx=1) # frame_idx starts from 1
for frame_index in frame_indices:
img = Image.open(os.path.join(video_path, f'{frame_index:05d}.jpg'))
images_group.append(img)
torch_imgs = self.transform(images_group)
return torch_imgs
def save_video_frames(self, imgs, video_name, frames):
frame_paths = self.frame_paths(video_name)
flag = np.all([osp.exists(p) for p in frame_paths])
if not flag:
block_size = imgs.size(0) // frames
split_tensors = torch.split(imgs, block_size)
to_pil = transforms.ToPILImage()
images = [to_pil(arr) for arr in split_tensors]
for im, pth in zip(images, frame_paths):
if not osp.exists(pth):
im.save(pth)
return frame_paths
def qa_template(self, data):
question = f"Question: {data['question']}\n"
question += 'Options:\n'
answer = data['answer']
answer_idx = -1
for idx, c in enumerate(eval(data['candidates'])):
question += f"({chr(ord('A') + idx)}) {c}\n"
if c == answer:
answer_idx = idx
question = question.rstrip()
answer = f"({chr(ord('A') + answer_idx)}) {answer}"
return question, answer
def load_into_video_and_process(self, line):
try:
from moviepy.editor import VideoFileClip, ImageSequenceClip
except:
raise ImportError(
'MoviePy is not installed, please install it by running "pip install moviepy==1.0.3"'
)
video_path = os.path.join(self.data_root, line['prefix'], line['video'])
if line['data_type'] in ['gif'] or os.path.splitext(video_path)[1] in ['.webm']:
processed_video_path = video_path.replace(os.path.splitext(video_path)[1], '.mp4')
if not os.path.exists(processed_video_path):
# using MoviePy to transform GIF, webm into mp4 format
gif_clip = VideoFileClip(video_path)
gif_clip.write_videofile(processed_video_path, codec='libx264')
gif_clip.close()
elif line['data_type'] in ['frame']:
input_images = os.path.join(video_path, '*.jpg')
processed_video_path = f'{video_path}.mp4'
if not os.path.exists(processed_video_path):
# using MoviePy to transform images into mp4
image_files = sorted(glob.glob(input_images))
image_clip = ImageSequenceClip(image_files, fps=self.frame_fps)
image_clip.write_videofile(processed_video_path, codec='libx264')
image_clip.close()
else:
processed_video_path = video_path
if line['bound']:
base_name, suffix = os.path.splitext(processed_video_path)
output_video_path = f'{base_name}_processed{suffix}'
if not os.path.exists(output_video_path):
video_clip = VideoFileClip(processed_video_path)
clip = video_clip.subclip(line['start'], min(line['end'], video_clip.duration))
clip.write_videofile(output_video_path)
clip.close()
else:
output_video_path = processed_video_path
return output_video_path
def save_video_into_images(self, line):
bound = None
if line['bound']:
bound = (
line['start'],
line['end'],
)
video_path = os.path.join(self.data_root, line['prefix'], line['video'])
decord_method = self.decord_method[line['data_type']]
self.num_segments = self.nframe
torch_imgs = decord_method(video_path, bound)
img_frame_paths = self.save_video_frames(torch_imgs, line['video'], self.num_segments)
return img_frame_paths
def build_prompt(self, line, video_llm):
if self.fps > 0:
raise ValueError('MVBench does not support fps setting, please transfer to MVBench_MP4!')
if isinstance(line, int):
assert line < len(self)
line = self.data.iloc[line]
question, answer = self.qa_template(line)
message = [dict(type='text', value=self.SYS, role='system')]
message.append(dict(type='text', value=question))
if video_llm:
new_video_path = self.load_into_video_and_process(line)
message.append(dict(type='video', value=new_video_path))
else:
img_frame_paths = self.save_video_into_images(line)
for im in img_frame_paths:
message.append(dict(type='image', value=im))
message.append(dict(type='text', value='\nOnly give the best option.'))
message.append(dict(type='text', value='Best option:(', role='assistant'))
return message
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
tgt_file = eval_file.replace('.xlsx', '_rating.json')
score_file = eval_file.replace('.xlsx', '_score.xlsx')
if not osp.exists(score_file):
model = judge_kwargs.setdefault('model', 'chatgpt-0125')
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
if model == 'exact_matching':
model = None
elif gpt_key_set():
model = build_judge(**judge_kwargs)
if not model.working():
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
warnings.warn(DEBUG_MESSAGE)
model = None
else:
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
model = None
res = {} if not osp.exists(tmp_file) else load(tmp_file)
res = {k: v for k, v in res.items() if FAIL_MSG not in v}
data = load(eval_file)
data_un = data[~pd.isna(data['prediction'])]
for idx in data_un['index']:
ans = data.loc[data['index'] == idx, 'answer'].values[0]
pred = data.loc[data['index'] == idx, 'prediction'].values[0]
options = eval(data.loc[data['index'] == idx, 'candidates'].values[0])
answer_idx = -1
for id, c in enumerate(options):
if c == ans:
answer_idx = id
ans = f"({chr(ord('A') + answer_idx)}) {ans}"
input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0]
for id, option_content in enumerate(eval(input_item['candidates'])):
input_item[chr(ord('A') + id)] = option_content
if option_content == input_item['answer']:
input_item['answer'] = chr(ord('A') + id)
if FAIL_MSG in pred:
data.loc[idx, 'score'] = -1
else:
data.loc[idx, 'score'] = int(check_ans_with_model(
pred, ans, model,
input_item,
'MVBench'
))
rejected = [x for x in data['score'] if x == -1]
print(
f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
f'failed to obtain the score for another {len(rejected)} questions. '
f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
)
dump(data, score_file)
rating = get_dimension_rating(score_file)
dump(rating, tgt_file)
return rating
class MVBench_MP4(VideoBaseDataset):
MP4_MD5 = '5c8c6f8b7972c2de65a629590f7c42f5'
SYS = """Carefully watch the video and pay attention to the cause and sequence of events, \
the detail and movement of objects, and the action and pose of persons. \
Based on your observations, select the best option that accurately addresses the question.
"""
TYPE = 'Video-MCQ'
def __init__(self, dataset='MVBench_MP4', nframe=0, fps=-1):
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
@classmethod
def supported_datasets(cls):
return ['MVBench_MP4']
def prepare_dataset(self, dataset_name='MVBench_MP4', repo_id='OpenGVLab/MVBench'):
def check_integrity(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if not os.path.exists(data_file):
return False
if md5(data_file) != self.MP4_MD5:
return False
data = load(data_file)
for idx, item in data.iterrows():
if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
return False
return True
if modelscope_flag_set():
repo_id = 'modelscope/MVBench'
cache_path = get_cache_path(repo_id, branch='video')
if cache_path is not None and check_integrity(cache_path):
dataset_path = cache_path
else:
def generate_tsv(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if os.path.exists(data_file) and md5(data_file) == self.MP4_MD5:
return
json_data_path = os.path.join(dataset_path, 'test.json')
json_data = load(json_data_path)
root_data_dict = json_data['root']
self.data_list = []
for k, v in json_data['meta'].items():
for item in v:
self.data_list.append({
'task_type': k,
'prefix': root_data_dict[k],
'video': item['video'],
'question': item['question'],
'answer': item['answer'],
'candidates': item['candidates']
})
data_df = pd.DataFrame(self.data_list)
data_df = data_df.assign(index=range(len(data_df)))
data_df.to_csv(data_file, sep='\t', index=False)
if modelscope_flag_set():
from modelscope import dataset_snapshot_download
dataset_path = dataset_snapshot_download(dataset_id=repo_id, revision='video')
else:
hf_token = os.environ.get('HUGGINGFACE_TOKEN')
huggingface_hub.login(hf_token)
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset', revision='video')
generate_tsv(dataset_path)
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
# transform
self.transform = T.Compose([
Stack(),
ToTorchFormatTensor()
])
return dict(root=dataset_path, data_file=data_file)
def qa_template(self, data):
question = f"Question: {data['question']}\n"
question += 'Options:\n'
answer = data['answer']
answer_idx = -1
for idx, c in enumerate(eval(data['candidates'])):
question += f"({chr(ord('A') + idx)}) {c}\n"
if c == answer:
answer_idx = idx
question = question.rstrip()
answer = f"({chr(ord('A') + answer_idx)}) {answer}"
return question, answer
def get_index_by_frame(self, max_frame):
seg_size = float(max_frame) / self.num_segments
frame_indices = np.array([
int((seg_size / 2) + np.round(seg_size * idx))
for idx in range(self.num_segments)
])
return frame_indices
def get_index_by_fps(self, vid, fps):
total_frames = len(vid)
video_fps = vid.get_avg_fps()
total_duration = total_frames / video_fps
required_frames = int(total_duration * fps)
step_size = video_fps / fps
frame_indices = np.array([int(i * step_size) for i in range(required_frames)])
self.num_segments = len(frame_indices)
return frame_indices
def read_video(self, video_path):
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
max_frame = len(vr) - 1
images_group = list()
if self.fps < 0:
frame_indices = self.get_index_by_frame(max_frame)
else:
frame_indices = self.get_index_by_fps(vr, self.fps)
for frame_index in frame_indices:
img = Image.fromarray(vr[frame_index].asnumpy())
images_group.append(img)
torch_imgs = self.transform(images_group)
return torch_imgs
def save_video_frames(self, imgs, video_name, frames):
if self.fps > 0:
frame_paths = self.frame_paths_fps(video_name, frames)
else:
frame_paths = self.frame_paths(video_name)
flag = np.all([osp.exists(p) for p in frame_paths])
if not flag:
block_size = imgs.size(0) // frames
split_tensors = torch.split(imgs, block_size)
to_pil = transforms.ToPILImage()
images = [to_pil(arr) for arr in split_tensors]
for im, pth in zip(images, frame_paths):
if not osp.exists(pth):
im.save(pth)
return frame_paths
def save_video_into_images(self, line):
video_path = os.path.join(self.data_root, line['prefix'], line['video'])
if self.fps <= 0:
self.num_segments = self.nframe
else:
self.num_segments = 0
torch_imgs = self.read_video(video_path)
img_frame_paths = self.save_video_frames(torch_imgs, line['video'], self.num_segments)
return img_frame_paths
def build_prompt(self, line, video_llm):
if isinstance(line, int):
assert line < len(self)
line = self.data.iloc[line]
question, answer = self.qa_template(line)
message = [dict(type='text', value=self.SYS, role='system')]
message.append(dict(type='text', value=question))
video_path = os.path.join(self.data_root, line['prefix'], line['video'])
if video_llm:
message.append(dict(type='video', value=video_path))
else:
img_frame_paths = self.save_video_into_images(line)
for im in img_frame_paths:
message.append(dict(type='image', value=im))
message.append(dict(type='text', value='\nOnly give the best option.'))
message.append(dict(type='text', value='Best option:(', role='assistant'))
return message
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
tgt_file = eval_file.replace('.xlsx', '_rating.json')
score_file = eval_file.replace('.xlsx', '_score.xlsx')
if not osp.exists(score_file):
model = judge_kwargs.setdefault('model', 'chatgpt-0125')
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
if model == 'exact_matching':
model = None
elif gpt_key_set():
model = build_judge(**judge_kwargs)
if not model.working():
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
warnings.warn(DEBUG_MESSAGE)
model = None
else:
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
model = None
res = {} if not osp.exists(tmp_file) else load(tmp_file)
res = {k: v for k, v in res.items() if FAIL_MSG not in v}
data = load(eval_file)
data_un = data[~pd.isna(data['prediction'])]
for idx in data_un['index']:
ans = data.loc[data['index'] == idx, 'answer'].values[0]
pred = data.loc[data['index'] == idx, 'prediction'].values[0]
options = eval(data.loc[data['index'] == idx, 'candidates'].values[0])
answer_idx = -1
for id, c in enumerate(options):
if c == ans:
answer_idx = id
ans = f"({chr(ord('A') + answer_idx)}) {ans}"
input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0]
for id, option_content in enumerate(eval(input_item['candidates'])):
input_item[chr(ord('A') + id)] = option_content
if option_content == input_item['answer']:
input_item['answer'] = chr(ord('A') + id)
if FAIL_MSG in pred:
data.loc[idx, 'score'] = -1
else:
data.loc[idx, 'score'] = int(check_ans_with_model(
pred, ans, model,
input_item,
'MVBench_MP4'
))
rejected = [x for x in data['score'] if x == -1]
print(
f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
f'failed to obtain the score for another {len(rejected)} questions. '
f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
)
dump(data, score_file)
rating = get_dimension_rating(score_file)
dump(rating, tgt_file)
return rating
import re
import math
from typing import List
from vlmeval.dataset.utils.judge_util import build_judge
from vlmeval.smp import *
from .image_base import ImageBaseDataset
from .mmlongbench import concat_images, MMLongBench_auxeval, anls_compute
FAIL_MSG = 'Failed to obtain answer via API.'
def get_f1(gt, pred):
gt_bow, pred_bow = gt.strip().split(), pred.strip().split()
if not gt_bow or not pred_bow:
return 0.0
recall = len([pred_e for pred_e in pred_bow if pred_e in gt_bow]) / len(gt_bow)
precision = len([pred_e for pred_e in pred_bow if pred_e in gt_bow]) / len(pred_bow)
f1 = 2 * recall * precision / (recall + precision) if (recall + precision) > 1e-4 else 0.0
return f1
def SlideVQA_acc(result_file):
data = load(result_file)
anls_list, em_list, f1_list = list(), list(), list()
for i in range(len(data)):
item = data.iloc[i]
if isinstance(item['answer'], float) and math.isnan(item['answer']):
item['answer'] = 'Not answerable'
item['answer'] = re.sub('\n', '', item['answer']).lower()
item['pred'] = str(item['pred']).lower()
anls_score = anls_compute(item['answer'], item['pred'])
em_score = (item['answer'].strip() == item['pred'].strip())
f1_score = get_f1(item['answer'], item['pred'])
anls_list.append(anls_score)
em_list.append(em_score)
f1_list.append(f1_score)
print('---------------------')
print(item['answer'], item['pred'], anls_score, em_score, f1_score)
data['anls'] = anls_list
data['em'] = em_list
data['f1'] = f1_list
dump(data, result_file)
res = dict()
res['category'], res['num'] = ['anls', 'EM', 'F1'], [len(data), len(data), len(data)]
res['avg'] = [sum(anls_list) / len(data), sum(em_list) / len(data), sum(f1_list) / len(data)]
res = pd.DataFrame(res)
return res
class SlideVQA(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'SLIDEVQA_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/SLIDEVQA_MINI.tsv',
'SLIDEVQA': 'https://opencompass.openxlab.space/utils/VLMEval/SLIDEVQA.tsv',
}
DATASET_MD5 = {
'SLIDEVQA_MINI': '6d9a8d8814fa5b7669deb2af3a3208eb',
'SLIDEVQA': '5e822c2f800e94c1e23badfd478326b6',
}
SUPPORTED_MODELS = {
'GPT4': (1, 1),
'GPT4V': (1, 1),
'GPT4V_HIGH': (1, 1),
'GPT4o': (1, 1),
'GPT4o_HIGH': (1, 1),
'GPT4o_MINI': (1, 1),
'XComposer2d5': (1, -1),
'XComposer2_4KHD': (1, -1),
'MiniCPM-Llama3-V-2_5': (1, 5),
'InternVL-Chat-V1-5': (5, 2),
}
def __init__(self, dataset, **kwargs):
self.model_list = list(self.SUPPORTED_MODELS.keys())
model_name = kwargs['model']
if not listinstr(self.model_list, model_name):
raise AssertionError("{} doesn't support the evaluation on SlideVQA.".format(model_name))
super(SlideVQA, self).__init__(dataset)
self.is_api = True if listinstr(['GPT4'], model_name) else False
self.max_pages = 120
concat_num, column_num = self.SUPPORTED_MODELS.get(model_name)
self.concat_num = concat_num
self.column_num = column_num
def dump_image(self, origin_line):
os.makedirs(self.img_root, exist_ok=True)
line = origin_line.copy()
if not isinstance(line['image_path'], List):
line['image_path'] = [line['image_path']]
line['image_path'] = line['image_path'][:self.max_pages]
if 'image' in line:
if isinstance(line['image'], list):
tgt_path = []
assert 'image_path' in line
for img, im_name in zip(line['image'], line['image_path']):
path = osp.join(self.img_root, im_name)
if not read_ok(path):
decode_base64_to_image_file(img, path)
tgt_path.append(path)
else:
tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
if not read_ok(tgt_path):
decode_base64_to_image_file(line['image'], tgt_path)
tgt_path = [tgt_path]
else:
assert 'image_path' in line
tgt_path = toliststr(line['image_path'])
if self.concat_num > 0 and not self.is_api:
concatenated_images = concat_images(tgt_path, max_concat=self.concat_num, column_num=self.column_num)
old_tgt_path = tgt_path
assert isinstance(old_tgt_path, list)
if self.column_num != -1:
tgt_path = [
'_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat{}_{}.jpg'.format(self.concat_num, i)
for i in range(len(concatenated_images))
]
else:
tgt_path = ['_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat_all.jpg']
for path, concatenated_image in zip(tgt_path, concatenated_images):
if not read_ok(path):
decode_base64_to_image_file(encode_image_to_base64(concatenated_image), path)
num_images, image_size = len(old_tgt_path), concatenated_image.size
print('concat {} images to a new one with size {}. save at {}'.format(num_images, image_size, path))
return tgt_path
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
logger = get_logger('Evaluation')
model = judge_kwargs['model']
suffix = eval_file.split('.')[-1]
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
if osp.exists(storage):
logger.warning(f'GPT scoring file {storage} already exists, will reuse it in SlideVQA_eval. ')
else:
data = load(eval_file)
model = build_judge(max_tokens=128, **judge_kwargs)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = {}
if osp.exists(tmp_file):
ans = load(tmp_file)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
if len(indices):
new_results = list()
for model, line in tqdm(tups):
res = MMLongBench_auxeval(model, line)
new_results.append(res)
log_map, res_map, pred_map = {}, {}, {}
all_inds = [line['index'] for line in lines]
for k, v in zip(all_inds, new_results):
log_map[k] = v['log']
res_map[k] = v['res']
pred_map[k] = v['pred']
data['res'] = [res_map[idx] for idx in data['index']]
data['log'] = [log_map[idx] for idx in data['index']]
data['pred'] = [pred_map[idx] for idx in data['index']]
dump(data, storage)
score = SlideVQA_acc(storage)
score_pth = storage.replace('.xlsx', '_score.csv')
dump(score, score_pth)
logger.info(f'SlideVQA successfully finished evaluating {eval_file}, results saved in {score_pth}')
logger.info('Score: ')
logger.info(score)
import huggingface_hub
from huggingface_hub import snapshot_download
from ..smp import *
from .video_concat_dataset import ConcatVideoDataset
from .video_base import VideoBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..utils import track_progress_rich
import torchvision.transforms as T
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
from decord import VideoReader, cpu
from .utils.tempcompass import *
FAIL_MSG = 'Failed to obtain answer via API.'
class TempCompass(ConcatVideoDataset):
def __init__(self, dataset='TempCompass', nframe=0, fps=-1):
self.DATASET_SETS[dataset] = ['TempCompass_MCQ', 'TempCompass_Captioning', 'TempCompass_YorN']
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
@classmethod
def supported_datasets(cls):
return ['TempCompass']
def evaluate(self, eval_file, **judge_kwargs):
result = super().evaluate(eval_file=eval_file, **judge_kwargs)
suffix = eval_file.split('.')[-1]
result = result.reset_index().rename(columns={'index': 'dim.task_type'})
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
avg_dict = {}
for idx, item in result.iterrows():
dim, task_type = item['dim.task_type'].split('. ')
if dim not in avg_dict:
avg_dict[dim] = {'success': 0.0, 'overall': 0.0}
if task_type not in avg_dict:
avg_dict[task_type] = {'success': 0.0, 'overall': 0.0}
if 'overall' not in avg_dict:
avg_dict['overall'] = {'success': 0.0, 'overall': 0.0}
avg_dict[dim]['success'] += item['success']
avg_dict[dim]['overall'] += item['overall']
avg_dict[task_type]['success'] += item['success']
avg_dict[task_type]['overall'] += item['overall']
avg_dict['overall']['success'] += item['success']
avg_dict['overall']['overall'] += item['overall']
result.loc[idx, 'acc'] = round(item['success'] / item['overall'] * 100, 2)
for key, value in avg_dict.items():
# 使用 loc 方法添加新行
result.loc[len(result)] = {
'dim.task_type': key,
'success': value['success'],
'overall': value['overall'],
'acc': round(value['success'] / value['overall'] * 100, 2)
}
dump(result, score_file)
return result
class TempCompass_MCQ(VideoBaseDataset):
MD5 = '7efbb9e6d9dabacd22daf274852691dd'
TYPE = 'Video-MCQ'
def __init__(self, dataset='TempCompass_MCQ', nframe=0, fps=-1):
self.type_data_list = {
'multi-choice': ('multi-choice.json', './videos', '.mp4'),
'caption_matching': ('caption_matching.json', './videos', '.mp4'),
}
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
@classmethod
def supported_datasets(cls):
return ['TempCompass_MCQ']
def prepare_dataset(self, dataset_name='TempCompass_MCQ', repo_id='lmms-lab/TempCompass'):
def check_integrity(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if not osp.exists(data_file):
return False
if md5(data_file) != self.MD5:
return False
data = load(data_file)
for idx, item in data.iterrows():
if not osp.exists(osp.join(pth, item['prefix'], item['video'] + item['suffix'])):
return False
return True
cache_path = get_cache_path(repo_id)
if cache_path is not None and check_integrity(cache_path):
dataset_path = cache_path
else:
def read_parquet(pth):
import pandas as pd
for task_name in self.type_data_list.keys():
if not osp.exists(osp.join(pth, f'{task_name}.json')):
data = pd.read_parquet(osp.join(pth, task_name, 'test-00000-of-00001.parquet'))
data.to_json(osp.join(pth, f'{task_name}.json'), orient='records', lines=False)
def unzip_videos(pth):
import zipfile
if not osp.exists(osp.join(pth, 'videos')):
zip_file = osp.join(pth, 'tempcompass_videos.zip')
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
zip_ref.extractall(pth)
def generate_tsv(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if osp.exists(data_file) and md5(data_file) == self.MD5:
return
self.data_list = []
for k, v in self.type_data_list.items():
with open(osp.join(pth, v[0]), 'r') as f:
json_data = json.load(f)
for data in json_data:
self.data_list.append({
'task_type': k,
'prefix': v[1],
'suffix': v[2],
'video': data['video_id'],
'question': data['question'].split('\n')[0],
'answer': data['answer'],
'dim': data['dim'],
'candidates': data['question'].split('\n')[1:],
})
data_df = pd.DataFrame(self.data_list)
data_df = data_df.assign(index=range(len(data_df)))
data_df.to_csv(data_file, sep='\t', index=False)
if modelscope_flag_set():
from modelscope import dataset_snapshot_download
dataset_path = dataset_snapshot_download(dataset_id=repo_id)
else:
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
read_parquet(dataset_path)
unzip_videos(dataset_path)
generate_tsv(dataset_path)
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
return dict(root=dataset_path, data_file=data_file)
def qa_template(self, data):
question = data['question'] + '\n' + '\n'.join(eval(data['candidates']))
answer = data['answer']
return question, answer
def save_video_frames(self, line):
vid_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
vid = decord.VideoReader(vid_path)
video_info = {
'fps': vid.get_avg_fps(),
'n_frames': len(vid),
}
if self.nframe > 0 and self.fps < 0:
step_size = len(vid) / (self.nframe + 1)
indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
frame_paths = self.frame_paths(line['video'])
elif self.fps > 0:
# not constrained by num_frames, get frames by fps
total_duration = video_info['n_frames'] / video_info['fps']
required_frames = int(total_duration * self.fps)
step_size = video_info['fps'] / self.fps
indices = [int(i * step_size) for i in range(required_frames)]
frame_paths = self.frame_paths_fps(line['video'], len(indices))
flag = np.all([osp.exists(p) for p in frame_paths])
if not flag:
images = [vid[i].asnumpy() for i in indices]
images = [Image.fromarray(arr) for arr in images]
for im, pth in zip(images, frame_paths):
if not osp.exists(pth):
im.save(pth)
return frame_paths
def save_video_into_images(self, line):
frame_paths = self.save_video_frames(line)
return frame_paths
def build_prompt(self, line, video_llm):
if isinstance(line, int):
assert line < len(self)
line = self.data.iloc[line]
question, answer = self.qa_template(line)
message = []
message.append(dict(type='text', value=question))
video_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
if video_llm:
message.append(dict(type='video', value=video_path))
else:
img_frame_paths = self.save_video_into_images(line)
for im in img_frame_paths:
message.append(dict(type='image', value=im))
message.append(dict(type='text', value='\nPlease directly give the best option:'))
return message
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
model = judge_kwargs.get('model', 'exact_matching')
assert model in ['chatgpt-1106', 'exact_matching']
judge_kwargs.update({
"max_tokens": 128,
"temperature": 1.0,
"top_p": 1,
"presence_penalty": 1,
})
suffix = eval_file.split('.')[-1]
score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(score_file):
data = load(eval_file)
if model != 'exact_matching':
model = build_judge(system_prompt=sys_prompt, **judge_kwargs)
else:
model = None
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = {}
if osp.exists(tmp_file):
ans = load(tmp_file)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
if len(indices):
_ = track_progress_rich(
evaluate_tempcompass_mcq,
tups,
nproc=nproc,
chunksize=nproc,
keys=indices,
save=tmp_file,
)
ans = load(tmp_file)
for idx, item in data.iterrows():
data.loc[idx, 'score'] = ans[idx]['rating']
dump(data, score_file)
rating = get_dimension_rating(score_file)
return rating
class TempCompass_Captioning(VideoBaseDataset):
MD5 = '35be9bf2581ea7767f02e9a8f37ae1ab'
TYPE = 'Video-VQA'
def __init__(self, dataset='TempCompass_Captioning', nframe=0, fps=-1):
self.type_data_list = {
'captioning': ('captioning.json', './videos', '.mp4'),
}
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
@classmethod
def supported_datasets(cls):
return ['TempCompass_Captioning']
def prepare_dataset(self, dataset_name='TempCompass_Captioning', repo_id='lmms-lab/TempCompass'):
def check_integrity(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if not osp.exists(data_file):
return False
if md5(data_file) != self.MD5:
return False
data = load(data_file)
for idx, item in data.iterrows():
if not osp.exists(osp.join(pth, item['prefix'], item['video'] + item['suffix'])):
return False
return True
cache_path = get_cache_path(repo_id)
if cache_path is not None and check_integrity(cache_path):
dataset_path = cache_path
else:
def read_parquet(pth):
import pandas as pd
for task_name in self.type_data_list.keys():
if not osp.exists(osp.join(pth, f'{task_name}.json')):
data = pd.read_parquet(osp.join(pth, task_name, 'test-00000-of-00001.parquet'))
data.to_json(osp.join(pth, f'{task_name}.json'), orient='records', lines=False)
def unzip_videos(pth):
import zipfile
if not osp.exists(osp.join(pth, 'videos')):
zip_file = osp.join(pth, 'tempcompass_videos.zip')
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
zip_ref.extractall(pth)
def generate_tsv(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if osp.exists(data_file) and md5(data_file) == self.MD5:
return
self.data_list = []
for k, v in self.type_data_list.items():
with open(osp.join(pth, v[0]), 'r') as f:
json_data = json.load(f)
for data in json_data:
self.data_list.append({
'task_type': k,
'prefix': v[1],
'suffix': v[2],
'video': data['video_id'],
'question': data['question'],
'answer': data['answer'],
'dim': data['dim'],
'mc_question': data['mc_question'],
'mc_answer': data['mc_answer'],
})
data_df = pd.DataFrame(self.data_list)
data_df = data_df.assign(index=range(len(data_df)))
data_df.to_csv(data_file, sep='\t', index=False)
if modelscope_flag_set():
from modelscope import dataset_snapshot_download
dataset_path = dataset_snapshot_download(dataset_id=repo_id)
else:
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
read_parquet(dataset_path)
unzip_videos(dataset_path)
generate_tsv(dataset_path)
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
return dict(root=dataset_path, data_file=data_file)
def qa_template(self, data):
question = data['question']
answer = data['answer']
return question, answer
def save_video_frames(self, line):
vid_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
vid = decord.VideoReader(vid_path)
video_info = {
'fps': vid.get_avg_fps(),
'n_frames': len(vid),
}
if self.nframe > 0 and self.fps < 0:
step_size = len(vid) / (self.nframe + 1)
indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
frame_paths = self.frame_paths(line['video'])
elif self.fps > 0:
# not constrained by num_frames, get frames by fps
total_duration = video_info['n_frames'] / video_info['fps']
required_frames = int(total_duration * self.fps)
step_size = video_info['fps'] / self.fps
indices = [int(i * step_size) for i in range(required_frames)]
frame_paths = self.frame_paths_fps(line['video'], len(indices))
flag = np.all([osp.exists(p) for p in frame_paths])
if not flag:
images = [vid[i].asnumpy() for i in indices]
images = [Image.fromarray(arr) for arr in images]
for im, pth in zip(images, frame_paths):
if not osp.exists(pth):
im.save(pth)
return frame_paths
def save_video_into_images(self, line):
frame_paths = self.save_video_frames(line)
return frame_paths
def build_prompt(self, line, video_llm):
if isinstance(line, int):
assert line < len(self)
line = self.data.iloc[line]
question, answer = self.qa_template(line)
message = []
message.append(dict(type='text', value=question))
video_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
if video_llm:
message.append(dict(type='video', value=video_path))
else:
img_frame_paths = self.save_video_into_images(line)
for im in img_frame_paths:
message.append(dict(type='image', value=im))
return message
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
model = judge_kwargs.get('model', 'exact_matching')
assert model in ['chatgpt-1106', 'exact_matching']
judge_kwargs.update({
"max_tokens": 128,
"temperature": 1.0,
"top_p": 1,
"presence_penalty": 1,
})
suffix = eval_file.split('.')[-1]
score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(score_file):
data = load(eval_file)
if model != 'exact_matching':
model = build_judge(system_prompt=sys_prompt, **judge_kwargs)
else:
model = None
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = {}
if osp.exists(tmp_file):
ans = load(tmp_file)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
if len(indices):
_ = track_progress_rich(
evaluate_tempcompass_captioning,
tups,
nproc=nproc,
chunksize=nproc,
keys=indices,
save=tmp_file,
)
ans = load(tmp_file)
for idx, item in data.iterrows():
data.loc[idx, 'score'] = ans[idx]['rating']
dump(data, score_file)
rating = get_dimension_rating(score_file)
return rating
class TempCompass_YorN(VideoBaseDataset):
MD5 = 'c72c046d7fa0e82c8cd7462f2e844ea8'
TYPE = 'Video-Y/N'
def __init__(self, dataset='TempCompass_YorN', nframe=0, fps=-1):
self.type_data_list = {
'yes_no': ('yes_no.json', './videos', '.mp4'),
}
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
@classmethod
def supported_datasets(cls):
return ['TempCompass_YorN']
def prepare_dataset(self, dataset_name='TempCompass_YorN', repo_id='lmms-lab/TempCompass'):
def check_integrity(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if not osp.exists(data_file):
return False
if md5(data_file) != self.MD5:
return False
data = load(data_file)
for idx, item in data.iterrows():
if not osp.exists(osp.join(pth, item['prefix'], item['video'] + item['suffix'])):
return False
return True
cache_path = get_cache_path(repo_id)
if cache_path is not None and check_integrity(cache_path):
dataset_path = cache_path
else:
def read_parquet(pth):
import pandas as pd
for task_name in self.type_data_list.keys():
if not osp.exists(osp.join(pth, f'{task_name}.json')):
data = pd.read_parquet(osp.join(pth, task_name, 'test-00000-of-00001.parquet'))
data.to_json(osp.join(pth, f'{task_name}.json'), orient='records', lines=False)
def unzip_videos(pth):
import zipfile
if not osp.exists(osp.join(pth, 'videos')):
zip_file = osp.join(pth, 'tempcompass_videos.zip')
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
zip_ref.extractall(pth)
def generate_tsv(pth):
data_file = osp.join(pth, f'{dataset_name}.tsv')
if osp.exists(data_file) and md5(data_file) == self.MD5:
return
self.data_list = []
for k, v in self.type_data_list.items():
with open(osp.join(pth, v[0]), 'r') as f:
json_data = json.load(f)
for data in json_data:
self.data_list.append({
'task_type': k,
'prefix': v[1],
'suffix': v[2],
'video': data['video_id'],
'question': data['question'].split('\n')[0],
'answer': data['answer'],
'dim': data['dim']
})
data_df = pd.DataFrame(self.data_list)
data_df = data_df.assign(index=range(len(data_df)))
data_df.to_csv(data_file, sep='\t', index=False)
if modelscope_flag_set():
from modelscope import dataset_snapshot_download
dataset_path = dataset_snapshot_download(dataset_id=repo_id)
else:
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
read_parquet(dataset_path)
unzip_videos(dataset_path)
generate_tsv(dataset_path)
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
return dict(root=dataset_path, data_file=data_file)
def qa_template(self, data):
question = data['question']
answer = data['answer']
return question, answer
def save_video_frames(self, line):
vid_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
vid = decord.VideoReader(vid_path)
video_info = {
'fps': vid.get_avg_fps(),
'n_frames': len(vid),
}
if self.nframe > 0 and self.fps < 0:
step_size = len(vid) / (self.nframe + 1)
indices = [int(i * step_size) for i in range(1, self.nframe + 1)]
frame_paths = self.frame_paths(line['video'])
elif self.fps > 0:
# not constrained by num_frames, get frames by fps
total_duration = video_info['n_frames'] / video_info['fps']
required_frames = int(total_duration * self.fps)
step_size = video_info['fps'] / self.fps
indices = [int(i * step_size) for i in range(required_frames)]
frame_paths = self.frame_paths_fps(line['video'], len(indices))
flag = np.all([osp.exists(p) for p in frame_paths])
if not flag:
images = [vid[i].asnumpy() for i in indices]
images = [Image.fromarray(arr) for arr in images]
for im, pth in zip(images, frame_paths):
if not osp.exists(pth):
im.save(pth)
return frame_paths
def save_video_into_images(self, line):
frame_paths = self.save_video_frames(line)
return frame_paths
def build_prompt(self, line, video_llm):
if isinstance(line, int):
assert line < len(self)
line = self.data.iloc[line]
question, answer = self.qa_template(line)
message = []
message.append(dict(type='text', value=question))
video_path = osp.join(self.data_root, line['prefix'], line['video'] + line['suffix'])
if video_llm:
message.append(dict(type='video', value=video_path))
else:
img_frame_paths = self.save_video_into_images(line)
for im in img_frame_paths:
message.append(dict(type='image', value=im))
message.append(dict(type='text', value='\nPlease answer yes or no:'))
return message
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
model = judge_kwargs.get('model', 'exact_matching')
assert model in ['chatgpt-1106', 'exact_matching']
judge_kwargs.update({
"max_tokens": 128,
"temperature": 1.0,
"top_p": 1,
"presence_penalty": 1,
})
suffix = eval_file.split('.')[-1]
score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
nproc = judge_kwargs.pop('nproc', 4)
if not osp.exists(score_file):
data = load(eval_file)
if model != 'exact_matching':
model = build_judge(system_prompt=sys_prompt, **judge_kwargs)
else:
model = None
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = {}
if osp.exists(tmp_file):
ans = load(tmp_file)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
if len(indices):
_ = track_progress_rich(
evaluate_tempcompass_YorN,
tups,
nproc=nproc,
chunksize=nproc,
keys=indices,
save=tmp_file,
)
ans = load(tmp_file)
for idx, item in data.iterrows():
data.loc[idx, 'score'] = ans[idx]['rating']
dump(data, score_file)
rating = get_dimension_rating(score_file)
return rating
from abc import abstractmethod
from ..smp import *
class TextBaseDataset:
MODALITY = 'TEXT'
DATASET_URL = {}
DATASET_MD5 = {}
def __init__(self, dataset='MMBench', **kwargs):
self.dataset_name = dataset
data = self.load_data(dataset)
data['index'] = [str(x) for x in data['index']]
if np.all([istype(x, int) for x in data['index']]):
data['index'] = [int(x) for x in data['index']]
self.data = data
self.post_build(dataset)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return dict(self.data.iloc[idx])
def prepare_tsv(self, url, file_md5=None):
data_root = LMUDataRoot()
os.makedirs(data_root, exist_ok=True)
update_flag = False
file_name = url.split('/')[-1]
data_path = osp.join(data_root, file_name)
if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
pass
else:
warnings.warn('The dataset tsv is not downloaded')
download_file(url, data_path)
update_flag = True
if file_size(data_path, 'GB') > 1:
local_path = data_path.replace('.tsv', '_local.tsv')
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
from ..tools import LOCALIZE
LOCALIZE(data_path, local_path)
data_path = local_path
return load(data_path)
def dump_image(self, line):
return []
def display(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
assert isinstance(line, pd.Series) or isinstance(line, dict)
mmqa_display(line)
# Return a list of dataset names that are supported by this class, can override
@classmethod
def supported_datasets(cls):
return list(cls.DATASET_URL)
# Given the dataset name, return the dataset as a pandas dataframe, can override
def load_data(self, dataset):
url = self.DATASET_URL[dataset]
file_md5 = self.DATASET_MD5[dataset]
return self.prepare_tsv(url, file_md5)
# Post built hook, will be called after the dataset is built, can override
def post_build(self, dataset):
pass
# Given one data record, return the built prompt (a multi-modal message), can override
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
question = line['question']
msgs = []
msgs.append(dict(type='text', value=question))
return msgs
# Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
@abstractmethod
def evaluate(self, eval_file, **judge_kwargs):
pass
from .text_base import TextBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..smp import *
class TextMCQDataset(TextBaseDataset):
TYPE = 'MCQ'
DATASET_URL = {}
DATASET_MD5 = {}
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = 'Options:\n'
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = ''
if hint is not None:
prompt += f'Hint: {hint}\n'
prompt += f'Question: {question}\n'
if len(options):
prompt += options_prompt
prompt += 'Please select the correct answer from the options above. \n'
msgs = []
msgs.append(dict(type='text', value=prompt))
return msgs
def evaluate(self, eval_file, **judge_kwargs):
from .utils.multiple_choice import report_acc, report_acc_MMT, mcq_circular_eval, mcq_vanilla_eval
# assert dataset is not None
dataset_map = {
'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_EN_V11': 'MMBench_V11',
'MMBench_TEST_CN': 'MMBench_CN', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11'
}
dataset = self.dataset_name
if dataset in dataset_map:
dataset = dataset_map[dataset]
nproc = judge_kwargs.pop('nproc', 4)
circular = False
suffix = eval_file.split('.')[-1]
model = judge_kwargs.get('model', 'exact_matching')
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
name_str = name_str_map[model] if model in name_str_map else model
if model == 'exact_matching':
model = None
elif gpt_key_set():
model = build_judge(**judge_kwargs)
if not model.working():
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
warnings.warn(DEBUG_MESSAGE)
model = None
else:
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
model = None
result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
data = load(eval_file)
data = data.sort_values(by='index')
data['prediction'] = [str(x) for x in data['prediction']]
# If not choice label, then use lower case
for k in data.keys():
data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
meta = self.data
meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
data_map = {x: y for x, y in zip(data['index'], data['question'])}
for k in data_map:
assert k in meta_q_map, (
f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
)
if circular:
data = mcq_circular_eval(model, data, meta, nproc, result_file, self.dataset_name)
else:
data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
# load split
dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
# May have different report acc functions for different datasets
if 'MMT' in dataset:
acc = report_acc_MMT(data)
else:
acc = report_acc(data)
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
dump(acc, score_file)
return acc
class CustomTextMCQDataset(TextMCQDataset):
def load_data(self, dataset):
data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
if file_size(data_path, 'GB') > 1:
local_path = data_path.replace('.tsv', '_local.tsv')
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
from ..tools import LOCALIZE
LOCALIZE(data_path, local_path)
data_path = local_path
return load(data_path)
from .judge_util import build_judge, DEBUG_MESSAGE
from .multiple_choice import extract_answer_from_item, prefetch_answer
from .vqa_eval import levenshtein_distance
__all__ = [
'build_judge', 'extract_answer_from_item', 'prefetch_answer',
'levenshtein_distance', 'DEBUG_MESSAGE'
]
import json
import argparse
from collections import defaultdict
def is_correct(predict, answer):
# predict是标准答案 answer是预测
if len(answer) == 1:
return answer[0] == predict[0]
elif len(answer) != 1 and answer[0] in ['A', 'B', 'C', 'D']:
return answer[0] == predict[0]
elif len(answer) != 1 and answer[0] not in ['A', 'B', 'C', 'D']:
return predict[4:].lower() in answer.lower()
from ...smp import *
import os
def report_acc_hrbench(df):
cycle_group = df.groupby('cycle_category')
result_dic = defaultdict(list)
avg_dic = defaultdict(int)
count = 0
for key, data_value in cycle_group:
count += 1
_, resp_dic = hrbench_score(data_value)
for task_type, accuracy in resp_dic.items():
result_dic['cycle'].append(key)
result_dic['type'].append(task_type)
result_dic['accuracy'].append(accuracy)
avg_dic[task_type] += accuracy
for task_type, accuracy in avg_dic.items():
result_dic['cycle'].append('Average')
result_dic['type'].append(task_type)
result_dic['accuracy'].append(accuracy / count)
result_pd = pd.DataFrame(result_dic)
return result_pd
def hrbench_score(data):
ret = defaultdict(list)
resp_dic = {}
category_list = set(data['category'])
score_dict = defaultdict(list)
for i in range(len(data)):
d = data.iloc[i]
category = d['category']
gpt_score = d['hit']
score_dict[category].append(gpt_score)
score_dict['all'].append(gpt_score)
all_acc = np.mean(score_dict['all'])
ret['type'].append('all')
ret['acc'].append(all_acc)
resp_dic['all'] = all_acc
for cate in category_list:
acc = np.mean(score_dict[cate])
ret['type'].append(cate)
ret['acc'].append(acc)
resp_dic[cate] = acc
return pd.DataFrame(ret), resp_dic
import os
from ...smp import load_env
INTERNAL = os.environ.get('INTERNAL', 0)
def build_judge(**kwargs):
from ...api import OpenAIWrapper, SiliconFlowAPI
model = kwargs.pop('model', None)
kwargs.pop('nproc', None)
load_env()
LOCAL_LLM = os.environ.get('LOCAL_LLM', None)
if LOCAL_LLM is None:
model_map = {
'gpt-4-turbo': 'gpt-4-1106-preview',
'gpt-4-0613': 'gpt-4-0613',
'gpt-4-0125': 'gpt-4-0125-preview',
'gpt-4-0409': 'gpt-4-turbo-2024-04-09',
'chatgpt-1106': 'gpt-3.5-turbo-1106',
'chatgpt-0125': 'gpt-3.5-turbo-0125',
'gpt-4o': 'gpt-4o-2024-05-13',
'gpt-4o-0806': 'gpt-4o-2024-08-06',
'gpt-4o-mini': 'gpt-4o-mini-2024-07-18',
'qwen-7b': 'Qwen/Qwen2.5-7B-Instruct',
'qwen-72b': 'Qwen/Qwen2.5-72B-Instruct',
'deepseek': 'deepseek-ai/DeepSeek-V2.5',
}
model_version = model_map[model]
else:
model_version = LOCAL_LLM
if model in ['qwen-7b', 'qwen-72b', 'deepseek']:
model = SiliconFlowAPI(model_version, **kwargs)
else:
model = OpenAIWrapper(model_version, **kwargs)
return model
DEBUG_MESSAGE = """
To debug the OpenAI API, you can try the following scripts in python:
```python
from vlmeval.api import OpenAIWrapper
model = OpenAIWrapper('gpt-4o', verbose=True)
msgs = [dict(type='text', value='Hello!')]
code, answer, resp = model.generate_inner(msgs)
print(code, answer, resp)
```
You cam see the specific error if the API call fails.
"""
import numpy as np
import pandas as pd
from ...smp import *
rule_dict = {
'llava_bench_conv': {'role': 'Assistant', 'prompt': 'We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'}, # noqa: E501
'llava_bench_detail': {'role': 'Assistant', 'prompt': 'We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'}, # noqa: E501
'llava_bench_complex': {'role': 'Assistant', 'prompt': 'We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.'} # noqa: E501
}
def get_eval(judge, content):
return judge.generate(content)
def parse_score(review):
logger = get_logger('Evaluation')
try:
score_pair = review.split('\n')[0]
score_pair = score_pair.replace(',', ' ')
sp = score_pair.split(' ')
if len(sp) == 2:
return [float(sp[0]), float(sp[1])]
else:
logger.error('error', review)
return [-1, -1]
except Exception as e:
logger.error(e, 'error', review)
return [-1, -1]
def build_prompt(line):
cap_str = line['caption']
question = line['question']
ans1 = line['gpt4_ans']
ans2 = line['prediction']
category = 'llava_bench_' + line['category']
rule = rule_dict[category]
role, prompt = rule['role'], rule['prompt']
content = (f'[Context]\n{cap_str}\n\n'
f'[Question]\n{question}\n\n'
f'[{role} 1]\n{ans1}\n\n[End of {role} 1]\n\n'
f'[{role} 2]\n{ans2}\n\n[End of {role} 2]\n\n'
f'[System]\n{prompt}\n\n')
return content
def LLaVABench_atomeval(model, prompt):
review = get_eval(model, prompt)
scores = parse_score(review)
return scores
def LLaVABench_score(data):
cates = ['overall'] + list(set(data['category']))
ret = defaultdict(list)
for c in cates:
ret['split'].append(c)
sub = data[data['category'] == c] if c != 'overall' else data
ret['Relative Score (main)'].append(np.mean(sub['score']) / np.mean(sub['gpt4_score']) * 100)
ret['VLM Score'].append(np.mean(sub['score']) * 10)
ret['GPT4 Score'].append(np.mean(sub['gpt4_score']) * 10)
return pd.DataFrame(ret)
from ...smp import *
from .multiple_choice import extract_answer_from_item
import numpy as np
import re
FAIL_MSG = 'Failed to obtain answer via API.'
DURATIONS = [15, 60, 600, 3600]
TASK_CATEGORIES = [
"S2E", "S2O", "S2A",
"E2O", "O2E", "T2E",
"T2O", "T2A", "E3E",
"O3O", "SSS", "SOS",
"SAA", "T3E", "T3O",
"TOS", "TAA"
]
def get_dimension_rating(data_path):
data = load(data_path)
print(data.iloc[0])
duration_rating = {k: {} for k in DURATIONS}
for duration in DURATIONS + ['overall']:
duration_rating[duration] = {
'overall': '',
'question_category': {k: [] for k in TASK_CATEGORIES}
}
for i in range(len(data)):
task_ctg = data.iloc[i]['question_category']
duration = data.iloc[i]['duration_group']
duration_rating[duration]['question_category'][task_ctg].append(data.iloc[i]['score'])
duration_rating['overall']['question_category'][task_ctg].append(data.iloc[i]['score'])
for duration in DURATIONS + ['overall']:
overall_res_dur = f'{np.mean([x for x in sum(duration_rating[duration]["question_category"].values(), []) if x >= 0]):.3f}' # noqa: E501
duration_rating[duration]['overall'] = overall_res_dur
for task_ctg in TASK_CATEGORIES:
task_res_dur = f'{np.mean([x for x in duration_rating[duration]["question_category"][task_ctg] if x >= 0]):.3f}' # noqa: E501
duration_rating[duration]['question_category'][task_ctg] = task_res_dur
return duration_rating
def extract_option(model, input_item, dataset_name):
options = input_item['question'].split('\n')[1:]
for id, option in enumerate(options):
option_id = chr(ord('A') + id) + '.'
if option.find(option_id) >= 0:
input_item[chr(ord('A') + id)] = option[option.find(option_id) + len(option_id):].strip('. \n')
return extract_answer_from_item(model, input_item, dataset_name)['opt']
def extract_characters_regex(s):
s = s.strip()
answer_prefixes = [
'The best answer is',
'The correct answer is',
'The answer is',
'The answer',
'The best option is'
'The correct option is',
'Best answer:'
'Best option:',
'Answer:',
'Option:',
]
for answer_prefix in answer_prefixes:
s = s.replace(answer_prefix, '')
if len(s.split()) > 10 and not re.search('[ABCDE]', s):
return ''
matches = re.search(r'[ABCDE]', s)
if matches is None:
return ''
return matches[0]
from ...smp import *
from ...utils import can_infer
try:
from latex2sympy2 import latex2sympy
except Exception as e:
logging.critical(f'{type(e)}: {e}')
logging.critical('Please install latex2sympy2 by running "pip install latex2sympy2"')
FAIL_MSG = 'Failed to obtain answer via API.'
def is_equal(asw: str, gt_asw: str) -> bool:
if not isinstance(asw, str) != str or not isinstance(gt_asw, str):
print('Warning: input is not string')
print(asw, gt_asw)
asw = str(asw).lower().strip()
gt_asw = str(gt_asw).lower().strip()
if gt_asw == asw:
return True
try:
a = eval(gt_asw)
b = eval(asw)
if abs(a - b) < 1e-6:
return True
except:
pass
try:
a = latex2sympy(gt_asw)
b = latex2sympy(asw)
if abs(eval(str(a)) - eval(str(b))) < 1e-6:
return True
if abs(a - b) < 1e-6:
return True
except:
pass
return False
def get_gpt4_ICE():
example_1 = """
Hint: Please answer the question and provide the final answer at the end.\n
Question: Which number is missing?\n
Model response: The number missing in the sequence is 14.\n
Extracted answer: 14
"""
example_2 = """
Hint: Please answer the question and provide the final answer at the end.\n
Question: What is the fraction of females facing the camera?\n
Model response: The fraction of females facing the camera is 0.6,
which means that six out of ten females in the group are facing the camera.\n
Extracted answer: 0.6
"""
example_3 = """
Hint: Please answer the question and provide the final answer at the end.\n
Question: How much money does Luca need to buy a sour apple candy and a butter-scotch candy? (Unit: $)\n
Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.\n
Extracted answer: 1.45
"""
example_4 = """
Hint: Please answer the question and provide the final answer at the end.\n
Question: Between which two years does the line graph saw its maximum peak?\n
Model response: The line graph saw its maximum peak between 2007 and 2008.\n
Extracted answer: [2007, 2008]
"""
example_5 = """
Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.\n
Question: What fraction of the shape is blue?\n
Choices: (A) 3/11 (B) 8/11 (C) 6/11 (D) 3/5\n
Model response: The correct answer is (B) 8/11.\n
Extracted answer: B
"""
return [example_1, example_2, example_3, example_4, example_5]
def build_mathv_gpt4_prompt(line):
task_description = """
Please read the following example.
Then extract the answer from the model response and type it at the end of the prompt.\n
"""
question = line['question']
prediction = str(line['prediction'])
prompt = task_description
examples = get_gpt4_ICE()
for example in examples:
prompt += example + '\n'
prompt += question + '\n'
prompt += 'Model respone: ' + prediction
prompt += 'Extracted answer:'
return prompt
def list_to_dict(lst):
return {chr(65 + i): val for i, val in enumerate(lst)}
def post_check(line, prefetch=False):
res = None
ans = line['answer']
response = line['prediction'] if prefetch else line['res']
try:
if len(eval(line['choices'])) > 0:
ans = line['answer']
choices = list_to_dict(eval(line['choices']))
res = can_infer(response, choices)
if prefetch:
return res
else:
res = str(response)
ans = str(ans)
except ValueError:
pass
if is_equal(res, ans):
return res if prefetch else True
else:
return False
def MATH_V_auxeval(model, line):
prompt = build_mathv_gpt4_prompt(line)
log = ''
retry = 5
if post_check(line, prefetch=True):
res = post_check(line, prefetch=True)
return dict(log='Prefetch succeed', res=res)
for i in range(retry):
prediction = line['prediction']
res = model.generate(prompt, temperature=i * 0.5)
if FAIL_MSG in res:
log += f'Try {i}: output is {prediction}, failed to parse.\n'
else:
log += 'Succeed'
return dict(log=log, res=res)
log += 'All 5 retries failed.\n'
return dict(log=log, res='')
def MATH_V_acc(result_file):
data = load(result_file)
tot = defaultdict(lambda: 0)
fetch = defaultdict(lambda: 0)
hit = defaultdict(lambda: 0)
lt = len(data)
for i in range(lt):
item = data.iloc[i]
cate = item['category']
tot['Overall'] += 1
tot[cate] += 1
if item['log'] == 'Prefetch succeed':
fetch['Overall'] += 1
fetch[cate] += 1
if post_check(item, prefetch=False):
hit['Overall'] += 1
hit[cate] += 1
res = defaultdict(list)
for k in tot.keys():
res['Subject'].append(k)
res['tot'].append(tot[k])
res['prefetch'].append(fetch[k])
res['hit'].append(hit[k])
res['prefetch_rate'].append(fetch[k] / tot[k] * 100)
res['acc'].append(hit[k] / tot[k] * 100)
res = pd.DataFrame(res).sort_values('Subject', ignore_index=True)
return res
from ...smp import *
from ...utils import can_infer
FAIL_MSG = 'Failed to obtain answer via API.'
def get_gpt4_extract_ICE():
example_1 = """
1.
Model response: 'Rounded to two decimal places, the perimeter of the sector is approximately:\n\n(-2, 1)'
Extracted Answer: (-2, 1)
""" # noqa
example_2 = """
2.
Model response: 'at those points.\n\nTherefore, the correct option that represents the meaning of the intersection points of the graphs is:\n\nD. They give the solutions to the equation $f(t)=g(t)$.",'
Extracted Answer: D
""" # noqa
example_3 = """
3.
Model response: ' at 1 (there's a closed circle at y = 1), the range in interval notation is \\((-4, 1]\\).\n\nFinal values:\nDomain: \\((-3, 3]\\)\nRange: \\((-4, 1]\\)'
Extracted Answer: Domain: \\((-3, 3]\\)\nRange: \\((-4, 1]\\)
""" # noqa
example_4 = """
4.
Model response: 'As it stands, I cannot provide the correct option letter because there isn't enough information to solve for 'y'.'
Extracted Answer: null
""" # noqa
example_5 = """
5.
Model response: 'Given that AB = 17.6 meters, we can now substitute into the equation:\n\nd = 17.6 / cos(38\u00b0)\n\nTherefore, to one decimal place, the distance d between Ned and Bart is approximately 22.3 meters.'
Extracted answer: 22.3
""" # noqa
example_6 = """
6.
Model response: have all the coefficients for the quadratic function:\n\\( f(x) = ax^2 + bx + c \\)\n\\( f(x) = -1x^2 - 2x + 1 \\)\n\nTherefore, the equation for the graphed function \\( f \\) is:\n\\( f(x) = -x^2 - 2x + 1 \\)"'
Extracted answer: f(x) = -x^2 - 2x + 1
""" # noqa
return [example_1, example_2, example_3, example_4, example_5, example_6]
def get_gpt4_score_ICE():
example_1 = """
[Question]: Write the set of numbers represented on the number line in interval notation.
[Standard Answer]: (-2,1]
[Model_answer] : Extracted Answer: \\((-2, 1)\\)
Judgement: 0
""" # noqa
example_2 = """
[Question]: As shown in the figure, circle O has a radius 1.0, if angle BAC = 60.0, then the length of BC is ()\nChoices:\nA:2\nB:2\u221a{{3}}\nC:\u221a{{3}}\nD:2\u221a{{2}}
[Standard Answer]: C
[Model_answer] : B:2\u221a{{3}}
Judgement: 0
""" # noqa
example_3 = """
[Question]: Find the domain and range of the function f using interval notation.
[Standard Answer]: domain: [-4, 0) and range: (-3, 1]
[Model_answer] : Range: \\((-4, 1]\\)
Judgement: 0
""" # noqa
example_4 = """
[Question]: As shown in the figure, circle O has a radius 1.0, if angle BAC = 60.0, then the length of BC is ()\nChoices:\nA:2\nB:2\u221a{{3}}\nC:\u221a{{3}}\nD:2\u221a{{2}}
[Standard Answer]: C
[Model_answer] : null
Judgement: 0
""" # noqa
return [example_1, example_2, example_3, example_4]
def build_mathverse_gpt4_extract_prompt(line):
task_description = """
I am providing you a response from a model to a math problem, termed 'Model Response'. You should extract the answer from the response as 'Extracted Answer'. Directly output the extracted answer with no explanation.\n\n
""" # noqa
prediction = str(line['prediction'])
demo_prompt = task_description
examples = get_gpt4_extract_ICE()
for example in examples:
demo_prompt += example + '\n\n'
test_prompt = f"Model response: '{prediction}'\nExtracted Answer: "
full_prompt = f'{demo_prompt}7.\n{test_prompt}'
return full_prompt
def build_mathverse_gpt4_score_prompt(line):
task_description = """
Below are two answers to a math question. Question is [Question], [Standard Answer] is the standard answer to the question, and [Model_answer] is the answer extracted from a model's output to this question. Determine whether these two answers are consistent.
Please note that only when the [Model_answer] completely matches the [Standard Answer] means they are consistent. For non-multiple-choice questions, if the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
If they are consistent, Judement is 1; if they are different, Judement is 0.\n\n
""" # noqa
question_for_eval = line['question_for_eval']
extract = line['extract']
answer = line['answer']
demo_prompt = task_description
examples = get_gpt4_score_ICE()
for example in examples:
demo_prompt += example + '\n\n'
test_prompt = f"""
[Question]: {question_for_eval}
[Standard Answer]: {answer}
[Model_answer] : {extract}
Judgement:"""
full_prompt = f'{demo_prompt}{test_prompt}'
return full_prompt
def post_check_score(line, prefetch=False):
ans = str(line['answer']).strip()
response = str(line['extract']).strip()
if response == ans:
return response if prefetch else True
else:
return False
def MathVerse_auxeval_extract(model, line):
prompt = build_mathverse_gpt4_extract_prompt(line)
log = ''
retry = 5
for i in range(retry):
prediction = line['prediction']
res = model.generate(prompt, temperature=i * 0.5)
if FAIL_MSG in res:
log += f'Try {i}: output is {prediction}, failed to parse.\n'
else:
log += 'Succeed'
return dict(log_extract=log, extract=res)
log += 'All 5 retries failed.\n'
return dict(log_extract=log, extract='')
def MathVerse_auxeval_score(model, line):
prompt = build_mathverse_gpt4_score_prompt(line)
log = ''
retry = 5
if post_check_score(line, prefetch=True):
res = post_check_score(line, prefetch=True)
return dict(log_score='Prefetch succeed', score=True)
for i in range(retry):
prediction = line['prediction']
res = model.generate(prompt, temperature=i * 0.5)
if FAIL_MSG in res or res.strip() not in ['0', '1']:
log += f'Try {i}: output is {prediction}, res is {res}, failed to parse.\n'
else:
log += 'Succeed'
return dict(log_score=log, score=int(res) == 1)
log += 'All 5 retries failed.\n'
return dict(log_score=log, score=False)
def MathVerse_acc(result_file):
df = load(result_file)
df['metadata'] = df['metadata'].apply(lambda x: x.replace("'", '"'))
df['metadata'] = df['metadata'].apply(json.loads)
df_metadata = pd.json_normalize(df['metadata'])
df = pd.concat([df.drop('metadata', axis=1), df_metadata], axis=1)
subset = list(set(df['problem_version']))
res = defaultdict(list)
for p in subset:
if p != 'Overall':
sub = df[df['problem_version'] == p]
else:
sub = cp.deepcopy(df)
res['split'].append(p)
# Overall Acc
res['Overall'].append(np.mean(sub['score']) * 100)
# Subject
subjects = set(df['subject'])
for k in subjects:
res[k].append(np.mean(sub[sub['subject'] == k]['score']) * 100)
# Subfield
subfields = set(df['subfield'])
for k in subfields:
res[k].append(np.mean(sub[sub['subfield'] == k]['score']) * 100)
return pd.DataFrame(res)
from ...smp import *
from ...utils import can_infer
FAIL_MSG = 'Failed to obtain answer via API.'
def get_gpt4_ICE():
example_1 = """
Hint: Please answer the question requiring an integer answer and provide the final value,
e.g., 1, 2, 3, at the end.\n
Question: Which number is missing?\n
Model response: The number missing in the sequence is 14.\n
Extracted answer: 14
"""
example_2 = """
Hint: Please answer the question requiring a floating-point number with one decimal place and provide the final value,
e.g., 1.2, 1.3, 1.4, at the end.\n
Question: What is the fraction of females facing the camera?\n
Model response: The fraction of females facing the camera is 0.6,
which means that six out of ten females in the group are facing the camera.\n
Extracted answer: 0.6
"""
example_3 = """
Hint: Please answer the question requiring a floating-point number with two decimal places and provide the final value,
e.g., 1.23, 1.34, 1.45, at the end.\n
Question: How much money does Luca need to buy a sour apple candy and a butter-scotch candy? (Unit: $)\n
Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.\n
Extracted answer: 1.45
"""
example_4 = """
Hint: Please answer the question requiring a Python list as an answer and provide the final list,
e.g., [1, 2, 3], [1.2, 1.3, 1.4], at the end.\n
Question: Between which two years does the line graph saw its maximum peak?\n
Model response: The line graph saw its maximum peak between 2007 and 2008.\n
Extracted answer: [2007, 2008]
"""
example_5 = """
Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.\n
Question: What fraction of the shape is blue?\n
Choices: (A) 3/11 (B) 8/11 (C) 6/11 (D) 3/5\n
Model response: The correct answer is (B) 8/11.\n
Extracted answer: B
"""
return [example_1, example_2, example_3, example_4, example_5]
def build_mathvista_gpt4_prompt(line):
task_description = """
Please read the following example.
Then extract the answer from the model response and type it at the end of the prompt.\n
"""
question = line['question']
prediction = str(line['prediction'])
prompt = task_description
examples = get_gpt4_ICE()
for example in examples:
prompt += example + '\n'
prompt += question + '\n'
prompt += 'Model respone: ' + prediction
prompt += 'Extracted answer:'
return prompt
def list_to_dict(lst):
return {chr(65 + i): val for i, val in enumerate(lst)}
def post_check(line, prefetch=False):
res = None
ans = line['answer']
response = line['prediction'] if prefetch else line['res']
try:
if line['question_type'] == 'multi_choice':
ans = line['answer_option']
choices = list_to_dict(eval(line['choices']))
res = can_infer(response, choices)
if prefetch:
return res
else:
if line['answer_type'] == 'integer':
res = int(response)
ans = int(line['answer'])
elif line['answer_type'] == 'float':
res = float(response)
ans = float(line['answer'])
else:
res = str(res)
ans = str(ans)
except ValueError:
pass
if res == ans:
return res if prefetch else True
else:
return False
def MathVista_auxeval(model, line):
prompt = build_mathvista_gpt4_prompt(line)
log = ''
retry = 5
if post_check(line, prefetch=True):
res = post_check(line, prefetch=True)
return dict(log='Prefetch succeed', res=res)
for i in range(retry):
prediction = line['prediction']
res = model.generate(prompt, temperature=i * 0.5)
if FAIL_MSG in res:
log += f'Try {i}: output is {prediction}, failed to parse.\n'
else:
log += 'Succeed'
return dict(log=log, res=res)
log += 'All 5 retries failed.\n'
return dict(log=log, res='')
def MathVista_acc(result_file):
data = load(result_file)
tot = defaultdict(lambda: 0)
fetch = defaultdict(lambda: 0)
hit = defaultdict(lambda: 0)
lt = len(data)
skill_list = []
for i in range(lt):
item = data.iloc[i]
cate = item['task']
tot['Overall'] += 1
try:
skills = eval(item['skills'])
except SyntaxError:
skills = [item['skills']]
for skill in skills:
if skill not in skill_list:
skill_list.append(skill)
tot[skill] += 1
tot[cate] += 1
if item['log'] == 'Prefetch succeed':
fetch['Overall'] += 1
fetch[cate] += 1
for skill in skills:
fetch[skill] += 1
if post_check(item, prefetch=False):
hit['Overall'] += 1
hit[cate] += 1
for skill in skills:
hit[skill] += 1
res = defaultdict(list)
for k in tot.keys():
res['Task&Skill'].append(k)
res['tot'].append(tot[k])
res['prefetch'].append(fetch[k])
res['hit'].append(hit[k])
res['prefetch_rate'].append(fetch[k] / tot[k] * 100)
res['acc'].append(hit[k] / tot[k] * 100)
res = pd.DataFrame(res)
return res
from ...smp import *
from .multiple_choice import extract_answer_from_item
from PIL import Image, ImageOps
import numpy as np
FAIL_MSG = 'Failed to obtain answer via API.'
system_prompt_sub_scene = """
##TASK DESCRIPTION:
You are required to evaluate a respondent's answer based on a provided question, some scoring points, and the respondent's answer. You should provide two scores. The first is the accuracy score, which should range from 1 to 5. The second is the relevance score, which should also range from 1 to 5. Below are the criteria for each scoring category.
##ACCURACY Scoring Criteria:
Evaluate the respondent's answer against specific scoring points as follows:
Score 1: The response completely misses the scoring point.
Score 3: The response mentions content related to the scoring point but is not entirely correct.
Score 5: The response accurately addresses the scoring point.
Calculate the average score across all scoring points to determine the final accuracy score.
##RELEVANCE Scoring Criteria:
Assess how the respondent's answer relates to the original question:
Score 1: The response is completely off-topic from the question.
Score 2: The response is partially related to the question but contains a significant amount of irrelevant content.
Score 3: The response primarily addresses the question, but the respondent seems uncertain about their own answer.
Score 4: The response mostly addresses the question and the respondent appears confident in their answer.
Score 5: The response is fully focused on addressing the question with no irrelevant content and demonstrates complete certainty.
----
##INSTRUCTION:
1. Evaluate Accuracy: First, assess and score each scoring point based on the respondent's answer. Calculate the average of these scores to establish the final accuracy score. Provide a detailed rationale before assigning your score.
2. Evaluate RELEVANCE: Assess the relevance of the respondent’s answer to the question. Note that when evaluating relevance, the correctness of the answer is not considered; focus solely on how relevant the answer is to the question. Provide a comprehensive rationale before assigning your score.
3. Output Scores in JSON Format: Present the scores in JSON format as follows:
{'score_accuracy': score_acc, 'score_relevance': score_rele, 'total_score': score_acc + score_rele}
""" # noqa
system_prompt_summary = """
##TASK DESCRIPTION:
You are required to evaluate the performance of the respondent in the video summarization task based on the standard answer and the respondent's answer. You should provide two scores. The first is the COMPLETENESS score, which should range from 1 to 5. The second is the RELIABILITY score, which should also range from 1 to 5. Below are the criteria for each scoring category:
##COMPLETENESS Scoring Criteria:
The completeness score focuses on whether the summary covers all key points and main information from the video.
Score 1: The summary hardly covers any of the main content or key points of the video.
Score 2: The summary covers some of the main content and key points but misses many.
Score 3: The summary covers most of the main content and key points.
Score 4: The summary is very comprehensive, covering most to nearly all of the main content and key points.
Score 5: The summary completely covers all the main content and key points of the video.
##RELIABILITY Scoring Criteria:
The reliability score evaluates the correctness and clarity of the video summary. It checks for factual errors, misleading statements, and contradictions with the video content. If the respondent's answer includes details that are not present in the standard answer, as long as these details do not conflict with the correct answer and are reasonable, points should not be deducted.
Score 1: Contains multiple factual errors and contradictions; presentation is confusing.
Score 2: Includes several errors and some contradictions; needs clearer presentation.
Score 3: Generally accurate with minor errors; minimal contradictions; reasonably clear presentation.
Score 4: Very accurate with negligible inaccuracies; no contradictions; clear and fluent presentation.
Score 5: Completely accurate with no errors or contradictions; presentation is clear and easy to understand.
----
##INSTRUCTION:
1. Evaluate COMPLETENESS: First, analyze the respondent's answer according to the scoring criteria, then provide an integer score between 1 and 5 based on sufficient evidence.
2. Evaluate RELIABILITY: First, analyze the respondent's answer according to the scoring criteria, then provide an integer score between 1 and 5 based on sufficient evidence.
3. Output Scores in JSON Format: Present the scores in JSON format as follows:
{'score_completeness': score_comp, 'score_reliability': score_reli, 'total_score': score_comp + score_reli}
""" # noqa
def check_ans_with_model(pred, gt, model, item, dataset_name='MLVU_MCQ'):
flag = False
index = gt.index("(") # noqa
index2 = gt.index(")") # noqa
gt_option = gt[index + 1: index2]
if ")" in pred:
index3 = pred.index(")")
pred = pred[index3 - 1: index3]
if pred == gt_option:
flag = True
elif extract_answer_from_item(model, item, dataset_name)['opt'] == item['answer']:
flag = True
return flag
def extract_scores_summary(text):
# Define the keys to locate in the text
keys = ["score_completeness", "score_reliability"]
scores = []
for key in keys:
# Find the index where each key starts
start_index = text.find(key)
if start_index == -1:
continue # Skip if key is not found
# Find the start of the number which is after the colon and space
start_number_index = text.find(":", start_index) + 2
end_number_index = text.find(",", start_number_index) # Assuming the number ends before a comma
# Extract and convert the number to float
score = float(text[start_number_index:end_number_index])
scores.append(score)
return scores
def check_ans_with_model_summary(pred, gt, model, item, dataset_name='MLVU_OpenEnded'):
user_prompt = f"""
Please score the respondent's answer according to the steps in the Instructions. You must end with a JSON dict to store the scores.
Standard Answer: {gt}
Respondent's Answer: {pred}
""" # noqa
result = model.generate(user_prompt)
result = extract_scores_summary(result)
result = np.sum(result)
return result
def extract_scores_sub_scene(text):
# Define the keys to locate in the text
keys = ["score_accuracy", "score_relevance"]
scores = []
for key in keys:
# Find the index where each key starts
start_index = text.find(key)
if start_index == -1:
continue # Skip if key is not found
# Find the start of the number which is after the colon and space
start_number_index = text.find(":", start_index) + 2
end_number_index = text.find(",", start_number_index) # Assuming the number ends before a comma
# Extract and convert the number to float
score = float(text[start_number_index:end_number_index])
scores.append(score)
return scores
def check_ans_with_model_sub_scene(pred, gt, model, item, dataset_name='MLVU_OpenEnded'):
user_prompt = f"""
Please score the respondent's answer according to the steps in the Instructions. You must end with a JSON dict to store the scores.
Question: {item['question']}
Scoring Points: {item['scoring_points']}
Respondent's Answer: {pred}
""" # noqa
result = model.generate(user_prompt)
result = extract_scores_sub_scene(result)
result = np.sum(result)
return result
def MLVU_OpenEnded_generate(model, line):
task_type = line['task_type']
if task_type == 'summary':
user_prompt = (
f"Please score the respondent's answer according to the steps in the Instructions. "
f"You must end with a JSON dict to store the scores.\n"
f"Standard Answer: {line['answer']}\n"
f"Respondent's Answer: {line['prediction']}\n"
)
elif task_type == 'sub_scene':
user_prompt = (
f"Please score the respondent's answer according to the steps in the Instructions. "
f"You must end with a JSON dict to store the scores.\n"
f"Question: {line['question']}\n"
f"Scoring Points: {line['scoring_points']}\n"
f"Respondent's Answer: {line['prediction']}\n"
)
else:
AssertionError(f'MLVU don\'t have {task_type} open ended task!')
result = model.generate(user_prompt)
return result
def MLVU_OpenEnded_extract(gpt_generate_data, org_data):
extract_func = {
'sub_scene': extract_scores_sub_scene,
'summary': extract_scores_summary
}
for idx, item in org_data.iterrows():
func = extract_func[item['task_type']]
text = gpt_generate_data[idx]
org_data.loc[idx, 'score'] = np.sum(func(text))
return org_data
def get_dimension_rating(data_path):
data = load(data_path)
result_dict = {}
for idx, item in data.iterrows():
if item['task_type'] not in result_dict:
result_dict[item['task_type']] = [0,0]
result_dict[item['task_type']][0] += int(item['score'])
result_dict[item['task_type']][1] += 1
return result_dict
from ...smp import *
import numpy as np
FAIL_MSG = 'Failed to obtain answer via API.'
system_prompt = """
As an AI assistant, your task is to evaluate a candidate answer in comparison to a given correct answer.
The question itself, the correct 'groundtruth' answer, and the candidate answer will be provided to you.
Your assessment should range from 0 to 3, \
based solely on the semantic similarity between the groundtruth and the candidate answer, \
disregarding any grammatical differences.
A rating of 0 suggests no similarity, implying the candidate answer is entirely incorrect.
A rating of 1 suggests low similarity, meaning the candidate answer is largely incorrect.
A rating of 2 suggests high similarity, meaning the candidate answer is largely correct.
Lastly, a rating of 3 indicates complete similarity, which means the candidate answer is entirely correct.
Your response should be a single integer from 0, 1, 2, or 3.
"""
MMV_DIMENSIONS = {
'CP': ['Video Topic', 'Video Emotion', 'Video Scene', 'Video Style'],
'FP-S': ['OCR', 'Object Recognition', 'Attribute Recognition', 'Event Recognition', 'Human Motion', 'Counting'],
'FP-C': ['Spatial Relationship', 'Human-object Interaction', 'Human Interaction'],
'HL': ['Hallucination'],
'LR': ['Structuralized Image-Text Understanding', 'Mathematical Calculation'],
'AR': ['Physical Property', 'Function Reasoning', 'Identity Reasoning'],
'RR': ['Natural Relation', 'Physical Relation', 'Social Relation'],
'CSR': ['Common Sense Reasoning'],
'TR': ['Counterfactual Reasoning', 'Causal Reasoning', 'Future Prediction'],
}
L3_DIMS = []
for k, v in MMV_DIMENSIONS.items():
L3_DIMS.extend(v)
MMV_DIMENSIONS['Perception'] = []
MMV_DIMENSIONS['Reasoning'] = []
MMV_DIMENSIONS['Overall'] = []
for k in ['CP', 'FP-C', 'FP-S', 'HL']:
MMV_DIMENSIONS['Perception'].extend(MMV_DIMENSIONS[k])
MMV_DIMENSIONS['Overall'].extend(MMV_DIMENSIONS[k])
for k in ['LR', 'AR', 'RR', 'CSR', 'TR']:
MMV_DIMENSIONS['Reasoning'].extend(MMV_DIMENSIONS[k])
MMV_DIMENSIONS['Overall'].extend(MMV_DIMENSIONS[k])
def get_dimension_rating(data_path):
data = load(data_path)
coarse_rating = {k: [] for k in MMV_DIMENSIONS}
fine_rating = {k: [] for k in L3_DIMS}
for i in range(len(data)):
cate = data.iloc[i]['dimensions']
cates = eval(cate)
for c in cates:
fine_rating[c].append(data.iloc[i]['score'])
for d in MMV_DIMENSIONS:
if np.any([x in MMV_DIMENSIONS[d] for x in cates]):
coarse_rating[d].append(data.iloc[i]['score'])
coarse_all = {k: f'{np.mean([max(x, 0) for x in v]):.2f}' for k, v in coarse_rating.items()}
coarse_valid = {k: f'{np.mean([x for x in v if x >= 0]):.2f}' for k, v in coarse_rating.items()}
fine_all = {k: f'{np.mean([max(x, 0) for x in v]):.2f}' for k, v in fine_rating.items()}
fine_valid = {k: f'{np.mean([x for x in v if x >= 0]):.2f}' for k, v in fine_rating.items()}
return dict(coarse_all=coarse_all, coarse_valid=coarse_valid, fine_all=fine_all, fine_valid=fine_valid)
def build_prompt(item):
tmpl = 'Question: {}\nGroundtruth answer: {}\nCandidate answer: {}\nYour response: '
return tmpl.format(item['question'], item['answer'], item['prediction'])
from ...smp import *
meta_prompt = """
You are an assistant skilled at evaluating the quality of creative text.
Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to \
the user question displayed below. You'll need to assess the response on the following dimensions: \
Creativity, Richness, Visual Perception, Logical Coherence, Answer Accuracy and Image Relationship Understanding. \
We will provide you with a creative question and the AI model's response and a reference answer for your evaluation. \
As you begin your assessment, follow this process:
1. Evaluate the AI model's answers on different dimensions, pointing out its strengths or weaknesses \
in each dimension and assigning a score of 1 to 10 for each.
2. Finally, based on the assessments across dimensions, \
provide an overall score of 1 to 10 for the AI model's response.
3. Your scoring should be as stringent as possible and follow the scoring rules below:
In general, the higher the quality of the model's response and its strict adherence to user needs, \
the higher the score. Responses that do not meet user needs will receive lower scores.
Scoring rules:
Creativity:
Scores 1-2 when there is no innovation or uniqueness in the content.
Scores 3-4 when providing partially original content but with low creative quality.
Scores 5-6 when mostly creative but lacks significant novelty, with moderate quality.
Scores 7-8 when having novelty and high-quality content.
Scores 9-10 when highly novel and of exceptional quality compared to the reference answer.
Richness:
Scores 1-2 when lacking depth and breadth, with very limited information.
Scores 3-4 when limited in depth and breadth, with fewer explanations and examples, showing low diversity.
Scores 5-6 when limited in depth and breadth but provides basic necessary information.
Scores 7-8 when providing depth and useful additional information.
Scores 9-10 when providing exceptional depth, breadth, and high diversity compared to the reference answer.
Visual Perception:
Scores 1-2 when the description of the visual information in the image contains errors or \
is significantly inconsistent with the content of the image.
Scores 3-4 When the description of the visual information in the image reflects only a small amount \
of the image's information and contains some errors.
Scores 5-6 when the description of the visual information in the image includes the basic information \
of the image but contains minimal information.
Scores 7-8 when the description of the visual information in the image matches the image well and is rich in content, \
providing a substantial amount of information about the image.
Scores 9-10 when the description of the visual information in the image not only matches the image \
but also is more detailed and informative compared to the reference answer, providing more information about the image.
Logical Coherence:
Scores 1-2 when entirely incoherent, lacking any logic, and not matching the question or known information.
Scores 3-4 when somewhat coherent but with many logical errors or inconsistencies.
Scores 5-6 when mostly coherent, with few errors, but may struggle to maintain complete coherence in complex situations.
Scores 7-8 when excellent logical handling, very few errors.
Scores 9-10 when flawless logic, impeccable in handling complexity, \
and significantly higher logical coherence compared to the reference answer.
Answer Accuracy:
Scores 1-2 when the answer is significantly inconsistent with the question or contains obvious errors.
Scores 3-4 when the answer is partially correct but contains some errors or is incomplete.
Scores 5-6 when the answer is basically correct but lacks details or is not sufficiently detailed.
Scores 7-8 when the answer is accurate and detailed, fully corresponding to the question.
Scores 9-10 when the answer is not only accurate and detailed but also provides additional useful information, \
exceeding expectations.
Image Relationship Understanding:
Scores 1-2 when there are significant errors or confusion in distinguishing and describing different images, \
unable to correctly identify and relate the content of the images.
Scores 3-4 when the description of different images reflects only minimal distinguishing information, \
contains some errors and confusion, and fails to clearly differentiate and relate the images.
Scores 5-6 when the description of different images includes basic distinguishing information, \
is able to correctly identify and relate the images in a basic manner, \
but the information provided is minimal and lacks detail.
Scores 7-8 when the description of different images is accurate and detailed, \
clearly distinguishing and relating the images, \
with rich content that points out the main commonalities and differences between the images.
Scores 9-10 when the description of different images is not only accurate and detailed but also \
provides richer information and analysis, clearly distinguishing and relating the images, \
more comprehensively pointing out the commonalities and differences \
between the images compared to the reference answer.
Overall Score:
Scores 1-2 when irrelevant to the question, factually incorrect, or generates harmful content.
Scores 3-4 when no serious errors, mostly harmless, but of low quality and does not meet requirements.
Scores 5-6 when basically meeting requirements but performing poorly in some dimensions, with moderate quality.
Scores 7-8 when performing well in all dimensions.
Scores 9-10 when fully addressing user questions and all requirements, significantly surpassing the reference answer.
Please remember, you must evaluate and explain before scoring. After your explanation for each dimension, \
add the score for that dimension. Finally, at the end of your response, \
in the format of the dictionary (including brackets), return all your scoring results, \
ensuring your scores are integers:
{'Dimension One': Score, 'Dimension Two': Score, ..., 'Overall Score': Score}, \
for example: {'Creativity': 9, 'Richness': 6, ..., 'Overall Score': 7}.\n
"""
question_begin_prompt = '[Question]'
reference_begin_prompt = '[The Start of Reference Answer]'
reference_end_prompt = '[The End of Reference Answer]'
answers_begin_prompt = '[The Start of Assistant’s Answer]'
answers_end_prompt = '[The End of Assistant’s Answer]'
def mmdu_score(model, line):
question = eval(line['question'])
gt = eval(line['answer'])
prediction = eval(line['prediction'])
DIMS = [
'Creativity', 'Richness', 'Visual Perception', 'Logical Coherence',
'Answer Accuracy', 'Image Relationship Understanding', 'Overall Score'
]
all_result_dict = []
logs = []
for j in range(len(question)):
try:
prompt = meta_prompt + question_begin_prompt + '\n' + question[j] + '\n\n' + \
reference_begin_prompt + '\n' + gt[j] + '\n' + reference_end_prompt + '\n\n' + \
answers_begin_prompt + '\n' + prediction[j] + '\n' + answers_end_prompt
response = model.generate(prompt)
start_index = response.find('{')
end_index = response.rfind('}') + 1
dictionary_str = response[start_index: end_index]
result_dict = eval(dictionary_str)
all_result_dict.append(result_dict)
if all([x in result_dict for x in DIMS]):
logs.append('Succeed')
else:
logs.append(
f'Following Dims are not in results of turn {j}: '
f'{",".join([x for x in DIMS if x not in result_dict])}'
)
except Exception as e:
logging.warning(str(e))
all_result_dict.append({d: None for d in DIMS})
logs.append(str(e))
df = pd.DataFrame(all_result_dict)
return dict(res=df, log='\n'.join(logs))
import re
import json
def has_word(sentence, word):
pattern = r'\b' + re.escape(word) + r'\b'
match = re.search(pattern, sentence)
if match:
return True
else:
return False
class VQAEval:
def __init__(self):
self.contractions = {
'aint': "ain't",
'arent': "aren't",
'cant': "can't",
'couldve': "could've",
'couldnt': "couldn't",
"couldn'tve": "couldn't've",
"couldnt've": "couldn't've",
'didnt': "didn't",
'doesnt': "doesn't",
'dont': "don't",
'hadnt': "hadn't",
"hadnt've": "hadn't've",
"hadn'tve": "hadn't've",
'hasnt': "hasn't",
'havent': "haven't",
'hed': "he'd",
"hed've": "he'd've",
"he'dve": "he'd've",
'hes': "he's",
'howd': "how'd",
'howll': "how'll",
'hows': "how's",
"Id've": "I'd've",
"I'dve": "I'd've",
'Im': "I'm",
'Ive': "I've",
'isnt': "isn't",
'itd': "it'd",
"itd've": "it'd've",
"it'dve": "it'd've",
'itll': "it'll",
"let's": "let's",
'maam': "ma'am",
'mightnt': "mightn't",
"mightnt've": "mightn't've",
"mightn'tve": "mightn't've",
'mightve': "might've",
'mustnt': "mustn't",
'mustve': "must've",
'neednt': "needn't",
'notve': "not've",
'oclock': "o'clock",
'oughtnt': "oughtn't",
"ow's'at": "'ow's'at",
"'ows'at": "'ow's'at",
"'ow'sat": "'ow's'at",
'shant': "shan't",
"shed've": "she'd've",
"she'dve": "she'd've",
"she's": "she's",
'shouldve': "should've",
'shouldnt': "shouldn't",
"shouldnt've": "shouldn't've",
"shouldn'tve": "shouldn't've",
"somebody'd": 'somebodyd',
"somebodyd've": "somebody'd've",
"somebody'dve": "somebody'd've",
'somebodyll': "somebody'll",
'somebodys': "somebody's",
'someoned': "someone'd",
"someoned've": "someone'd've",
"someone'dve": "someone'd've",
'someonell': "someone'll",
'someones': "someone's",
'somethingd': "something'd",
"somethingd've": "something'd've",
"something'dve": "something'd've",
'somethingll': "something'll",
'thats': "that's",
'thered': "there'd",
"thered've": "there'd've",
"there'dve": "there'd've",
'therere': "there're",
'theres': "there's",
'theyd': "they'd",
"theyd've": "they'd've",
"they'dve": "they'd've",
'theyll': "they'll",
'theyre': "they're",
'theyve': "they've",
'twas': "'twas",
'wasnt': "wasn't",
"wed've": "we'd've",
"we'dve": "we'd've",
'weve': "we've",
'werent': "weren't",
'whatll': "what'll",
'whatre': "what're",
'whats': "what's",
'whatve': "what've",
'whens': "when's",
'whered': "where'd",
'wheres': "where's",
'whereve': "where've",
'whod': "who'd",
"whod've": "who'd've",
"who'dve": "who'd've",
'wholl': "who'll",
'whos': "who's",
'whove': "who've",
'whyll': "why'll",
'whyre': "why're",
'whys': "why's",
'wont': "won't",
'wouldve': "would've",
'wouldnt': "wouldn't",
"wouldnt've": "wouldn't've",
"wouldn'tve": "wouldn't've",
'yall': "y'all",
"yall'll": "y'all'll",
"y'allll": "y'all'll",
"yall'd've": "y'all'd've",
"y'alld've": "y'all'd've",
"y'all'dve": "y'all'd've",
'youd': "you'd",
"youd've": "you'd've",
"you'dve": "you'd've",
'youll': "you'll",
'youre': "you're",
'youve': "you've",
}
self.manualMap = {
'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4,
'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9,
'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13,
'fourteen': 14, 'fifteen': 15, 'sixteen': 16,
'seventeen': 17, 'eighteen': 18, 'nineteen': 19,
'twenty': 20, 'thirty': 30, 'forty': 40, 'fifty': 50,
'sixty': 60, 'seventy': 70, 'eighty': 80, 'ninety': 90}
self.articles = ['a', 'an', 'the']
self.periodStrip = re.compile('(?!<=\\d)(\\.)(?!\\d)')
self.commaStrip = re.compile('(\\d)(\\,)(\\d)')
self.punct = [
';',
r'/',
'[',
']',
'"',
'{',
'}',
'(',
')',
'=',
'+',
'\\',
'_',
'-',
'>',
'<',
'@',
'`',
',',
'?',
'!',
]
def evaluate(self, answer, gt_answers):
answer = answer.replace('\n', ' ')
answer = answer.replace('\t', ' ')
answer = answer.strip()
answer = self.processPunctuation(answer)
answer = self.processDigitArticle(answer)
if isinstance(gt_answers, list):
for i in range(len(gt_answers)):
gt_answers[i] = str(gt_answers[i])
gt_answers[i] = gt_answers[i].replace('\n', ' ')
gt_answers[i] = gt_answers[i].replace('\t', ' ')
gt_answers[i] = gt_answers[i].strip()
gt_answers[i] = self.processPunctuation(gt_answers[i])
gt_answers[i] = self.processDigitArticle(gt_answers[i])
if has_word(answer, gt_answers[i]):
return 1
return 0
else:
gt_answers = gt_answers.replace('\n', ' ')
gt_answers = gt_answers.replace('\t', ' ')
gt_answers = gt_answers.strip()
gt_answers = self.processPunctuation(gt_answers)
gt_answers = self.processDigitArticle(gt_answers)
if has_word(answer, gt_answers):
return 1
else:
return 0
def evaluate_MRR(self, answer, gt_answers):
answer = answer.replace('\n', ' ')
answer = answer.replace('\t', ' ')
answer = answer.strip()
answer = self.processPunctuation(answer)
answer = self.processDigitArticle(answer)
assert isinstance(gt_answers, list)
for i in range(len(gt_answers)):
gt_answers[i] = gt_answers[i].replace('\n', ' ')
gt_answers[i] = gt_answers[i].replace('\t', ' ')
gt_answers[i] = gt_answers[i].strip()
gt_answers[i] = self.processPunctuation(gt_answers[i])
gt_answers[i] = self.processDigitArticle(gt_answers[i])
if has_word(answer, gt_answers[i]):
return 1 / (i + 1)
return 0.0
def processPunctuation(self, inText):
outText = inText
for p in self.punct:
if (p + ' ' in inText or ' ' + p in inText) or (
re.search(self.commaStrip, inText) is not None
):
outText = outText.replace(p, '')
else:
outText = outText.replace(p, ' ')
outText = self.periodStrip.sub('', outText, re.UNICODE)
return outText
def processDigitArticle(self, inText):
outText = []
tempText = inText.lower().split()
for word in tempText:
word = self.manualMap.setdefault(word, word)
if word not in self.articles:
outText.append(word)
else:
pass
for wordId, word in enumerate(outText):
if word in self.contractions:
outText[wordId] = self.contractions[word]
outText = [str(text) for text in outText]
outText = ' '.join(outText)
return outText
def is_correct(answer, response):
# response_orig = response
response = response.strip('.')
if isinstance(answer, int):
if response.isdigit():
return int(int(response) == answer)
response = response.lower()
response = response.replace('the answer is', '')
response = response.replace('*', '') # parse **A**
if response.find('.') != -1:
response = response.split('.')[0]
response = response.replace(',', '')
response = response.strip()
response = response.strip()
if response == 'none':
return 0
if 'the camera is moving left' in response:
response = 'a'
elif 'the camera is moving right' in response:
response = 'b'
if len(response) != 1:
# print(f"Fail to parse {response_orig}")
return 0
return (ord(response) - ord('a')) == answer
if isinstance(answer, list):
try:
response = response.replace('json', '').replace('```', '').strip()
response = json.loads(response)
if isinstance(response, dict):
response = sum(list(response.values()), start=[])
except:
# print(f"Fail to parse {response_orig} Exception: {e}")
return 0
if not isinstance(response, (list, tuple)):
# print(f"Fail to parse {response_orig} Exception: not a list!")
return 0
match = 0
for res, ans in zip(response, answer):
match += res == ans
return match / len(answer)
return VQAEval().evaluate(response, answer)
from ...smp import *
def build_mmvet_gpt4_prompt(line):
question = line['question']
gt = str(line['answer'])
prediction = str(line['prediction'])
prompt = """
Compare the ground truth and prediction from AI models, to give a correctness score for the prediction.
<AND> in the ground truth means it is totally right
only when all elements in the ground truth are present in the prediction,
and <OR> means it is totally right when any one element in the ground truth is present in the prediction.
The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right).
Just complete the last space of the correctness score.
Question | Ground truth | Prediction | Correctness
--- | --- | --- | ---
What is x in the equation? | -1 <AND> -5 | x = 3 | 0.0
What is x in the equation? | -1 <AND> -5 | x = -1 | 0.5
What is x in the equation? | -1 <AND> -5 | x = -5 | 0.5
What is x in the equation? | -1 <AND> -5 | x = -5 or 5 | 0.5
What is x in the equation? | -1 <AND> -5 | x = -1 or x = -5 | 1.0
Can you explain this meme? | This meme is poking fun at the fact that the names of the countries
Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes,
while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues
because the names of these countries do not accurately represent their landscapes. |
The meme talks about Iceland and Greenland. It's pointing out that despite their names,
Iceland is not very icy and Greenland isn't very green. | 0.4
Can you explain this meme? | This meme is poking fun at the fact that the names of the countries
Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes,
while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues
because the names of these countries do not accurately represent their landscapes. |
The meme is using humor to point out the misleading nature of Iceland's and Greenland's names.
Iceland, despite its name, has lush green landscapes while Greenland is mostly covered in ice and snow.
The text 'This is why I have trust issues' is a playful way to suggest
that these contradictions can lead to distrust or confusion.
The humor in this meme is derived from the unexpected contrast between the names of the countries
and their actual physical characteristics. | 1.0
"""
gpt4_prompt = prompt + '\n' + ' | '.join(
[question, gt.replace('<AND>', ' <AND> ').replace('<OR>', ' <OR> '), prediction, ''])
return gpt4_prompt
def MMVet_auxeval(model, line):
def float_cvt(s):
try:
return float(s)
except ValueError:
return None
prompt = build_mmvet_gpt4_prompt(line)
log = ''
retry = 5
for i in range(retry):
output = model.generate(prompt, temperature=i * 0.5)
score = float_cvt(output)
if score is None:
log += f'Try {i}: output is {output}, failed to parse.\n'
elif score < 0 or score > 1:
log += f'Try {i}: output is {output}, invalid score: {score}.\n'
else:
log += 'Succeed'
return dict(log=log, score=score)
log += 'All 5 retries failed.\n'
return dict(log=log, score=0.0)
def MMVet_acc(result_file):
data = load(result_file)
tot = defaultdict(lambda: 0)
score = defaultdict(lambda: 0)
lt = len(data)
cate2_list = []
for i in range(lt):
item = data.iloc[i]
cate = item['category']
cate2 = cate.replace(',', '_')
if cate2 not in cate2_list:
cate2_list.append(cate2)
grade = float(item['score'])
cate_list = ['rec', 'ocr', 'know', 'gen', 'spat', 'math']
for capa in cate_list:
if capa in cate:
tot[capa] += 1
score[capa] += grade
tot['Overall'] += 1
tot[cate2] += 1
score['Overall'] += grade
score[cate2] += grade
res = defaultdict(list)
res2 = defaultdict(list)
cate_list.append('Overall')
cate2_list.append('Overall')
for k in cate_list:
res['Category'].append(k)
res['tot'].append(tot[k])
res['acc'].append(score[k] / tot[k] * 100)
for v in cate2_list:
res2['Category'].append(v)
res2['tot'].append(tot[v])
res2['acc'].append(score[v] / tot[v] * 100)
res = pd.DataFrame(res)
res2 = pd.DataFrame(res2)
return res, res2
import pandas as pd
from ...utils import can_infer, track_progress_rich
from ...smp import *
import numpy as np
import re
MMB_abbrs = {
'coarse_perception': 'CP',
'finegrained_perception (instance-level)': 'FP-S',
'finegrained_perception (cross-instance)': 'FP-C',
'logic_reasoning': 'LR',
'relation_reasoning': 'RR',
'attribute_reasoning': 'AR'
}
MMT_abbrs = {
'visual_recognition': 'VR',
'localization': 'Loc',
'ocr': 'OCR',
'counting': 'Count',
'hallucination': 'HLN',
'image_retrieval': 'IR',
'threed': '3D',
'visual_captioning': 'VC',
'visual_grounding': 'VG',
'doc_understanding': 'DU',
'action_recognition': 'AR',
'pixel_level_perception': 'PLP',
'image-to-image_translation': 'I2IT',
'relation_reasoning': 'RR',
'intelligence_quotient_test': 'IQT',
'emotion': 'Emo',
'visual_illusion': 'VI',
'meme_understanding': 'MemU',
'visual_prompt_understanding': 'VPU',
'anomaly_detection': 'AND',
'keypoint_detection': 'KD',
'visual_commonsense_reasoning': 'VCR',
'image_evaluation_judgement': 'IEJ',
'multiple_image_analysis': 'MIA',
'cross_image_matching': 'CIM',
'temporal_understanding': 'TU',
'visual_code': 'VP',
'medical_understanding': 'MedU',
'autonomous_driving': 'AUD',
'discipline_knowledge_reasoning': 'DKR',
'embodied_ai': 'EA',
'gui_navigation': 'GN'
}
def MMMU_preproc(data):
logger = get_logger('Evaluation')
cnt = 0
As, Bs, Ans = list(data['A']), list(data['B']), list(data['answer'])
lt = len(data)
for i in range(lt):
if pd.isna(As[i]):
As[i] = Ans[i]
Bs[i] = 'Other Answers'
cnt += 1
logger.info(f'During MMMU_preproc in Evaluation, {cnt} open questions are re-formulated to multi-choice ones. ')
data['A'] = As
data['B'] = Bs
return data
def report_acc(df):
# assert group in [None, 'category', 'l2-category']
res = defaultdict(list)
if 'split' in df:
splits = list(set(df['split']))
res['split'] = splits
else:
df['split'] = ['none'] * len(df)
res['split'] = ['none']
for group in [None, 'l2-category', 'category']:
if group is None:
res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']]
elif group not in df:
continue
else:
abilities = list(set(df[group]))
abilities.sort()
for ab in abilities:
ab_name = MMB_abbrs[ab] if ab in MMB_abbrs else ab
sub_df = df[df[group] == ab]
res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
return pd.DataFrame(res)
def report_acc_MMT(df):
# assert group in [None, 'category', 'l2-category']
res = defaultdict(list)
res['split'] = list()
res['Overall'] = list()
for _, name in MMT_abbrs.items():
res[name] = list()
if 'split' in df:
splits = list(set(df['split']))
res['split'] = splits
else:
df['split'] = ['none'] * len(df)
res['split'] = ['none']
for group in [None, 'category', 'l2-category']:
if group is None:
res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']]
res['Overall'].extend([np.mean(df['hit'])])
elif group not in df:
continue
elif group == 'category':
abilities = list(set(df[group]))
abilities.sort()
for ab in abilities:
ab_name = ab
sub_df = df[df[group] == ab]
res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
res[ab_name].extend([np.mean(sub_df['hit'])])
else:
abilities = list(set(df[group]))
abilities.sort()
for ab in abilities:
sub_task_name_list = df[df['l2-category'] == ab]['category'].unique()
sub_task_acc = []
for sub_task_name in sub_task_name_list:
sub_df = df[df['category'] == sub_task_name]
sub_task_acc.append([np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']])
new_acc = []
for i in range(len(sub_task_acc[0])):
new_acc.append(sum([_[i] for _ in sub_task_acc]) / len([_ for _ in sub_task_acc]))
ab_name = MMT_abbrs[ab] if ab in MMT_abbrs else ab
res[ab_name] = new_acc
sub_task_acc = []
for sub_task_name in sub_task_name_list:
sub_df = df[df['category'] == sub_task_name]
sub_task_acc.append([np.mean(sub_df['hit'])])
new_acc = []
for i in range(len(sub_task_acc[0])):
new_acc.append(sum([_[i] for _ in sub_task_acc]) / len([_ for _ in sub_task_acc]))
res[ab_name].extend(new_acc)
res['split'].append('ALL')
return pd.DataFrame(res)
def build_prompt(question, options, prediction):
tmpl = (
'You are an AI assistant who will help me to match '
'an answer with several options of a single-choice question. '
'You are provided with a question, several options, and an answer, '
'and you need to find which option is most similar to the answer. '
'If the meaning of all options are significantly different from the answer, output Z. '
'Your should output a single uppercase character in A, B, C, D (if they are valid options), and Z. \n'
'Example 1: \n'
'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
'Answer: a cute teddy bear\nYour output: A\n'
'Example 2: \n'
'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
'Answer: Spider\nYour output: Z\n'
'Example 3: \n'
'Question: {}?\nOptions: {}\nAnswer: {}\nYour output: '
)
return tmpl.format(question, options, prediction)
def build_prompt_blink(question, options, prediction):
tmpl = (
'You are an AI assistant who will help me to match an answer with several options of a single-choice question. '
'You are provided with a question, several options, and an answer, '
'and you need to find which option is most similar to the answer. '
"If the answer says things like refuse to answer, I'm sorry cannot help, etc., output Z."
'If the meaning of all options are significantly different from the answer, '
'or the answer does not select any option, output Z. '
'Your should output one of the choices, A, B, C, D (if they are valid options), or Z.\n'
'Example 1: \n'
'Question: Which point is closer to the camera?\nSelect from the following choices.\n'
'Options: A. Point A\nB. Point B\n(Z) Failed\n'
'Answer: Point B, where the child is sitting, is closer to the camera.\nYour output: (B)\n'
'Example 2: \n'
'Question: Which point is closer to the camera?\nSelect from the following choices.\n'
'Options: (A) Point A\n(B) Point B\n(Z) Failed\n'
"Answer: I'm sorry, but I can't assist with that request.\nYour output: (Z)\n"
'Example 3: \n'
'Question: Which point is corresponding to the reference point?\nSelect from the following choices.\n'
'Options: (A) Point A\n(B) Point B\n(Z) Failed\n'
'Answer:The reference point (REF) on the first image is at the tip of the pot, '
'which is the part used to Poke if the pots were used for that action. Looking at the second image, '
'we need to find the part of the object that would correspond to poking.\n'
"(A) Point A is at the tip of the spoon's handle, which is not used for poking.\n"
'(B) Point B is at the bottom of the spoon, which is not used for poking.\n'
'(C) Point C is on the side of the pspoonot, which is not used for poking.\n'
'(D) Point D is at the tip of the spoon, which is not used for poking.\n'
'\nTherefore, there is no correct answer in the choices\nYour output: (Z)\n'
'Example 4: \n'
'Question: {}?\nOptions: {}\n(Z) Failed\nAnswer: {}\nYour output: '
)
return tmpl.format(question, options, prediction)
def build_prompt_cn(question, options, prediction):
tmpl = (
'你是一个帮助我匹配答案与单选题中多个选项的 AI 助手。'
'你会被提供:一个问题,多个选项,一个答案。你的任务是找到与答案意义最相近的选项。'
'如果所有选项的意义都与答案显著不同,则输出 Z。'
'你应该输出一个单个的大写字母,例如 A, B, C, D(如果它们是有效选项),或 Z。'
'例 1:'
'问题: 图中最主要的物体是什么?\n选项: A. 泰迪熊 B. 兔子 C. 猫 D. 狗\n答案: 一只可爱的泰迪熊\n输出: A\n'
'例 2: \n'
'问题: 图中最主要的物体是什么?\n选项: A. 泰迪熊 B. 兔子 C. 猫 D. 狗\n答案: 蜘蛛\n输出: Z\n'
'例 3: \n'
'问题: {}?\n选项: {}\n答案: {}\n输出: '
)
return tmpl.format(question, options, prediction)
def build_choices(item):
ret = {}
for ch in string.ascii_uppercase:
if ch in item and (not pd.isna(item[ch])):
ret[ch] = item[ch]
return ret
def prefetch_answer(item):
choices = build_choices(item)
return can_infer(item['prediction'], choices)
def extract_answer_from_item(model, item, dataset_name=None):
logger = get_logger('Evaluation')
# It will return: (pred, raw, llm_time)
choices = build_choices(item)
option_str = build_option_str(choices)
if dataset_name == 'BLINK':
prompt = build_prompt_blink(item['question'], option_str, item['prediction'])
elif cn_string(item['question']):
prompt = build_prompt_cn(item['question'], option_str, item['prediction'])
else:
prompt = build_prompt(item['question'], option_str, item['prediction'])
retry = 3
ret = can_infer(item['prediction'], choices)
if ret:
return dict(opt=ret, log=item['prediction'])
if model is None:
return dict(opt='Z', log='Failed in Prefetch, no GPT-based answer matching under `exact_matching` policy.')
while retry:
ans = model.generate(prompt)
if 'Failed to obtain answer via API' in ans:
logger.warning('GPT API failed to answer. ')
else:
ret = can_infer(ans, choices)
if ret:
return dict(opt=ret, log=ans)
else:
logger.warning(f'Output includes 0 / > 1 letter among candidates {set(choices)} and Z: {ans}')
retry -= 1
if retry == 0:
options = list(choices) + ['Z'] if 'Z' not in choices else []
return dict(opt=rd.choice(options), log='Failed to predict, thus randomly generate one. ')
# For Circular Evaluation
def prefetch_circular_group(sub_data, verbose=False):
lt = len(sub_data)
GT, PRED = [], []
for i in range(lt):
item = sub_data.iloc[i]
GT.append(item['GT'])
PRED.append(prefetch_answer(item))
if PRED[-1] and (GT[-1] != PRED[-1]):
log = (
f'Failed in Prefetching Rolling {i}: Answer is {GT[-1]}, '
f"Prediction is {item['prediction']}, Pre-fetched is {PRED[-1]}. "
)
return dict(hit=0, log=log)
flag = True
for g, p in zip(GT, PRED):
if g != p:
flag = False
ret = (dict(hit=1, log='Succeed During Pre-fetching'), ) if flag else (None, )
ret = ret + (GT, PRED) if verbose else ret
return ret if len(ret) > 1 else ret[0]
def eval_vanilla(model, item, dataset_name=None):
res = extract_answer_from_item(model, item, dataset_name=dataset_name)
opt, match_log = res['opt'], res['log']
if opt == item['GT']:
return dict(hit=1, log=f'Match Log: {match_log}. ')
else:
return dict(hit=0, log=f'Match Log: {match_log}. ')
# For Circular Evaluation
def eval_circular_group(model, sub_data, dataset_name=None):
res, GT, PRED = prefetch_circular_group(sub_data, verbose=True)
if res is not None:
return res
lt = len(sub_data)
log = ''
for i in range(lt):
if PRED[i]:
log += f'Rolling {i} Matched.\n'
else:
res = extract_answer_from_item(model, sub_data.iloc[i], dataset_name=dataset_name)
opt, match_log = res['opt'], res['log']
PRED[i] = opt
if PRED[i] != GT[i]:
log += (
f"Failed in Rolling {i}: Answer is {GT[i]}; Prediction is {sub_data.iloc[i]['prediction']}; "
f'Pre-fetched is {PRED[i]}; Match Log is {match_log}.\n'
)
return dict(hit=0, log=log)
else:
log += (
f"Rolling {i}: Answer is {GT[i]}, Prediction is {sub_data.iloc[i]['prediction']}, "
f'Pre-fetched is {PRED[i]}.\n'
)
return dict(hit=1, log=log)
# data, meta are pd.DataFrame, result_file is a path
def mcq_vanilla_eval(model, data, meta, nproc, result_file, dataset_name=None):
result = {}
if osp.exists(result_file):
result = load(result_file)
answer_map = {i: c for i, c in zip(meta['index'], meta['answer'])}
if 'MMMU' in dataset_name:
data = MMMU_preproc(data)
answer_map = {k: (v if v in list(string.ascii_uppercase) else 'A') for k, v in answer_map.items()}
data = data[data['index'].isin(answer_map)]
data['GT'] = [answer_map[idx] for idx in data['index']]
items = []
for i in range(len(data)):
# Dealing with the normal part
item = data.iloc[i]
if item['index'] not in result:
items.append(item)
tups = [dict(model=model, item=x, dataset_name=dataset_name) for x in items]
keys = [x['index'] for x in items]
if len(tups):
res = track_progress_rich(eval_vanilla, tups, nproc=nproc, chunksize=nproc, save=result_file, keys=keys)
result = load(result_file)
for k, v in zip(keys, res):
if k not in result:
result[k] = v
data['hit'] = [result[i]['hit'] for i in data['index']]
data['log'] = [result[i]['log'] for i in data['index']]
if 'GT' in data:
data.pop('GT')
return data
# data, meta are pd.DataFrame, result_file is a path
def mcq_circular_eval(model, data, meta, nproc, result_file, dataset_name=None):
result = {}
if osp.exists(result_file):
result = load(result_file)
# Build Answer Map
answer_map = {i: c for i, c in zip(meta['index'], meta['answer'])}
for idx in list(meta['index']) + list(data['index']):
assert istype(idx, int)
# Only keep those lines in the meta data
data = data[data['index'].isin(answer_map)]
data['GT'] = [answer_map[idx] for idx in data['index']]
data_main = data[data['index'] < int(1e6)]
data_groups = []
for i in range(len(data_main)):
# Dealing with the normal part
idx = data_main.iloc[i]['index']
if idx not in result:
sub_data = data[data['index'] % int(1e6) == idx]
data_groups.append(sub_data)
if len(data_groups):
prefetched = [prefetch_circular_group(g, verbose=False) for g in data_groups]
remain = []
for dg, pf in zip(data_groups, prefetched):
if pf is not None:
result[dg.iloc[0]['index'] % 1e6] = pf
else:
remain.append(dg)
dump(result, result_file)
tups = [dict(model=model, sub_data=x, dataset_name=dataset_name) for x in remain]
keys = [x.iloc[0]['index'] % 1e6 for x in remain]
if len(tups) == 0:
pass
elif model is None:
logger = get_logger('Evaluation')
logger.warning('Exact Matching mode, will not do GPT-based answer matching. ')
for k in keys:
result[k] = dict(
hit=0, log='Failed in Prefetch, no GPT-based answer matching under `exact_matching` policy.')
else:
res = track_progress_rich(
eval_circular_group,
tups,
nproc=nproc,
chunksize=nproc,
save=result_file,
keys=keys)
result = load(result_file)
for k, v in zip(keys, res):
if k not in result:
result[k] = v
tmp_pth = f'/tmp/{timestr()}.xlsx'
dump(data_main, tmp_pth)
data_main = load(tmp_pth)
indices = data_main['index']
data_main['hit'] = [result[i]['hit'] for i in indices]
data_main['log'] = [result[i]['log'] for i in indices]
if 'GT' in data_main:
data_main.pop('GT')
return data_main
def extract_characters_regex(s, choices=['(A)', '(B)', '(C)', '(D)', '(E)']):
if type(s) is dict:
s = ''
s = s.strip()
answer_prefixes = [
'The best answer is',
'The correct answer is',
'The answer is',
'The answer',
'The best option is'
'The correct option is',
'Best answer:'
'Best option:',
]
for answer_prefix in answer_prefixes:
s = s.replace(answer_prefix, '')
if len(s.split()) > 10 and not re.search('[ABCDE]', s):
return ''
matches = re.search(r'[ABCDE]', s)
if matches is None:
for choice in choices:
if s.lower() in choice.lower():
return choice[1]
return ''
return matches[0]
def get_dimension_rating(data_path):
TASKS = [
'Reasoning',
'Perception',
]
SUBTASKS = [
'Monitoring',
'Autonomous_Driving',
'OCR with Complex Context',
'Diagram and Table',
'Remote Sensing',
]
data = load(data_path)
results = {}
results['Overall'] = {}
for task in TASKS:
results[f'{task}'] = {}
for subtask in SUBTASKS:
results[f'{task}'][f'{subtask}'] = {}
for i in range(len(data)):
question = data.iloc[i]
Task = question['category'].split('/')[0]
Subtask = question['category'].split('/')[1]
Category = question['l2-category'].lower()
if 'attribute' in Category.lower():
Category = Category.split('/')[0] + '/attribute'
if question['score'] >= 0:
cnt = question['score']
if Category not in results[Task][Subtask].keys():
results[Task][Subtask][f'{Category}'] = {'true': cnt, 'false': 1 - cnt}
else:
results[Task][Subtask][f'{Category}']['true'] += cnt
results[Task][Subtask][f'{Category}']['false'] += 1 - cnt
sum_all, succ_all = 0, 0
for task, tasks_values in results.items():
cnt_task, sum_task = 0, 0
for substask, subtask_value in tasks_values.items():
cnt_subtask, sum_subtask = 0, 0
for category, category_dict in subtask_value.items():
cnt_subtask += category_dict['true']
sum_subtask += category_dict['false'] + category_dict['true']
acc = category_dict['true'] / (category_dict['false'] + category_dict['true'])
results[task][substask][category] = acc
if sum_subtask == 0:
acc_subtasks = 0
else:
acc_subtasks = cnt_subtask / sum_subtask
cnt_task += cnt_subtask
sum_task += sum_subtask
results[task][substask]['Avg'] = acc_subtasks
if sum_task == 0:
acc_task = 0
else:
acc_task = cnt_task / sum_task
succ_all += cnt_task
sum_all += sum_task
results[task]['Avg'] = acc_task
results['Overall'] = succ_all / sum_all
return results
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment