Commit 81028572 authored by luopl's avatar luopl
Browse files

init

parents
Pipeline #1722 canceled with stages
from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
import os
import json
def multimodal(images, text, url, key, temperature=0, max_tokens=1024, history=[]):
if images:
pics = []
for image in images:
with open(image, 'rb') as f:
pic = base64.b64encode(f.read()).decode('utf-8')
pics.append(pic)
data = {'images': pics, 'text': text, 'key': key, 'temperature': temperature, 'max_new_tokens': max_tokens}
else:
data = {'text': text, 'key': key, 'temperature': temperature, 'max_new_tokens': max_tokens}
response = requests.post(url, json=data, headers={'Content-Type': 'application/json'})
response = json.loads(response.text)
return response
class BlueLMWrapper(BaseAPI):
is_api: bool = True
def __init__(self,
model: str = 'BlueLM-V-v3.0',
retry: int = 5,
wait: int = 5,
verbose: bool = True,
temperature: float = 0.0,
system_prompt: str = None,
max_tokens: int = 1024,
key: str = None,
url: str = 'http://api-ai.vivo.com.cn/multimodal',
**kwargs):
self.model = model
self.fail_msg = 'Failed to obtain answer BlueLM-V API. '
self.max_tokens = max_tokens
self.temperature = temperature
self.url = url
self.key = key
if self.key is None:
self.key = os.environ.get('BLUELM_V_API_KEY', None)
assert self.key is not None, (
'Please set the API Key (obtain it here: '
'contact by email : shuai.ren@vivo.com'
)
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
def message_to_promptimg(self, message, dataset=None):
num_images = len([x for x in message if x['type'] == 'image'])
if num_images == 0:
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
image = None
elif num_images == 1:
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
image = [x['value'] for x in message if x['type'] == 'image']
else:
prompt = '\n'.join([x['value'] if x['type'] == 'text' else '<image>' for x in message])
if dataset == 'BLINK':
image = concat_images_vlmeval(
[x['value'] for x in message if x['type'] == 'image'],
target_size=512)
else:
image = [x['value'] for x in message if x['type'] == 'image']
if dataset in ['MMBench_DEV_EN_V11', 'MMBench_DEV_CN_V11', 'MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11',
'AI2D_TEST', 'AI2D_TEST_TO_MASK', 'MMMU_DEV_VAL']:
prompt = prompt.replace('Please select the correct answer from the options above.',
'Answer with the option’s letter from the given choices directly.')
elif dataset in ['ChartQA_TEST']:
prompt = prompt.replace('Answer the question using a single word or phrase.',
'Answer the question using a single number or phrase.')
elif dataset in ['DocVQA_VAL', 'DocVQA_TEST', ]:
prompt = prompt.replace('Answer the question using a single word or phrase.',
'Give the short answer directly.')
elif dataset in ['TextVQA_VAL']:
prompt = prompt.replace('Answer the question using a single word or phrase.',
'When the provided information is insufficient, respond with ’Unanswerable’.'
'Answer the question using a single word or phrase.')
elif dataset in ['MTVQA_TEST']:
prompt = prompt.replace('\nAnswer the question using a word or phrase in the language of the question.', '')
elif dataset in ['MathVista_MINI']:
if 'Choices:' in prompt:
prompt = prompt.replace('Choices:', 'Options:').replace('Hint:', 'Context:')
for i in range(1, 7): # replace A ~ F
prompt = prompt.replace(f'({chr(64 + i)})', f'{chr(64 + i)}.')
prompt += '\nAnswer with the option’s letter from the given choices directly.'
else:
prompt += '\nAnswer the question using a single word or phrase.'
return prompt, image
def generate_inner(self, inputs, **kwargs) -> str:
assert isinstance(inputs, str) or isinstance(inputs, list)
pure_text = np.all([x['type'] == 'text' for x in inputs])
assert not pure_text
prompt, image_path = self.message_to_promptimg(inputs, kwargs['dataset'])
try:
response = multimodal(image_path, prompt, self.url, self.key, self.temperature, self.max_tokens)
answer = response['result']
return 0, answer, 'Succeeded! '
except Exception as err:
if self.verbose:
self.logger.error(err)
self.logger.error(f'The input messages are {inputs}.')
return -1, '', ''
class BlueLM_V_API(BlueLMWrapper):
def generate(self, message, dataset=None):
return super(BlueLM_V_API, self).generate(message, dataset=dataset)
from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
from time import sleep
import base64
import mimetypes
from PIL import Image
url = 'https://openxlab.org.cn/gw/alles-apin-hub/v1/claude/v1/text/chat'
headers = {
'alles-apin-token': '',
'Content-Type': 'application/json'
}
class Claude_Wrapper(BaseAPI):
is_api: bool = True
def __init__(self,
model: str = 'claude-3-opus-20240229',
key: str = None,
retry: int = 10,
wait: int = 3,
system_prompt: str = None,
verbose: bool = True,
temperature: float = 0,
max_tokens: int = 1024,
**kwargs):
self.model = model
self.headers = headers
self.temperature = temperature
self.max_tokens = max_tokens
if key is not None:
self.key = key
else:
self.key = os.environ.get('ALLES', '')
self.headers['alles-apin-token'] = self.key
super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
# inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
# content can be a string or a list of image & text
def prepare_itlist(self, inputs):
assert np.all([isinstance(x, dict) for x in inputs])
has_images = np.sum([x['type'] == 'image' for x in inputs])
if has_images:
content_list = []
for msg in inputs:
if msg['type'] == 'text' and msg['value'] != '':
content_list.append(dict(type='text', text=msg['value']))
elif msg['type'] == 'image':
pth = msg['value']
suffix = osp.splitext(pth)[-1].lower()
media_type = mimetypes.types_map.get(suffix, None)
assert media_type is not None
content_list.append(dict(
type='image',
source={
'type': 'base64',
'media_type': media_type,
'data': encode_image_file_to_base64(pth, target_size=4096)
}))
else:
assert all([x['type'] == 'text' for x in inputs])
text = '\n'.join([x['value'] for x in inputs])
content_list = [dict(type='text', text=text)]
return content_list
def prepare_inputs(self, inputs):
input_msgs = []
assert isinstance(inputs, list) and isinstance(inputs[0], dict)
assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs
if 'role' in inputs[0]:
assert inputs[-1]['role'] == 'user', inputs[-1]
for item in inputs:
input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content'])))
else:
input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs)))
return input_msgs
def generate_inner(self, inputs, **kwargs) -> str:
payload = json.dumps({
'model': self.model,
'max_tokens': self.max_tokens,
'messages': self.prepare_inputs(inputs),
'system': self.system_prompt,
**kwargs
})
response = requests.request('POST', url, headers=headers, data=payload)
ret_code = response.status_code
ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
answer = self.fail_msg
try:
resp_struct = json.loads(response.text)
answer = resp_struct['data']['content'][0]['text'].strip()
except:
pass
return ret_code, answer, response
class Claude3V(Claude_Wrapper):
def generate(self, message, dataset=None):
return super(Claude_Wrapper, self).generate(message)
from ..smp import *
import os
from .base import BaseAPI
class CWWrapper(BaseAPI):
is_api: bool = True
def __init__(self,
model: str = 'cw-congrong-v1.5',
retry: int = 10,
wait: int = 5,
key: str = None,
verbose: bool = True,
system_prompt: str = None,
temperature: float = 0,
timeout: int = 600,
api_base: str = 'http://cwapi-vlm01.cw_rb.azurebot.tk/v1/chat/completions',
max_tokens: int = 1024,
img_size: int = 512,
img_detail: str = 'low',
**kwargs):
self.model = model
self.cur_idx = 0
self.fail_msg = 'Failed to obtain answer via API. '
self.max_tokens = max_tokens
self.temperature = temperature
base = os.environ.get('CW_API_BASE', None)
self.api_base = base if base is not None else api_base
env_key = os.environ.get('CW_API_KEY', None)
self.key = env_key if env_key is not None else key
assert self.key is not None, 'API key not provided. Please set CW_API_KEY environment variable or \
pass it to the constructor.'
assert img_size > 0 or img_size == -1
self.img_size = -1 # allways send full size image
assert img_detail in ['high', 'low']
self.img_detail = img_detail
self.vision = True
self.timeout = timeout
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
# inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
# content can be a string or a list of image & text
def prepare_inputs(self, inputs):
input_msgs = []
if self.system_prompt is not None:
input_msgs.append(dict(role='system', content=self.system_prompt))
has_images = np.sum([x['type'] == 'image' for x in inputs])
if has_images:
content_list = []
for msg in inputs:
if msg['type'] == 'text':
content_list.append(dict(type='text', text=msg['value']))
elif msg['type'] == 'image':
from PIL import Image
img = Image.open(msg['value'])
b64 = encode_image_to_base64(img, target_size=self.img_size)
img_struct = dict(url=f'data:image/jpeg;base64,{b64}', detail=self.img_detail)
content_list.append(dict(type='image_url', image_url=img_struct))
input_msgs.append(dict(role='user', content=content_list))
else:
assert all([x['type'] == 'text' for x in inputs])
text = '\n'.join([x['value'] for x in inputs])
input_msgs.append(dict(role='user', content=text))
return input_msgs
def generate_inner(self, inputs, **kwargs) -> str:
input_msgs = self.prepare_inputs(inputs)
temperature = kwargs.pop('temperature', self.temperature)
max_tokens = kwargs.pop('max_tokens', self.max_tokens)
if 0 < max_tokens <= 100:
self.logger.warning(
'Less than 100 tokens left, '
'may exceed the context window with some additional meta symbols. '
)
if max_tokens <= 0:
return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. '
headers = {'Content-Type': 'application/json', 'Authorization': f'{self.key}'}
payload = dict(
model=self.model,
messages=input_msgs,
max_tokens=max_tokens,
n=1,
temperature=temperature,
**kwargs)
response = requests.post(self.api_base, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
ret_code = response.status_code
ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
answer = self.fail_msg
try:
resp_struct = json.loads(response.text)
answer = resp_struct['choices'][0]['message']['content'].strip()
except:
pass
return ret_code, answer, response
from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
headers = 'Content-Type: application/json'
class GeminiWrapper(BaseAPI):
is_api: bool = True
def __init__(self,
model: str = 'gemini-1.0-pro',
retry: int = 5,
wait: int = 5,
key: str = None,
verbose: bool = True,
temperature: float = 0.0,
system_prompt: str = None,
max_tokens: int = 1024,
proxy: str = None,
backend='genai',
project_id='vlmeval',
**kwargs):
assert model in ['gemini-1.0-pro', 'gemini-1.5-pro', 'gemini-1.5-flash']
self.model = model
self.fail_msg = 'Failed to obtain answer via API. '
self.max_tokens = max_tokens
self.temperature = temperature
if key is None:
key = os.environ.get('GOOGLE_API_KEY', None)
# Try to load backend from environment variable
be = os.environ.get('GOOGLE_API_BACKEND', None)
if be is not None and be in ['genai', 'vertex']:
backend = be
assert backend in ['genai', 'vertex']
if backend == 'genai':
# We have not evaluated Gemini-1.5 w. GenAI backend
assert key is not None # Vertex does not require API Key
self.backend = backend
self.project_id = project_id
self.api_key = key
if proxy is not None:
proxy_set(proxy)
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
def build_msgs_genai(self, inputs):
messages = [] if self.system_prompt is None else [self.system_prompt]
for inp in inputs:
if inp['type'] == 'text':
messages.append(inp['value'])
elif inp['type'] == 'image':
messages.append(Image.open(inp['value']))
return messages
def build_msgs_vertex(self, inputs):
from vertexai.generative_models import Part, Image
messages = [] if self.system_prompt is None else [self.system_prompt]
for inp in inputs:
if inp['type'] == 'text':
messages.append(inp['value'])
elif inp['type'] == 'image':
messages.append(Part.from_image(Image.load_from_file(inp['value'])))
return messages
def generate_inner(self, inputs, **kwargs) -> str:
if self.backend == 'genai':
import google.generativeai as genai
assert isinstance(inputs, list)
pure_text = np.all([x['type'] == 'text' for x in inputs])
genai.configure(api_key=self.api_key)
if pure_text and self.model == 'gemini-1.0-pro':
model = genai.GenerativeModel('gemini-1.0-pro')
else:
assert self.model in ['gemini-1.5-pro', 'gemini-1.5-flash']
model = genai.GenerativeModel(self.model)
messages = self.build_msgs_genai(inputs)
gen_config = dict(max_output_tokens=self.max_tokens, temperature=self.temperature)
gen_config.update(kwargs)
try:
answer = model.generate_content(
messages,
generation_config=genai.types.GenerationConfig(**gen_config)).text
return 0, answer, 'Succeeded! '
except Exception as err:
if self.verbose:
self.logger.error(err)
self.logger.error(f'The input messages are {inputs}.')
return -1, '', ''
elif self.backend == 'vertex':
import vertexai
from vertexai.generative_models import GenerativeModel
vertexai.init(project=self.project_id, location='us-central1')
model_name = 'gemini-1.0-pro-vision' if self.model == 'gemini-1.0-pro' else self.model
model = GenerativeModel(model_name=model_name)
messages = self.build_msgs_vertex(inputs)
try:
resp = model.generate_content(messages)
answer = resp.text
return 0, answer, 'Succeeded! '
except Exception as err:
if self.verbose:
self.logger.error(err)
self.logger.error(f'The input messages are {inputs}.')
return -1, '', ''
class GeminiProVision(GeminiWrapper):
def generate(self, message, dataset=None):
return super(GeminiProVision, self).generate(message)
from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
from vlmeval.dataset import DATASET_TYPE
from vlmeval.smp.vlm import encode_image_file_to_base64
class GLMVisionWrapper(BaseAPI):
is_api: bool = True
def __init__(self,
model: str,
retry: int = 5,
wait: int = 5,
key: str = None,
verbose: bool = True,
system_prompt: str = None,
max_tokens: int = 1024,
proxy: str = None,
**kwargs):
self.model = model
self.fail_msg = 'Failed to obtain answer via API. '
self.default_params = {
'top_p': 0.6,
'top_k': 2,
'temperature': 0.8,
'repetition_penalty': 1.1,
'best_of': 1,
'do_sample': True,
'stream': False,
'max_tokens': max_tokens
}
if key is None:
key = os.environ.get('GLMV_API_KEY', None)
assert key is not None, (
'Please set the API Key (obtain it here: '
'https://open.bigmodel.cn/dev/howuse/introduction)'
)
self.key = key
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
def image_to_base64(self, image_path):
import base64
with open(image_path, 'rb') as image_file:
encoded_string = base64.b64encode(image_file.read())
return encoded_string.decode('utf-8')
def build_msgs(self, msgs_raw, system_prompt=None, dataset=None):
msgs = cp.deepcopy(msgs_raw)
content = []
text = ''
for i, msg in enumerate(msgs):
if msg['type'] == 'text':
text += msg['value']
elif msg['type'] == 'image':
content.append(dict(type='image_url', image_url=dict(url=encode_image_file_to_base64(msg['value']))))
if dataset is not None and DATASET_TYPE(dataset) in ['MCQ', 'Y/N']:
text += '\nShort Answer.'
content.append(dict(type='text', text=text))
ret = [dict(role='user', content=content)]
return ret
def generate_inner(self, inputs, **kwargs) -> str:
assert isinstance(inputs, str) or isinstance(inputs, list)
inputs = [inputs] if isinstance(inputs, str) else inputs
messages = self.build_msgs(msgs_raw=inputs, dataset=kwargs.get('dataset', None))
url = 'https://api.chatglm.cn/v1/chat/completions'
headers = {
'Content-Type': 'application/json',
'Request-Id': 'remote-test',
'Authorization': f'Bearer {self.key}'
}
payload = {
'model': self.model,
'messages': messages,
**self.default_params
}
response = requests.post(url, headers=headers, data=json.dumps(payload), verify=False)
output = []
try:
assert response.status_code == 200
for line in response.iter_lines():
data = json.loads(line.decode('utf-8').lstrip('data: '))
output.append(data['choices'][0]['message']['content'])
answer = ''.join(output).replace('</s>', '')
if self.verbose:
self.logger.info(f'inputs: {inputs}\nanswer: {answer}')
return 0, answer, 'Succeeded! '
except Exception as err:
if self.verbose:
self.logger.error(err)
self.logger.error(f'The input messages are {inputs}.')
return -1, self.fail_msg, ''
class GLMVisionAPI(GLMVisionWrapper):
def generate(self, message, dataset=None):
return super(GLMVisionAPI, self).generate(message, dataset=dataset)
from ..smp import *
import os
import sys
from .base import BaseAPI
APIBASES = {
'OFFICIAL': 'https://api.openai.com/v1/chat/completions',
}
def GPT_context_window(model):
length_map = {
'gpt-4': 8192,
'gpt-4-0613': 8192,
'gpt-4-turbo-preview': 128000,
'gpt-4-1106-preview': 128000,
'gpt-4-0125-preview': 128000,
'gpt-4-vision-preview': 128000,
'gpt-4-turbo': 128000,
'gpt-4-turbo-2024-04-09': 128000,
'gpt-3.5-turbo': 16385,
'gpt-3.5-turbo-0125': 16385,
'gpt-3.5-turbo-1106': 16385,
'gpt-3.5-turbo-instruct': 4096,
}
if model in length_map:
return length_map[model]
else:
return 128000
class OpenAIWrapper(BaseAPI):
is_api: bool = True
def __init__(self,
model: str = 'gpt-3.5-turbo-0613',
retry: int = 5,
wait: int = 5,
key: str = None,
verbose: bool = True,
system_prompt: str = None,
temperature: float = 0,
timeout: int = 60,
api_base: str = None,
max_tokens: int = 1024,
img_size: int = 512,
img_detail: str = 'low',
use_azure: bool = False,
**kwargs):
self.model = model
self.cur_idx = 0
self.fail_msg = 'Failed to obtain answer via API. '
self.max_tokens = max_tokens
self.temperature = temperature
self.use_azure = use_azure
if 'step-1v' in model:
env_key = os.environ.get('STEPAI_API_KEY', '')
if key is None:
key = env_key
elif 'yi-vision' in model:
env_key = os.environ.get('YI_API_KEY', '')
if key is None:
key = env_key
else:
if use_azure:
env_key = os.environ.get('AZURE_OPENAI_API_KEY', None)
assert env_key is not None, 'Please set the environment variable AZURE_OPENAI_API_KEY. '
if key is None:
key = env_key
assert isinstance(key, str), (
'Please set the environment variable AZURE_OPENAI_API_KEY to your openai key. '
)
else:
env_key = os.environ.get('OPENAI_API_KEY', '')
if key is None:
key = env_key
assert isinstance(key, str) and key.startswith('sk-'), (
f'Illegal openai_key {key}. '
'Please set the environment variable OPENAI_API_KEY to your openai key. '
)
self.key = key
assert img_size > 0 or img_size == -1
self.img_size = img_size
assert img_detail in ['high', 'low']
self.img_detail = img_detail
self.timeout = timeout
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
if use_azure:
api_base_template = (
'{endpoint}openai/deployments/{deployment_name}/chat/completions?api-version={api_version}'
)
endpoint = os.getenv('AZURE_OPENAI_ENDPOINT', None)
assert endpoint is not None, 'Please set the environment variable AZURE_OPENAI_ENDPOINT. '
deployment_name = os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME', None)
assert deployment_name is not None, 'Please set the environment variable AZURE_OPENAI_DEPLOYMENT_NAME. '
api_version = os.getenv('OPENAI_API_VERSION', None)
assert api_version is not None, 'Please set the environment variable OPENAI_API_VERSION. '
self.api_base = api_base_template.format(
endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
deployment_name=os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME'),
api_version=os.getenv('OPENAI_API_VERSION')
)
else:
if api_base is None:
if 'OPENAI_API_BASE' in os.environ and os.environ['OPENAI_API_BASE'] != '':
self.logger.info('Environment variable OPENAI_API_BASE is set. Will use it as api_base. ')
api_base = os.environ['OPENAI_API_BASE']
else:
api_base = 'OFFICIAL'
assert api_base is not None
if api_base in APIBASES:
self.api_base = APIBASES[api_base]
elif api_base.startswith('http'):
self.api_base = api_base
else:
self.logger.error('Unknown API Base. ')
sys.exit(-1)
self.logger.info(f'Using API Base: {self.api_base}; API Key: {self.key}')
# inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
# content can be a string or a list of image & text
def prepare_itlist(self, inputs):
assert np.all([isinstance(x, dict) for x in inputs])
has_images = np.sum([x['type'] == 'image' for x in inputs])
if has_images:
content_list = []
for msg in inputs:
if msg['type'] == 'text':
content_list.append(dict(type='text', text=msg['value']))
elif msg['type'] == 'image':
from PIL import Image
img = Image.open(msg['value'])
b64 = encode_image_to_base64(img, target_size=self.img_size)
img_struct = dict(url=f'data:image/jpeg;base64,{b64}', detail=self.img_detail)
content_list.append(dict(type='image_url', image_url=img_struct))
else:
assert all([x['type'] == 'text' for x in inputs])
text = '\n'.join([x['value'] for x in inputs])
content_list = [dict(type='text', text=text)]
return content_list
def prepare_inputs(self, inputs):
input_msgs = []
if self.system_prompt is not None:
input_msgs.append(dict(role='system', content=self.system_prompt))
assert isinstance(inputs, list) and isinstance(inputs[0], dict)
assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs
if 'role' in inputs[0]:
assert inputs[-1]['role'] == 'user', inputs[-1]
for item in inputs:
input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content'])))
else:
input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs)))
return input_msgs
def generate_inner(self, inputs, **kwargs) -> str:
input_msgs = self.prepare_inputs(inputs)
temperature = kwargs.pop('temperature', self.temperature)
max_tokens = kwargs.pop('max_tokens', self.max_tokens)
context_window = GPT_context_window(self.model)
max_tokens = min(max_tokens, context_window - self.get_token_len(inputs))
if 0 < max_tokens <= 100:
self.logger.warning(
'Less than 100 tokens left, '
'may exceed the context window with some additional meta symbols. '
)
if max_tokens <= 0:
return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. '
# Will send request if use Azure, dk how to use openai client for it
if self.use_azure:
headers = {'Content-Type': 'application/json', 'api-key': self.key}
else:
headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.key}'}
payload = dict(
model=self.model,
messages=input_msgs,
max_tokens=max_tokens,
n=1,
temperature=temperature,
**kwargs)
response = requests.post(
self.api_base,
headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
ret_code = response.status_code
ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
answer = self.fail_msg
try:
resp_struct = json.loads(response.text)
answer = resp_struct['choices'][0]['message']['content'].strip()
except:
pass
return ret_code, answer, response
def get_image_token_len(self, img_path, detail='low'):
import math
if detail == 'low':
return 85
im = Image.open(img_path)
height, width = im.size
if width > 1024 or height > 1024:
if width > height:
height = int(height * 1024 / width)
width = 1024
else:
width = int(width * 1024 / height)
height = 1024
h = math.ceil(height / 512)
w = math.ceil(width / 512)
total = 85 + 170 * h * w
return total
def get_token_len(self, inputs) -> int:
import tiktoken
try:
enc = tiktoken.encoding_for_model(self.model)
except:
enc = tiktoken.encoding_for_model('gpt-4')
assert isinstance(inputs, list)
tot = 0
for item in inputs:
if 'role' in item:
tot += self.get_token_len(item['content'])
elif item['type'] == 'text':
tot += len(enc.encode(item['value']))
elif item['type'] == 'image':
tot += self.get_image_token_len(item['value'], detail=self.img_detail)
return tot
class GPT4V(OpenAIWrapper):
def generate(self, message, dataset=None):
return super(GPT4V, self).generate(message)
import os
import sys
import os.path as osp
import torch
from ..smp import *
def get_gpu_num(model_name):
model_name = model_name.lower()
kws = {
8: ['65b', '70b'],
4: ['30b', '33b', '35b', '40b'],
2: ['13b', '14b', '20b'],
1: ['6b', '7b', 'moss'],
}
for k in [8, 4, 2, 1]:
for keyword in kws[k]:
if keyword in model_name:
return k
return 8
validated_llms = [
'internlm/internlm-chat-7b', 'internlm/internlm-chat-7b-8k', 'internlm/internlm-chat-20b',
'Qwen/Qwen-7B-Chat', 'Qwen/Qwen-14B-Chat',
'THUDM/chatglm2-6b', 'THUDM/chatglm2-6b-32k', 'THUDM/chatglm3-6b', 'THUDM/chatglm3-6b-32k',
'baichuan-inc/Baichuan2-7B-Chat', 'baichuan-inc/Baichuan2-13B-Chat',
'lmsys/vicuna-7b-v1.5', 'lmsys/vicuna-13b-v1.5',
'meta-llama/Llama-2-7b-chat-hf'
]
Auto_model = ['chatglm']
class HFChatModel:
def _get_context_length(self, model, model_path):
# By default, we use model.config.seq_length
model_path = model_path.lower()
if 'baichuan' in model_path:
context_window = model.config.model_max_length
elif 'internlm' in model_path or 'llama' in model_path:
context_window = model.config.max_position_embeddings
elif 'vicuna' in model_path:
context_window = model.generation_config.max_length
else:
# chatglm & qwen
context_window = model.config.seq_length
return context_window
def _get_context_length_robust(self, model, model_path):
try:
context_window = self._get_context_length(model, model_path)
return context_window
except:
self.logger.critical(
'Failed to extract context_window information from config / generation_config. '
'Please read the above code and check if the logic works for you model path'
)
raise NotImplementedError
def __init__(self,
model_path,
system_prompt: str = None,
**kwargs):
self.logger = get_logger('HFChatModel')
if 'vicuna' in model_path.lower():
try:
from fastchat.model import get_conversation_template
except:
self.logger.critical('Please install fastchat first to use vicuna. ')
sys.exit(-1)
self.explicit_device = kwargs.pop('device', None)
if self.explicit_device is None:
# If CUDA_VISIBLE_DEVICES is not properly set
if 'CUDA_VISIBLE_DEVICES' not in os.environ or os.environ['CUDA_VISIBLE_DEVICES'] == '0,1,2,3,4,5,6,7':
num_gpu = get_gpu_num(model_path)
gpu_offset = kwargs.pop('gpu_offset', 0)
cuda_visible_devices = ','.join([str(i) for i in range(gpu_offset, gpu_offset + num_gpu)])
os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
from transformers.generation import GenerationConfig
if model_path not in validated_llms:
self.logger.warning(f'{model_path} not in validated LLMs, may have inference troubles. ')
self.model_path = model_path
if listinstr(Auto_model, model_path):
LoadModel = AutoModel
else:
LoadModel = AutoModelForCausalLM
assert osp.exists(model_path) or len(model_path.split('/')) == 2
device = self.explicit_device if self.explicit_device else 'auto'
precision = {}
if 'internlm-chat-7b' in model_path:
precision = {'torch_dtype': torch.float16}
elif 'internlm-chat-20b' in model_path:
precision = {'torch_dtype': torch.bfloat16}
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = LoadModel.from_pretrained(model_path, trust_remote_code=True, device_map='cpu', **precision)
model = model.eval()
if device != 'cpu':
model = model.to(f'cuda:{device}' if isinstance(device, int) else 'cuda')
try:
model.generation_config = GenerationConfig.from_pretrained(
model_path, trust_remote_code=True, device_map=device)
except:
pass
torch.cuda.empty_cache()
self.model = model
self.context_length = self._get_context_length_robust(model=model, model_path=model_path)
self.answer_buffer = 192
self.system_prompt = system_prompt
for k, v in kwargs.items():
self.logger.info(f'Following args will be used for generation (If not set specifically), {k}: {v}. ')
self.kwargs = kwargs
def generate_str(self, input, **kwargs):
if 'baichuan' in self.model_path.lower():
messages = []
messages.append({'role': 'user', 'content': input})
resp = self.model.chat(self.tokenizer, messages, **kwargs)
elif 'vicuna' in self.model_path.lower():
from fastchat.model import get_conversation_template
conv = get_conversation_template('vicuna')
conv.append_message(conv.roles[0], input)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
inputs = self.tokenizer([prompt], return_tensors='pt')
if torch.cuda.is_available():
for k in inputs:
inputs[k] = inputs[k].cuda()
params = dict(do_sample=True, temperature=0.7, repetition_penalty=1.0, max_new_tokens=512)
params.update(self.kwargs)
params.update(kwargs)
outputs = self.model.generate(**inputs, **params)
resp = self.tokenizer.decode(
outputs[0][len(inputs['input_ids'][0]):],
skip_special_tokens=True,
spaces_between_special_tokens=False)
else:
params = self.kwargs
params.update(kwargs)
resp, _ = self.model.chat(self.tokenizer, input, history=[], **params)
return resp
def length_ok(self, inputs):
tot = len(self.tokenizer.encode(self.system_prompt)) if self.system_prompt is not None else 0
for s in inputs:
tot += len(self.tokenizer.encode(s))
return tot + self.answer_buffer < self.context_length
def generate_list(self, full_inputs, offset=0, **kwargs):
assert isinstance(full_inputs, list)
inputs = full_inputs[offset:]
if not self.length_ok(inputs):
return self.chat(full_inputs, offset + 1)
model_path = self.model_path.lower()
if sum([x in model_path for x in ['baichuan']]):
input_msgs = []
if self.system_prompt is not None:
input_msgs.append(dict(role='user', content=self.system_prompt))
if len(inputs):
assert isinstance(inputs, list) and isinstance(inputs[0], str)
roles = ['user', 'assistant'] if len(inputs) % 2 == 1 else ['assistant', 'user']
roles = roles * len(inputs)
for role, msg in zip(roles, inputs):
input_msgs.append(dict(role=role, content=msg))
response = self.model.chat(self.tokenizer, input_msgs)
elif sum([x in model_path for x in ['vicuna']]):
from fastchat.model import get_conversation_template
conv = get_conversation_template('vicuna')
assert isinstance(inputs, list) and isinstance(inputs[0], str)
if len(inputs) % 2 == 1:
if self.system_prompt is not None:
conv.append_message(conv.roles[0], self.system_prompt)
for i in range(len(inputs) // 2):
conv.append_message(conv.roles[0], inputs[2 * i])
conv.append_message(conv.roles[1], inputs[2 * i + 1])
else:
assert self.system_prompt is not None
conv.append_message(conv.roles[0], self.system_prompt)
conv.append_message(conv.roles[1], inputs[0])
for i in range(len(inputs) // 2 - 1):
conv.append_message(conv.roles[0], inputs[2 * i + 1])
conv.append_message(conv.roles[1], inputs[2 * i + 2])
conv.append_message(conv.roles[0], inputs[-1])
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
inputs = self.tokenizer([prompt], return_tensors='pt')
if torch.cuda.is_available():
for k in inputs:
inputs[k] = inputs[k].cuda()
params = dict(do_sample=True, temperature=0.7, repetition_penalty=1.0, max_new_tokens=512)
params.update(self.kwargs)
params.update(kwargs)
outputs = self.model.generate(**inputs, **params)
response = self.tokenizer.decode(
outputs[0][len(inputs['input_ids'][0]):],
skip_special_tokens=True,
spaces_between_special_tokens=False)
response = response.lstrip('\n')
else:
# The default option, support internlm, chatglm, qwen
history, msg = [], None
if len(inputs) % 2 == 1:
if self.system_prompt is not None:
history = [(self.system_prompt, '')]
for i in range(len(inputs) // 2):
history.append((inputs[2 * i], inputs[2 * i + 1]))
else:
assert self.system_prompt is not None
history = [(self.system_prompt, inputs[0])]
for i in range(len(inputs) // 2 - 1):
history.append((inputs[2 * i + 1], inputs[2 * i + 2]))
msg = inputs[-1]
params = self.kwargs
params.update(kwargs)
response, _ = self.model.chat(self.tokenizer, msg, history=history, **params)
return response, offset
def generate(self, inputs, **kwargs):
if isinstance(inputs, str):
return self.generate_str(inputs, **kwargs)
elif isinstance(inputs, list):
return self.generate_list(inputs, **kwargs)
from vlmeval.smp import *
import os
import sys
from vlmeval.api.base import BaseAPI
class HunyuanWrapper(BaseAPI):
is_api: bool = True
_apiVersion = '2023-09-01'
_service = 'hunyuan'
def __init__(self,
model: str = 'hunyuan-vision',
retry: int = 5,
wait: int = 5,
secret_key: str = None,
secret_id: str = None,
verbose: bool = True,
system_prompt: str = None,
temperature: float = 0,
timeout: int = 60,
api_base: str = 'hunyuan.tencentcloudapi.com',
**kwargs):
self.model = model
self.cur_idx = 0
self.fail_msg = 'Failed to obtain answer via API. '
self.temperature = temperature
warnings.warn('You may need to set the env variable HUNYUAN_SECRET_ID & HUNYUAN_SECRET_KEY to use Hunyuan. ')
secret_key = os.environ.get('HUNYUAN_SECRET_KEY', secret_key)
assert secret_key is not None, 'Please set the environment variable HUNYUAN_SECRET_KEY. '
secret_id = os.environ.get('HUNYUAN_SECRET_ID', secret_id)
assert secret_id is not None, 'Please set the environment variable HUNYUAN_SECRET_ID. '
self.model = model
self.endpoint = api_base
self.secret_id = secret_id
self.secret_key = secret_key
self.timeout = timeout
try:
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.hunyuan.v20230901 import hunyuan_client
except ImportError:
warnings.warn('Please install tencentcloud-sdk-python to use Hunyuan API. ')
exit(-1)
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
cred = credential.Credential(self.secret_id, self.secret_key)
httpProfile = HttpProfile()
httpProfile.endpoint = self.endpoint
clientProfile = ClientProfile()
clientProfile.httpProfile = httpProfile
self.client = hunyuan_client.HunyuanClient(cred, 'ap-beijing', clientProfile)
self.logger.info(
f'Using Endpoint: {self.endpoint}; API Secret ID: {self.secret_id}; API Secret Key: {self.secret_key}'
)
# inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
# content can be a string or a list of image & text
def prepare_itlist(self, inputs):
assert np.all([isinstance(x, dict) for x in inputs])
has_images = np.sum([x['type'] == 'image' for x in inputs])
if has_images:
content_list = []
for msg in inputs:
if msg['type'] == 'text':
content_list.append(dict(Type='text', Text=msg['value']))
elif msg['type'] == 'image':
from PIL import Image
img = Image.open(msg['value'])
b64 = encode_image_to_base64(img)
img_struct = dict(Url=f'data:image/jpeg;base64,{b64}')
content_list.append(dict(Type='image_url', ImageUrl=img_struct))
else:
assert all([x['type'] == 'text' for x in inputs])
text = '\n'.join([x['value'] for x in inputs])
content_list = [dict(Type='text', Text=text)]
return content_list
def prepare_inputs(self, inputs):
input_msgs = []
if self.system_prompt is not None:
input_msgs.append(dict(Role='system', Content=self.system_prompt))
assert isinstance(inputs, list) and isinstance(inputs[0], dict)
assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs
if 'role' in inputs[0]:
assert inputs[-1]['role'] == 'user', inputs[-1]
for item in inputs:
input_msgs.append(dict(Role=item['role'], Contents=self.prepare_itlist(item['content'])))
else:
input_msgs.append(dict(Role='user', Contents=self.prepare_itlist(inputs)))
return input_msgs
def generate_inner(self, inputs, **kwargs) -> str:
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.hunyuan.v20230901 import models
input_msgs = self.prepare_inputs(inputs)
temperature = kwargs.pop('temperature', self.temperature)
payload = dict(
Model=self.model,
Messages=input_msgs,
Temperature=temperature,
**kwargs)
retry_counter = 0
while retry_counter < 3:
try:
req = models.ChatCompletionsRequest()
req.from_json_string(json.dumps(payload))
resp = self.client.ChatCompletions(req)
resp = json.loads(resp.to_json_string())
answer = resp['Choices'][0]['Message']['Content']
return 0, answer, resp
except TencentCloudSDKException as e:
self.logger.error(f'Got error code: {e.get_code()}')
if e.get_code() == 'ClientNetworkError':
return -1, self.fail_msg + e.get_code(), None
elif e.get_code() in ['InternalError', 'ServerNetworkError']:
if retry_counter == 3:
return -1, self.fail_msg + e.get_code(), None
retry_counter += 1
continue
elif e.get_code() in ['LimitExceeded']:
time.sleep(5)
if retry_counter == 3:
return -1, self.fail_msg + e.get_code(), None
retry_counter += 1
continue
else:
return -1, self.fail_msg + str(e), None
return -1, self.fail_msg, None
class HunyuanVision(HunyuanWrapper):
def generate(self, message, dataset=None):
return super(HunyuanVision, self).generate(message)
from http import HTTPStatus
import os
from vlmeval.api.base import BaseAPI
from vlmeval.smp import *
# Note: This is a pure language model API.
class QwenAPI(BaseAPI):
is_api: bool = True
def __init__(self,
model: str = 'qwen-max-1201',
retry: int = 5,
wait: int = 5,
verbose: bool = True,
seed: int = 2680,
temperature: float = 0.0,
system_prompt: str = None,
key: str = None,
max_tokens: int = 1024,
proxy: str = None,
**kwargs):
assert model in ['qwen-turbo', 'qwen-plus', 'qwen-max', 'qwen-max-1201', 'qwen-max-longcontext']
self.model = model
import dashscope
self.fail_msg = 'Failed to obtain answer via API. '
self.max_tokens = max_tokens
self.temperature = temperature
self.seed = seed
if key is None:
key = os.environ.get('DASHSCOPE_API_KEY', None)
assert key is not None, (
'Please set the API Key (obtain it here: '
'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
)
dashscope.api_key = key
if proxy is not None:
proxy_set(proxy)
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
@staticmethod
def build_msgs(msgs_raw, system_prompt=None):
msgs = cp.deepcopy(msgs_raw)
ret = []
if system_prompt is not None:
ret.append(dict(role='system', content=system_prompt))
for i, msg in enumerate(msgs):
role = 'user' if i % 2 == 0 else 'assistant'
ret.append(dict(role=role, content=msg))
return ret
def generate_inner(self, inputs, **kwargs) -> str:
from dashscope import MultiModalConversation
assert isinstance(inputs, str) or isinstance(inputs, list)
inputs = [inputs] if isinstance(inputs, str) else inputs
messages = self.build_msgs(msgs_raw=inputs, system_prompt=self.system_prompt)
import dashscope
response = dashscope.Generation.call(
model=self.model,
messages=messages,
seed=self.seed,
temperature=self.temperature,
max_tokens=self.max_tokens,
result_format='message', # set the result to be "message" format.
)
if response.status_code != HTTPStatus.OK:
return -1, 'Error: Bad Response Statuse Code. ', f'The response status code is {response.status_code}. '
try:
return 0, response['output']['choices'][0]['message']['content'].strip(), 'Succeeded! '
except Exception as err:
return -1, f'Error: Failed to parse the response. {err}', response
from __future__ import annotations
import os
import warnings
from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
from vlmeval.vlm.qwen2_vl.prompt import Qwen2VLPromptMixin
def ensure_image_url(image: str) -> str:
prefixes = ['http://', 'https://', 'file://', 'data:image;']
if any(image.startswith(prefix) for prefix in prefixes):
return image
if os.path.exists(image):
return 'file://' + image
raise ValueError(f'Invalid image: {image}')
class Qwen2VLAPI(Qwen2VLPromptMixin, BaseAPI):
is_api: bool = True
def __init__(
self,
model: str = 'qwen-vl-max-0809',
key: str | None = None,
min_pixels: int | None = None,
max_pixels: int | None = None,
max_length=2048,
top_p=0.001,
top_k=1,
temperature=0.01,
repetition_penalty=1.0,
presence_penalty=0.0,
seed=3407,
use_custom_prompt: bool = True,
**kwargs,
):
import dashscope
self.model = model
self.min_pixels = min_pixels
self.max_pixels = max_pixels
self.generate_kwargs = dict(
max_length=max_length,
top_p=top_p,
top_k=top_k,
temperature=temperature,
repetition_penalty=repetition_penalty,
presence_penalty=presence_penalty,
seed=seed,
)
key = os.environ.get('DASHSCOPE_API_KEY', None) if key is None else key
assert key is not None, (
'Please set the API Key (obtain it here: '
'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
)
dashscope.api_key = key
super().__init__(use_custom_prompt=use_custom_prompt, **kwargs)
def _prepare_content(self, inputs: list[dict[str, str]], dataset: str | None = None) -> list[dict[str, str]]:
"""
inputs list[dict[str, str]], each dict has keys: ['type', 'value']
"""
content = []
for s in inputs:
if s['type'] == 'image':
item = {'type': 'image', 'image': ensure_image_url(s['value'])}
if dataset == 'OCRBench':
item['min_pixels'] = 10 * 10 * 28 * 28
warnings.warn(f"OCRBench dataset uses custom min_pixels={item['min_pixels']}")
if self.max_pixels is not None:
item['max_pixels'] = self.max_pixels
else:
if self.min_pixels is not None:
item['min_pixels'] = self.min_pixels
if self.max_pixels is not None:
item['max_pixels'] = self.max_pixels
elif s['type'] == 'text':
item = {'type': 'text', 'text': s['value']}
else:
raise ValueError(f"Invalid message type: {s['type']}, {s}")
content.append(item)
return content
def generate_inner(self, inputs, **kwargs) -> str:
import dashscope
messages = []
if self.system_prompt is not None:
messages.append({'role': 'system', 'content': self.system_prompt})
messages.append(
{'role': 'user', 'content': self._prepare_content(inputs, dataset=kwargs.get('dataset', None))}
)
if self.verbose:
print(f'\033[31m{messages}\033[0m')
# generate
generation_kwargs = self.generate_kwargs.copy()
kwargs.pop('dataset', None)
generation_kwargs.update(kwargs)
try:
response = dashscope.MultiModalConversation.call(
model=self.model,
messages=messages,
**generation_kwargs,
)
if self.verbose:
print(response)
answer = response.output.choices[0]['message']['content'][0]['text']
return 0, answer, 'Succeeded! '
except Exception as err:
if self.verbose:
self.logger.error(err)
self.logger.error(f'The input messages are {inputs}.')
return -1, '', ''
class QwenVLWrapper(BaseAPI):
is_api: bool = True
def __init__(self,
model: str = 'qwen-vl-plus',
retry: int = 5,
wait: int = 5,
key: str = None,
verbose: bool = True,
temperature: float = 0.0,
system_prompt: str = None,
max_tokens: int = 1024,
proxy: str = None,
**kwargs):
assert model in ['qwen-vl-plus', 'qwen-vl-max']
self.model = model
import dashscope
self.fail_msg = 'Failed to obtain answer via API. '
self.max_tokens = max_tokens
self.temperature = temperature
if key is None:
key = os.environ.get('DASHSCOPE_API_KEY', None)
assert key is not None, (
'Please set the API Key (obtain it here: '
'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
)
dashscope.api_key = key
if proxy is not None:
proxy_set(proxy)
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
# inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
# content can be a string or a list of image & text
def prepare_itlist(self, inputs):
assert np.all([isinstance(x, dict) for x in inputs])
has_images = np.sum([x['type'] == 'image' for x in inputs])
if has_images:
content_list = []
for msg in inputs:
if msg['type'] == 'text':
content_list.append(dict(text=msg['value']))
elif msg['type'] == 'image':
content_list.append(dict(image='file://' + msg['value']))
else:
assert all([x['type'] == 'text' for x in inputs])
text = '\n'.join([x['value'] for x in inputs])
content_list = [dict(text=text)]
return content_list
def prepare_inputs(self, inputs):
input_msgs = []
if self.system_prompt is not None:
input_msgs.append(dict(role='system', content=self.system_prompt))
assert isinstance(inputs, list) and isinstance(inputs[0], dict)
assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs
if 'role' in inputs[0]:
assert inputs[-1]['role'] == 'user', inputs[-1]
for item in inputs:
input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content'])))
else:
input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs)))
return input_msgs
def generate_inner(self, inputs, **kwargs) -> str:
from dashscope import MultiModalConversation
assert isinstance(inputs, str) or isinstance(inputs, list)
if 'type' in inputs[0]:
pure_text = np.all([x['type'] == 'text' for x in inputs])
else:
pure_text = True
for inp in inputs:
if not np.all([x['type'] == 'text' for x in inp['content']]):
pure_text = False
break
assert not pure_text
messages = self.prepare_inputs(inputs)
gen_config = dict(max_output_tokens=self.max_tokens, temperature=self.temperature)
gen_config.update(kwargs)
try:
response = MultiModalConversation.call(model=self.model, messages=messages)
if self.verbose:
print(response)
answer = response.output.choices[0]['message']['content'][0]['text']
return 0, answer, 'Succeeded! '
except Exception as err:
if self.verbose:
self.logger.error(err)
self.logger.error(f'The input messages are {inputs}.')
return -1, '', ''
class QwenVLAPI(QwenVLWrapper):
def generate(self, message, dataset=None):
return super(QwenVLAPI, self).generate(message)
from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
from time import sleep
import mimetypes
class Reka_Wrapper(BaseAPI):
is_api: bool = True
INTERLEAVE: bool = False
def __init__(self,
model: str = 'reka-flash-20240226',
key: str = None,
retry: int = 10,
wait: int = 3,
system_prompt: str = None,
verbose: bool = True,
temperature: float = 0,
max_tokens: int = 1024,
**kwargs):
try:
import reka
except ImportError:
raise ImportError('Please install reka by running "pip install reka-api"')
self.model = model
default_kwargs = dict(temperature=temperature, request_output_len=max_tokens)
default_kwargs.update(kwargs)
self.kwargs = default_kwargs
if key is not None:
self.key = key
else:
self.key = os.environ.get('REKA_API_KEY', '')
super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
def generate_inner(self, inputs, **kwargs) -> str:
import reka
reka.API_KEY = self.key
dataset = kwargs.pop('dataset', None)
prompt, image_path = self.message_to_promptimg(inputs, dataset=dataset)
image_b64 = encode_image_file_to_base64(image_path)
response = reka.chat(
model_name=self.model,
human=prompt,
media_url=f'data:image/jpeg;base64,{image_b64}',
**self.kwargs)
try:
return 0, response['text'], response
except:
return -1, self.fail_msg, response
class Reka(Reka_Wrapper):
def generate(self, message, dataset=None):
return super(Reka_Wrapper, self).generate(message)
from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
from vlmeval.dataset import img_root_map
from vlmeval.dataset import DATASET_TYPE
class SenseChatVisionWrapper(BaseAPI):
is_api: bool = True
def __init__(self,
model: str = 'SenseChat-5-Vision',
retry: int = 5,
wait: int = 5,
ak: str = None,
sk: str = None,
verbose: bool = True,
system_prompt: str = None,
max_tokens: int = 1024,
proxy: str = None,
**kwargs):
self.model = model
self.fail_msg = 'Failed to obtain answer via API. '
self.ak = os.environ.get('SENSECHAT_AK', None) if ak is None else ak
self.sk = os.environ.get('SENSECHAT_SK', None) if sk is None else sk
assert self.ak is not None and self.sk is not None
self.max_new_tokens = max_tokens
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
def dump_image(self, line, dataset):
"""Dump the image(s) of the input line to the corresponding dataset folder.
Args:
line (line of pd.DataFrame): The raw input line.
dataset (str): The name of the dataset.
Returns:
str | list[str]: The paths of the dumped images.
"""
ROOT = LMUDataRoot()
assert isinstance(dataset, str)
img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset)
os.makedirs(img_root, exist_ok=True)
if 'image' in line:
if isinstance(line['image'], list):
tgt_path = []
assert 'image_path' in line
for img, im_name in zip(line['image'], line['image_path']):
path = osp.join(img_root, im_name)
if not read_ok(path):
decode_base64_to_image_file(img, path)
tgt_path.append(path)
else:
tgt_path = osp.join(img_root, f"{line['index']}.jpg")
if not read_ok(tgt_path):
decode_base64_to_image_file(line['image'], tgt_path)
tgt_path = [tgt_path]
else:
assert 'image_path' in line
tgt_path = toliststr(line['image_path'])
return tgt_path
def image_to_base64(self, image_path):
import base64
with open(image_path, 'rb') as image_file:
encoded_string = base64.b64encode(image_file.read())
return encoded_string.decode('utf-8')
def encode_jwt_token(self, ak, sk):
import jwt
headers = {'alg': 'HS256', 'typ': 'JWT'}
payload = {
'iss': ak,
'exp': int(time.time())
+ 1800, # 填写您期望的有效时间,此处示例代表当前时间+30分钟
'nbf': int(time.time()) - 5, # 填写您期望的生效时间,此处示例代表当前时间-5秒
}
token = jwt.encode(payload, sk, headers=headers)
return token
def use_custom_prompt(self, dataset):
return True
def build_multi_choice_prompt(self, line, dataset=None):
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if len(options):
prompt += '\n请直接回答选项字母。' if cn_string(
prompt) else "\nAnswer with the option's letter from the given choices directly."
else:
prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
return prompt
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert dataset is None or isinstance(dataset, str)
tgt_path = self.dump_image(line, dataset)
if dataset is not None and listinstr(['MME'], dataset):
question = line['question']
prompt = question + ' Answer the question using a single word or phrase.'
elif dataset is not None and listinstr(['HallusionBench'], dataset):
question = line['question']
prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.'
elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ' and 'MMMU' not in dataset:
prompt = self.build_multi_choice_prompt(line, dataset)
elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
if 'MathVista' in dataset:
prompt = line['question']
elif listinstr(['LLaVABench'], dataset):
question = line['question']
prompt = question + '\nAnswer this question in detail.'
elif listinstr(['MMVet'], dataset):
prompt = line['question']
else:
question = line['question']
prompt = question + '\nAnswer the question using a single word or phrase.'
elif dataset is not None and 'MMMU' in dataset:
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = {
'multiple-choice': 'You are an expert in {}. Please solve the university-level {} examination question, which includes interleaved images and text. Your output should be divided into two parts: First, reason about the correct answer. Then write the answer in the following format where X is exactly one of the choices given by the problem: "ANSWER: X". If you are uncertain of the correct answer, guess the most likely one.', # noqa: E501
'open': 'You are an expert in {}. Please solve the university-level {} examination question, which includes interleaved images and text. Your output should be divided into two parts: First, reason about the correct answer. Then write the answer in the following format where X is only the answer and nothing else: "ANSWER: X"' # noqa: E501
}
subject = '_'.join(line['id'].split('_')[1:-1])
prompt = prompt[line['question_type']].format(subject, subject) + '\n' + question
else:
prompt = line['question']
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
def message_to_promptimg(self, message, dataset=None):
if dataset is None or listinstr(['MMMU', 'BLINK'], dataset):
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
image = [[x['value'] for x in message if x['type'] == 'image'][0]]
else:
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
image = [x['value'] for x in message if x['type'] == 'image']
return prompt, image
def generate_inner(self, inputs, **kwargs) -> str:
assert isinstance(inputs, str) or isinstance(inputs, list)
inputs = [inputs] if isinstance(inputs, str) else inputs
dataset = kwargs.get('dataset', None)
if dataset is not None and listinstr(['ChartQA_TEST'], dataset):
self.max_num = 12
elif dataset is not None and listinstr(['DocVQA_VAL', 'DocVQA_TEST'], dataset):
self.max_num = 18
elif dataset is not None and listinstr(['InfoVQA_VAL', 'InfoVQA_TEST', 'OCRBench'], dataset):
self.max_num = 24
else:
self.max_num = 6
if dataset is None:
pass
elif listinstr(['AI2D_TEST'], dataset):
self.max_new_tokens = 10
elif 'MMMU' in dataset:
self.max_new_tokens = 1024
elif 'MMBench' in dataset:
self.max_new_tokens = 100
prompt, image = self.message_to_promptimg(message=inputs, dataset=dataset)
url = 'https://api.sensenova.cn/v1/llm/chat-completions'
api_secret_key = self.encode_jwt_token(self.ak, self.sk)
content = [{
'image_base64': self.image_to_base64(item),
'image_file_id': '',
'image_url': '',
'text': '',
'text': '',
'type': 'image_base64'
} for item in image]
content.append({
'image_base64': '',
'image_file_id': '',
'image_url': '',
'text': prompt,
'type': 'text'
})
message = [{'content': content, 'role': 'user'}]
data = {
'messages': message,
'max_new_tokens': self.max_new_tokens,
'model': self.model,
'stream': False,
}
headers = {
'Content-type': 'application/json',
'Authorization': 'Bearer ' + api_secret_key
}
response = requests.post(
url,
headers=headers,
json=data,
)
request_id = response.headers['x-request-id']
time.sleep(1)
try:
assert response.status_code == 200
response = response.json()['data']['choices'][0]['message'].strip()
if dataset is not None and 'MMMU' in dataset:
response = response.split('ANSWER: ')[-1].strip()
if self.verbose:
self.logger.info(f'inputs: {inputs}\nanswer: {response}')
return 0, response, 'Succeeded! '
except Exception as err:
if self.verbose:
self.logger.error('---------------------------ERROR---------------------------')
self.logger.error(response.json())
self.logger.error(err)
self.logger.error('---------------------------request_id---------------------------' + request_id)
self.logger.error(
'api error' + response.json()['error']['message']
+ str([input['value'] if input['type'] == 'image' else None for input in inputs])
)
self.logger.error(f'The input messages are {inputs}.')
return -1, response.json()['error']['message'], ''
class SenseChatVisionAPI(SenseChatVisionWrapper):
def generate(self, message, dataset=None):
return super(SenseChatVisionAPI, self).generate(message, dataset=dataset)
from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
url = 'https://api.stepfun.com/v1/chat/completions'
headers = {
'Content-Type': 'application/json',
'Authorization': 'Bearer {}',
}
class StepAPI_INT(BaseAPI):
is_api: bool = True
def __init__(self,
model: str = 'step-1v-8k',
retry: int = 10,
wait: int = 3,
key: str = None,
temperature: float = 0,
max_tokens: int = 300,
verbose: bool = True,
system_prompt: str = None,
**kwargs):
self.model = model
self.fail_msg = 'Fail to obtain answer via API.'
self.headers = headers
self.temperature = temperature
self.max_tokens = max_tokens
self.system_prompt = system_prompt
if key is not None:
self.key = key
else:
self.key = os.environ.get('STEPAI_API_KEY', '')
headers['Authorization'] = headers['Authorization'].format(self.key)
super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
@staticmethod
def build_msgs(msgs_raw):
messages = []
message = {'role': 'user', 'content': []}
for msg in msgs_raw:
if msg['type'] == 'image':
image_b64 = encode_image_file_to_base64(msg['value'])
message['content'].append({
'image_url': {'url': 'data:image/webp;base64,%s' % (image_b64)},
'type': 'image_url'
})
elif msg['type'] == 'text':
message['content'].append({
'text': msg['value'],
'type': 'text'
})
messages.append(message)
return messages
def generate_inner(self, inputs, **kwargs) -> str:
print(inputs, '\n')
payload = dict(
model=self.model,
max_tokens=self.max_tokens,
temperature=self.temperature,
messages=self.build_msgs(msgs_raw=inputs),
**kwargs)
response = requests.post(url, headers=headers, data=json.dumps(payload))
ret_code = response.status_code
ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
answer = self.fail_msg
try:
resp_struct = json.loads(response.text)
answer = resp_struct['choices'][0]['message']['content'].strip()
except:
pass
return ret_code, answer, response
class Step1V_INT(StepAPI_INT):
def generate(self, message, dataset=None):
return super(StepAPI_INT, self).generate(message)
from vlmeval.vlm import *
from vlmeval.api import *
from functools import partial
PandaGPT_ROOT = None
MiniGPT4_ROOT = None
TransCore_ROOT = None
Yi_ROOT = None
OmniLMM_ROOT = None
Mini_Gemini_ROOT = None
VXVERSE_ROOT = None
VideoChat2_ROOT = None
VideoChatGPT_ROOT = None
PLLaVA_ROOT = None
RBDash_ROOT = None
LLAVA_V1_7B_MODEL_PTH = 'Please set your local path to LLaVA-7B-v1.1 here, the model weight is obtained by merging LLaVA delta weight based on vicuna-7b-v1.1 in https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md with vicuna-7b-v1.1. '
video_models = {
'Video-LLaVA-7B':partial(VideoLLaVA, model_path='LanguageBind/Video-LLaVA-7B'),
'Video-LLaVA-7B-HF':partial(VideoLLaVA_HF, model_path='LanguageBind/Video-LLaVA-7B-hf'),
'VideoChat2-HD':partial(VideoChat2_HD, model_path='OpenGVLab/VideoChat2_HD_stage4_Mistral_7B', root=VideoChat2_ROOT, config_file='./vlmeval/vlm/video_llm/configs/videochat2_hd.json'),
'Chat-UniVi-7B': partial(Chatunivi, model_path="Chat-UniVi/Chat-UniVi"),
'Chat-UniVi-7B-v1.5': partial(Chatunivi, model_path="Chat-UniVi/Chat-UniVi-7B-v1.5"),
'LLaMA-VID-7B': partial(LLaMAVID, model_path='YanweiLi/llama-vid-7b-full-224-video-fps-1'),
'Video-ChatGPT': partial(VideoChatGPT, model_path='MBZUAI/Video-ChatGPT-7B', dir_root=VideoChatGPT_ROOT),
'PLLaVA-7B': partial(PLLaVA, model_path='ermu2001/pllava-7b', dir_root=PLLaVA_ROOT),
'PLLaVA-13B': partial(PLLaVA, model_path='ermu2001/pllava-13b', dir_root=PLLaVA_ROOT),
'PLLaVA-34B': partial(PLLaVA, model_path='ermu2001/pllava-34b', dir_root=PLLaVA_ROOT),
}
ungrouped = {
'TransCore_M': partial(TransCoreM, root=TransCore_ROOT),
'PandaGPT_13B': partial(PandaGPT, name='PandaGPT_13B', root=PandaGPT_ROOT),
'flamingov2': partial(OpenFlamingo, name='v2', mpt_pth='anas-awadalla/mpt-7b', ckpt_pth='openflamingo/OpenFlamingo-9B-vitl-mpt7b'),
'VisualGLM_6b': partial(VisualGLM, model_path='THUDM/visualglm-6b'),
'mPLUG-Owl2': partial(mPLUG_Owl2, model_path='MAGAer13/mplug-owl2-llama2-7b'),
'mPLUG-Owl3': partial(mPLUG_Owl3, model_path='mPLUG/mPLUG-Owl3-7B-240728'),
'emu2_chat': partial(Emu, model_path='BAAI/Emu2-Chat'),
'OmniLMM_12B': partial(OmniLMM12B, model_path='openbmb/OmniLMM-12B', root=OmniLMM_ROOT),
'MGM_7B': partial(Mini_Gemini, model_path='YanweiLi/MGM-7B-HD', root=Mini_Gemini_ROOT),
'Bunny-llama3-8B': partial(BunnyLLama3, model_path='BAAI/Bunny-v1_1-Llama-3-8B-V'),
'VXVERSE': partial(VXVERSE, model_name='XVERSE-V-13B', root=VXVERSE_ROOT),
'paligemma-3b-mix-448': partial(PaliGemma, model_path='google/paligemma-3b-mix-448'),
'360VL-70B': partial(QH_360VL, model_path='qihoo360/360VL-70B'),
'Llama-3-MixSenseV1_1': partial(LLama3Mixsense, model_path='Zero-Vision/Llama-3-MixSenseV1_1'),
'Parrot': partial(Parrot, model_path='AIDC-AI/Parrot-7B'),
'OmChat': partial(OmChat, model_path='omlab/omchat-v2.0-13B-single-beta_hf'),
'RBDash_72b': partial(RBDash, model_path='RBDash-Team/RBDash-v1.2-72b', root=RBDash_ROOT),
'Pixtral-12B': partial(Pixtral, model_path="mistralai/Pixtral-12B-2409")
}
api_models = {
# GPT
'GPT4V': partial(GPT4V, model='gpt-4-1106-vision-preview', temperature=0, img_size=512, img_detail='low', retry=10),
'GPT4V_HIGH': partial(GPT4V, model='gpt-4-1106-vision-preview', temperature=0, img_size=-1, img_detail='high', retry=10),
'GPT4V_20240409': partial(GPT4V, model='gpt-4-turbo-2024-04-09', temperature=0, img_size=512, img_detail='low', retry=10),
'GPT4V_20240409_HIGH': partial(GPT4V, model='gpt-4-turbo-2024-04-09', temperature=0, img_size=-1, img_detail='high', retry=10),
'GPT4o': partial(GPT4V, model='gpt-4o-2024-05-13', temperature=0, img_size=512, img_detail='low', retry=10),
'GPT4o_HIGH': partial(GPT4V, model='gpt-4o-2024-05-13', temperature=0, img_size=-1, img_detail='high', retry=10),
'GPT4o_20240806': partial(GPT4V, model='gpt-4o-2024-08-06', temperature=0, img_size=-1, img_detail='high', retry=10),
'GPT4o_MINI': partial(GPT4V, model='gpt-4o-mini-2024-07-18', temperature=0, img_size=-1, img_detail='high', retry=10),
# Gemini
'GeminiPro1-0': partial(GeminiProVision, model='gemini-1.0-pro', temperature=0, retry=10), # now GeminiPro1-0 is only supported by vertex backend
'GeminiPro1-5': partial(GeminiProVision, model='gemini-1.5-pro', temperature=0, retry=10),
'GeminiFlash1-5': partial(GeminiProVision, model='gemini-1.5-flash', temperature=0, retry=10),
# Qwen-VL
'QwenVLPlus': partial(QwenVLAPI, model='qwen-vl-plus', temperature=0, retry=10),
'QwenVLMax': partial(QwenVLAPI, model='qwen-vl-max', temperature=0, retry=10),
# Reka
'RekaEdge': partial(Reka, model='reka-edge-20240208'),
'RekaFlash': partial(Reka, model='reka-flash-20240226'),
'RekaCore': partial(Reka, model='reka-core-20240415'),
# Step1V
'Step1V': partial(GPT4V, model='step-1v-8k', api_base="https://api.stepfun.com/v1/chat/completions", temperature=0, retry=10, img_detail='high'),
# Yi-Vision
'Yi-Vision': partial(GPT4V, model='yi-vision', api_base="https://api.lingyiwanwu.com/v1/chat/completions", temperature=0, retry=10),
# Claude
'Claude3V_Opus': partial(Claude3V, model='claude-3-opus-20240229', temperature=0, retry=10),
'Claude3V_Sonnet': partial(Claude3V, model='claude-3-sonnet-20240229', temperature=0, retry=10),
'Claude3V_Haiku': partial(Claude3V, model='claude-3-haiku-20240307', temperature=0, retry=10),
'Claude3-5V_Sonnet': partial(Claude3V, model='claude-3-5-sonnet-20240620', temperature=0, retry=10),
# GLM4V
'GLM4V': partial(GLMVisionAPI, model='glm4v-biz-eval', temperature=0, retry=10),
# CongRong
'CloudWalk': partial(CWWrapper, model='cw-congrong-v1.5', temperature=0, retry=10),
# SenseChat-V
'SenseChat-5-Vision': partial(SenseChatVisionAPI, model='SenseChat-5-Vision', temperature=0, retry=10),
'HunYuan-Vision': partial(HunyuanVision, model='hunyuan-vision', temperature=0, retry=10),
# BlueLM-V
"BlueLM_V": partial(BlueLM_V_API, model='BlueLM-VL-v3.0', temperature=0, retry=10)
}
mmalaya_series = {
'MMAlaya': partial(MMAlaya, model_path='DataCanvas/MMAlaya'),
'MMAlaya2': partial(MMAlaya2, model_path='DataCanvas/MMAlaya2'),
}
minicpm_series = {
'MiniCPM-V': partial(MiniCPM_V, model_path='openbmb/MiniCPM-V'),
'MiniCPM-V-2': partial(MiniCPM_V, model_path='openbmb/MiniCPM-V-2'),
'MiniCPM-Llama3-V-2_5': partial(MiniCPM_Llama3_V, model_path='openbmb/MiniCPM-Llama3-V-2_5'),
'MiniCPM-V-2_6': partial(MiniCPM_V_2_6, model_path='openbmb/MiniCPM-V-2_6'),
}
xtuner_series = {
'llava-internlm2-7b': partial(LLaVA_XTuner, llm_path='internlm/internlm2-chat-7b', llava_path='xtuner/llava-internlm2-7b', visual_select_layer=-2, prompt_template='internlm2_chat'),
'llava-internlm2-20b': partial(LLaVA_XTuner, llm_path='internlm/internlm2-chat-20b', llava_path='xtuner/llava-internlm2-20b', visual_select_layer=-2, prompt_template='internlm2_chat'),
'llava-internlm-7b': partial(LLaVA_XTuner, llm_path='internlm/internlm-chat-7b', llava_path='xtuner/llava-internlm-7b', visual_select_layer=-2, prompt_template='internlm_chat'),
'llava-v1.5-7b-xtuner': partial(LLaVA_XTuner, llm_path='lmsys/vicuna-7b-v1.5', llava_path='xtuner/llava-v1.5-7b-xtuner', visual_select_layer=-2, prompt_template='vicuna'),
'llava-v1.5-13b-xtuner': partial(LLaVA_XTuner, llm_path='lmsys/vicuna-13b-v1.5', llava_path='xtuner/llava-v1.5-13b-xtuner', visual_select_layer=-2, prompt_template='vicuna'),
'llava-llama-3-8b': partial(LLaVA_XTuner, llm_path='xtuner/llava-llama-3-8b-v1_1', llava_path='xtuner/llava-llama-3-8b-v1_1', visual_select_layer=-2, prompt_template='llama3_chat'),
}
qwen_series = {
'qwen_base': partial(QwenVL, model_path='Qwen/Qwen-VL'),
'qwen_chat': partial(QwenVLChat, model_path='Qwen/Qwen-VL-Chat'),
'monkey': partial(Monkey, model_path='echo840/Monkey'),
'monkey-chat': partial(MonkeyChat, model_path='echo840/Monkey-Chat'),
'minimonkey': partial(MiniMonkey, model_path='mx262/MiniMokney')
}
llava_series = {
'llava_v1.5_7b': partial(LLaVA, model_path='liuhaotian/llava-v1.5-7b'),
'llava_v1.5_13b': partial(LLaVA, model_path='liuhaotian/llava-v1.5-13b'),
'llava_v1_7b': partial(LLaVA, model_path=LLAVA_V1_7B_MODEL_PTH),
'sharegpt4v_7b': partial(LLaVA, model_path='Lin-Chen/ShareGPT4V-7B'),
'sharegpt4v_13b': partial(LLaVA, model_path='Lin-Chen/ShareGPT4V-13B'),
'llava_next_vicuna_7b': partial(LLaVA_Next, model_path='llava-hf/llava-v1.6-vicuna-7b-hf'),
'llava_next_vicuna_13b': partial(LLaVA_Next, model_path='llava-hf/llava-v1.6-vicuna-13b-hf'),
'llava_next_mistral_7b': partial(LLaVA_Next, model_path='llava-hf/llava-v1.6-mistral-7b-hf'),
'llava_next_yi_34b': partial(LLaVA_Next, model_path='llava-hf/llava-v1.6-34b-hf'),
'llava_next_llama3': partial(LLaVA_Next, model_path='llava-hf/llama3-llava-next-8b-hf'),
'llava_next_72b': partial(LLaVA_Next, model_path='llava-hf/llava-next-72b-hf'),
'llava_next_110b': partial(LLaVA_Next, model_path='llava-hf/llava-next-110b-hf'),
'llava_next_qwen_32b': partial(LLaVA_Next2, model_path='lmms-lab/llava-next-qwen-32b'),
'llava_next_interleave_7b': partial(LLaVA_Next, model_path='llava-hf/llava-interleave-qwen-7b-hf'),
'llava_next_interleave_7b_dpo': partial(LLaVA_Next, model_path='llava-hf/llava-interleave-qwen-7b-dpo-hf'),
'llava_onevision_qwen2_0.5b_si': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-0.5b-si'),
'llava_onevision_qwen2_7b_si': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-7b-si'),
'llava_onevision_qwen2_72b_si': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-72b-si'),
'llava_onevision_qwen2_0.5b_ov': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-0.5b-ov'),
'llava_onevision_qwen2_7b_ov': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-7b-ov'),
'llava_onevision_qwen2_72b_ov': partial(LLaVA_OneVision, model_path='lmms-lab/llava-onevision-qwen2-72b-ov'),
}
internvl_series = {
'InternVL-Chat-V1-1': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-V1-1', version='V1.1'),
'InternVL-Chat-V1-2': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-V1-2', version='V1.2'),
'InternVL-Chat-V1-2-Plus': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-V1-2-Plus', version='V1.2'),
'InternVL-Chat-V1-5': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-V1-5', version='V1.5'),
'Mini-InternVL-Chat-2B-V1-5': partial(InternVLChat, model_path='OpenGVLab/Mini-InternVL-Chat-2B-V1-5', version='V1.5'),
'Mini-InternVL-Chat-4B-V1-5': partial(InternVLChat, model_path='OpenGVLab/Mini-InternVL-Chat-4B-V1-5', version='V1.5'),
# InternVL2 series
'InternVL2-1B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-1B', version='V2.0'),
'InternVL2-2B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-2B', version='V2.0'),
'InternVL2-4B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-4B', version='V2.0'),
'InternVL2-8B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-8B', version='V2.0'),
'InternVL2-26B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-26B', version='V2.0'),
'InternVL2-40B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-40B', version='V2.0', load_in_8bit=True),
'InternVL2-76B': partial(InternVLChat, model_path='OpenGVLab/InternVL2-Llama3-76B', version='V2.0'),
}
yivl_series = {
'Yi_VL_6B': partial(Yi_VL, model_path='01-ai/Yi-VL-6B', root=Yi_ROOT),
'Yi_VL_34B': partial(Yi_VL, model_path='01-ai/Yi-VL-34B', root=Yi_ROOT),
}
xcomposer_series = {
'XComposer': partial(XComposer, model_path='internlm/internlm-xcomposer-vl-7b'),
'sharecaptioner': partial(ShareCaptioner, model_path='Lin-Chen/ShareCaptioner'),
'XComposer2': partial(XComposer2, model_path='internlm/internlm-xcomposer2-vl-7b'),
'XComposer2_1.8b': partial(XComposer2, model_path='internlm/internlm-xcomposer2-vl-1_8b'),
'XComposer2_4KHD': partial(XComposer2_4KHD, model_path='internlm/internlm-xcomposer2-4khd-7b'),
'XComposer2d5': partial(XComposer2d5, model_path='internlm/internlm-xcomposer2d5-7b'),
}
minigpt4_series = {
'MiniGPT-4-v2': partial(MiniGPT4, mode='v2', root=MiniGPT4_ROOT),
'MiniGPT-4-v1-7B': partial(MiniGPT4, mode='v1_7b', root=MiniGPT4_ROOT),
'MiniGPT-4-v1-13B': partial(MiniGPT4, mode='v1_13b', root=MiniGPT4_ROOT),
}
idefics_series = {
'idefics_9b_instruct': partial(IDEFICS, model_path='HuggingFaceM4/idefics-9b-instruct'),
'idefics_80b_instruct': partial(IDEFICS, model_path='HuggingFaceM4/idefics-80b-instruct'),
'idefics2_8b': partial(IDEFICS2, model_path='HuggingFaceM4/idefics2-8b'),
# Idefics3 follows Idefics2 Pattern
'Idefics3-8B-Llama3': partial(IDEFICS2, model_path='HuggingFaceM4/Idefics3-8B-Llama3'),
}
instructblip_series = {
'instructblip_7b': partial(InstructBLIP, name='instructblip_7b'),
'instructblip_13b': partial(InstructBLIP, name='instructblip_13b'),
}
deepseekvl_series = {
'deepseek_vl_7b': partial(DeepSeekVL, model_path='deepseek-ai/deepseek-vl-7b-chat'),
'deepseek_vl_1.3b': partial(DeepSeekVL, model_path='deepseek-ai/deepseek-vl-1.3b-chat'),
}
cogvlm_series = {
'cogvlm-grounding-generalist': partial(CogVlm, model_path='THUDM/cogvlm-grounding-generalist-hf', tokenizer_name='lmsys/vicuna-7b-v1.5'),
'cogvlm-chat': partial(CogVlm, model_path='THUDM/cogvlm-chat-hf', tokenizer_name='lmsys/vicuna-7b-v1.5'),
'cogvlm2-llama3-chat-19B': partial(CogVlm, model_path='THUDM/cogvlm2-llama3-chat-19B'),
'glm-4v-9b': partial(GLM4v, model_path='THUDM/glm-4v-9b')
}
wemm_series = {
'WeMM': partial(WeMM, model_path='feipengma/WeMM'),
}
cambrian_series = {
'cambrian_8b': partial(Cambrian, model_path='nyu-visionx/cambrian-8b'),
'cambrian_13b': partial(Cambrian, model_path='nyu-visionx/cambrian-13b'),
'cambrian_34b': partial(Cambrian, model_path='nyu-visionx/cambrian-34b'),
}
chameleon_series = {
'chameleon_7b': partial(Chameleon, model_path='facebook/chameleon-7b'),
'chameleon_30b': partial(Chameleon, model_path='facebook/chameleon-30b'),
}
vila_series = {
'VILA1.5-3b': partial(VILA, model_path='Efficient-Large-Model/VILA1.5-3b'),
'Llama-3-VILA1.5-8b': partial(VILA, model_path='Efficient-Large-Model/Llama-3-VILA1.5-8b'),
'VILA1.5-13b': partial(VILA, model_path='Efficient-Large-Model/VILA1.5-13b'),
'VILA1.5-40b': partial(VILA, model_path='Efficient-Large-Model/VILA1.5-40b'),
}
ovis_series = {
'Ovis1.5-Llama3-8B': partial(Ovis, model_path='AIDC-AI/Ovis1.5-Llama3-8B'),
'Ovis1.5-Gemma2-9B': partial(Ovis, model_path='AIDC-AI/Ovis1.5-Gemma2-9B'),
'Ovis1.6-Gemma2-9B': partial(Ovis1_6, model_path='AIDC-AI/Ovis1.6-Gemma2-9B')
}
mantis_series = {
'Mantis-8B-siglip-llama3': partial(Mantis, model_path='TIGER-Lab/Mantis-8B-siglip-llama3'),
'Mantis-8B-clip-llama3': partial(Mantis, model_path='TIGER-Lab/Mantis-8B-clip-llama3'),
'Mantis-8B-Idefics2': partial(Mantis, model_path='TIGER-Lab/Mantis-8B-Idefics2'),
'Mantis-8B-Fuyu': partial(Mantis, model_path='TIGER-Lab/Mantis-8B-Fuyu')
}
phi3_series = {
'Phi-3-Vision': partial(Phi3Vision, model_path='microsoft/Phi-3-vision-128k-instruct'),
'Phi-3.5-Vision': partial(Phi3_5Vision, model_path='microsoft/Phi-3.5-vision-instruct')
}
xgen_mm_series = {
'xgen-mm-phi3-interleave-r-v1.5': partial(XGenMM, model_path='Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5'),
'xgen-mm-phi3-dpo-r-v1.5': partial(XGenMM, model_path='Salesforce/xgen-mm-phi3-mini-instruct-dpo-r-v1.5'),
}
qwen2vl_series = {
'Qwen-VL-Max-0809': partial(Qwen2VLAPI, model='qwen-vl-max-0809', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen-VL-Plus-0809': partial(Qwen2VLAPI, model='qwen-vl-plus-0809', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen2-VL-7B-Instruct': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen2-VL-7B-Instruct-AWQ': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct-AWQ', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen2-VL-7B-Instruct-GPTQ-Int4': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen2-VL-7B-Instruct-GPTQ-Int8': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen2-VL-2B-Instruct': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-2B-Instruct', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen2-VL-2B-Instruct-AWQ': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-2B-Instruct-AWQ', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen2-VL-2B-Instruct-GPTQ-Int4': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4', min_pixels=1280*28*28, max_pixels=16384*28*28),
'Qwen2-VL-2B-Instruct-GPTQ-Int8': partial(Qwen2VLChat, model_path='Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8', min_pixels=1280*28*28, max_pixels=16384*28*28),
}
slime_series = {
'Slime-7B': partial(SliME, model_path='yifanzhang114/SliME-vicuna-7B'),
'Slime-8B': partial(SliME, model_path='yifanzhang114/SliME-Llama3-8B'),
'Slime-13B': partial(SliME, model_path='yifanzhang114/SliME-vicuna-13B'),
}
eagle_series={
'Eagle-X4-8B-Plus': partial(Eagle, model_path='NVEagle/Eagle-X4-8B-Plus'),
'Eagle-X4-13B-Plus': partial(Eagle, model_path='NVEagle/Eagle-X4-13B-Plus'),
'Eagle-X5-7B': partial(Eagle, model_path='NVEagle/Eagle-X5-7B'),
'Eagle-X5-13B': partial(Eagle, model_path='NVEagle/Eagle-X5-13B'),
'Eagle-X5-13B-Chat': partial(Eagle, model_path='NVEagle/Eagle-X5-13B-Chat'),
'Eagle-X5-34B-Chat': partial(Eagle, model_path='NVEagle/Eagle-X5-34B-Chat'),
'Eagle-X5-34B-Plus': partial(Eagle, model_path='NVEagle/Eagle-X5-34B-Plus'),
}
moondream_series={
'Moondream1': partial(Moondream1, model_path='vikhyatk/moondream1'),
'Moondream2': partial(Moondream2, model_path='vikhyatk/moondream2'),
}
supported_VLM = {}
model_groups = [
ungrouped, api_models,
xtuner_series, qwen_series, llava_series, internvl_series, yivl_series,
xcomposer_series, minigpt4_series, idefics_series, instructblip_series,
deepseekvl_series, minicpm_series, cogvlm_series, wemm_series,
cambrian_series, chameleon_series, video_models, ovis_series, vila_series,
mantis_series, mmalaya_series, phi3_series, xgen_mm_series, qwen2vl_series,
slime_series, eagle_series, moondream_series
]
for grp in model_groups:
supported_VLM.update(grp)
import warnings
from .image_base import img_root_map, ImageBaseDataset
from .image_caption import ImageCaptionDataset
from .image_yorn import ImageYORNDataset
from .image_mcq import (
ImageMCQDataset, MMMUDataset, CustomMCQDataset, MUIRDataset, GMAIMMBenchDataset, MMERealWorld, HRBenchDataset
)
from .image_mt import MMDUDataset
from .image_vqa import (
ImageVQADataset, MathVision, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, TableVQABench,
CustomVQADataset, CRPE, MathVerse
)
from .vcr import VCRDataset
from .mmlongbench import MMLongBench
from .dude import DUDE
from .slidevqa import SlideVQA
from .mmbench_video import MMBenchVideo
from .text_mcq import CustomTextMCQDataset, TextMCQDataset
from .videomme import VideoMME
from .mvbench import MVBench, MVBench_MP4
from .utils import *
from ..smp import *
class ConcatDataset(ImageBaseDataset):
# This dataset takes multiple dataset names as input and aggregate them into a single dataset.
# Each single dataset should not have a field named `SUB_DATASET`
DATASET_SETS = {
'MMMB': ['MMMB_ar', 'MMMB_cn', 'MMMB_en', 'MMMB_pt', 'MMMB_ru', 'MMMB_tr'],
'MTL_MMBench_DEV': [
'MMBench_dev_ar', 'MMBench_dev_cn', 'MMBench_dev_en',
'MMBench_dev_pt', 'MMBench_dev_ru', 'MMBench_dev_tr'
]
}
def __init__(self, dataset):
datasets = self.DATASET_SETS[dataset]
self.dataset_map = {}
# The name of the compliation
self.dataset_name = dataset
self.datasets = datasets
for dname in datasets:
dataset = build_dataset(dname)
assert dataset is not None, dataset
self.dataset_map[dname] = dataset
TYPES = [x.TYPE for x in self.dataset_map.values()]
MODALITIES = [x.MODALITY for x in self.dataset_map.values()]
assert np.all([x == TYPES[0] for x in TYPES]), (datasets, TYPES)
assert np.all([x == MODALITIES[0] for x in MODALITIES]), (datasets, MODALITIES)
self.TYPE = TYPES[0]
self.MODALITY = MODALITIES[0]
data_all = []
for dname in datasets:
data = self.dataset_map[dname].data
data['SUB_DATASET'] = [dname] * len(data)
data_new = localize_df(data, dname, nproc=16)
data_all.append(data_new)
data = pd.concat(data_all)
data['original_index'] = data.pop('index')
data['index'] = np.arange(len(data))
self.data = data
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
idx = line['original_index']
dname = line['SUB_DATASET']
org_data = self.dataset_map[dname].data
org_line = cp.deepcopy(org_data[org_data['index'] == idx]).iloc[0]
return self.dataset_map[dname].build_prompt(org_line)
def dump_image(self, line):
# Assert all images are pre-dumped
assert 'image' not in line
assert 'image_path' in line
tgt_path = toliststr(line['image_path'])
return tgt_path
@classmethod
def supported_datasets(cls):
return list(cls.DATASET_SETS)
def evaluate(self, eval_file, **judge_kwargs):
suffix = eval_file.split('.')[-1]
# First, split the eval_file by dataset
data_all = load(eval_file)
for dname in self.datasets:
tgt = eval_file.replace(self.dataset_name, dname)
data_sub = data_all[data_all['SUB_DATASET'] == dname]
data_sub.pop('index')
data_sub['index'] = data_sub.pop('original_index')
data_sub.pop('SUB_DATASET')
dump(data_sub, tgt)
# Then, evaluate each dataset separately
results_all = []
for dname in self.datasets:
tgt = eval_file.replace(self.dataset_name, dname)
res = self.dataset_map[dname].evaluate(tgt, **judge_kwargs)
assert isinstance(res, pd.DataFrame)
res['DATASET'] = [dname] * len(res)
results_all.append(res)
result = pd.concat(results_all)
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
dump(result, score_file)
return result
# Add new supported dataset class here
IMAGE_DATASET = [
ImageCaptionDataset, ImageYORNDataset, ImageMCQDataset, ImageVQADataset, MathVision,
MMMUDataset, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, TableVQABench,
MMLongBench, VCRDataset, MMDUDataset, DUDE, SlideVQA, MUIRDataset,
GMAIMMBenchDataset, MMERealWorld, HRBenchDataset, CRPE, MathVerse
]
VIDEO_DATASET = [
MMBenchVideo, VideoMME, MVBench, MVBench_MP4
]
TEXT_DATASET = [
TextMCQDataset
]
CUSTOM_DATASET = [
CustomMCQDataset, CustomVQADataset, CustomTextMCQDataset
]
DATASET_COLLECTION = [ConcatDataset]
DATASET_CLASSES = IMAGE_DATASET + VIDEO_DATASET + TEXT_DATASET + CUSTOM_DATASET + DATASET_COLLECTION
SUPPORTED_DATASETS = []
for DATASET_CLS in DATASET_CLASSES:
SUPPORTED_DATASETS.extend(DATASET_CLS.supported_datasets())
def DATASET_TYPE(dataset, *, default: str = 'MCQ') -> str:
for cls in DATASET_CLASSES:
if dataset in cls.supported_datasets():
if hasattr(cls, 'TYPE'):
return cls.TYPE
# Have to add specific routine to handle ConcatDataset
if dataset in ConcatDataset.DATASET_SETS:
dataset_list = ConcatDataset.DATASET_SETS[dataset]
TYPES = [DATASET_TYPE(dname) for dname in dataset_list]
assert np.all([x == TYPES[0] for x in TYPES]), (dataset_list, TYPES)
return TYPES[0]
if 'openended' in dataset.lower():
return 'VQA'
warnings.warn(f'Dataset {dataset} is a custom one and not annotated as `openended`, will treat as {default}. ')
return default
def build_dataset(dataset_name, **kwargs):
for cls in DATASET_CLASSES:
if dataset_name in cls.supported_datasets():
return cls(dataset=dataset_name, **kwargs)
warnings.warn(f'Dataset {dataset_name} is not officially supported. ')
data_file = osp.join(LMUDataRoot(), f'{dataset_name}.tsv')
if not osp.exists(data_file):
warnings.warn(f'Data file {data_file} does not exist. Dataset building failed. ')
return None
data = load(data_file)
if 'question' not in [x.lower() for x in data.columns]:
warnings.warn(f'Data file {data_file} does not have a `question` column. Dataset building failed. ')
return None
if 'A' in data and 'B' in data:
if 'image' in data or 'image_path' in data:
warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom MCQ dataset. ')
return CustomMCQDataset(dataset=dataset_name, **kwargs)
else:
warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom Text MCQ dataset. ')
return CustomTextMCQDataset(dataset=dataset_name, **kwargs)
else:
warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom VQA dataset. ')
return CustomVQADataset(dataset=dataset_name, **kwargs)
__all__ = [
'build_dataset', 'img_root_map', 'build_judge', 'extract_answer_from_item', 'prefetch_answer', 'DEBUG_MESSAGE'
] + [cls.__name__ for cls in DATASET_CLASSES]
import math
from typing import List
from .utils.judge_util import build_judge
from .image_base import ImageBaseDataset
from .mmlongbench import concat_images, MMLongBench_auxeval, anls_compute
from ..smp import *
FAIL_MSG = 'Failed to obtain answer via API.'
def DUDE_acc(result_file):
data = load(result_file)
overall_score = 0.0
score_list = list()
for i in range(len(data)):
item = data.iloc[i]
if isinstance(item['answer'], float) and math.isnan(item['answer']):
item['answer'] = 'Not answerable'
item['answer'] = item['answer'].lower()
item['pred'] = item['pred'].lower()
score = anls_compute(item['answer'], item['pred'])
score_list.append(score)
overall_score += score
data['score'] = score_list
dump(data, result_file)
res = dict()
res['category'], res['num'], res['avg_score'] = ['anls'], [len(data)], [overall_score / len(data)]
res = pd.DataFrame(res)
return res
class DUDE(ImageBaseDataset):
TYPE = 'VQA'
DATASET_URL = {
'DUDE': 'https://opencompass.openxlab.space/utils/VLMEval/DUDE.tsv',
'DUDE_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/DUDE_MINI.tsv',
}
DATASET_MD5 = {
'DUDE': '130d860d08206e1e407cd77150c10d88',
'DUDE_MINI': 'e0c0d998114f0cca7516d12039d2b538',
}
SUPPORTED_MODELS = {
'GPT4': (1, 1),
'GPT4V': (1, 1),
'GPT4V_HIGH': (1, 1),
'GPT4o': (1, 1),
'GPT4o_HIGH': (1, 1),
'GPT4o_MINI': (1, 1),
'XComposer2d5': (1, -1),
'XComposer2_4KHD': (1, -1),
'MiniCPM-Llama3-V-2_5': (1, 5),
'InternVL-Chat-V1-5': (5, 2),
}
def __init__(self, dataset, **kwargs):
self.model_list = list(self.SUPPORTED_MODELS.keys())
model_name = kwargs['model']
if not listinstr(self.model_list, model_name):
raise AssertionError("{} doesn't support the evaluation on DUDE.".format(model_name))
super(DUDE, self).__init__(dataset)
self.is_api = True if listinstr(['GPT4'], model_name) else False
self.max_pages = 120
concat_num, column_num = self.SUPPORTED_MODELS.get(model_name)
self.concat_num = concat_num
self.column_num = column_num
def prepare_tsv(self, url, file_md5=None):
data_root = LMUDataRoot()
os.makedirs(data_root, exist_ok=True)
file_name = url.split('/')[-1]
data_path = osp.join(data_root, file_name)
if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
pass
else:
warnings.warn('The dataset tsv is not downloaded')
download_file(url, data_path)
return load(data_path)
def dump_image(self, origin_line):
os.makedirs(self.img_root, exist_ok=True)
try:
import fitz
except:
warnings.warn('Please use `pip install pymupdf` to parse PDF files.')
line = origin_line.copy()
if not isinstance(line['image_path'], List):
line['image_path'] = [line['image_path']]
line['image_path'] = line['image_path'][:self.max_pages]
skip_pdf_parse = True
for im_name in line['image_path']:
path = osp.join(self.img_root, im_name)
if not read_ok(path):
skip_pdf_parse = False
break
# Just for being compatible with the zooped loop: zip(line['image'], line['image_path'])
if skip_pdf_parse:
line['image'] = line['image_path']
else:
pdf_data = base64.b64decode(line['image'])
pdf_file = io.BytesIO(pdf_data)
encoded_images = []
with fitz.open(stream=pdf_file, filetype='pdf') as doc:
doc = doc[:self.max_pages]
for page in doc:
image = page.get_pixmap(dpi=144)
image_file = io.BytesIO(image.tobytes(output='png'))
image = Image.open(image_file)
encoded_image = encode_image_to_base64(image)
encoded_images.append(encoded_image)
line['image'] = encoded_images
print('process {}'.format(line['doc_id']))
if 'image' in line:
if isinstance(line['image'], list):
tgt_path = []
assert 'image_path' in line
for img, im_name in zip(line['image'], line['image_path']):
path = osp.join(self.img_root, im_name)
if not read_ok(path):
decode_base64_to_image_file(img, path)
tgt_path.append(path)
else:
tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
if not read_ok(tgt_path):
decode_base64_to_image_file(line['image'], tgt_path)
tgt_path = [tgt_path]
else:
assert 'image_path' in line
tgt_path = toliststr(line['image_path'])
if self.concat_num > 0 and not self.is_api:
concatenated_images = concat_images(tgt_path, max_concat=self.concat_num, column_num=self.column_num)
old_tgt_path = tgt_path
assert isinstance(old_tgt_path, list)
if self.column_num != -1:
tgt_path = [
'_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat{}_{}.jpg'.format(self.concat_num, i)
for i in range(len(concatenated_images))
]
else:
tgt_path = ['_'.join(old_tgt_path[0].split('_')[:-1]) + '_concat_all.jpg']
for path, concatenated_image in zip(tgt_path, concatenated_images):
if not read_ok(path):
decode_base64_to_image_file(encode_image_to_base64(concatenated_image), path)
num_images, image_size = len(old_tgt_path), concatenated_image.size
print('concat {} images to a new one with size {}. save at {}'.format(num_images, image_size, path))
return tgt_path
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
logger = get_logger('Evaluation')
model = judge_kwargs['model']
suffix = eval_file.split('.')[-1]
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
if osp.exists(storage):
logger.warning(f'GPT scoring file {storage} already exists, will reuse it in DUDE_eval. ')
else:
data = load(eval_file)
model = build_judge(max_tokens=128, **judge_kwargs)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = {}
if osp.exists(tmp_file):
ans = load(tmp_file)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
if len(indices):
new_results = list()
for model, line in tqdm(tups):
res = MMLongBench_auxeval(model, line)
new_results.append(res)
log_map, res_map, pred_map = {}, {}, {}
all_inds = [line['index'] for line in lines]
for k, v in zip(all_inds, new_results):
log_map[k] = v['log']
res_map[k] = v['res']
pred_map[k] = v['pred']
data['res'] = [res_map[idx] for idx in data['index']]
data['log'] = [log_map[idx] for idx in data['index']]
data['pred'] = [pred_map[idx] for idx in data['index']]
dump(data, storage)
score = DUDE_acc(storage)
score_pth = storage.replace('.xlsx', '_score.csv')
dump(score, score_pth)
logger.info(f'DUDE successfully finished evaluating {eval_file}, results saved in {score_pth}')
logger.info('Score: ')
logger.info(score)
import pandas as pd
from abc import abstractmethod
from ..smp import *
def img_root_map(dataset):
if 'CRPE' in dataset:
return 'CRPE'
if 'OCRVQA' in dataset:
return 'OCRVQA'
if 'COCO_VAL' == dataset:
return 'COCO'
if 'MMMU' in dataset:
return 'MMMU'
mmbench_root_map = {
'MMBench_DEV_EN': 'MMBench', 'MMBench_TEST_EN': 'MMBench',
'MMBench_DEV_CN': 'MMBench', 'MMBench_TEST_CN': 'MMBench',
'MMBench': 'MMBench', 'MMBench_CN': 'MMBench',
'MMBench_DEV_EN_V11': 'MMBench_V11', 'MMBench_TEST_EN_V11': 'MMBench_V11',
'MMBench_DEV_CN_V11': 'MMBench_V11', 'MMBench_TEST_CN_V11': 'MMBench_V11',
'MMBench_V11': 'MMBench', 'MMBench_CN_V11': 'MMBench',
}
if dataset in mmbench_root_map:
return mmbench_root_map[dataset]
return dataset
class ImageBaseDataset:
MODALITY = 'IMAGE'
DATASET_URL = {}
DATASET_MD5 = {}
def __init__(self, dataset='MMBench', skip_noimg=True):
ROOT = LMUDataRoot()
# You can override this variable to save image files to a different directory
self.dataset_name = dataset
self.img_root = osp.join(ROOT, 'images', img_root_map(dataset))
data = self.load_data(dataset)
self.skip_noimg = skip_noimg
if skip_noimg and 'image' in data:
data = data[~pd.isna(data['image'])]
data['index'] = [str(x) for x in data['index']]
self.meta_only = True
# The image field can store the base64 encoded image or another question index (for saving space)
if 'image' in data:
data['image'] = [str(x) for x in data['image']]
image_map = {x: y for x, y in zip(data['index'], data['image'])}
for k in image_map:
if len(image_map[k]) <= 64:
idx = image_map[k]
assert idx in image_map and len(image_map[idx]) > 64
image_map[k] = image_map[idx]
images = [toliststr(image_map[k]) for k in data['index']]
data['image'] = [x[0] if len(x) == 1 else x for x in images]
self.meta_only = False
if 'image_path' in data:
paths = [toliststr(x) for x in data['image_path']]
data['image_path'] = [x[0] if len(x) == 1 else x for x in paths]
if np.all([istype(x, int) for x in data['index']]):
data['index'] = [int(x) for x in data['index']]
self.data = data
self.post_build(dataset)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return dict(self.data.iloc[idx])
def prepare_tsv(self, url, file_md5=None):
data_root = LMUDataRoot()
os.makedirs(data_root, exist_ok=True)
update_flag = False
file_name = url.split('/')[-1]
data_path = osp.join(data_root, file_name)
if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
pass
else:
warnings.warn('The dataset tsv is not downloaded')
download_file(url, data_path)
update_flag = True
if file_size(data_path, 'GB') > 1:
local_path = data_path.replace('.tsv', '_local.tsv')
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
from ..tools import LOCALIZE
LOCALIZE(data_path, local_path)
data_path = local_path
return load(data_path)
def dump_image(self, line):
os.makedirs(self.img_root, exist_ok=True)
if 'image' in line:
if isinstance(line['image'], list):
tgt_path = []
assert 'image_path' in line
for img, im_name in zip(line['image'], line['image_path']):
path = osp.join(self.img_root, im_name)
if not read_ok(path):
decode_base64_to_image_file(img, path)
tgt_path.append(path)
else:
tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
if not read_ok(tgt_path):
decode_base64_to_image_file(line['image'], tgt_path)
tgt_path = [tgt_path]
else:
assert 'image_path' in line
tgt_path = toliststr(line['image_path'])
return tgt_path
def display(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
assert isinstance(line, pd.Series) or isinstance(line, dict)
mmqa_display(line)
# Return a list of dataset names that are supported by this class, can override
@classmethod
def supported_datasets(cls):
return list(cls.DATASET_URL)
# Given the dataset name, return the dataset as a pandas dataframe, can override
def load_data(self, dataset):
url = self.DATASET_URL[dataset]
file_md5 = self.DATASET_MD5[dataset] if dataset in self.DATASET_MD5 else None
return self.prepare_tsv(url, file_md5)
# Post built hook, will be called after the dataset is built, can override
def post_build(self, dataset):
pass
# Given one data record, return the built prompt (a multi-modal message), can override
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
if self.meta_only:
tgt_path = toliststr(line['image_path'])
else:
tgt_path = self.dump_image(line)
question = line['question']
msgs = []
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=question))
return msgs
# Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
@abstractmethod
def evaluate(self, eval_file, **judge_kwargs):
pass
from .image_base import ImageBaseDataset
from ..smp import *
class COCO_Caption_Scorer():
def __init__(self, ref, gt):
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
self.ref = ref
self.gt = gt
print('setting up scorers...')
self.scorers = [
(Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']),
(Rouge(), 'ROUGE_L'),
(Cider(), 'CIDEr'),
]
def compute_scores(self):
total_scores = {}
for scorer, method in self.scorers:
print('computing %s score...' % (scorer.method()))
score, scores = scorer.compute_score(self.gt, self.ref)
if isinstance(method, list):
for sc, scs, m in zip(score, scores, method):
print('%s: %0.3f' % (m, sc * 100))
total_scores['Bleu'] = [x * 100 for x in score]
else:
print('%s: %0.3f' % (method, score * 100))
total_scores[method] = score * 100
print('*****DONE*****')
for key, value in total_scores.items():
print('{}:{}'.format(key, value))
return total_scores
class ImageCaptionDataset(ImageBaseDataset):
TYPE = 'Caption'
DATASET_URL = {
'COCO_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/COCO_VAL.tsv',
}
DATASET_MD5 = {
'COCO_VAL': '72a5079dead060269ac222c5aa5128af',
}
def load_data(self, dataset):
data = super().load_data(dataset)
if 'question' not in data:
data['question'] = [(
'Please describe this image in general. Directly provide the description, '
'do not include prefix like "This image depicts". '
)] * len(data)
return data
# It returns a dictionary of scores
@classmethod
def evaluate(self, eval_file, **kwargs):
data = load(eval_file)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
ref, gt = {}, {}
for i, line in enumerate(lines):
ref[str(i)] = [str(line['prediction'])]
gt[str(i)] = eval(line['answer'])
scorer = COCO_Caption_Scorer(ref, gt)
coco_caption_score_dict = scorer.compute_scores()
score_pth = eval_file.replace('.xlsx', '_score.json')
dump(coco_caption_score_dict, score_pth)
return coco_caption_score_dict
import warnings
from .image_base import ImageBaseDataset
from .utils import build_judge, DEBUG_MESSAGE
from ..smp import *
MMMB_URLS = {
'MMMB_ar': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_ar.tsv',
'MMMB_cn': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_cn.tsv',
'MMMB_en': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_en.tsv',
'MMMB_pt': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_pt.tsv',
'MMMB_ru': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_ru.tsv',
'MMMB_tr': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_tr.tsv',
}
MTL_MMBench_URLS = {
'MMBench_dev_ar': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_ar.tsv',
'MMBench_dev_cn': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_cn.tsv',
'MMBench_dev_en': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_en.tsv',
'MMBench_dev_pt': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_pt.tsv',
'MMBench_dev_tr': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_tr.tsv',
'MMBench_dev_ru': 'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_ru.tsv',
}
MMMB_MD5 = {
'MMMB_ar': 'f3a18b6385f1d9701840aa42de27aead', 'MMMB_cn': '13ed82fa89730037292fcaa27f08f430',
'MMMB_en': '1cd781a71ec5a2983c090b84105d6a01', 'MMMB_pt': '548ea2b3bb2da991790386f0015d30d1',
'MMMB_ru': 'ce1cc8a0533425ab0d86b326ebfc2984', 'MMMB_tr': '0733739d43090327975294292bc5cd67'
}
MTL_MMBench_MD5 = {
'MMBench_dev_ar': '4271b4a0d0200e1a86380a878e0d64a4', 'MMBench_dev_cn': '2ed5135326fed02c8e51ea50dda8222f',
'MMBench_dev_en': 'd9ab776fc018b3d45785e9a5c23431c2', 'MMBench_dev_pt': '4ddfbcd27ef12444b908c03831cd0295',
'MMBench_dev_tr': '4fab39d501389d3d6cc90264bb708f11', 'MMBench_dev_ru': '5ba1171ff2e68f80637bf78349e402a5'
}
class ImageMCQDataset(ImageBaseDataset):
TYPE = 'MCQ'
DATASET_URL = {
# MMBench v1.0
'MMBench_DEV_EN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_EN.tsv',
'MMBench_TEST_EN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_EN.tsv',
'MMBench_DEV_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_CN.tsv',
'MMBench_TEST_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_CN.tsv',
'MMBench': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench.tsv', # Internal Only
'MMBench_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN.tsv', # Internal Only
# MMBench v1.1
'MMBench_DEV_EN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_EN_V11.tsv',
'MMBench_TEST_EN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_EN_V11.tsv',
'MMBench_DEV_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_CN_V11.tsv',
'MMBench_TEST_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_CN_V11.tsv',
'MMBench_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_V11.tsv', # Internal Only
'MMBench_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN_V11.tsv', # Internal Only
# SEEDBench Series
'SEEDBench_IMG': 'https://opencompass.openxlab.space/utils/VLMEval/SEEDBench_IMG.tsv',
'SEEDBench2': 'https://huggingface.co/datasets/VLMEval/SEEDBench2/resolve/main/SEEDBench2.tsv',
'SEEDBench2_Plus': 'https://opencompass.openxlab.space/utils/VLMEval/SEEDBench2_Plus.tsv',
# ScienceQA Series
'ScienceQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/ScienceQA_VAL.tsv',
'ScienceQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/ScienceQA_TEST.tsv',
# MMT-Bench
'MMT-Bench_ALL_MI': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_ALL_MI.tsv',
'MMT-Bench_ALL': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_ALL.tsv',
'MMT-Bench_VAL_MI': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_VAL_MI.tsv',
'MMT-Bench_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_VAL.tsv',
# AesBench
'AesBench_VAL': 'https://huggingface.co/datasets/VLMEval/AesBench/resolve/main/AesBench_VAL.tsv',
'AesBench_TEST': 'https://huggingface.co/datasets/VLMEval/AesBench/resolve/main/AesBench_TEST.tsv',
# Q-Bench1
'Q-Bench1_VAL': 'https://huggingface.co/datasets/zhangzicheng/qbench_tsv/resolve/main/Q-Bench1_VAL.tsv',
'Q-Bench1_TEST': 'https://huggingface.co/datasets/zhangzicheng/qbench_tsv/resolve/main/Q-Bench1_TEST.tsv',
# A-Bench
'A-Bench_VAL': 'https://huggingface.co/datasets/zhangzicheng/abench_tsv/resolve/main/A-bench_VAL.tsv',
'A-Bench_TEST': 'https://huggingface.co/datasets/zhangzicheng/abench_tsv/resolve/main/A-bench_TEST.tsv',
# Other Benchmarks
'CCBench': 'https://opencompass.openxlab.space/utils/VLMEval/CCBench.tsv',
'AI2D_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST.tsv',
'AI2D_TEST_NO_MASK': 'https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST_NO_MASK.tsv',
'MMStar': 'https://opencompass.openxlab.space/utils/VLMEval/MMStar.tsv',
'RealWorldQA': 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv',
'MLLMGuard_DS': 'https://opencompass.openxlab.space/utils/VLMEval/MLLMGuard_DS.tsv',
'BLINK': 'https://opencompass.openxlab.space/utils/VLMEval/BLINK.tsv',
'TaskMeAnything_v1_imageqa_random': (
'https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-random/'
'resolve/main/TaskMeAnything-v1-imageqa-random.tsv'
)
}
DATASET_MD5 = {
# MMBench v1.0
'MMBench_DEV_EN': 'b6caf1133a01c6bb705cf753bb527ed8',
'MMBench_TEST_EN': '6939fadb0ce626fefc0bdc9c64efc528',
'MMBench_DEV_CN': '08b8fc3324a5ed74155350f57be69fbd',
'MMBench_TEST_CN': '7e1239baf0ee4c8b513e19705a0f317e',
'MMBench': '4115aea3383f3dd0083be6a633e0f820', # Internal Only
'MMBench_CN': '2e053ffc90ea598b1feae13c36dc13ee', # Internal Only
# MMBench v1.1
'MMBench_DEV_EN_V11': '30c05be8f2f347a50be25aa067248184',
'MMBench_TEST_EN_V11': '26f0f15381a21720255091d3e0316ce6',
'MMBench_DEV_CN_V11': '593f9b5f6bea453d870a798b34ae4f37',
'MMBench_TEST_CN_V11': '74bbe4556dac745613c7cbe5ad787050',
'MMBench_V11': 'b9276414f57af1308dcc4d0cd9b42e7c', # Internal Only
'MMBench_CN_V11': '95f6980dd1b4de38e3cbffe0305a3f25', # Internal Only
# SEEDBench
'SEEDBench_IMG': '68017231464752261a2526d6ca3a10c0',
'SEEDBench2': '4ec15cf864c4f16274112284f531813e',
'SEEDBench2_Plus': 'e32d3216dc4f452b0fe497a52015d1fd',
# ScienceQA
'ScienceQA_VAL': '96320d05e142e585e7204e72affd29f3',
'ScienceQA_TEST': 'e42e9e00f9c59a80d8a5db35bc32b71f',
# MMT-Bench
'MMT-Bench_ALL_MI': '5272157097e19cdd7cb41e412ab3b7c7',
'MMT-Bench_ALL': 'b273a2f4c596fe4f2605de0494cd632f',
'MMT-Bench_VAL_MI': 'c7d7b998eb5cd9aa36c7d4f721472462',
'MMT-Bench_VAL': '8dd4b730f53dbf9c3aed90ca31c928e0',
# AesBench
'AesBench_VAL': '3edb0c319e9187aa0b97fe7a11700a8c',
'AesBench_TEST': '58b1f7ba2cc32e1d68896d6ee716bbf8',
# Q-Bench1
'Q-Bench1_VAL': '837bdb6cd2da571713543462815187b7',
'Q-Bench1_TEST': '15e759bfd58c9d5f30b23a317d347153',
# A-Bench
'A-Bench_VAL': '218563ec50d34bb336c814143a5bb9c1',
'A-Bench_TEST': '567013fb033a20cf23f51d8e865bd16c',
# Other Benchmarks
'CCBench': 'f5dde47f24dc5a6fb6e595b409b466ac',
'AI2D_TEST': '0f593e0d1c7df9a3d69bf1f947e71975',
'AI2D_TEST_NO_MASK': 'fd8f463634d4fe9fbd23b876e8eea5be',
'MMStar': 'e1ecd2140806c1b1bbf54b43372efb9e',
'RealWorldQA': '92321028d2bc29040284b6674721e48f',
'MLLMGuard_DS': '975fc0dd7119386e198c37d71e274b3f',
'BLINK': '3b6649b6a662184ea046908e5506260e',
'TaskMeAnything_v1_imageqa_random': '023fef69e2ca21827afb77c5ec3bc889'
}
DATASET_URL.update(MMMB_URLS)
DATASET_URL.update(MTL_MMBench_URLS)
DATASET_MD5.update(MMMB_MD5)
DATASET_MD5.update(MTL_MMBench_MD5)
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
if self.meta_only:
tgt_path = toliststr(line['image_path'])
else:
tgt_path = self.dump_image(line)
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = 'Options:\n'
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = ''
if hint is not None:
prompt += f'Hint: {hint}\n'
prompt += f'Question: {question}\n'
if len(options):
prompt += options_prompt
prompt += 'Please select the correct answer from the options above. \n'
msgs = []
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=prompt))
return msgs
def evaluate(self, eval_file, **judge_kwargs):
from .utils.multiple_choice import report_acc, report_acc_MMT, mcq_circular_eval, mcq_vanilla_eval
# assert dataset is not None
dataset_map = {
'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_EN_V11': 'MMBench_V11',
'MMBench_TEST_CN': 'MMBench_CN', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11'
}
dataset = self.dataset_name
if dataset in dataset_map:
dataset = dataset_map[dataset]
nproc = judge_kwargs.pop('nproc', 4)
circular = False
if listinstr(['mmbench', 'ccbench'], dataset.lower()):
data = load(eval_file)
data['index'] = [int(x) for x in data['index']]
dump(data, eval_file)
circular = True
suffix = eval_file.split('.')[-1]
model = judge_kwargs.get('model', 'exact_matching')
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
name_str = name_str_map[model] if model in name_str_map else model
if model == 'exact_matching':
model = None
elif gpt_key_set():
model = build_judge(**judge_kwargs)
if not model.working():
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
warnings.warn(DEBUG_MESSAGE)
model = None
else:
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
model = None
result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
data = load(eval_file)
data = data.sort_values(by='index')
data['prediction'] = [str(x) for x in data['prediction']]
# If not choice label, then use lower case
for k in data.keys():
data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
meta = self.data
meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
data_map = {x: y for x, y in zip(data['index'], data['question'])}
for k in data_map:
assert k in meta_q_map, (
f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
)
if circular:
data = mcq_circular_eval(model, data, meta, nproc, result_file, self.dataset_name)
else:
data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
# load split
dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
# May have different report acc functions for different datasets
if 'MMT' in dataset:
acc = report_acc_MMT(data)
else:
acc = report_acc(data)
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
dump(acc, score_file)
if dataset == 'AesBench_VAL':
warnings.warn('Note that AesBench VAL is just a toy version of AesBench TEST. For full results, \
please evaluate on AesBench TEST. The AesBench TEST dataset is more than 20 times \
larger than the VAL dataset and the leaderboard results are based on AesBench TEST.')
return acc
class MMMUDataset(ImageMCQDataset):
DATASET_URL = {
'MMMU_DEV_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_DEV_VAL.tsv',
'MMMU_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_TEST.tsv',
}
DATASET_MD5 = {
'MMMU_DEV_VAL': '521afc0f3bf341e6654327792781644d',
'MMMU_TEST': 'c19875d11a2d348d07e5eb4bdf33166d',
}
@staticmethod
def split_MMMU(msgs):
text, images = None, []
for s in msgs:
if s['type'] == 'image':
images.append(s['value'])
elif s['type'] == 'text':
assert text is None
text = s['value']
text_segs = text.split('<image ')
if len(text_segs) == 1:
return msgs
segs = [dict(type='text', value=text_segs[0])]
for i, seg in enumerate(text_segs):
if i == 0:
continue
assert istype(seg[0], int) and seg[1] == '>'
image_idx = int(seg[0]) - 1
segs.append(dict(type='image', value=images[image_idx]))
segs.append(dict(type='text', value=seg[2:]))
return segs
def build_prompt(self, line):
msgs = super().build_prompt(line)
msgs = self.split_MMMU(msgs)
return msgs
class MUIRDataset(ImageMCQDataset):
DATASET_URL = {
'MUIRBench': 'http://opencompass.openxxlab.com/utils/VLMEval/MUIRBench.tsv'
}
DATASET_MD5 = {
'MUIRBench': '2e5e6fd7699761b08a7cb3ab8c0c2ec8'
}
@staticmethod
def split_MUIR(msgs):
text, images = None, []
# Separate images and text from msgs
for s in msgs:
if s['type'] == 'image':
images.append(s['value'])
elif s['type'] == 'text':
assert text is None # Ensure only one text entry is expected
text = s['value']
# Split text by <image> tags
text_segs = text.split('<image>')
# Initialize the segments list
segs = []
# Iterate through the text segments and images
for i, seg in enumerate(text_segs):
# Append the image if this is not the first segment and there are still images left
if i > 0 and i - 1 < len(images):
segs.append(dict(type='image', value=images[i - 1]))
# Append the text segment (if it's non-empty)
if len(seg) > 0:
segs.append(dict(type='text', value=seg))
return segs
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
if self.meta_only:
tgt_path = toliststr(line['image_path'])
else:
tgt_path = self.dump_image(line)
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
# options_prompt = ''
options_prompt = '\n'.join([f'{key}. {item}' for key, item in options.items()])
# for key, item in options.items():
# options_prompt += f'{key}. {item}\n'
prompt = ''
prompt += f'{question}\n'
if len(options):
prompt += options_prompt
prompt += "\nAnswer with the option's letter from the given choices directly."
msgs = []
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=prompt))
msgs = self.split_MUIR(msgs)
return msgs
class GMAIMMBenchDataset(ImageMCQDataset):
DATASET_URL = {
'GMAI-MMBench_VAL': 'https://huggingface.co/datasets/VLMEval/GMAI-MMBench/resolve/main/GMAI-MMBench_VAL.tsv'
}
DATASET_MD5 = {
'GMAI-MMBench_VAL': '254bd581627866f1c499d3d6b4422324'
}
def report_acc_by_groups(self, df, group_column):
res = defaultdict(list)
# Check for the 'split' column
if 'split' in df:
splits = list(set(df['split']))
res['split'] = splits
else:
df['split'] = ['none'] * len(df)
res['split'] = ['none']
res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']]
if group_column not in df:
raise ValueError(f"Column '{group_column}' not found in dataframe.")
abilities = list(set(df[group_column]))
abilities = ['None' if isinstance(ab, float) and pd.isna(ab) else ab for ab in abilities]
abilities.sort()
for ab in abilities:
ab_name = ab
sub_df = df[df[group_column] == ab]
res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
return pd.DataFrame(res)
def evaluate(self, eval_file, **judge_kwargs):
from .utils.multiple_choice import report_acc, mcq_vanilla_eval
nproc = judge_kwargs.pop('nproc', 4)
suffix = eval_file.split('.')[-1]
model = judge_kwargs.get('model', 'exact_matching')
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
name_str = name_str_map[model] if model in name_str_map else model
if model == 'exact_matching':
model = None
elif gpt_key_set():
model = build_judge(**judge_kwargs)
if not model.working():
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
warnings.warn(DEBUG_MESSAGE)
model = None
else:
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
model = None
result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
data = load(eval_file)
data = data.sort_values(by='index')
data['prediction'] = [str(x) for x in data['prediction']]
# If not choice label, then use lower case
for k in data.keys():
data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
meta = self.data
meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
data_map = {x: y for x, y in zip(data['index'], data['question'])}
for k in data_map:
assert k in meta_q_map, (
f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
)
data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
# load split
dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
acc = report_acc(data)
for group_col in ['clinical vqa task', 'department', 'perceptual granularity']:
acc_grouped = self.report_acc_by_groups(data, group_col)
score_file_grouped = eval_file.replace(f'.{suffix}', f'_{group_col}_acc.csv')
dump(acc_grouped, score_file_grouped)
return acc
class MMERealWorld(ImageMCQDataset):
TYPE = 'MMERealWorld'
DATASET_MD5 = {
'MME-RealWorld': '271c33ec814c39533c467ec6fb8a6f36',
'MME-RealWorld-CN': 'daaa763d52a760a38606d5dedb3fe444',
}
SYS = {
'MME-RealWorld': (
'Select the best answer to the above multiple-choice question based on the image. '
'Respond with only the letter (A, B, C, D, or E) of the correct option. \n'
'The best answer is:'
),
'MME-RealWorld-CN': (
'根据图像选择上述多项选择题的最佳答案。只需回答正确选项的字母(A, B, C, D 或 E)。\n'
'最佳答案为:'
),
}
@classmethod
def supported_datasets(cls):
return ['MME-RealWorld', 'MME-RealWorld-CN']
def load_data(self, dataset='MME-RealWorld', repo_id='yifanzhang114/MME-RealWorld-Base64'):
def check_integrity(pth):
data_file = osp.join(pth, f'{dataset}.tsv')
if not os.path.exists(data_file):
return False
if md5(data_file) != self.DATASET_MD5[dataset]:
return False
return True
def generate_tsv(pth):
tsv_file = os.path.join(pth, f'{dataset}.tsv')
if os.path.exists(tsv_file):
print(f'{tsv_file} already exists.')
return
json_dir = os.path.join(pth, dataset)
json_files = [f for f in os.listdir(json_dir) if f.endswith('.json')]
data_list = []
for json_file in json_files:
with open(os.path.join(json_dir, json_file), 'r') as f:
data = json.load(f)
for item in tqdm(data):
choice_prompt = 'The choices are listed below:\n' if dataset == 'MME-RealWorld' else '选项如下所示:\n'
data_list.append({
'index': item['index'],
'image': item['image'],
'question': item['question'],
'multi-choice options': choice_prompt + '\n'.join(item['multi-choice options']),
'A': item['multi-choice options'][0][4:],
'B': item['multi-choice options'][1][4:],
'C': item['multi-choice options'][2][4:],
'D': item['multi-choice options'][3][4:],
'E': item['multi-choice options'][4][4:],
'answer': item['answer'],
'category': item['category'],
'l2-category': item['l2-category']
})
df = pd.DataFrame(data_list)
df.to_csv(tsv_file, sep='\t', index=False)
print(f'TSV file saved to {tsv_file}')
# Check if dataset is cached and has integrity
update_flag = False
cache_path = get_cache_path(repo_id)
if cache_path is not None and check_integrity(cache_path):
dataset_path = cache_path
print(f'Using cached dataset from {cache_path}')
else:
from huggingface_hub import snapshot_download
# Download or find the dataset path
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
generate_tsv(dataset_path)
update_flag = True
data_path = os.path.join(dataset_path, f'{dataset}.tsv')
if file_size(data_path, 'GB') > 1:
local_path = data_path.replace('.tsv', '_local.tsv')
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
from vlmeval.tools import LOCALIZE
LOCALIZE(data_path, local_path)
data_path = local_path
return load(data_path)
def post_build(self, dataset):
self.TYPE = 'MMERealWorld'
# Given one data record, return the built prompt (a multi-modal message), can override
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
if self.meta_only:
tgt_path = toliststr(line['image_path'])
else:
tgt_path = self.dump_image(line)
question = line['question']
choice_prompt = line['multi-choice options'] + '\n'
question += ' ' + choice_prompt + self.SYS[self.dataset_name]
msgs = []
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=question))
return msgs
# It returns a dictionary
@classmethod
def evaluate(self, eval_file, **judge_kwargs):
from .utils.multiple_choice import extract_characters_regex, get_dimension_rating
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
FAIL_MSG = 'Failed to obtain answer via API.'
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
tgt_file = eval_file.replace('.xlsx', '_rating.json')
score_file = eval_file.replace('.xlsx', '_score.xlsx')
if not osp.exists(score_file):
res = {} if not osp.exists(tmp_file) else load(tmp_file)
res = {k: v for k, v in res.items() if FAIL_MSG not in v}
data = load(eval_file)
cnt_rejected = 0
data_un = data[~pd.isna(data['prediction'])]
for idx in data['index']:
ans = data.loc[data['index'] == idx, 'answer'].values[0]
pred = data.loc[data['index'] == idx, 'prediction'].values[0]
extract_pred = extract_characters_regex(pred)
if extract_pred == '':
cnt_rejected += 1
data.loc[data['index'] == idx, 'score'] = 0
else:
data.loc[data['index'] == idx, 'score'] = int(extract_pred == ans)
print(
f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
f'failed to obtain the score for another {cnt_rejected} questions. '
f'Those questions will be counted as 0 score in ALL rating.'
)
dump(data, score_file)
rating = get_dimension_rating(score_file)
dump(rating, tgt_file)
return rating
class HRBenchDataset(ImageMCQDataset):
DATASET_URL = {
'HRBench4K': 'https://huggingface.co/datasets/DreamMr/HR-Bench/resolve/main/hr_bench_4k.tsv',
'HRBench8K': 'https://huggingface.co/datasets/DreamMr/HR-Bench/resolve/main/hr_bench_8k.tsv',
}
DATASET_MD5 = {
'HRBench4K': 'f6b041b03d49543494b8a56d2e35be65',
'HRBench8K': '274c9c7f89329b804a4723178a00219c',
}
def evaluate(self, eval_file, **judge_kwargs):
assert os.path.exists(eval_file), '{} does not exist!'.format(eval_file)
from .utils.multiple_choice import mcq_vanilla_eval
from .utils.hrbench import report_acc_hrbench
nproc = judge_kwargs.pop('nproc', 4)
suffix = eval_file.split('.')[-1]
model = judge_kwargs.get('model', 'extract_matching')
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'}
name_str = name_str_map[model] if model in name_str_map else model
if model == 'exact_matching':
model = None
elif gpt_key_set():
model = build_judge(**judge_kwargs)
if not model.working():
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
warnings.warn(DEBUG_MESSAGE)
model = None
else:
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
model = None
result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
data = load(eval_file)
data = data.sort_values(by='index')
data['prediction'] = [str(x) for x in data['prediction']]
# If not choice label, then use lower case
for k in data.keys():
data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
meta = self.data
meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
data_map = {x: y for x, y in zip(data['index'], data['question'])}
for k in data_map:
assert k in meta_q_map, (
f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
)
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
if osp.exists(score_file):
acc = load(score_file)
return acc
data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
acc = report_acc_hrbench(data)
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
dump(acc, score_file)
return acc
class CustomMCQDataset(ImageMCQDataset):
def load_data(self, dataset):
data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
if file_size(data_path, 'GB') > 1:
local_path = data_path.replace('.tsv', '_local.tsv')
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
from ..tools import LOCALIZE
LOCALIZE(data_path, local_path)
data_path = local_path
return load(data_path)
from .image_base import ImageBaseDataset
from .utils.judge_util import build_judge
from ..smp import *
from ..utils import track_progress_rich
class ImageMTDataset(ImageBaseDataset):
TYPE = 'MT'
def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
if self.meta_only:
tgt_path = toliststr(line['image_path'])
else:
tgt_path = self.dump_image(line)
questions = toliststr(line['question'])
if 'answer' in line:
answers = toliststr(line['answer'])
else:
answers = [''] * len(questions)
assert len(questions) == len(answers)
dlgs, pics_number = [], 0
for i in range(len(questions)):
q, a = questions[i], answers[i]
if '<ImageHere>' in q:
content = []
tag_number = q.count('<ImageHere>')
images = tgt_path[pics_number: pics_number + tag_number]
pics_number += tag_number
q_split = q.split('<ImageHere>')
for i in range(tag_number):
qsp, im = q_split[i], images[i]
if qsp != '':
content.append(dict(type='text', value=qsp))
content.append(dict(type='image', value=im))
if q_split[-1] != '':
content.append(dict(type='text', value=q_split[-1]))
else:
content = [dict(type='text', value=q)]
dlgs.append(dict(role='user', content=content))
assert '<ImageHere>' not in a, 'We currently do not support images in the answer. '
content = [dict(type='text', value=a)]
dlgs.append(dict(role='assistant', content=content))
return dlgs
class MMDUDataset(ImageMTDataset):
DATASET_URL = {'MMDU': 'https://opencompass.openxlab.space/utils/VLMEval/MMDU.tsv'}
DATASET_MD5 = {'MMDU': '848b635a88a078f49aebcc6e39792061'}
DIMS = [
'Creativity', 'Richness', 'Visual Perception', 'Logical Coherence',
'Answer Accuracy', 'Image Relationship Understanding', 'Overall Score'
]
def calculat_metric(self, ans):
all = defaultdict(lambda: 0)
tot = defaultdict(lambda: 0)
valid = defaultdict(lambda: 0)
for k in ans:
res = ans[k]['res']
assert isinstance(res, pd.DataFrame)
lt = len(res)
for i in range(lt):
line = res.iloc[i]
for k in self.DIMS:
tot[k] += 1
if k in line and line[k] is not None:
try:
score = int(line[k])
score = np.clip(score, 0, 10)
all[k] += score
valid[k] += 1
except Exception as e:
print(f'Failed to parse the score: {str(e)}')
sp1 = {'set': 'all'}
sp1.update({k: all[k] / tot[k] * 10 for k in self.DIMS})
sp2 = {'set': 'valid'}
sp2.update({k: all[k] / valid[k] * 10 for k in self.DIMS})
return pd.DataFrame([sp1, sp2])
def evaluate(self, eval_file, **judge_kwargs):
suffix = eval_file.split('.')[-1]
model = judge_kwargs['model']
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
score_file = eval_file.replace(f'.{suffix}', f'_{model}_score.csv')
nproc = judge_kwargs.pop('nproc', 4)
data = load(eval_file)
model = judge_kwargs.pop('model', 'gpt-4o')
judge_model = build_judge(model=model, **judge_kwargs)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(judge_model, line) for line in lines]
indices = [line['index'] for line in lines]
ans = {}
if osp.exists(tmp_file):
ans = load(tmp_file)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]
from .utils.mmdu import mmdu_score
if len(indices):
new_results = track_progress_rich(
mmdu_score,
tups,
nproc=nproc,
chunksize=nproc,
keys=indices,
save=tmp_file,)
ans = load(tmp_file)
for k, v in zip(indices, new_results):
assert k in ans
metric = self.calculat_metric(ans)
dump(metric, score_file)
return metric
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment