Commit bc5ebf0f authored by luopl's avatar luopl
Browse files

Initial commit

parents
Pipeline #2167 canceled with stages
try:
import torch
except ImportError:
pass
from .smp import *
from .api import *
from .dataset import *
from .utils import *
from .vlm import *
from .config import *
from .tools import cli
load_env()
__version__ = '0.2rc1'
from .gpt import OpenAIWrapper, GPT4V
from .hf_chat_model import HFChatModel
from .gemini import GeminiWrapper, GeminiProVision
from .qwen_vl_api import QwenVLWrapper, QwenVLAPI, Qwen2VLAPI
from .qwen_api import QwenAPI
from .claude import Claude_Wrapper, Claude3V
from .reka import Reka
from .glm_vision import GLMVisionAPI
from .cloudwalk import CWWrapper
from .sensechat_vision import SenseChatVisionAPI
from .siliconflow import SiliconFlowAPI, TeleMMAPI
from .hunyuan import HunyuanVision
from .bailingmm import bailingMMAPI
from .bluelm_v_api import BlueLMWrapper, BlueLM_V_API
from .jt_vl_chat import JTVLChatAPI
from .taiyi import TaiyiAPI
from .lmdeploy import LMDeployAPI
from .taichu import TaichuVLAPI
__all__ = [
'OpenAIWrapper', 'HFChatModel', 'GeminiWrapper', 'GPT4V',
'GeminiProVision', 'QwenVLWrapper', 'QwenVLAPI', 'QwenAPI',
'Claude3V', 'Claude_Wrapper', 'Reka', 'GLMVisionAPI',
'CWWrapper', 'SenseChatVisionAPI', 'HunyuanVision', 'Qwen2VLAPI',
'BlueLMWrapper', 'BlueLM_V_API', 'JTVLChatAPI', 'bailingMMAPI',
'TaiyiAPI', 'TeleMMAPI', 'SiliconFlowAPI', 'LMDeployAPI',
'TaichuVLAPI'
]
import base64
from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
from vlmeval.dataset import DATASET_TYPE
from vlmeval.smp.vlm import encode_image_file_to_base64
import time
class bailingMMWrapper(BaseAPI):
is_api: bool = True
def __init__(self,
model: str,
retry: int = 5,
wait: int = 5,
key: str = None,
verbose: bool = True,
system_prompt: str = None,
max_tokens: int = 1024,
proxy: str = None,
**kwargs):
self.model = model
self.fail_msg = 'Failed to obtain answer via bailingMM API.'
if key is None:
key = os.environ.get('BAILINGMM_API_KEY', None)
assert key is not None, ('Please set the API Key for bailingMM.')
self.key = key
self.headers = {"Content-Type": "application/json"}
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
def image_to_base64(self, image_path):
with open(image_path, 'rb') as image_file:
encoded_string = str(base64.b64encode(image_file.read()), 'utf-8')
return encoded_string
def prepare_inputs(self, inputs):
msgs = cp.deepcopy(inputs)
content = []
for i, msg in enumerate(msgs):
if msg['type'] == 'text':
pass
else:
try:
image_data = self.image_to_base64(msg['value'])
except Exception as e:
if self.verbose:
self.logger.error(e)
image_data = ''
msg['value'] = image_data
content.append(msg)
return content
def generate_inner(self, inputs, **kwargs) -> str:
assert isinstance(inputs, str) or isinstance(inputs, list)
start = time.time()
inputs = [inputs] if isinstance(inputs, str) else inputs
messages = self.prepare_inputs(inputs)
service_url = "https://bailingchat.alipay.com/api/proxy/eval/antgmm/completions"
payload = {
"structInput": messages,
"sk": self.key,
"timeout": 180000
}
response = requests.post(service_url, headers=self.headers, json=payload)
if self.verbose:
self.logger.info('Time for requesting is:')
self.logger.info(time.time() - start)
try:
assert response.status_code == 200
output = json.loads(response.text)
answer = output['preds']['pred']
if self.verbose:
self.logger.info(f'inputs: {inputs}\nanswer: {answer}')
return 0, answer, 'Succeeded! '
except Exception as e:
if self.verbose:
self.logger.error(e)
self.logger.error(f'The input messages are {inputs}.')
return -1, self.fail_msg, ''
class bailingMMAPI(bailingMMWrapper):
def generate(self, message, dataset=None):
return super(bailingMMAPI, self).generate(message, dataset=dataset)
import time
import random as rd
from abc import abstractmethod
import os.path as osp
import copy as cp
from ..smp import get_logger, parse_file, concat_images_vlmeval, LMUDataRoot, md5, decode_base64_to_image_file
class BaseAPI:
allowed_types = ['text', 'image']
INTERLEAVE = True
INSTALL_REQ = False
def __init__(self,
retry=10,
wait=3,
system_prompt=None,
verbose=True,
fail_msg='Failed to obtain answer via API.',
**kwargs):
"""Base Class for all APIs.
Args:
retry (int, optional): The retry times for `generate_inner`. Defaults to 10.
wait (int, optional): The wait time after each failed retry of `generate_inner`. Defaults to 3.
system_prompt (str, optional): Defaults to None.
verbose (bool, optional): Defaults to True.
fail_msg (str, optional): The message to return when failed to obtain answer.
Defaults to 'Failed to obtain answer via API.'.
**kwargs: Other kwargs for `generate_inner`.
"""
self.wait = wait
self.retry = retry
self.system_prompt = system_prompt
self.verbose = verbose
self.fail_msg = fail_msg
self.logger = get_logger('ChatAPI')
if len(kwargs):
self.logger.info(f'BaseAPI received the following kwargs: {kwargs}')
self.logger.info('Will try to use them as kwargs for `generate`. ')
self.default_kwargs = kwargs
@abstractmethod
def generate_inner(self, inputs, **kwargs):
"""The inner function to generate the answer.
Returns:
tuple(int, str, str): ret_code, response, log
"""
self.logger.warning('For APIBase, generate_inner is an abstract method. ')
assert 0, 'generate_inner not defined'
ret_code, answer, log = None, None, None
# if ret_code is 0, means succeed
return ret_code, answer, log
def working(self):
"""If the API model is working, return True, else return False.
Returns:
bool: If the API model is working, return True, else return False.
"""
self.old_timeout = None
if hasattr(self, 'timeout'):
self.old_timeout = self.timeout
self.timeout = 120
retry = 5
while retry > 0:
ret = self.generate('hello')
if ret is not None and ret != '' and self.fail_msg not in ret:
if self.old_timeout is not None:
self.timeout = self.old_timeout
return True
retry -= 1
if self.old_timeout is not None:
self.timeout = self.old_timeout
return False
def check_content(self, msgs):
"""Check the content type of the input. Four types are allowed: str, dict, liststr, listdict.
Args:
msgs: Raw input messages.
Returns:
str: The message type.
"""
if isinstance(msgs, str):
return 'str'
if isinstance(msgs, dict):
return 'dict'
if isinstance(msgs, list):
types = [self.check_content(m) for m in msgs]
if all(t == 'str' for t in types):
return 'liststr'
if all(t == 'dict' for t in types):
return 'listdict'
return 'unknown'
def preproc_content(self, inputs):
"""Convert the raw input messages to a list of dicts.
Args:
inputs: raw input messages.
Returns:
list(dict): The preprocessed input messages. Will return None if failed to preprocess the input.
"""
if self.check_content(inputs) == 'str':
return [dict(type='text', value=inputs)]
elif self.check_content(inputs) == 'dict':
assert 'type' in inputs and 'value' in inputs
return [inputs]
elif self.check_content(inputs) == 'liststr':
res = []
for s in inputs:
mime, pth = parse_file(s)
if mime is None or mime == 'unknown':
res.append(dict(type='text', value=s))
else:
res.append(dict(type=mime.split('/')[0], value=pth))
return res
elif self.check_content(inputs) == 'listdict':
for item in inputs:
assert 'type' in item and 'value' in item
mime, s = parse_file(item['value'])
if mime is None:
assert item['type'] == 'text', item['value']
else:
assert mime.split('/')[0] == item['type']
item['value'] = s
return inputs
else:
return None
# May exceed the context windows size, so try with different turn numbers.
def chat_inner(self, inputs, **kwargs):
_ = kwargs.pop('dataset', None)
while len(inputs):
try:
return self.generate_inner(inputs, **kwargs)
except Exception as e:
if self.verbose:
self.logger.info(f'{type(e)}: {e}')
inputs = inputs[1:]
while len(inputs) and inputs[0]['role'] != 'user':
inputs = inputs[1:]
continue
return -1, self.fail_msg + ': ' + 'Failed with all possible conversation turns.', None
def chat(self, messages, **kwargs1):
"""The main function for multi-turn chatting. Will call `chat_inner` with the preprocessed input messages."""
assert hasattr(self, 'chat_inner'), 'The API model should has the `chat_inner` method. '
for msg in messages:
assert isinstance(msg, dict) and 'role' in msg and 'content' in msg, msg
assert self.check_content(msg['content']) in ['str', 'dict', 'liststr', 'listdict'], msg
msg['content'] = self.preproc_content(msg['content'])
# merge kwargs
kwargs = cp.deepcopy(self.default_kwargs)
kwargs.update(kwargs1)
answer = None
# a very small random delay [0s - 0.5s]
T = rd.random() * 0.5
time.sleep(T)
assert messages[-1]['role'] == 'user'
for i in range(self.retry):
try:
ret_code, answer, log = self.chat_inner(messages, **kwargs)
if ret_code == 0 and self.fail_msg not in answer and answer != '':
if self.verbose:
print(answer)
return answer
elif self.verbose:
if not isinstance(log, str):
try:
log = log.text
except Exception as e:
self.logger.warning(f'Failed to parse {log} as an http response: {str(e)}. ')
self.logger.info(f'RetCode: {ret_code}\nAnswer: {answer}\nLog: {log}')
except Exception as err:
if self.verbose:
self.logger.error(f'An error occured during try {i}: ')
self.logger.error(f'{type(err)}: {err}')
# delay before each retry
T = rd.random() * self.wait * 2
time.sleep(T)
return self.fail_msg if answer in ['', None] else answer
def preprocess_message_with_role(self, message):
system_prompt = ''
new_message = []
for data in message:
assert isinstance(data, dict)
role = data.pop('role', 'user')
if role == 'system':
system_prompt += data['value'] + '\n'
else:
new_message.append(data)
if system_prompt != '':
if self.system_prompt is None:
self.system_prompt = system_prompt
else:
self.system_prompt += '\n' + system_prompt
return new_message
def generate(self, message, **kwargs1):
"""The main function to generate the answer. Will call `generate_inner` with the preprocessed input messages.
Args:
message: raw input messages.
Returns:
str: The generated answer of the Failed Message if failed to obtain answer.
"""
if self.check_content(message) == 'listdict':
message = self.preprocess_message_with_role(message)
assert self.check_content(message) in ['str', 'dict', 'liststr', 'listdict'], f'Invalid input type: {message}'
message = self.preproc_content(message)
assert message is not None and self.check_content(message) == 'listdict'
for item in message:
assert item['type'] in self.allowed_types, f'Invalid input type: {item["type"]}'
# merge kwargs
kwargs = cp.deepcopy(self.default_kwargs)
kwargs.update(kwargs1)
answer = None
# a very small random delay [0s - 0.5s]
T = rd.random() * 0.5
time.sleep(T)
for i in range(self.retry):
try:
ret_code, answer, log = self.generate_inner(message, **kwargs)
if ret_code == 0 and self.fail_msg not in answer and answer != '':
if self.verbose:
print(answer)
return answer
elif self.verbose:
if not isinstance(log, str):
try:
log = log.text
except Exception as e:
self.logger.warning(f'Failed to parse {log} as an http response: {str(e)}. ')
self.logger.info(f'RetCode: {ret_code}\nAnswer: {answer}\nLog: {log}')
except Exception as err:
if self.verbose:
self.logger.error(f'An error occured during try {i}: ')
self.logger.error(f'{type(err)}: {err}')
# delay before each retry
T = rd.random() * self.wait * 2
time.sleep(T)
return self.fail_msg if answer in ['', None] else answer
def message_to_promptimg(self, message, dataset=None):
assert not self.INTERLEAVE
model_name = self.__class__.__name__
import warnings
warnings.warn(
f'Model {model_name} does not support interleaved input. '
'Will use the first image and aggregated texts as prompt. ')
num_images = len([x for x in message if x['type'] == 'image'])
if num_images == 0:
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
image = None
elif num_images == 1:
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
image = [x['value'] for x in message if x['type'] == 'image'][0]
else:
prompt = '\n'.join([x['value'] if x['type'] == 'text' else '<image>' for x in message])
if dataset == 'BLINK':
image = concat_images_vlmeval(
[x['value'] for x in message if x['type'] == 'image'],
target_size=512)
else:
image = [x['value'] for x in message if x['type'] == 'image'][0]
return prompt, image
from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
import os
import json
def multimodal(images, text, url, key, temperature=0, max_tokens=1024, history=[]):
if images:
pics = []
for image in images:
with open(image, 'rb') as f:
pic = base64.b64encode(f.read()).decode('utf-8')
pics.append(pic)
data = {'images': pics, 'text': text, 'key': key, 'temperature': temperature, 'max_new_tokens': max_tokens}
else:
data = {'text': text, 'key': key, 'temperature': temperature, 'max_new_tokens': max_tokens}
response = requests.post(url, json=data, headers={'Content-Type': 'application/json'})
response = json.loads(response.text)
return response
class BlueLMWrapper(BaseAPI):
is_api: bool = True
def __init__(self,
model: str = 'BlueLM-V-v3.0',
retry: int = 5,
wait: int = 5,
verbose: bool = True,
temperature: float = 0.0,
system_prompt: str = None,
max_tokens: int = 1024,
key: str = None,
url: str = 'http://api-ai.vivo.com.cn/multimodal',
**kwargs):
self.model = model
self.fail_msg = 'Failed to obtain answer BlueLM-V API. '
self.max_tokens = max_tokens
self.temperature = temperature
self.url = url
self.key = key
if self.key is None:
self.key = os.environ.get('BLUELM_V_API_KEY', None)
assert self.key is not None, (
'Please set the API Key (obtain it here: '
'contact by email : shuai.ren@vivo.com'
)
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
def message_to_promptimg(self, message, dataset=None):
num_images = len([x for x in message if x['type'] == 'image'])
if num_images == 0:
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
image = None
elif num_images == 1:
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
image = [x['value'] for x in message if x['type'] == 'image']
else:
prompt = '\n'.join([x['value'] if x['type'] == 'text' else '<image>' for x in message])
if dataset == 'BLINK':
image = concat_images_vlmeval(
[x['value'] for x in message if x['type'] == 'image'],
target_size=512)
else:
image = [x['value'] for x in message if x['type'] == 'image']
if dataset in ['MMBench_DEV_EN_V11', 'MMBench_DEV_CN_V11', 'MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11',
'AI2D_TEST', 'AI2D_TEST_TO_MASK', 'MMMU_DEV_VAL']:
prompt = prompt.replace('Please select the correct answer from the options above.',
'Answer with the option’s letter from the given choices directly.')
elif dataset in ['ChartQA_TEST']:
prompt = prompt.replace('Answer the question using a single word or phrase.',
'Answer the question using a single number or phrase.')
elif dataset in ['DocVQA_VAL', 'DocVQA_TEST', ]:
prompt = prompt.replace('Answer the question using a single word or phrase.',
'Give the short answer directly.')
elif dataset in ['TextVQA_VAL']:
prompt = prompt.replace('Answer the question using a single word or phrase.',
'When the provided information is insufficient, respond with ’Unanswerable’.'
'Answer the question using a single word or phrase.')
elif dataset in ['MTVQA_TEST']:
prompt = prompt.replace('\nAnswer the question using a word or phrase in the language of the question.', '')
elif dataset in ['MathVista_MINI']:
if 'Choices:' in prompt:
prompt = prompt.replace('Choices:', 'Options:').replace('Hint:', 'Context:')
for i in range(1, 7): # replace A ~ F
prompt = prompt.replace(f'({chr(64 + i)})', f'{chr(64 + i)}.')
prompt += '\nAnswer with the option’s letter from the given choices directly.'
else:
prompt += '\nAnswer the question using a single word or phrase.'
return prompt, image
def generate_inner(self, inputs, **kwargs) -> str:
assert isinstance(inputs, str) or isinstance(inputs, list)
pure_text = np.all([x['type'] == 'text' for x in inputs])
assert not pure_text
prompt, image_path = self.message_to_promptimg(inputs, kwargs['dataset'])
try:
response = multimodal(image_path, prompt, self.url, self.key, self.temperature, self.max_tokens)
answer = response['result']
return 0, answer, 'Succeeded! '
except Exception as err:
if self.verbose:
self.logger.error(f'{type(err)}: {err}')
self.logger.error(f'The input messages are {inputs}.')
return -1, '', ''
class BlueLM_V_API(BlueLMWrapper):
def generate(self, message, dataset=None):
return super(BlueLM_V_API, self).generate(message, dataset=dataset)
from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
from time import sleep
import base64
import mimetypes
from PIL import Image
alles_url = 'https://openxlab.org.cn/gw/alles-apin-hub/v1/claude/v1/text/chat'
alles_headers = {
'alles-apin-token': '',
'Content-Type': 'application/json'
}
official_url = 'https://api.anthropic.com/v1/messages'
official_headers = {
'x-api-key': '',
'anthropic-version': '2023-06-01',
'content-type': 'application/json'
}
class Claude_Wrapper(BaseAPI):
is_api: bool = True
def __init__(self,
backend: str = 'alles',
model: str = 'claude-3-opus-20240229',
key: str = None,
retry: int = 10,
wait: int = 3,
system_prompt: str = None,
verbose: bool = True,
temperature: float = 0,
max_tokens: int = 1024,
**kwargs):
if os.environ.get('ANTHROPIC_BACKEND', '') == 'official':
backend = 'official'
assert backend in ['alles', 'official'], f'Invalid backend: {backend}'
self.backend = backend
self.url = alles_url if backend == 'alles' else official_url
self.model = model
self.temperature = temperature
self.max_tokens = max_tokens
self.headers = alles_headers if backend == 'alles' else official_headers
if key is not None:
self.key = key
else:
self.key = os.environ.get('ALLES', '') if self.backend == 'alles' else os.environ.get('ANTHROPIC_API_KEY', '') # noqa: E501
if self.backend == 'alles':
self.headers['alles-apin-token'] = self.key
else:
self.headers['x-api-key'] = self.key
super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
# inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
# content can be a string or a list of image & text
def prepare_itlist(self, inputs):
assert np.all([isinstance(x, dict) for x in inputs])
has_images = np.sum([x['type'] == 'image' for x in inputs])
if has_images:
content_list = []
for msg in inputs:
if msg['type'] == 'text' and msg['value'] != '':
content_list.append(dict(type='text', text=msg['value']))
elif msg['type'] == 'image':
pth = msg['value']
suffix = osp.splitext(pth)[-1].lower()
media_type = mimetypes.types_map.get(suffix, None)
assert media_type is not None
content_list.append(dict(
type='image',
source={
'type': 'base64',
'media_type': media_type,
'data': encode_image_file_to_base64(pth, target_size=4096)
}))
else:
assert all([x['type'] == 'text' for x in inputs])
text = '\n'.join([x['value'] for x in inputs])
content_list = [dict(type='text', text=text)]
return content_list
def prepare_inputs(self, inputs):
input_msgs = []
assert isinstance(inputs, list) and isinstance(inputs[0], dict)
assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs
if 'role' in inputs[0]:
assert inputs[-1]['role'] == 'user', inputs[-1]
for item in inputs:
input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content'])))
else:
input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs)))
return input_msgs
def generate_inner(self, inputs, **kwargs) -> str:
payload = {
'model': self.model,
'max_tokens': self.max_tokens,
'messages': self.prepare_inputs(inputs),
**kwargs
}
if self.system_prompt is not None:
payload['system'] = self.system_prompt
response = requests.request('POST', self.url, headers=self.headers, data=json.dumps(payload))
ret_code = response.status_code
ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
answer = self.fail_msg
try:
resp_struct = json.loads(response.text)
answer = resp_struct['data']['content'][0]['text'].strip()
except Exception as err:
if self.verbose:
self.logger.error(f'{type(err)}: {err}')
self.logger.error(response.text if hasattr(response, 'text') else response)
return ret_code, answer, response
class Claude3V(Claude_Wrapper):
def generate(self, message, dataset=None):
return super(Claude_Wrapper, self).generate(message)
from ..smp import *
import os
from .base import BaseAPI
class CWWrapper(BaseAPI):
is_api: bool = True
def __init__(self,
model: str = 'cw-congrong-v1.5',
retry: int = 10,
wait: int = 5,
key: str = None,
verbose: bool = True,
system_prompt: str = None,
temperature: float = 0,
timeout: int = 600,
api_base: str = 'http://cwapi-vlm01.cw_rb.azurebot.tk/v1/chat/completions',
max_tokens: int = 1024,
img_size: int = 512,
img_detail: str = 'low',
**kwargs):
self.model = model
self.cur_idx = 0
self.fail_msg = 'Failed to obtain answer via API. '
self.max_tokens = max_tokens
self.temperature = temperature
base = os.environ.get('CW_API_BASE', None)
self.api_base = base if base is not None else api_base
env_key = os.environ.get('CW_API_KEY', None)
self.key = env_key if env_key is not None else key
assert self.key is not None, 'API key not provided. Please set CW_API_KEY environment variable or \
pass it to the constructor.'
assert img_size > 0 or img_size == -1
self.img_size = -1 # allways send full size image
assert img_detail in ['high', 'low']
self.img_detail = img_detail
self.vision = True
self.timeout = timeout
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
# inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
# content can be a string or a list of image & text
def prepare_inputs(self, inputs):
input_msgs = []
if self.system_prompt is not None:
input_msgs.append(dict(role='system', content=self.system_prompt))
has_images = np.sum([x['type'] == 'image' for x in inputs])
if has_images:
content_list = []
for msg in inputs:
if msg['type'] == 'text':
content_list.append(dict(type='text', text=msg['value']))
elif msg['type'] == 'image':
from PIL import Image
img = Image.open(msg['value'])
b64 = encode_image_to_base64(img, target_size=self.img_size)
img_struct = dict(url=f"data:image/jpeg;base64,{b64}", detail=self.img_detail)
content_list.append(dict(type='image_url', image_url=img_struct))
input_msgs.append(dict(role='user', content=content_list))
else:
assert all([x['type'] == 'text' for x in inputs])
text = '\n'.join([x['value'] for x in inputs])
input_msgs.append(dict(role='user', content=text))
return input_msgs
def generate_inner(self, inputs, **kwargs) -> str:
input_msgs = self.prepare_inputs(inputs)
temperature = kwargs.pop('temperature', self.temperature)
max_tokens = kwargs.pop('max_tokens', self.max_tokens)
if 0 < max_tokens <= 100:
self.logger.warning(
'Less than 100 tokens left, '
'may exceed the context window with some additional meta symbols. '
)
if max_tokens <= 0:
return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. '
headers = {'Content-Type': 'application/json', 'Authorization': f'{self.key}'}
payload = dict(
model=self.model,
messages=input_msgs,
max_tokens=max_tokens,
n=1,
temperature=temperature,
**kwargs)
response = requests.post(self.api_base, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
ret_code = response.status_code
ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
answer = self.fail_msg
try:
resp_struct = json.loads(response.text)
answer = resp_struct['choices'][0]['message']['content'].strip()
except Exception as err:
if self.verbose:
self.logger.error(f'{type(err)}: {err}')
self.logger.error(response.text if hasattr(response, 'text') else response)
return ret_code, answer, response
from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
headers = 'Content-Type: application/json'
class GeminiWrapper(BaseAPI):
is_api: bool = True
def __init__(self,
model: str = 'gemini-1.0-pro',
retry: int = 5,
wait: int = 5,
key: str = None,
verbose: bool = True,
temperature: float = 0.0,
system_prompt: str = None,
max_tokens: int = 1024,
proxy: str = None,
backend='genai',
project_id='vlmeval',
**kwargs):
self.model = model
self.fail_msg = 'Failed to obtain answer via API. '
self.max_tokens = max_tokens
self.temperature = temperature
if key is None:
key = os.environ.get('GOOGLE_API_KEY', None)
# Try to load backend from environment variable
be = os.environ.get('GOOGLE_API_BACKEND', None)
if be is not None and be in ['genai', 'vertex']:
backend = be
assert backend in ['genai', 'vertex']
if backend == 'genai':
# We have not evaluated Gemini-1.5 w. GenAI backend
assert key is not None # Vertex does not require API Key
self.backend = backend
self.project_id = project_id
self.api_key = key
if proxy is not None:
proxy_set(proxy)
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
def build_msgs_genai(self, inputs):
messages = [] if self.system_prompt is None else [self.system_prompt]
for inp in inputs:
if inp['type'] == 'text':
messages.append(inp['value'])
elif inp['type'] == 'image':
messages.append(Image.open(inp['value']))
return messages
def build_msgs_vertex(self, inputs):
from vertexai.generative_models import Part, Image
messages = [] if self.system_prompt is None else [self.system_prompt]
for inp in inputs:
if inp['type'] == 'text':
messages.append(inp['value'])
elif inp['type'] == 'image':
messages.append(Part.from_image(Image.load_from_file(inp['value'])))
return messages
def generate_inner(self, inputs, **kwargs) -> str:
if self.backend == 'genai':
import google.generativeai as genai
assert isinstance(inputs, list)
pure_text = np.all([x['type'] == 'text' for x in inputs])
genai.configure(api_key=self.api_key)
if pure_text and self.model == 'gemini-1.0-pro':
model = genai.GenerativeModel('gemini-1.0-pro')
else:
model = genai.GenerativeModel(self.model)
messages = self.build_msgs_genai(inputs)
gen_config = dict(max_output_tokens=self.max_tokens, temperature=self.temperature)
gen_config.update(kwargs)
try:
answer = model.generate_content(
messages,
generation_config=genai.types.GenerationConfig(**gen_config)).text
return 0, answer, 'Succeeded! '
except Exception as err:
if self.verbose:
self.logger.error(f'{type(err)}: {err}')
self.logger.error(f'The input messages are {inputs}.')
return -1, '', ''
elif self.backend == 'vertex':
import vertexai
from vertexai.generative_models import GenerativeModel
vertexai.init(project=self.project_id, location='us-central1')
model_name = 'gemini-1.0-pro-vision' if self.model == 'gemini-1.0-pro' else self.model
model = GenerativeModel(model_name=model_name)
messages = self.build_msgs_vertex(inputs)
try:
resp = model.generate_content(messages)
answer = resp.text
return 0, answer, 'Succeeded! '
except Exception as err:
if self.verbose:
self.logger.error(f'{type(err)}: {err}')
self.logger.error(f'The input messages are {inputs}.')
return -1, '', ''
class GeminiProVision(GeminiWrapper):
def generate(self, message, dataset=None):
return super(GeminiProVision, self).generate(message)
import requests
requests.packages.urllib3.disable_warnings()
from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
from vlmeval.dataset import DATASET_TYPE
from vlmeval.smp.vlm import encode_image_file_to_base64
class GLMVisionWrapper(BaseAPI):
is_api: bool = True
def __init__(self,
model: str,
retry: int = 5,
wait: int = 5,
key: str = None,
verbose: bool = True,
system_prompt: str = None,
max_tokens: int = 4096,
proxy: str = None,
**kwargs):
self.model = model
self.fail_msg = 'Failed to obtain answer via API. '
self.default_params = {
'top_k': 1,
'best_of': 1,
'do_sample': False,
'stream': False,
'max_tokens': max_tokens,
"skip_moderation": True
}
if key is None:
key = os.environ.get('GLMV_API_KEY', None)
assert key is not None, (
'Please set the API Key (obtain it here: '
'https://open.bigmodel.cn/dev/howuse/introduction)'
)
self.key = key
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
def build_msgs(self, msgs_raw, system_prompt=None, dataset=None):
msgs = cp.deepcopy(msgs_raw)
content = []
for i, msg in enumerate(msgs):
if msg['type'] == 'text':
content.append(dict(type='text', text=msg['value']))
elif msg['type'] == 'image':
content.append(dict(type='image_url', image_url=dict(url=encode_image_file_to_base64(msg['value']))))
if dataset in {'HallusionBench', 'POPE'}:
content.append(dict(type="text", text="Please answer yes or no."))
ret = [dict(role='user', content=content)]
return ret
def generate_inner(self, inputs, **kwargs) -> str:
assert isinstance(inputs, str) or isinstance(inputs, list)
inputs = [inputs] if isinstance(inputs, str) else inputs
messages = self.build_msgs(msgs_raw=inputs, dataset=kwargs.get('dataset', None))
url = 'https://api.chatglm.cn/v1/chat/completions'
headers = {
'Content-Type': 'application/json',
'Request-Id': 'remote-test',
'Authorization': f'Bearer {self.key}'
}
payload = {
'model': self.model,
'messages': messages,
**self.default_params
}
response = requests.post(url, headers=headers, data=json.dumps(payload), verify=False)
output = []
try:
assert response.status_code == 200
for line in response.iter_lines():
data = json.loads(line.decode('utf-8').lstrip('data: '))
output.append(data['choices'][0]['message']['content'])
answer = ''.join(output).replace('</s>', '')
if self.verbose:
self.logger.info(f'inputs: {inputs}\nanswer: {answer}')
return 0, answer, 'Succeeded! '
except Exception as err:
if self.verbose:
self.logger.error(f'{type(err)}: {err}')
self.logger.error(f'The input messages are {inputs}.')
return -1, self.fail_msg, ''
class GLMVisionAPI(GLMVisionWrapper):
def generate(self, message, dataset=None):
return super(GLMVisionAPI, self).generate(message, dataset=dataset)
from ..smp import *
import os
import sys
from .base import BaseAPI
APIBASES = {
'OFFICIAL': 'https://api.openai.com/v1/chat/completions',
}
def GPT_context_window(model):
length_map = {
'gpt-4': 8192,
'gpt-4-0613': 8192,
'gpt-4-turbo-preview': 128000,
'gpt-4-1106-preview': 128000,
'gpt-4-0125-preview': 128000,
'gpt-4-vision-preview': 128000,
'gpt-4-turbo': 128000,
'gpt-4-turbo-2024-04-09': 128000,
'gpt-3.5-turbo': 16385,
'gpt-3.5-turbo-0125': 16385,
'gpt-3.5-turbo-1106': 16385,
'gpt-3.5-turbo-instruct': 4096,
}
if model in length_map:
return length_map[model]
else:
return 128000
class OpenAIWrapper(BaseAPI):
is_api: bool = True
def __init__(self,
model: str = 'gpt-3.5-turbo-0613',
retry: int = 5,
wait: int = 5,
key: str = None,
verbose: bool = False,
system_prompt: str = None,
temperature: float = 0,
timeout: int = 60,
api_base: str = None,
max_tokens: int = 1024,
img_size: int = 512,
img_detail: str = 'low',
use_azure: bool = False,
**kwargs):
self.model = model
self.cur_idx = 0
self.fail_msg = 'Failed to obtain answer via API. '
self.max_tokens = max_tokens
self.temperature = temperature
self.use_azure = use_azure
if 'step' in model:
env_key = os.environ.get('STEPAI_API_KEY', '')
if key is None:
key = env_key
elif 'yi-vision' in model:
env_key = os.environ.get('YI_API_KEY', '')
if key is None:
key = env_key
elif 'internvl2-pro' in model:
env_key = os.environ.get('InternVL2_PRO_KEY', '')
if key is None:
key = env_key
elif 'abab' in model:
env_key = os.environ.get('MiniMax_API_KEY', '')
if key is None:
key = env_key
else:
if use_azure:
env_key = os.environ.get('AZURE_OPENAI_API_KEY', None)
assert env_key is not None, 'Please set the environment variable AZURE_OPENAI_API_KEY. '
if key is None:
key = env_key
assert isinstance(key, str), (
'Please set the environment variable AZURE_OPENAI_API_KEY to your openai key. '
)
else:
env_key = os.environ.get('OPENAI_API_KEY', '')
if key is None:
key = env_key
assert isinstance(key, str) and key.startswith('sk-'), (
f'Illegal openai_key {key}. '
'Please set the environment variable OPENAI_API_KEY to your openai key. '
)
self.key = key
assert img_size > 0 or img_size == -1
self.img_size = img_size
assert img_detail in ['high', 'low']
self.img_detail = img_detail
self.timeout = timeout
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
if use_azure:
api_base_template = (
'{endpoint}openai/deployments/{deployment_name}/chat/completions?api-version={api_version}'
)
endpoint = os.getenv('AZURE_OPENAI_ENDPOINT', None)
assert endpoint is not None, 'Please set the environment variable AZURE_OPENAI_ENDPOINT. '
deployment_name = os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME', None)
assert deployment_name is not None, 'Please set the environment variable AZURE_OPENAI_DEPLOYMENT_NAME. '
api_version = os.getenv('OPENAI_API_VERSION', None)
assert api_version is not None, 'Please set the environment variable OPENAI_API_VERSION. '
self.api_base = api_base_template.format(
endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
deployment_name=os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME'),
api_version=os.getenv('OPENAI_API_VERSION')
)
else:
if api_base is None:
if 'OPENAI_API_BASE' in os.environ and os.environ['OPENAI_API_BASE'] != '':
self.logger.info('Environment variable OPENAI_API_BASE is set. Will use it as api_base. ')
api_base = os.environ['OPENAI_API_BASE']
else:
api_base = 'OFFICIAL'
assert api_base is not None
if api_base in APIBASES:
self.api_base = APIBASES[api_base]
elif api_base.startswith('http'):
self.api_base = api_base
else:
self.logger.error('Unknown API Base. ')
raise NotImplementedError
self.logger.info(f'Using API Base: {self.api_base}; API Key: {self.key}')
# inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
# content can be a string or a list of image & text
def prepare_itlist(self, inputs):
assert np.all([isinstance(x, dict) for x in inputs])
has_images = np.sum([x['type'] == 'image' for x in inputs])
if has_images:
content_list = []
for msg in inputs:
if msg['type'] == 'text':
content_list.append(dict(type='text', text=msg['value']))
elif msg['type'] == 'image':
from PIL import Image
img = Image.open(msg['value'])
b64 = encode_image_to_base64(img, target_size=self.img_size)
img_struct = dict(url=f'data:image/jpeg;base64,{b64}', detail=self.img_detail)
content_list.append(dict(type='image_url', image_url=img_struct))
else:
assert all([x['type'] == 'text' for x in inputs])
text = '\n'.join([x['value'] for x in inputs])
content_list = [dict(type='text', text=text)]
return content_list
def prepare_inputs(self, inputs):
input_msgs = []
if self.system_prompt is not None:
input_msgs.append(dict(role='system', content=self.system_prompt))
assert isinstance(inputs, list) and isinstance(inputs[0], dict)
assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs
if 'role' in inputs[0]:
assert inputs[-1]['role'] == 'user', inputs[-1]
for item in inputs:
input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content'])))
else:
input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs)))
return input_msgs
def generate_inner(self, inputs, **kwargs) -> str:
input_msgs = self.prepare_inputs(inputs)
temperature = kwargs.pop('temperature', self.temperature)
max_tokens = kwargs.pop('max_tokens', self.max_tokens)
context_window = GPT_context_window(self.model)
new_max_tokens = min(max_tokens, context_window - self.get_token_len(inputs))
if 0 < new_max_tokens <= 100 and new_max_tokens < max_tokens:
self.logger.warning(
'Less than 100 tokens left, '
'may exceed the context window with some additional meta symbols. '
)
if new_max_tokens <= 0:
return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. '
max_tokens = new_max_tokens
# Will send request if use Azure, dk how to use openai client for it
if self.use_azure:
headers = {'Content-Type': 'application/json', 'api-key': self.key}
elif 'internvl2-pro' in self.model:
headers = {'Content-Type': 'application/json', 'Authorization': self.key}
else:
headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.key}'}
payload = dict(
model=self.model,
messages=input_msgs,
max_tokens=max_tokens,
n=1,
temperature=temperature,
**kwargs)
response = requests.post(
self.api_base,
headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
ret_code = response.status_code
ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
answer = self.fail_msg
try:
resp_struct = json.loads(response.text)
answer = resp_struct['choices'][0]['message']['content'].strip()
except Exception as err:
if self.verbose:
self.logger.error(f'{type(err)}: {err}')
self.logger.error(response.text if hasattr(response, 'text') else response)
return ret_code, answer, response
def get_image_token_len(self, img_path, detail='low'):
import math
if detail == 'low':
return 85
im = Image.open(img_path)
height, width = im.size
if width > 1024 or height > 1024:
if width > height:
height = int(height * 1024 / width)
width = 1024
else:
width = int(width * 1024 / height)
height = 1024
h = math.ceil(height / 512)
w = math.ceil(width / 512)
total = 85 + 170 * h * w
return total
def get_token_len(self, inputs) -> int:
import tiktoken
try:
enc = tiktoken.encoding_for_model(self.model)
except Exception as err:
if 'gpt' in self.model.lower():
if self.verbose:
self.logger.warning(f'{type(err)}: {err}')
enc = tiktoken.encoding_for_model('gpt-4')
else:
return 0
assert isinstance(inputs, list)
tot = 0
for item in inputs:
if 'role' in item:
tot += self.get_token_len(item['content'])
elif item['type'] == 'text':
tot += len(enc.encode(item['value']))
elif item['type'] == 'image':
tot += self.get_image_token_len(item['value'], detail=self.img_detail)
return tot
class GPT4V(OpenAIWrapper):
def generate(self, message, dataset=None):
return super(GPT4V, self).generate(message)
import os
import sys
import os.path as osp
import torch
from ..smp import *
def get_gpu_num(model_name):
model_name = model_name.lower()
kws = {
8: ['65b', '70b'],
4: ['30b', '33b', '35b', '40b'],
2: ['13b', '14b', '20b'],
1: ['6b', '7b', 'moss'],
}
for k in [8, 4, 2, 1]:
for keyword in kws[k]:
if keyword in model_name:
return k
return 8
validated_llms = [
'internlm/internlm-chat-7b', 'internlm/internlm-chat-7b-8k', 'internlm/internlm-chat-20b',
'Qwen/Qwen-7B-Chat', 'Qwen/Qwen-14B-Chat',
'THUDM/chatglm2-6b', 'THUDM/chatglm2-6b-32k', 'THUDM/chatglm3-6b', 'THUDM/chatglm3-6b-32k',
'baichuan-inc/Baichuan2-7B-Chat', 'baichuan-inc/Baichuan2-13B-Chat',
'lmsys/vicuna-7b-v1.5', 'lmsys/vicuna-13b-v1.5',
'meta-llama/Llama-2-7b-chat-hf'
]
Auto_model = ['chatglm']
class HFChatModel:
def _get_context_length(self, model, model_path):
# By default, we use model.config.seq_length
model_path = model_path.lower()
if 'baichuan' in model_path:
context_window = model.config.model_max_length
elif 'internlm' in model_path or 'llama' in model_path:
context_window = model.config.max_position_embeddings
elif 'vicuna' in model_path:
context_window = model.generation_config.max_length
else:
# chatglm & qwen
context_window = model.config.seq_length
return context_window
def _get_context_length_robust(self, model, model_path):
try:
context_window = self._get_context_length(model, model_path)
return context_window
except Exception as err:
self.logger.critical(f'{type(err)}: {err}')
self.logger.critical(
'Failed to extract context_window information from config / generation_config. '
'Please read the above code and check if the logic works for you model path'
)
raise NotImplementedError
def __init__(self,
model_path,
system_prompt: str = None,
**kwargs):
self.logger = get_logger('HFChatModel')
if 'vicuna' in model_path.lower():
try:
from fastchat.model import get_conversation_template
except Exception as err:
self.logger.critical('Please install fastchat first to use vicuna. ')
raise err
self.explicit_device = kwargs.pop('device', None)
if self.explicit_device is None:
# If CUDA_VISIBLE_DEVICES is not properly set
if 'CUDA_VISIBLE_DEVICES' not in os.environ or os.environ['CUDA_VISIBLE_DEVICES'] == '0,1,2,3,4,5,6,7':
num_gpu = get_gpu_num(model_path)
gpu_offset = kwargs.pop('gpu_offset', 0)
cuda_visible_devices = ','.join([str(i) for i in range(gpu_offset, gpu_offset + num_gpu)])
os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
from transformers.generation import GenerationConfig
if model_path not in validated_llms:
self.logger.warning(f'{model_path} not in validated LLMs, may have inference troubles. ')
self.model_path = model_path
if listinstr(Auto_model, model_path):
LoadModel = AutoModel
else:
LoadModel = AutoModelForCausalLM
assert osp.exists(model_path) or len(model_path.split('/')) == 2
device = self.explicit_device if self.explicit_device else 'auto'
precision = {}
if 'internlm-chat-7b' in model_path:
precision = {'torch_dtype': torch.float16}
elif 'internlm-chat-20b' in model_path:
precision = {'torch_dtype': torch.bfloat16}
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = LoadModel.from_pretrained(model_path, trust_remote_code=True, device_map='cpu', **precision)
model = model.eval()
if device != 'cpu':
model = model.to(f'cuda:{device}' if isinstance(device, int) else 'cuda')
try:
model.generation_config = GenerationConfig.from_pretrained(
model_path, trust_remote_code=True, device_map=device)
except Exception as err:
self.logger.warning(f'{type(err)}: {err}')
torch.cuda.empty_cache()
self.model = model
self.context_length = self._get_context_length_robust(model=model, model_path=model_path)
self.answer_buffer = 192
self.system_prompt = system_prompt
for k, v in kwargs.items():
self.logger.info(f'Following args will be used for generation (If not set specifically), {k}: {v}. ')
self.kwargs = kwargs
def generate_str(self, input, **kwargs):
if 'baichuan' in self.model_path.lower():
messages = []
messages.append({'role': 'user', 'content': input})
resp = self.model.chat(self.tokenizer, messages, **kwargs)
elif 'vicuna' in self.model_path.lower():
from fastchat.model import get_conversation_template
conv = get_conversation_template('vicuna')
conv.append_message(conv.roles[0], input)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
inputs = self.tokenizer([prompt], return_tensors='pt')
if torch.cuda.is_available():
for k in inputs:
inputs[k] = inputs[k].cuda()
params = dict(do_sample=True, temperature=0.7, repetition_penalty=1.0, max_new_tokens=512)
params.update(self.kwargs)
params.update(kwargs)
outputs = self.model.generate(**inputs, **params)
resp = self.tokenizer.decode(
outputs[0][len(inputs['input_ids'][0]):],
skip_special_tokens=True,
spaces_between_special_tokens=False)
else:
params = self.kwargs
params.update(kwargs)
resp, _ = self.model.chat(self.tokenizer, input, history=[], **params)
return resp
def length_ok(self, inputs):
tot = len(self.tokenizer.encode(self.system_prompt)) if self.system_prompt is not None else 0
for s in inputs:
tot += len(self.tokenizer.encode(s))
return tot + self.answer_buffer < self.context_length
def generate_list(self, full_inputs, offset=0, **kwargs):
assert isinstance(full_inputs, list)
inputs = full_inputs[offset:]
if not self.length_ok(inputs):
return self.chat(full_inputs, offset + 1)
model_path = self.model_path.lower()
if sum([x in model_path for x in ['baichuan']]):
input_msgs = []
if self.system_prompt is not None:
input_msgs.append(dict(role='user', content=self.system_prompt))
if len(inputs):
assert isinstance(inputs, list) and isinstance(inputs[0], str)
roles = ['user', 'assistant'] if len(inputs) % 2 == 1 else ['assistant', 'user']
roles = roles * len(inputs)
for role, msg in zip(roles, inputs):
input_msgs.append(dict(role=role, content=msg))
response = self.model.chat(self.tokenizer, input_msgs)
elif sum([x in model_path for x in ['vicuna']]):
from fastchat.model import get_conversation_template
conv = get_conversation_template('vicuna')
assert isinstance(inputs, list) and isinstance(inputs[0], str)
if len(inputs) % 2 == 1:
if self.system_prompt is not None:
conv.append_message(conv.roles[0], self.system_prompt)
for i in range(len(inputs) // 2):
conv.append_message(conv.roles[0], inputs[2 * i])
conv.append_message(conv.roles[1], inputs[2 * i + 1])
else:
assert self.system_prompt is not None
conv.append_message(conv.roles[0], self.system_prompt)
conv.append_message(conv.roles[1], inputs[0])
for i in range(len(inputs) // 2 - 1):
conv.append_message(conv.roles[0], inputs[2 * i + 1])
conv.append_message(conv.roles[1], inputs[2 * i + 2])
conv.append_message(conv.roles[0], inputs[-1])
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
inputs = self.tokenizer([prompt], return_tensors='pt')
if torch.cuda.is_available():
for k in inputs:
inputs[k] = inputs[k].cuda()
params = dict(do_sample=True, temperature=0.7, repetition_penalty=1.0, max_new_tokens=512)
params.update(self.kwargs)
params.update(kwargs)
outputs = self.model.generate(**inputs, **params)
response = self.tokenizer.decode(
outputs[0][len(inputs['input_ids'][0]):],
skip_special_tokens=True,
spaces_between_special_tokens=False)
response = response.lstrip('\n')
else:
# The default option, support internlm, chatglm, qwen
history, msg = [], None
if len(inputs) % 2 == 1:
if self.system_prompt is not None:
history = [(self.system_prompt, '')]
for i in range(len(inputs) // 2):
history.append((inputs[2 * i], inputs[2 * i + 1]))
else:
assert self.system_prompt is not None
history = [(self.system_prompt, inputs[0])]
for i in range(len(inputs) // 2 - 1):
history.append((inputs[2 * i + 1], inputs[2 * i + 2]))
msg = inputs[-1]
params = self.kwargs
params.update(kwargs)
response, _ = self.model.chat(self.tokenizer, msg, history=history, **params)
return response, offset
def generate(self, inputs, **kwargs):
if isinstance(inputs, str):
return self.generate_str(inputs, **kwargs)
elif isinstance(inputs, list):
return self.generate_list(inputs, **kwargs)
from vlmeval.smp import *
import os
import sys
from vlmeval.api.base import BaseAPI
class HunyuanWrapper(BaseAPI):
is_api: bool = True
_apiVersion = '2023-09-01'
_service = 'hunyuan'
def __init__(self,
model: str = 'hunyuan-vision',
retry: int = 5,
wait: int = 5,
secret_key: str = None,
secret_id: str = None,
verbose: bool = True,
system_prompt: str = None,
temperature: float = 0,
timeout: int = 60,
api_base: str = 'hunyuan.tencentcloudapi.com',
**kwargs):
self.model = model
self.cur_idx = 0
self.fail_msg = 'Failed to obtain answer via API. '
self.temperature = temperature
warnings.warn('You may need to set the env variable HUNYUAN_SECRET_ID & HUNYUAN_SECRET_KEY to use Hunyuan. ')
secret_key = os.environ.get('HUNYUAN_SECRET_KEY', secret_key)
assert secret_key is not None, 'Please set the environment variable HUNYUAN_SECRET_KEY. '
secret_id = os.environ.get('HUNYUAN_SECRET_ID', secret_id)
assert secret_id is not None, 'Please set the environment variable HUNYUAN_SECRET_ID. '
self.model = model
self.endpoint = api_base
self.secret_id = secret_id
self.secret_key = secret_key
self.timeout = timeout
try:
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.hunyuan.v20230901 import hunyuan_client
except ImportError as err:
self.logger.critical('Please install tencentcloud-sdk-python to use Hunyuan API. ')
raise err
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
cred = credential.Credential(self.secret_id, self.secret_key)
httpProfile = HttpProfile()
httpProfile.endpoint = self.endpoint
clientProfile = ClientProfile()
clientProfile.httpProfile = httpProfile
self.client = hunyuan_client.HunyuanClient(cred, 'ap-beijing', clientProfile)
self.logger.info(
f'Using Endpoint: {self.endpoint}; API Secret ID: {self.secret_id}; API Secret Key: {self.secret_key}'
)
# inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
# content can be a string or a list of image & text
def prepare_itlist(self, inputs):
assert np.all([isinstance(x, dict) for x in inputs])
has_images = np.sum([x['type'] == 'image' for x in inputs])
if has_images:
content_list = []
for msg in inputs:
if msg['type'] == 'text':
content_list.append(dict(Type='text', Text=msg['value']))
elif msg['type'] == 'image':
from PIL import Image
img = Image.open(msg['value'])
b64 = encode_image_to_base64(img)
img_struct = dict(Url=f'data:image/jpeg;base64,{b64}')
content_list.append(dict(Type='image_url', ImageUrl=img_struct))
else:
assert all([x['type'] == 'text' for x in inputs])
text = '\n'.join([x['value'] for x in inputs])
content_list = [dict(Type='text', Text=text)]
return content_list
def prepare_inputs(self, inputs):
input_msgs = []
if self.system_prompt is not None:
input_msgs.append(dict(Role='system', Content=self.system_prompt))
assert isinstance(inputs, list) and isinstance(inputs[0], dict)
assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs
if 'role' in inputs[0]:
assert inputs[-1]['role'] == 'user', inputs[-1]
for item in inputs:
input_msgs.append(dict(Role=item['role'], Contents=self.prepare_itlist(item['content'])))
else:
input_msgs.append(dict(Role='user', Contents=self.prepare_itlist(inputs)))
return input_msgs
def generate_inner(self, inputs, **kwargs) -> str:
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.hunyuan.v20230901 import models
input_msgs = self.prepare_inputs(inputs)
temperature = kwargs.pop('temperature', self.temperature)
payload = dict(
Model=self.model,
Messages=input_msgs,
Temperature=temperature,
**kwargs)
retry_counter = 0
while retry_counter < 3:
try:
req = models.ChatCompletionsRequest()
req.from_json_string(json.dumps(payload))
resp = self.client.ChatCompletions(req)
resp = json.loads(resp.to_json_string())
answer = resp['Choices'][0]['Message']['Content']
return 0, answer, resp
except TencentCloudSDKException as e:
self.logger.error(f'Got error code: {e.get_code()}')
if e.get_code() == 'ClientNetworkError':
return -1, self.fail_msg + e.get_code(), None
elif e.get_code() in ['InternalError', 'ServerNetworkError']:
if retry_counter == 3:
return -1, self.fail_msg + e.get_code(), None
retry_counter += 1
continue
elif e.get_code() in ['LimitExceeded']:
time.sleep(5)
if retry_counter == 3:
return -1, self.fail_msg + e.get_code(), None
retry_counter += 1
continue
else:
return -1, self.fail_msg + str(e), None
return -1, self.fail_msg, None
class HunyuanVision(HunyuanWrapper):
def generate(self, message, dataset=None):
return super(HunyuanVision, self).generate(message)
import pandas as pd
import requests
import json
import os
import base64
from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
from vlmeval.dataset import DATASET_TYPE
from vlmeval.dataset import img_root_map
API_ENDPOINT = 'https://jiutian.10086.cn/kunlun/ingress/api/h3t-eeceff/92390745235a40a484d850be19e1f8b4/ai-5d7ae47ec93f4280953273c4001aafee/service-7544ea5ee3e841ad9d01e7af44acef7c/v1/chat/completions' # noqa: E501
APP_CODE = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiI5ZGQwNmQ2ZjU4YTU0ZGY0OGEzNjRhMjQyNGMwODEyNSIsImlzcyI6ImFwaS1hdXRoLWtleSIsImV4cCI6NDg4MjkwNDA3OX0.k5t_T-955xWMndzBbx4WQQNAgm5DpMos9mHm7vkFipQ3yebCFMfyufpSxORSfEVpBaDS3Nly0dd8ygQYGnDgIQcC72vQ1xtkjCP49LNcqlceoET4rGc1zwRi76XLPSGFES4GcwvEmr7Ilth7XtqZNxcDF_Z7HyHyf1-zF0JIQETYSoxenqLU-gNteNfqRUnlyCgaKh03DscAbYvtoMUxEaFa2ZqyRSwekdHI_SPKCq9aC9G19yDPHTjeiwl1ubtyC5uMy5pERn_ClRsZS3Wyb-GmD5QQsFofrWvCiU_fVJuUiez39pYZvEP8awH0R9B7SkpQ4XOzj3fdytTPYy3g6g' # noqa: E501
class JTVLChatWrapper(BaseAPI):
is_api: bool = True
INTERLEAVE = False
def __init__(self,
model: str = 'jt-vl-chat',
retry: int = 5,
wait: int = 5,
api_base: str = API_ENDPOINT,
key: str = APP_CODE,
verbose: bool = True,
system_prompt: str = None,
temperature: float = 0.7,
max_tokens: int = 256,
proxy: str = None,
**kwargs):
self.model = model
self.temperature = temperature
self.max_tokens = max_tokens
self.api_base = api_base
if key is None:
key = os.environ.get('JTVLChat_API_KEY', None)
assert key is not None, (
'Please set the API Key (also called app_code, obtain it here: https://github.com/jiutiancv/JT-VL-Chat)'
)
self.key = key
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
def dump_image(self, line, dataset):
"""Dump the image(s) of the input line to the corresponding dataset folder.
Args:
line (line of pd.DataFrame): The raw input line.
dataset (str): The name of the dataset.
Returns:
str | list[str]: The paths of the dumped images.
"""
ROOT = LMUDataRoot()
assert isinstance(dataset, str)
img_root = os.path.join(ROOT, 'images', img_root_map(dataset) if dataset in img_root_map(dataset) else dataset)
os.makedirs(img_root, exist_ok=True)
if 'image' in line:
if isinstance(line['image'], list):
tgt_path = []
assert 'image_path' in line
for img, im_name in zip(line['image'], line['image_path']):
path = osp.join(img_root, im_name)
if not read_ok(path):
decode_base64_to_image_file(img, path)
tgt_path.append(path)
else:
tgt_path = osp.join(img_root, f"{line['index']}.jpg")
if not read_ok(tgt_path):
decode_base64_to_image_file(line['image'], tgt_path)
tgt_path = [tgt_path]
else:
assert 'image_path' in line
tgt_path = toliststr(line['image_path'])
return tgt_path
def use_custom_prompt(self, dataset):
assert dataset is not None
if listinstr(['MMMU_DEV_VAL','MMMU_TEST'], dataset):
return False
else:
return True
def build_multi_choice_prompt(self, line, dataset=None):
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if len(options):
prompt += '\n请直接回答选项字母。' if cn_string(
prompt) else "\nAnswer with the option's letter from the given choices directly."
else:
prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
return prompt
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert dataset is None or isinstance(dataset, str)
tgt_path = self.dump_image(line, dataset)
if dataset is not None and listinstr(['MME'], dataset):
question = line['question']
prompt = question + ' Answer the question using a single word or phrase.'
elif dataset is not None and listinstr(['HallusionBench'], dataset):
question = line['question']
prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.'
elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
prompt = self.build_multi_choice_prompt(line, dataset)
elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
if listinstr(['MathVista', 'MathVision'], dataset):
prompt = line['question']
elif listinstr(['LLaVABench'], dataset):
question = line['question']
prompt = question + '\nAnswer this question in detail.'
elif listinstr(['MMVet'], dataset):
prompt = line['question']
else:
question = line['question']
prompt = question + '\nAnswer the question using a single word or phrase.'
else:
prompt = line['question']
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
def message_to_promptimg(self, message, dataset=None):
assert not self.INTERLEAVE
model_name = self.__class__.__name__
import warnings
warnings.warn(
f'Model {model_name} does not support interleaved input. '
'Will use the first image and aggregated texts as prompt. ')
num_images = len([x for x in message if x['type'] == 'image'])
if num_images == 0:
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
image = None
else:
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
if dataset == 'BLINK':
image = concat_images_vlmeval(
[x['value'] for x in message if x['type'] == 'image'],
target_size=512)
else:
image = [x['value'] for x in message if x['type'] == 'image'][0]
return prompt, image
def get_send_data(self,prompt, image_path, temperature, max_tokens):
image = ''
with open(image_path, 'rb') as f:
image = str(base64.b64encode(f.read()), 'utf-8')
send_data = {
"messages": [
{
"role": "user",
"content": prompt
}
],
"image_base64": image,
"max_tokens": max_tokens,
"temperature": temperature
}
return send_data
def get_send_data_no_image(self,prompt, temperature, max_tokens):
send_data = {
"messages": [
{
"role": "user",
"content": prompt
}
],
"max_tokens": max_tokens,
"temperature": temperature
}
return send_data
def generate_inner(self, inputs, **kwargs) -> str:
assert isinstance(inputs, str) or isinstance(inputs, list)
inputs = [inputs] if isinstance(inputs, str) else inputs
dataset = kwargs.get('dataset', None)
prompt, image_path = self.message_to_promptimg(message=inputs, dataset=dataset)
# print("prompt:",prompt)
if image_path:
send_data = self.get_send_data(
prompt=prompt,
image_path=image_path,
temperature=self.temperature,
max_tokens=self.max_tokens)
else:
send_data = self.get_send_data_no_image(
prompt=prompt,
temperature=self.temperature,
max_tokens=self.max_tokens)
json_data = json.dumps(send_data)
header_dict = {'Content-Type': 'application/json', 'Authorization': 'Bearer ' + self.key}
r = requests.post(self.api_base, headers=header_dict, data=json_data, timeout=3000)
try:
assert r.status_code == 200
r_json = r.json()
output = r_json['choices'][0]['message']['content']
if self.verbose:
self.logger.info(f'inputs: {inputs}\nanswer: {output}')
return 0,output,'Succeeded! '
except:
error_msg = f'Error! code {r.status_code} content: {r.content}'
error_con = r.content.decode('utf-8')
if self.verbose:
self.logger.error(error_msg)
self.logger.error(error_con)
self.logger.error(f'The input messages are {inputs}.')
return -1,error_msg,''
class JTVLChatAPI(JTVLChatWrapper):
def generate(self, message, dataset=None):
return super(JTVLChatAPI, self).generate(message, dataset=dataset)
# from http import HTTPStatus
import os
import requests
from ..dataset import DATASET_TYPE, DATASET_MODALITY
from vlmeval.api.base import BaseAPI
from vlmeval.smp import *
class InternVL2_PromptUtil:
def __init__(self, use_mpo_prompt=False):
self.use_mpo_prompt = use_mpo_prompt
def dump_image(self, line, dataset):
return self.dump_image_func(line)
def use_custom_prompt(self, dataset):
assert dataset is not None
assert DATASET_MODALITY(dataset) != 'VIDEO', 'not supported'
if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN'], dataset):
# For Multi-Turn we don't have custom prompt
return False
if DATASET_MODALITY(dataset) == 'VIDEO':
# For Video benchmarks we don't have custom prompt at here
return False
else:
return True
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert dataset is None or isinstance(dataset, str)
from ..vlm.internvl.utils import (build_multi_choice_prompt,
build_mcq_cot_prompt,
build_qa_cot_prompt,
build_mpo_prompt,
reorganize_prompt)
tgt_path = self.dump_image(line, dataset)
max_num = self.get_max_num(dataset)
if dataset is not None and DATASET_TYPE(dataset) == 'Y/N':
question = line['question']
if listinstr(['MME'], dataset):
prompt = question + ' Answer the question using a single word or phrase.'
elif listinstr(['HallusionBench', 'AMBER'], dataset):
prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.'
else:
prompt = question
elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
prompt = build_multi_choice_prompt(line, dataset)
if os.getenv('USE_COT') == '1':
prompt = build_mcq_cot_prompt(line, prompt)
elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
question = line['question']
if listinstr(['LLaVABench', 'WildVision'], dataset):
prompt = question + '\nAnswer this question in detail.'
elif listinstr(['OCRVQA', 'TextVQA', 'ChartQA', 'DocVQA', 'InfoVQA', 'OCRBench',
'DUDE', 'SLIDEVQA', 'GQA', 'MMLongBench_DOC'], dataset):
prompt = question + '\nAnswer the question using a single word or phrase.'
elif listinstr(['MathVista', 'MathVision', 'VCR', 'MTVQA', 'MMVet', 'MathVerse',
'MMDU', 'CRPE', 'MIA-Bench', 'MM-Math', 'DynaMath', 'QSpatial'], dataset):
prompt = question
if os.getenv('USE_COT') == '1':
prompt = build_qa_cot_prompt(line, prompt)
else:
prompt = question + '\nAnswer the question using a single word or phrase.'
else:
# VQA_ex_prompt: OlympiadBench, VizWiz
prompt = line['question']
if os.getenv('USE_COT') == '1':
prompt = build_qa_cot_prompt(line, prompt)
message = [dict(type='text', value=prompt)]
image_num = len(tgt_path)
max_num = max(1, min(max_num, 64 // image_num))
# TODO:support upscale_flag
message.extend([dict(type='image', value=s, max_dynamic_patch=max_num) for s in tgt_path])
if self.use_mpo_prompt:
message = build_mpo_prompt(message, line, dataset)
# reorganize_prompt
prompt = reorganize_prompt(message, image_num, dataset=dataset)
prompt.replace('<image>', '<IMAGE_TOKEN>')
message[0] = dict(type='text', value=prompt)
return message
def get_max_num(self, dataset):
assert dataset is not None
res_1_datasets = ['MMBench-Video', 'Video-MME', 'MVBench', 'Video']
res_12_datasets = ['ChartQA_TEST', 'MMMU_DEV_VAL', 'MMMU_TEST', 'MME-RealWorld',
'VCR_EN', 'VCR_ZH', 'OCRVQA']
res_18_datasets = ['DocVQA_VAL', 'DocVQA_TEST', 'DUDE', 'MMLongBench_DOC', 'SLIDEVQA']
res_24_datasets = ['InfoVQA_VAL', 'InfoVQA_TEST', 'OCRBench', 'HRBench4K', 'HRBench8K']
if listinstr(res_1_datasets, dataset):
return 1
elif listinstr(res_12_datasets, dataset):
return 12
elif listinstr(res_18_datasets, dataset):
return 18
elif listinstr(res_24_datasets, dataset):
return 24
else:
return 6
class CogVLM2_PromptUtil:
def dump_image(self, line, dataset):
return self.dump_image_func(line)
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) in 'MCQ':
return True
return False
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
if dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
option_candidate = string.ascii_uppercase
options = {
cand: line[cand]
for cand in option_candidate
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if not cn_string(prompt):
prompt = prompt + '\n' + "Answer with the option's letter from the given choices directly."
else:
prompt = prompt + '\n' + '请直接回答选项字母。'
else:
prompt = line['question']
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=p) for p in tgt_path])
return message
class LMDeployWrapper(BaseAPI):
is_api: bool = True
custom_prompt: str = None
prompt_map = {
'cogvlm2': CogVLM2_PromptUtil(),
'internvl2': InternVL2_PromptUtil(),
'internvl2-8b-mpo-cot': InternVL2_PromptUtil(use_mpo_prompt=True),
}
def __init__(self,
retry: int = 5,
wait: int = 5,
key: str = 'sk-123456',
verbose: bool = True,
temperature: float = 0.0,
timeout: int = 60,
api_base: str = None,
system_prompt: str = None,
max_tokens: int = 1024,
**kwargs):
self.fail_msg = 'Failed to obtain answer via API. '
self.max_tokens = max_tokens
self.temperature = temperature
self.timeout = timeout
key = os.environ.get('LMDEPLOY_API_KEY', key)
api_base = os.environ.get('LMDEPLOY_API_BASE', api_base)
assert key is not None, 'Please set the environment variable LMDEPLOY_API_KEY.'
assert api_base is not None, 'Please set the environment variable LMDEPLOY_API_BASE.'
self.key = key
self.api_base = api_base
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
model_url = ''.join([api_base.split('v1')[0], 'v1/models'])
resp = requests.get(model_url)
self.model = resp.json()['data'][0]['id']
self.logger.info(f'lmdeploy evaluate model: {self.model}')
self.set_prompt_pattern(self.model)
if hasattr(self, 'custom_prompt'):
self.logger.info(f'using custom prompt {self.custom_prompt}')
def set_dump_image(self, dump_image_func):
if self.custom_prompt in self.prompt_map:
self.prompt_map[self.custom_prompt].dump_image_func = dump_image_func
self.dump_image_func = dump_image_func
def use_custom_prompt(self, dataset):
if self.custom_prompt in self.prompt_map:
return self.prompt_map[self.custom_prompt].use_custom_prompt(dataset)
return False
def build_prompt(self, line, dataset=None):
if self.custom_prompt in self.prompt_map:
return self.prompt_map[self.custom_prompt].build_prompt(line, dataset)
raise NotImplementedError
def set_prompt_pattern(self, model_name):
if 'Phi-3.5-Vision'.lower() in model_name.lower():
self.max_tokens = 1000
self.temperature = 0.0
if 'cogvlm2-llama3-chat-19B'.lower() in model_name.lower():
self.max_tokens = 2048
self.temperature = 0.0
self.custom_prompt = 'cogvlm2'
if 'InternVL2-'.lower() in model_name.lower():
self.max_tokens = 1024
self.temperature = 0.0
self.custom_prompt = 'internvl2'
if 'internvl2-8b-mpo-cot'.lower() in model_name.lower():
self.use_mpo_prompt = True
self.max_tokens = 1024
self.temperature = 0.0
self.custom_prompt = 'internvl2-8b-mpo-cot'
def prepare_itlist(self, inputs):
assert np.all([isinstance(x, dict) for x in inputs])
has_images = np.sum([x['type'] == 'image' for x in inputs])
if has_images:
content_list = []
for msg in inputs:
if msg['type'] == 'text':
content_list.append(dict(type='text', text=msg['value']))
elif msg['type'] == 'image':
from PIL import Image
img = Image.open(msg['value'])
b64 = encode_image_to_base64(img)
extra_args = msg.copy()
extra_args.pop('type')
extra_args.pop('value')
img_struct = dict(url=f'data:image/jpeg;base64,{b64}', **extra_args)
content_list.append(dict(type='image_url', image_url=img_struct))
else:
assert all([x['type'] == 'text' for x in inputs])
text = '\n'.join([x['value'] for x in inputs])
content_list = [dict(type='text', text=text)]
return content_list
def prepare_inputs(self, inputs):
input_msgs = []
if self.system_prompt is not None:
input_msgs.append(dict(role='system', content=self.system_prompt))
assert isinstance(inputs, list) and isinstance(inputs[0], dict)
assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs
if 'role' in inputs[0]:
assert inputs[-1]['role'] == 'user', inputs[-1]
for item in inputs:
input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content'])))
else:
input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs)))
return input_msgs
def generate_inner(self, inputs, **kwargs) -> str:
input_msgs = self.prepare_inputs(inputs)
temperature = kwargs.pop('temperature', self.temperature)
max_tokens = kwargs.pop('max_tokens', self.max_tokens)
headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.key}'}
payload = dict(
model=self.model,
messages=input_msgs,
max_tokens=max_tokens,
n=1,
temperature=temperature,
**kwargs)
response = requests.post(
self.api_base,
headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
ret_code = response.status_code
ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
answer = self.fail_msg
try:
resp_struct = json.loads(response.text)
answer = resp_struct['choices'][0]['message']['content'].strip()
# for internvl2-8b-mpo-cot
if getattr(self, 'use_mpo_prompt', False):
from ..vlm.internvl.utils import mpo_post_processing
answer = mpo_post_processing(answer, kwargs.get('dataset'))
except:
pass
return ret_code, answer, response
class LMDeployAPI(LMDeployWrapper):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def generate(self, message, dataset=None):
return super(LMDeployAPI, self).generate(message, dataset=dataset)
from http import HTTPStatus
import os
from vlmeval.api.base import BaseAPI
from vlmeval.smp import *
# Note: This is a pure language model API.
class QwenAPI(BaseAPI):
is_api: bool = True
def __init__(self,
model: str = 'qwen-max-1201',
retry: int = 5,
wait: int = 5,
verbose: bool = True,
seed: int = 2680,
temperature: float = 0.0,
system_prompt: str = None,
key: str = None,
max_tokens: int = 1024,
proxy: str = None,
**kwargs):
assert model in ['qwen-turbo', 'qwen-plus', 'qwen-max', 'qwen-max-1201', 'qwen-max-longcontext']
self.model = model
import dashscope
self.fail_msg = 'Failed to obtain answer via API. '
self.max_tokens = max_tokens
self.temperature = temperature
self.seed = seed
if key is None:
key = os.environ.get('DASHSCOPE_API_KEY', None)
assert key is not None, (
'Please set the API Key (obtain it here: '
'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
)
dashscope.api_key = key
if proxy is not None:
proxy_set(proxy)
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
@staticmethod
def build_msgs(msgs_raw, system_prompt=None):
msgs = cp.deepcopy(msgs_raw)
ret = []
if system_prompt is not None:
ret.append(dict(role='system', content=system_prompt))
for i, msg in enumerate(msgs):
role = 'user' if i % 2 == 0 else 'assistant'
ret.append(dict(role=role, content=msg))
return ret
def generate_inner(self, inputs, **kwargs) -> str:
from dashscope import MultiModalConversation
assert isinstance(inputs, str) or isinstance(inputs, list)
inputs = [inputs] if isinstance(inputs, str) else inputs
messages = self.build_msgs(msgs_raw=inputs, system_prompt=self.system_prompt)
import dashscope
response = dashscope.Generation.call(
model=self.model,
messages=messages,
seed=self.seed,
temperature=self.temperature,
max_tokens=self.max_tokens,
result_format='message', # set the result to be "message" format.
)
if response.status_code != HTTPStatus.OK:
return -1, 'Error: Bad Response Statuse Code. ', f'The response status code is {response.status_code}. '
try:
return 0, response['output']['choices'][0]['message']['content'].strip(), 'Succeeded! '
except Exception as err:
return -1, f'Error: Failed to parse the response. {err}', response
from __future__ import annotations
import os
import warnings
from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
from vlmeval.vlm.qwen2_vl.prompt import Qwen2VLPromptMixin
def ensure_image_url(image: str) -> str:
prefixes = ['http://', 'https://', 'file://', 'data:image;']
if any(image.startswith(prefix) for prefix in prefixes):
return image
if os.path.exists(image):
return 'file://' + image
raise ValueError(f'Invalid image: {image}')
class Qwen2VLAPI(Qwen2VLPromptMixin, BaseAPI):
is_api: bool = True
def __init__(
self,
model: str = 'qwen-vl-max-0809',
key: str | None = None,
min_pixels: int | None = None,
max_pixels: int | None = None,
max_length=1024,
top_p=0.001,
top_k=1,
temperature=0.01,
repetition_penalty=1.0,
presence_penalty=0.0,
seed=3407,
use_custom_prompt: bool = True,
**kwargs,
):
import dashscope
self.model = model
self.min_pixels = min_pixels
self.max_pixels = max_pixels
self.generate_kwargs = dict(
max_length=max_length,
top_p=top_p,
top_k=top_k,
temperature=temperature,
repetition_penalty=repetition_penalty,
presence_penalty=presence_penalty,
seed=seed,
)
key = os.environ.get('DASHSCOPE_API_KEY', None) if key is None else key
assert key is not None, (
'Please set the API Key (obtain it here: '
'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
)
dashscope.api_key = key
super().__init__(use_custom_prompt=use_custom_prompt, **kwargs)
def _prepare_content(self, inputs: list[dict[str, str]], dataset: str | None = None) -> list[dict[str, str]]:
"""
inputs list[dict[str, str]], each dict has keys: ['type', 'value']
"""
content = []
for s in inputs:
if s['type'] == 'image':
item = {'type': 'image', 'image': ensure_image_url(s['value'])}
if dataset == 'OCRBench':
item['min_pixels'] = 10 * 10 * 28 * 28
warnings.warn(f"OCRBench dataset uses custom min_pixels={item['min_pixels']}")
if self.max_pixels is not None:
item['max_pixels'] = self.max_pixels
else:
if self.min_pixels is not None:
item['min_pixels'] = self.min_pixels
if self.max_pixels is not None:
item['max_pixels'] = self.max_pixels
elif s['type'] == 'text':
item = {'type': 'text', 'text': s['value']}
else:
raise ValueError(f"Invalid message type: {s['type']}, {s}")
content.append(item)
return content
def generate_inner(self, inputs, **kwargs) -> str:
import dashscope
messages = []
if self.system_prompt is not None:
messages.append({'role': 'system', 'content': self.system_prompt})
messages.append(
{'role': 'user', 'content': self._prepare_content(inputs, dataset=kwargs.get('dataset', None))}
)
if self.verbose:
print(f'\033[31m{messages}\033[0m')
# generate
generation_kwargs = self.generate_kwargs.copy()
kwargs.pop('dataset', None)
generation_kwargs.update(kwargs)
try:
response = dashscope.MultiModalConversation.call(
model=self.model,
messages=messages,
**generation_kwargs,
)
if self.verbose:
print(response)
answer = response.output.choices[0]['message']['content'][0]['text']
return 0, answer, 'Succeeded! '
except Exception as err:
if self.verbose:
self.logger.error(f'{type(err)}: {err}')
self.logger.error(f'The input messages are {inputs}.')
return -1, '', ''
class QwenVLWrapper(BaseAPI):
is_api: bool = True
def __init__(self,
model: str = 'qwen-vl-plus',
retry: int = 5,
wait: int = 5,
key: str = None,
verbose: bool = True,
temperature: float = 0.0,
system_prompt: str = None,
max_tokens: int = 1024,
proxy: str = None,
**kwargs):
assert model in ['qwen-vl-plus', 'qwen-vl-max']
self.model = model
import dashscope
self.fail_msg = 'Failed to obtain answer via API. '
self.max_tokens = max_tokens
self.temperature = temperature
if key is None:
key = os.environ.get('DASHSCOPE_API_KEY', None)
assert key is not None, (
'Please set the API Key (obtain it here: '
'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
)
dashscope.api_key = key
if proxy is not None:
proxy_set(proxy)
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
# inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
# content can be a string or a list of image & text
def prepare_itlist(self, inputs):
assert np.all([isinstance(x, dict) for x in inputs])
has_images = np.sum([x['type'] == 'image' for x in inputs])
if has_images:
content_list = []
for msg in inputs:
if msg['type'] == 'text':
content_list.append(dict(text=msg['value']))
elif msg['type'] == 'image':
content_list.append(dict(image='file://' + msg['value']))
else:
assert all([x['type'] == 'text' for x in inputs])
text = '\n'.join([x['value'] for x in inputs])
content_list = [dict(text=text)]
return content_list
def prepare_inputs(self, inputs):
input_msgs = []
if self.system_prompt is not None:
input_msgs.append(dict(role='system', content=self.system_prompt))
assert isinstance(inputs, list) and isinstance(inputs[0], dict)
assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs
if 'role' in inputs[0]:
assert inputs[-1]['role'] == 'user', inputs[-1]
for item in inputs:
input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content'])))
else:
input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs)))
return input_msgs
def generate_inner(self, inputs, **kwargs) -> str:
from dashscope import MultiModalConversation
assert isinstance(inputs, str) or isinstance(inputs, list)
if 'type' in inputs[0]:
pure_text = np.all([x['type'] == 'text' for x in inputs])
else:
pure_text = True
for inp in inputs:
if not np.all([x['type'] == 'text' for x in inp['content']]):
pure_text = False
break
assert not pure_text
messages = self.prepare_inputs(inputs)
gen_config = dict(max_output_tokens=self.max_tokens, temperature=self.temperature)
gen_config.update(kwargs)
try:
response = MultiModalConversation.call(model=self.model, messages=messages)
if self.verbose:
print(response)
answer = response.output.choices[0]['message']['content'][0]['text']
return 0, answer, 'Succeeded! '
except Exception as err:
if self.verbose:
self.logger.error(f'{type(err)}: {err}')
self.logger.error(f'The input messages are {inputs}.')
return -1, '', ''
class QwenVLAPI(QwenVLWrapper):
def generate(self, message, dataset=None):
return super(QwenVLAPI, self).generate(message)
from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
from time import sleep
import mimetypes
class Reka_Wrapper(BaseAPI):
is_api: bool = True
INTERLEAVE: bool = False
def __init__(self,
model: str = 'reka-flash-20240226',
key: str = None,
retry: int = 10,
wait: int = 3,
system_prompt: str = None,
verbose: bool = True,
temperature: float = 0,
max_tokens: int = 1024,
**kwargs):
try:
import reka
except ImportError:
raise ImportError('Please install reka by running "pip install reka-api"')
self.model = model
default_kwargs = dict(temperature=temperature, request_output_len=max_tokens)
default_kwargs.update(kwargs)
self.kwargs = default_kwargs
if key is not None:
self.key = key
else:
self.key = os.environ.get('REKA_API_KEY', '')
super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
def generate_inner(self, inputs, **kwargs) -> str:
import reka
reka.API_KEY = self.key
dataset = kwargs.pop('dataset', None)
prompt, image_path = self.message_to_promptimg(inputs, dataset=dataset)
image_b64 = encode_image_file_to_base64(image_path)
response = reka.chat(
model_name=self.model,
human=prompt,
media_url=f'data:image/jpeg;base64,{image_b64}',
**self.kwargs)
try:
return 0, response['text'], response
except Exception as err:
return -1, self.fail_msg + str(err), response
class Reka(Reka_Wrapper):
def generate(self, message, dataset=None):
return super(Reka_Wrapper, self).generate(message)
from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
from vlmeval.dataset import img_root_map
from vlmeval.dataset import DATASET_TYPE
class SenseChatVisionWrapper(BaseAPI):
is_api: bool = True
def __init__(self,
model: str = 'SenseChat-5-Vision',
retry: int = 5,
wait: int = 5,
ak: str = None,
sk: str = None,
verbose: bool = True,
system_prompt: str = None,
max_tokens: int = 1024,
proxy: str = None,
**kwargs):
self.model = model
self.fail_msg = 'Failed to obtain answer via API. '
self.ak = os.environ.get('SENSECHAT_AK', None) if ak is None else ak
self.sk = os.environ.get('SENSECHAT_SK', None) if sk is None else sk
assert self.ak is not None and self.sk is not None
self.max_new_tokens = max_tokens
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
def dump_image(self, line, dataset):
"""Dump the image(s) of the input line to the corresponding dataset folder.
Args:
line (line of pd.DataFrame): The raw input line.
dataset (str): The name of the dataset.
Returns:
str | list[str]: The paths of the dumped images.
"""
ROOT = LMUDataRoot()
assert isinstance(dataset, str)
img_root = osp.join(ROOT, 'images', img_root_map(dataset))
os.makedirs(img_root, exist_ok=True)
if 'image' in line:
if isinstance(line['image'], list):
tgt_path = []
assert 'image_path' in line
for img, im_name in zip(line['image'], line['image_path']):
path = osp.join(img_root, im_name)
if not read_ok(path):
decode_base64_to_image_file(img, path)
tgt_path.append(path)
else:
tgt_path = osp.join(img_root, f"{line['index']}.jpg")
if not read_ok(tgt_path):
decode_base64_to_image_file(line['image'], tgt_path)
tgt_path = [tgt_path]
else:
assert 'image_path' in line
tgt_path = toliststr(line['image_path'])
return tgt_path
def image_to_base64(self, image_path):
import base64
with open(image_path, 'rb') as image_file:
encoded_string = base64.b64encode(image_file.read())
return encoded_string.decode('utf-8')
def encode_jwt_token(self, ak, sk):
import jwt
headers = {'alg': 'HS256', 'typ': 'JWT'}
payload = {
'iss': ak,
'exp': int(time.time())
+ 1800, # 填写您期望的有效时间,此处示例代表当前时间+30分钟
'nbf': int(time.time()) - 5, # 填写您期望的生效时间,此处示例代表当前时间-5秒
}
token = jwt.encode(payload, sk, headers=headers)
return token
def use_custom_prompt(self, dataset):
return True
def build_multi_choice_prompt(self, line, dataset=None):
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if len(options):
prompt += '\n请直接回答选项字母。' if cn_string(
prompt) else "\nAnswer with the option's letter from the given choices directly."
else:
prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
return prompt
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert dataset is None or isinstance(dataset, str)
tgt_path = self.dump_image(line, dataset)
if dataset is not None and listinstr(['MME'], dataset):
question = line['question']
prompt = question + ' Answer the question using a single word or phrase.'
elif dataset is not None and listinstr(['HallusionBench'], dataset):
question = line['question']
prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.'
elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ' and 'MMMU' not in dataset:
prompt = self.build_multi_choice_prompt(line, dataset)
elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
if 'MathVista' in dataset:
prompt = line['question']
elif listinstr(['LLaVABench'], dataset):
question = line['question']
prompt = question + '\nAnswer this question in detail.'
elif listinstr(['MMVet'], dataset):
prompt = line['question']
else:
question = line['question']
prompt = question + '\nAnswer the question using a single word or phrase.'
elif dataset is not None and 'MMMU' in dataset:
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = {
'multiple-choice': 'Answer with carefully thought step by step. Apply the thinking process recursively at both macro and micro levels. Verify consistency of reasoning and look for potential flaws or gaps during thinking. When realize mistakes, explain why the previous thinking was incorrect, fix it and then continue thinking.\n\n', # noqa
'open': 'Answer with carefully thought step by step. Apply the thinking process recursively at both macro and micro levels. Verify consistency of reasoning and look for potential flaws or gaps during thinking. When realize mistakes, explain why the previous thinking was incorrect, fix it and then continue thinking.\n\n' # noqa
}
subject = '_'.join(line['id'].split('_')[1:-1])
prompt = prompt[line['question_type']].format(subject, subject) + '\n' + question
else:
prompt = line['question']
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
def message_to_promptimg(self, message, dataset=None):
if dataset is None or listinstr(['MMMU', 'BLINK'], dataset):
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
image = [[x['value'] for x in message if x['type'] == 'image'][0]]
else:
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
image = [x['value'] for x in message if x['type'] == 'image']
return prompt, image
def generate_inner(self, inputs, **kwargs) -> str:
assert isinstance(inputs, str) or isinstance(inputs, list)
inputs = [inputs] if isinstance(inputs, str) else inputs
dataset = kwargs.get('dataset', None)
if dataset is not None and listinstr(['ChartQA_TEST','MathVista_MINI'], dataset):
self.max_num = 12
elif dataset is not None and listinstr(['DocVQA_VAL', 'DocVQA_TEST'], dataset):
self.max_num = 18
elif dataset is not None and listinstr(['InfoVQA_VAL', 'InfoVQA_TEST', 'OCRBench'], dataset):
self.max_num = 24
else:
self.max_num = 6
if dataset is None:
pass
elif listinstr(['AI2D_TEST'], dataset):
self.max_new_tokens = 10
elif 'MMMU' in dataset:
self.max_new_tokens = 4096 # 1024
elif 'MMBench' in dataset:
self.max_new_tokens = 100
elif 'MathVista_MINI' in dataset:
self.max_new_tokens = 4096
prompt, image = self.message_to_promptimg(message=inputs, dataset=dataset)
url = 'https://api.sensenova.cn/v1/llm/chat-completions'
api_secret_key = self.encode_jwt_token(self.ak, self.sk)
content = [{
'image_base64': self.image_to_base64(item),
'image_file_id': '',
'image_url': '',
'text': '',
'text': '',
'type': 'image_base64'
} for item in image]
content.append({
'image_base64': '',
'image_file_id': '',
'image_url': '',
'text': prompt,
'type': 'text'
})
message = [{'content': content, 'role': 'user'}]
data = {
'messages': message,
'max_new_tokens': self.max_new_tokens, # 1024
'temperature': 0,
"top_k": 0,
"top_p": 0.99,
'repetition_penalty': 1.05,
'model': self.model,
'stream': False,
}
headers = {
'Content-type': 'application/json',
'Authorization': 'Bearer ' + api_secret_key
}
response = requests.post(
url,
headers=headers,
json=data,
)
request_id = response.headers['x-request-id']
time.sleep(1)
try:
assert response.status_code == 200
response = response.json()['data']['choices'][0]['message'].strip()
if self.verbose:
self.logger.info(f'inputs: {inputs}\nanswer: {response}')
return 0, response, 'Succeeded! '
except Exception as err:
if self.verbose:
self.logger.error('---------------------------ERROR---------------------------')
self.logger.error(response.json())
self.logger.error(err)
self.logger.error('---------------------------request_id---------------------------' + request_id)
self.logger.error(
'api error' + response.json()['error']['message']
+ str([input['value'] if input['type'] == 'image' else None for input in inputs])
)
self.logger.error(f'The input messages are {inputs}.')
return -1, response.json()['error']['message'], ''
class SenseChatVisionAPI(SenseChatVisionWrapper):
def generate(self, message, dataset=None):
return super(SenseChatVisionAPI, self).generate(message, dataset=dataset)
import math
from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
from vlmeval.dataset import img_root_map
API_BASE = "https://api.siliconflow.cn/v1/chat/completions"
def resize_image(image: Image.Image, max_height: int, max_width: int) -> Image.Image:
width, height = image.size
if min(width, height) < 50:
scale = 50 / min(width, height)
image = image.resize((int(width * scale), int(height * scale)))
current_pixels = width * height
if current_pixels <= max_height * max_width:
return image
scale = math.sqrt(max_height * max_width / current_pixels)
new_width = int(width * scale)
new_height = int(height * scale)
return image.resize((new_width, new_height), Image.Resampling.LANCZOS)
def encode_image(path: str, max_height: int = 1024, max_width: int = 1024) -> str:
image = Image.open(path).convert("RGB")
image = resize_image(image, max_height, max_width)
height, width = image.size
if min(height, width) < 50:
scale = 50 / min(width, height)
image = image.resize((int(width * scale), int(height * scale)))
buffered = io.BytesIO()
image.save(buffered, format="PNG")
img_bytes = buffered.getvalue()
img_base64 = base64.b64encode(img_bytes).decode("utf-8")
return img_base64
class SiliconFlowAPI(BaseAPI):
is_api: bool = True
def __init__(
self,
model: str = "deepseek-ai/DeepSeek-V2.5",
retry: int = 5,
wait: int = 5,
key: str = None,
api_base: str = API_BASE,
verbose: bool = True,
system_prompt: str = None,
timeout: int = 60,
**kwargs,
):
self.model = model
self.api_base = api_base
default_kwargs = {
"stream": False,
"temperature": 0,
"n": 1,
"max_tokens": 1280,
}
for k, v in default_kwargs.items():
if k not in kwargs:
kwargs[k] = default_kwargs[k]
if key is not None:
self.key = key
else:
self.key = os.environ.get("SiliconFlow_API_KEY", "")
headers = {"Authorization": "Bearer {}", "Content-Type": "application/json"}
headers["Authorization"] = headers["Authorization"].format(self.key)
self.headers = headers
super().__init__(
wait=wait,
retry=retry,
system_prompt=system_prompt,
verbose=verbose,
**kwargs,
)
@staticmethod
def build_msgs(msgs_raw):
messages = []
message = {"role": "user", "content": []}
image_b64 = None
for msg in msgs_raw:
if msg["type"] == "image" and not image_b64:
image_b64 = encode_image(msg["value"])
message["content"].append(
{"image_url": {"url": image_b64}, "type": "image_url"}
)
elif msg["type"] == "text":
message["content"].append({"text": msg["value"], "type": "text"})
messages.append(message)
return messages
def generate_inner(self, inputs, **kwargs) -> str:
default_kwargs = self.default_kwargs
default_kwargs.update(kwargs)
payload = dict(
model=self.model,
messages=self.build_msgs(msgs_raw=inputs),
**default_kwargs,
)
response = requests.post(
self.api_base, headers=self.headers, data=json.dumps(payload)
)
ret_code = response.status_code
ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
answer = self.fail_msg
try:
resp_struct = json.loads(response.text)
answer = resp_struct["choices"][0]["message"]["content"].strip()
except:
pass
return ret_code, answer, response
class TeleMMAPI(SiliconFlowAPI):
is_api: bool = True
def __init__(
self,
model: str = "TeleAI/TeleMM",
key: str = None,
max_height: int = 1280,
max_width: int = 784,
**kwargs,
):
super().__init__(model=model, key=key, **kwargs)
self.max_height = max_height
self.max_width = max_width
def dump_image(self, line, dataset):
"""Dump the image(s) of the input line to the corresponding dataset folder.
Args:
line (line of pd.DataFrame): The raw input line.
dataset (str): The name of the dataset.
Returns:
str | list[str]: The paths of the dumped images.
"""
ROOT = LMUDataRoot()
assert isinstance(dataset, str)
# img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset)
img_root = osp.join(ROOT, "images", img_root_map(dataset))
os.makedirs(img_root, exist_ok=True)
if "image" in line:
if isinstance(line["image"], list):
tgt_path = []
assert "image_path" in line
for img, im_name in zip(line["image"], line["image_path"]):
path = osp.join(img_root, im_name)
if not read_ok(path):
decode_base64_to_image_file(img, path)
tgt_path.append(path)
else:
tgt_path = osp.join(img_root, f"{line['index']}.jpg")
if not read_ok(tgt_path):
decode_base64_to_image_file(line["image"], tgt_path)
tgt_path = [tgt_path]
else:
assert "image_path" in line
tgt_path = toliststr(line["image_path"])
return tgt_path
def _prepare_content(
self, inputs: list[dict[str, str]], dataset: str = None
) -> list[dict[str, str]]:
"""
inputs list[dict[str, str]], each dict has keys: ['type', 'value']
"""
content = []
has_image = False
for s in inputs:
if s["type"] == "image":
if not has_image:
item = {
"type": "image_url",
"image_url": {
"url": encode_image(
s["value"],
max_height=self.max_height,
max_width=self.max_width,
)
},
}
has_image = True
else:
continue
elif s["type"] == "text":
prompt = s["value"]
if len(prompt) == 0:
continue
if dataset == "HallusionBench":
prompt += " Please answer yes or no directly, without any unnecessary explanation."
elif dataset == "OCRBench":
prompt = (
prompt + "\nExtract the text from the image intactly and "
+ "answer the question concisely and clearly if possible."
)
elif (
dataset == "AI2D_TEST"
or dataset == "MMStar"
or dataset == "MMBench_TEST_EN_V11"
or dataset == "MMVet"
):
prompt = prompt.replace(
"Please select the correct answer from the options above. \n",
"Please select the correct option from the above choices based on the "
+ "input image and question. The final output should only be one option, such as 'A'",
)
elif dataset == "MMBench_TEST_CN_V11":
prompt = prompt.replace(
"Please select the correct answer from the options above. \n",
"请根据输入图像和问题从上述选项中选择正确选项,最终的输出只有一个选项,例如'A'",
)
item = {"type": "text", "text": prompt}
else:
raise ValueError(f"Invalid message type: {s['type']}, {s}")
content.append(item)
return content
def generate_inner(self, inputs, **kwargs) -> str:
default_kwargs = self.default_kwargs
default_kwargs.update(kwargs)
messages = []
messages.append(
{
"role": "user",
"content": self._prepare_content(
inputs, dataset=kwargs.get("dataset", None)
),
}
)
payload = dict(model=self.model, messages=messages, **default_kwargs)
response = requests.post(
self.api_base, headers=self.headers, data=json.dumps(payload)
)
ret_code = response.status_code
ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
answer = self.fail_msg
try:
resp_struct = json.loads(response.text)
answer = resp_struct["choices"][0]["message"]["content"].strip()
return ret_code, answer, response
except Exception as err:
import traceback
traceback.print_exc()
if self.verbose:
self.logger.error(f"{type(err)}: {err}")
self.logger.error(f"The input messages are {inputs}.")
return -1, "", ""
from vlmeval.smp import *
from vlmeval.api.base import BaseAPI
url = 'https://api.stepfun.com/v1/chat/completions'
headers = {
'Content-Type': 'application/json',
'Authorization': 'Bearer {}',
}
class StepAPI_INT(BaseAPI):
is_api: bool = True
def __init__(self,
model: str = 'step-1v-8k',
retry: int = 10,
wait: int = 3,
key: str = None,
temperature: float = 0,
max_tokens: int = 300,
verbose: bool = True,
system_prompt: str = None,
**kwargs):
self.model = model
self.fail_msg = 'Fail to obtain answer via API.'
self.headers = headers
self.temperature = temperature
self.max_tokens = max_tokens
self.system_prompt = system_prompt
if key is not None:
self.key = key
else:
self.key = os.environ.get('STEPAI_API_KEY', '')
headers['Authorization'] = headers['Authorization'].format(self.key)
super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
@staticmethod
def build_msgs(msgs_raw):
messages = []
message = {'role': 'user', 'content': []}
for msg in msgs_raw:
if msg['type'] == 'image':
image_b64 = encode_image_file_to_base64(msg['value'])
message['content'].append({
'image_url': {'url': 'data:image/webp;base64,%s' % (image_b64)},
'type': 'image_url'
})
elif msg['type'] == 'text':
message['content'].append({
'text': msg['value'],
'type': 'text'
})
messages.append(message)
return messages
def generate_inner(self, inputs, **kwargs) -> str:
print(inputs, '\n')
payload = dict(
model=self.model,
max_tokens=self.max_tokens,
temperature=self.temperature,
messages=self.build_msgs(msgs_raw=inputs),
**kwargs)
response = requests.post(url, headers=headers, data=json.dumps(payload))
ret_code = response.status_code
ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
answer = self.fail_msg
try:
resp_struct = json.loads(response.text)
answer = resp_struct['choices'][0]['message']['content'].strip()
except Exception as err:
if self.verbose:
self.logger.error(f'{type(err)}: {err}')
self.logger.error(response.text if hasattr(response, 'text') else response)
return ret_code, answer, response
class Step1V_INT(StepAPI_INT):
def generate(self, message, dataset=None):
return super(StepAPI_INT, self).generate(message)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment