Initial commit

bc5ebf0f · luopl · bc5ebf0f · bc5ebf0f · bc5ebf0f · bc5ebf0f
Commit bc5ebf0f authored Dec 27, 2024 by luopl
20 changed files
--- a/VLMEvalKit/vlmeval/__init__.py
+++ b/VLMEvalKit/vlmeval/__init__.py
+try:
+    import torch
+except ImportError:
+    pass
+from .smp import *
+from .api import *
+from .dataset import *
+from .utils import *
+from .vlm import *
+from .config import *
+from .tools import cli
+load_env()
+__version__ = '0.2rc1'
--- a/VLMEvalKit/vlmeval/api/__init__.py
+++ b/VLMEvalKit/vlmeval/api/__init__.py
+from .gpt import OpenAIWrapper, GPT4V
+from .hf_chat_model import HFChatModel
+from .gemini import GeminiWrapper, GeminiProVision
+from .qwen_vl_api import QwenVLWrapper, QwenVLAPI, Qwen2VLAPI
+from .qwen_api import QwenAPI
+from .claude import Claude_Wrapper, Claude3V
+from .reka import Reka
+from .glm_vision import GLMVisionAPI
+from .cloudwalk import CWWrapper
+from .sensechat_vision import SenseChatVisionAPI
+from .siliconflow import SiliconFlowAPI, TeleMMAPI
+from .hunyuan import HunyuanVision
+from .bailingmm import bailingMMAPI
+from .bluelm_v_api import BlueLMWrapper, BlueLM_V_API
+from .jt_vl_chat import JTVLChatAPI
+from .taiyi import TaiyiAPI
+from .lmdeploy import LMDeployAPI
+from .taichu import TaichuVLAPI
+__all__ = [
+    'OpenAIWrapper', 'HFChatModel', 'GeminiWrapper', 'GPT4V',
+    'GeminiProVision', 'QwenVLWrapper', 'QwenVLAPI', 'QwenAPI',
+    'Claude3V', 'Claude_Wrapper', 'Reka', 'GLMVisionAPI',
+    'CWWrapper', 'SenseChatVisionAPI', 'HunyuanVision', 'Qwen2VLAPI',
+    'BlueLMWrapper', 'BlueLM_V_API', 'JTVLChatAPI', 'bailingMMAPI',
+    'TaiyiAPI', 'TeleMMAPI', 'SiliconFlowAPI', 'LMDeployAPI',
+    'TaichuVLAPI'
+]
--- a/VLMEvalKit/vlmeval/api/bailingmm.py
+++ b/VLMEvalKit/vlmeval/api/bailingmm.py
+import base64
+from vlmeval.smp import *
+from vlmeval.api.base import BaseAPI
+from vlmeval.dataset import DATASET_TYPE
+from vlmeval.smp.vlm import encode_image_file_to_base64
+import time
+class bailingMMWrapper(BaseAPI):
+    is_api: bool = True
+    def __init__(self,
+                 model: str,
+                 retry: int = 5,
+                 wait: int = 5,
+                 key: str = None,
+                 verbose: bool = True,
+                 system_prompt: str = None,
+                 max_tokens: int = 1024,
+                 proxy: str = None,
+                 **kwargs):
+        self.model = model
+        self.fail_msg = 'Failed to obtain answer via bailingMM API.'
+        if key is None:
+            key = os.environ.get('BAILINGMM_API_KEY', None)
+        assert key is not None, ('Please set the API Key for bailingMM.')
+        self.key = key
+        self.headers = {"Content-Type": "application/json"}
+        super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
+    def image_to_base64(self, image_path):
+        with open(image_path, 'rb') as image_file:
+            encoded_string = str(base64.b64encode(image_file.read()), 'utf-8')
+            return encoded_string
+    def prepare_inputs(self, inputs):
+        msgs = cp.deepcopy(inputs)
+        content = []
+        for i, msg in enumerate(msgs):
+            if msg['type'] == 'text':
+                pass
+            else:
+                try:
+                    image_data = self.image_to_base64(msg['value'])
+                except Exception as e:
+                    if self.verbose:
+                        self.logger.error(e)
+                    image_data = ''
+                msg['value'] = image_data
+            content.append(msg)
+        return content
+    def generate_inner(self, inputs, **kwargs) -> str:
+        assert isinstance(inputs, str) or isinstance(inputs, list)
+        start = time.time()
+        inputs = [inputs] if isinstance(inputs, str) else inputs
+        messages = self.prepare_inputs(inputs)
+        service_url = "https://bailingchat.alipay.com/api/proxy/eval/antgmm/completions"
+        payload = {
+            "structInput": messages,
+            "sk": self.key,
+            "timeout": 180000
+        }
+        response = requests.post(service_url, headers=self.headers, json=payload)
+        if self.verbose:
+            self.logger.info('Time for requesting is:')
+            self.logger.info(time.time() - start)
+        try:
+            assert response.status_code == 200
+            output = json.loads(response.text)
+            answer = output['preds']['pred']
+            if self.verbose:
+                self.logger.info(f'inputs: {inputs}\nanswer: {answer}')
+            return 0, answer, 'Succeeded! '
+        except Exception as e:
+            if self.verbose:
+                self.logger.error(e)
+                self.logger.error(f'The input messages are {inputs}.')
+            return -1, self.fail_msg, ''
+class bailingMMAPI(bailingMMWrapper):
+    def generate(self, message, dataset=None):
+        return super(bailingMMAPI, self).generate(message, dataset=dataset)
--- a/VLMEvalKit/vlmeval/api/base.py
+++ b/VLMEvalKit/vlmeval/api/base.py
--- a/VLMEvalKit/vlmeval/api/bluelm_v_api.py
+++ b/VLMEvalKit/vlmeval/api/bluelm_v_api.py
+from vlmeval.smp import *
+from vlmeval.api.base import BaseAPI
+import os
+import json
+def multimodal(images, text, url, key, temperature=0, max_tokens=1024, history=[]):
+    if images:
+        pics = []
+        for image in images:
+            with open(image, 'rb') as f:
+                pic = base64.b64encode(f.read()).decode('utf-8')
+            pics.append(pic)
+        data = {'images': pics, 'text': text, 'key': key, 'temperature': temperature, 'max_new_tokens': max_tokens}
+    else:
+        data = {'text': text, 'key': key, 'temperature': temperature, 'max_new_tokens': max_tokens}
+    response = requests.post(url, json=data, headers={'Content-Type': 'application/json'})
+    response = json.loads(response.text)
+    return response
+class BlueLMWrapper(BaseAPI):
+    is_api: bool = True
+    def __init__(self,
+                 model: str = 'BlueLM-V-v3.0',
+                 retry: int = 5,
+                 wait: int = 5,
+                 verbose: bool = True,
+                 temperature: float = 0.0,
+                 system_prompt: str = None,
+                 max_tokens: int = 1024,
+                 key: str = None,
+                 url: str = 'http://api-ai.vivo.com.cn/multimodal',
+                 **kwargs):
+        self.model = model
+        self.fail_msg = 'Failed to obtain answer BlueLM-V API. '
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+        self.url = url
+        self.key = key
+        if self.key is None:
+            self.key = os.environ.get('BLUELM_V_API_KEY', None)
+        assert self.key is not None, (
+            'Please set the API Key (obtain it here: '
+            'contact by email : shuai.ren@vivo.com'
+        )
+        super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
+    def message_to_promptimg(self, message, dataset=None):
+        num_images = len([x for x in message if x['type'] == 'image'])
+        if num_images == 0:
+            prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+            image = None
+        elif num_images == 1:
+            prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+            image = [x['value'] for x in message if x['type'] == 'image']
+        else:
+            prompt = '\n'.join([x['value'] if x['type'] == 'text' else '<image>' for x in message])
+            if dataset == 'BLINK':
+                image = concat_images_vlmeval(
+                    [x['value'] for x in message if x['type'] == 'image'],
+                    target_size=512)
+            else:
+                image = [x['value'] for x in message if x['type'] == 'image']
+        if dataset in ['MMBench_DEV_EN_V11', 'MMBench_DEV_CN_V11', 'MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11',
+                       'AI2D_TEST', 'AI2D_TEST_TO_MASK', 'MMMU_DEV_VAL']:
+            prompt = prompt.replace('Please select the correct answer from the options above.',
+                                    'Answer with the option’s letter from the given choices directly.')
+        elif dataset in ['ChartQA_TEST']:
+            prompt = prompt.replace('Answer the question using a single word or phrase.',
+                                    'Answer the question using a single number or phrase.')
+        elif dataset in ['DocVQA_VAL', 'DocVQA_TEST', ]:
+            prompt = prompt.replace('Answer the question using a single word or phrase.',
+                                    'Give the short answer directly.')
+        elif dataset in ['TextVQA_VAL']:
+            prompt = prompt.replace('Answer the question using a single word or phrase.',
+                                    'When the provided information is insufficient, respond with ’Unanswerable’.'
+                                    'Answer the question using a single word or phrase.')
+        elif dataset in ['MTVQA_TEST']:
+            prompt = prompt.replace('\nAnswer the question using a word or phrase in the language of the question.', '')
+        elif dataset in ['MathVista_MINI']:
+            if 'Choices:' in prompt:
+                prompt = prompt.replace('Choices:', 'Options:').replace('Hint:', 'Context:')
+                for i in range(1, 7):  # replace A ~ F
+                    prompt = prompt.replace(f'({chr(64 + i)})', f'{chr(64 + i)}.')
+                prompt += '\nAnswer with the option’s letter from the given choices directly.'
+            else:
+                prompt += '\nAnswer the question using a single word or phrase.'
+        return prompt, image
+    def generate_inner(self, inputs, **kwargs) -> str:
+        assert isinstance(inputs, str) or isinstance(inputs, list)
+        pure_text = np.all([x['type'] == 'text' for x in inputs])
+        assert not pure_text
+        prompt, image_path = self.message_to_promptimg(inputs, kwargs['dataset'])
+        try:
+            response = multimodal(image_path, prompt, self.url, self.key, self.temperature, self.max_tokens)
+            answer = response['result']
+            return 0, answer, 'Succeeded! '
+        except Exception as err:
+            if self.verbose:
+                self.logger.error(f'{type(err)}: {err}')
+                self.logger.error(f'The input messages are {inputs}.')
+            return -1, '', ''
+class BlueLM_V_API(BlueLMWrapper):
+    def generate(self, message, dataset=None):
+        return super(BlueLM_V_API, self).generate(message, dataset=dataset)
--- a/VLMEvalKit/vlmeval/api/claude.py
+++ b/VLMEvalKit/vlmeval/api/claude.py
--- a/VLMEvalKit/vlmeval/api/cloudwalk.py
+++ b/VLMEvalKit/vlmeval/api/cloudwalk.py
--- a/VLMEvalKit/vlmeval/api/gemini.py
+++ b/VLMEvalKit/vlmeval/api/gemini.py
--- a/VLMEvalKit/vlmeval/api/glm_vision.py
+++ b/VLMEvalKit/vlmeval/api/glm_vision.py
--- a/VLMEvalKit/vlmeval/api/gpt.py
+++ b/VLMEvalKit/vlmeval/api/gpt.py
--- a/VLMEvalKit/vlmeval/api/hf_chat_model.py
+++ b/VLMEvalKit/vlmeval/api/hf_chat_model.py
--- a/VLMEvalKit/vlmeval/api/hunyuan.py
+++ b/VLMEvalKit/vlmeval/api/hunyuan.py
--- a/VLMEvalKit/vlmeval/api/jt_vl_chat.py
+++ b/VLMEvalKit/vlmeval/api/jt_vl_chat.py
--- a/VLMEvalKit/vlmeval/api/lmdeploy.py
+++ b/VLMEvalKit/vlmeval/api/lmdeploy.py
--- a/VLMEvalKit/vlmeval/api/qwen_api.py
+++ b/VLMEvalKit/vlmeval/api/qwen_api.py
--- a/VLMEvalKit/vlmeval/api/qwen_vl_api.py
+++ b/VLMEvalKit/vlmeval/api/qwen_vl_api.py
--- a/VLMEvalKit/vlmeval/api/reka.py
+++ b/VLMEvalKit/vlmeval/api/reka.py
--- a/VLMEvalKit/vlmeval/api/sensechat_vision.py
+++ b/VLMEvalKit/vlmeval/api/sensechat_vision.py
--- a/VLMEvalKit/vlmeval/api/siliconflow.py
+++ b/VLMEvalKit/vlmeval/api/siliconflow.py
--- a/VLMEvalKit/vlmeval/api/stepai.py
+++ b/VLMEvalKit/vlmeval/api/stepai.py