Commit bc5ebf0f authored by luopl's avatar luopl
Browse files

Initial commit

parents
Pipeline #2167 canceled with stages
import json
import pickle
import pandas as pd
import os
import csv
import hashlib
import os.path as osp
import time
import numpy as np
import validators
import mimetypes
import multiprocessing as mp
from .misc import toliststr
from .vlm import decode_base64_to_image_file
def decode_img_omni(tup):
root, im, p = tup
images = toliststr(im)
paths = toliststr(p)
if len(images) > 1 and len(paths) == 1:
paths = [osp.splitext(p)[0] + f'_{i}' + osp.splitext(p)[1] for i in range(len(images))]
assert len(images) == len(paths)
paths = [osp.join(root, p) for p in paths]
for p, im in zip(paths, images):
if osp.exists(p):
continue
if isinstance(im, str) and len(im) > 64:
decode_base64_to_image_file(im, p)
return paths
def localize_df(data, dname, nproc=32):
assert 'image' in data
indices = list(data['index'])
indices_str = [str(x) for x in indices]
images = list(data['image'])
image_map = {x: y for x, y in zip(indices_str, images)}
root = LMUDataRoot()
root = osp.join(root, 'images', dname)
os.makedirs(root, exist_ok=True)
if 'image_path' in data:
img_paths = list(data['image_path'])
else:
img_paths = []
for i in indices_str:
if len(image_map[i]) <= 64:
idx = image_map[i]
assert idx in image_map and len(image_map[idx]) > 64
img_paths.append(f'{idx}.jpg')
else:
img_paths.append(f'{i}.jpg')
tups = [(root, im, p) for p, im in zip(img_paths, images)]
pool = mp.Pool(32)
ret = pool.map(decode_img_omni, tups)
pool.close()
data.pop('image')
if 'image_path' not in data:
data['image_path'] = [x[0] if len(x) == 1 else x for x in ret]
return data
def LMUDataRoot():
if 'LMUData' in os.environ and osp.exists(os.environ['LMUData']):
return os.environ['LMUData']
home = osp.expanduser('~')
root = osp.join(home, 'LMUData')
os.makedirs(root, exist_ok=True)
return root
def HFCacheRoot():
cache_list = ['HUGGINGFACE_HUB_CACHE', 'HF_HOME']
for cache_name in cache_list:
if cache_name in os.environ and osp.exists(os.environ[cache_name]):
if os.environ[cache_name].split('/')[-1] == 'hub':
return os.environ[cache_name]
else:
return osp.join(os.environ[cache_name], 'hub')
home = osp.expanduser('~')
root = osp.join(home, '.cache', 'huggingface', 'hub')
os.makedirs(root, exist_ok=True)
return root
def MMBenchOfficialServer(dataset_name):
root = LMUDataRoot()
if dataset_name in ['MMBench', 'MMBench_V11', 'MMBench_CN', 'MMBench_CN_V11']:
ans_file = f'{root}/{dataset_name}.tsv'
if osp.exists(ans_file):
data = load(ans_file)
if 'answer' in data and sum([pd.isna(x) for x in data['answer']]) == 0:
return True
if dataset_name in ['MMBench_TEST_EN', 'MMBench_TEST_CN', 'MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11']:
ans_file1 = f'{root}/{dataset_name}.tsv'
mapp = {
'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_CN': 'MMBench_CN',
'MMBench_TEST_EN_V11': 'MMBench_V11', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11',
}
ans_file2 = f'{root}/{mapp[dataset_name]}.tsv'
for f in [ans_file1, ans_file2]:
if osp.exists(f):
data = load(f)
if 'answer' in data and sum([pd.isna(x) for x in data['answer']]) == 0:
return True
return False
class NumpyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
np.int16, np.int32, np.int64, np.uint8,
np.uint16, np.uint32, np.uint64)):
return int(obj)
elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
return float(obj)
elif isinstance(obj, (np.complex_, np.complex64, np.complex128)):
return {'real': obj.real, 'imag': obj.imag}
elif isinstance(obj, (np.ndarray,)):
return obj.tolist()
elif isinstance(obj, (np.bool_)):
return bool(obj)
elif isinstance(obj, (np.void)):
return None
return json.JSONEncoder.default(self, obj)
# LOAD & DUMP
def dump(data, f, **kwargs):
def dump_pkl(data, pth, **kwargs):
pickle.dump(data, open(pth, 'wb'))
def dump_json(data, pth, **kwargs):
json.dump(data, open(pth, 'w'), indent=4, ensure_ascii=False, cls=NumpyEncoder)
def dump_jsonl(data, f, **kwargs):
lines = [json.dumps(x, ensure_ascii=False, cls=NumpyEncoder) for x in data]
with open(f, 'w', encoding='utf8') as fout:
fout.write('\n'.join(lines))
def dump_xlsx(data, f, **kwargs):
data.to_excel(f, index=False, engine='xlsxwriter')
def dump_csv(data, f, quoting=csv.QUOTE_ALL):
data.to_csv(f, index=False, encoding='utf-8', quoting=quoting)
def dump_tsv(data, f, quoting=csv.QUOTE_ALL):
data.to_csv(f, sep='\t', index=False, encoding='utf-8', quoting=quoting)
handlers = dict(pkl=dump_pkl, json=dump_json, jsonl=dump_jsonl, xlsx=dump_xlsx, csv=dump_csv, tsv=dump_tsv)
suffix = f.split('.')[-1]
return handlers[suffix](data, f, **kwargs)
def load(f, fmt=None):
def load_pkl(pth):
return pickle.load(open(pth, 'rb'))
def load_json(pth):
return json.load(open(pth, 'r', encoding='utf-8'))
def load_jsonl(f):
lines = open(f, encoding='utf-8').readlines()
lines = [x.strip() for x in lines]
if lines[-1] == '':
lines = lines[:-1]
data = [json.loads(x) for x in lines]
return data
def load_xlsx(f):
return pd.read_excel(f)
def load_csv(f):
return pd.read_csv(f)
def load_tsv(f):
return pd.read_csv(f, sep='\t')
handlers = dict(pkl=load_pkl, json=load_json, jsonl=load_jsonl, xlsx=load_xlsx, csv=load_csv, tsv=load_tsv)
if fmt is not None:
return handlers[fmt](f)
suffix = f.split('.')[-1]
return handlers[suffix](f)
def download_file(url, filename=None):
import urllib.request
from tqdm import tqdm
class DownloadProgressBar(tqdm):
def update_to(self, b=1, bsize=1, tsize=None):
if tsize is not None:
self.total = tsize
self.update(b * bsize - self.n)
if filename is None:
filename = url.split('/')[-1]
try:
with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc=url.split('/')[-1]) as t:
urllib.request.urlretrieve(url, filename=filename, reporthook=t.update_to)
except Exception as e:
import logging
logging.warning(f'{type(e)}: {e}')
# Handle Failed Downloads from huggingface.co
if 'huggingface.co' in url:
url_new = url.replace('huggingface.co', 'hf-mirror.com')
try:
download_file(url_new, filename)
return filename
except Exception as e:
logging.warning(f'{type(e)}: {e}')
raise Exception(f'Failed to download {url}')
else:
raise Exception(f'Failed to download {url}')
return filename
def ls(dirname='.', match=[], mode='all', level=1):
if isinstance(level, str):
assert '+' in level
level = int(level[:-1])
res = []
for i in range(1, level + 1):
res.extend(ls(dirname, match=match, mode='file', level=i))
return res
if dirname == '.':
ans = os.listdir(dirname)
else:
ans = [osp.join(dirname, x) for x in os.listdir(dirname)]
assert mode in ['all', 'dir', 'file']
assert level >= 1 and isinstance(level, int)
if level == 1:
if isinstance(match, str):
match = [match]
for m in match:
if len(m) == 0:
continue
if m[0] != '!':
ans = [x for x in ans if m in x]
else:
ans = [x for x in ans if m[1:] not in x]
if mode == 'dir':
ans = [x for x in ans if osp.isdir(x)]
elif mode == 'file':
ans = [x for x in ans if not osp.isdir(x)]
return ans
else:
dirs = [x for x in ans if osp.isdir(x)]
res = []
for d in dirs:
res.extend(ls(d, match=match, mode=mode, level=level - 1))
return res
def mrlines(fname, sp='\n'):
f = open(fname).read().split(sp)
while f != [] and f[-1] == '':
f = f[:-1]
return f
def mwlines(lines, fname):
with open(fname, 'w') as fout:
fout.write('\n'.join(lines))
def md5(s):
hash = hashlib.new('md5')
if osp.exists(s):
with open(s, 'rb') as f:
for chunk in iter(lambda: f.read(2**20), b''):
hash.update(chunk)
else:
hash.update(s.encode('utf-8'))
return str(hash.hexdigest())
def last_modified(pth):
stamp = osp.getmtime(pth)
m_ti = time.ctime(stamp)
t_obj = time.strptime(m_ti)
t = time.strftime('%Y%m%d%H%M%S', t_obj)[2:]
return t
def parse_file(s):
if osp.exists(s) and s != '.':
assert osp.isfile(s)
suffix = osp.splitext(s)[1].lower()
mime = mimetypes.types_map.get(suffix, 'unknown')
return (mime, s)
elif s.startswith('data:image/'):
# To be compatible with OPENAI base64 format
content = s[11:]
mime = content.split(';')[0]
content = ';'.join(content.split(';')[1:])
dname = osp.join(LMUDataRoot(), 'files')
assert content.startswith('base64,')
b64 = content[7:]
os.makedirs(dname, exist_ok=True)
tgt = osp.join(dname, md5(b64) + '.png')
decode_base64_to_image_file(b64, tgt)
return parse_file(tgt)
elif validators.url(s):
suffix = osp.splitext(s)[1].lower()
if suffix in mimetypes.types_map:
mime = mimetypes.types_map[suffix]
dname = osp.join(LMUDataRoot(), 'files')
os.makedirs(dname, exist_ok=True)
tgt = osp.join(dname, md5(s) + suffix)
download_file(s, tgt)
return (mime, tgt)
else:
return ('url', s)
else:
return (None, s)
def file_size(f, unit='GB'):
stats = os.stat(f)
div_map = {
'GB': 2 ** 30,
'MB': 2 ** 20,
'KB': 2 ** 10,
}
return stats.st_size / div_map[unit]
def parquet_to_tsv(file_path):
data = pd.read_parquet(file_path)
pth = '/'.join(file_path.split('/')[:-1])
data_name = file_path.split('/')[-1].split('.')[0]
data.to_csv(osp.join(pth, f'{data_name}.tsv'), sep='\t', index=False)
import logging
logging.basicConfig(
format='[%(asctime)s] %(levelname)s - %(filename)s: %(funcName)s - %(lineno)d: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
logger_initialized = {}
def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
logger = logging.getLogger(name)
if name in logger_initialized:
return logger
for logger_name in logger_initialized:
if name.startswith(logger_name):
return logger
stream_handler = logging.StreamHandler()
handlers = [stream_handler]
try:
import torch.distributed as dist
if dist.is_available() and dist.is_initialized():
rank = dist.get_rank()
else:
rank = 0
except ImportError:
rank = 0
if rank == 0 and log_file is not None:
file_handler = logging.FileHandler(log_file, file_mode)
handlers.append(file_handler)
formatter = logging.Formatter(
'[%(asctime)s] %(levelname)s - %(name)s - %(filename)s: %(funcName)s - %(lineno)d: %(message)s')
for handler in handlers:
handler.setFormatter(formatter)
handler.setLevel(log_level)
logger.addHandler(handler)
if rank == 0:
logger.setLevel(log_level)
else:
logger.setLevel(logging.ERROR)
logger_initialized[name] = True
return logger
# flake8: noqa: F401, F403
import abc
import argparse
import csv
import multiprocessing as mp
import os
import os.path as osp
from pathlib import Path
import copy as cp
import random as rd
import requests
import shutil
import subprocess
import warnings
import pandas as pd
from collections import OrderedDict, defaultdict
from multiprocessing import Pool, current_process
from tqdm import tqdm
import datetime
import matplotlib.pyplot as plt
from tabulate import tabulate
from json import JSONDecoder
from huggingface_hub import scan_cache_dir
from huggingface_hub.utils._cache_manager import _scan_cached_repo
from sty import fg, bg, ef, rs
def modelscope_flag_set():
return os.environ.get('VLMEVALKIT_USE_MODELSCOPE', None) in ['1', 'True']
def process_punctuation(inText):
import re
outText = inText
punct = [
';', r'/', '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_', '-',
'>', '<', '@', '`', ',', '?', '!'
]
commaStrip = re.compile('(\d)(,)(\d)') # noqa: W605
periodStrip = re.compile('(?!<=\d)(\.)(?!\d)') # noqa: W605
for p in punct:
if (p + ' ' in inText or ' ' + p in inText) or (re.search(
commaStrip, inText) is not None):
outText = outText.replace(p, '')
else:
outText = outText.replace(p, ' ')
outText = periodStrip.sub('', outText, re.UNICODE)
return outText
def h2r(value):
if value[0] == '#':
value = value[1:]
assert len(value) == 6
return tuple(int(value[i:i + 2], 16) for i in range(0, 6, 2))
def r2h(rgb):
return '#%02x%02x%02x' % rgb
def colored(s, color):
if isinstance(color, str):
if hasattr(fg, color):
return getattr(fg, color) + s + fg.rs
color = h2r(color)
return fg(*color) + s + fg.rs
def istype(s, type):
if isinstance(s, type):
return True
try:
return isinstance(eval(s), type)
except Exception as _:
return False
def bincount(lst):
bins = defaultdict(lambda: 0)
for item in lst:
bins[item] += 1
return bins
def get_cache_path(repo_id, branch='main', repo_type='datasets'):
try:
if modelscope_flag_set():
from modelscope.hub.file_download import create_temporary_directory_and_cache
if repo_type == 'datasets':
repo_type = 'dataset'
_, cache = create_temporary_directory_and_cache(model_id=repo_id, repo_type=repo_type)
cache_path = cache.get_root_location()
return cache_path
else:
from .file import HFCacheRoot
cache_path = HFCacheRoot()
org, repo_name = repo_id.split('/')
repo_path = Path(osp.join(cache_path, f'{repo_type}--{org}--{repo_name}/'))
hf_cache_info = _scan_cached_repo(repo_path=repo_path)
revs = {r.refs: r for r in hf_cache_info.revisions}
if branch is not None:
revs = {refs: r for refs, r in revs.items() if branch in refs}
rev2keep = max(revs.values(), key=lambda r: r.last_modified)
return str(rev2keep.snapshot_path)
except Exception as e:
import logging
logging.warning(f'{type(e)}: {e}')
return None
def proxy_set(s):
import os
for key in ['http_proxy', 'HTTP_PROXY', 'https_proxy', 'HTTPS_PROXY']:
os.environ[key] = s
def get_rank_and_world_size():
rank = int(os.environ.get('RANK', 0))
world_size = int(os.environ.get('WORLD_SIZE', 1))
return rank, world_size
def splitlen(s, sym='/'):
return len(s.split(sym))
def listinstr(lst, s):
assert isinstance(lst, list)
for item in lst:
if item in s:
return True
return False
def d2df(D):
return pd.DataFrame({x: [D[x]] for x in D})
def cn_string(s):
import re
if re.search(u'[\u4e00-\u9fff]', s):
return True
return False
try:
import decord
except ImportError:
pass
def timestr(granularity='second'):
s = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
assert granularity in ['second', 'minute', 'hour', 'day']
if granularity == 'second':
return s
elif granularity == 'minute':
return s[:-2]
elif granularity == 'hour':
return s[:-4]
elif granularity == 'day':
return s[:-6]
def _minimal_ext_cmd(cmd, cwd=None):
env = {}
for k in ['SYSTEMROOT', 'PATH', 'HOME']:
v = os.environ.get(k)
if v is not None:
env[k] = v
env['LANGUAGE'] = 'C'
env['LANG'] = 'C'
env['LC_ALL'] = 'C'
out = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env, cwd=cwd).communicate()[0]
return out
def githash(fallback='unknown', digits=8):
if digits is not None and not isinstance(digits, int):
raise TypeError('digits must be None or an integer')
try:
import vlmeval
except ImportError as e:
import logging
logging.error(f'ImportError: {str(e)}')
return fallback
try:
out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'], cwd=vlmeval.__path__[0])
sha = out.strip().decode('ascii')
if digits is not None:
sha = sha[:digits]
except OSError:
sha = fallback
return sha
def dict_merge(dct, merge_dct):
for k, _ in merge_dct.items():
if (k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], dict)): #noqa
dict_merge(dct[k], merge_dct[k])
else:
dct[k] = merge_dct[k]
def youtube_dl(idx):
cmd = f'youtube-dl -f best -f mp4 "{idx}" -o {idx}.mp4'
os.system(cmd)
def run_command(cmd):
if isinstance(cmd, str):
cmd = cmd.split()
return subprocess.check_output(cmd).decode()
def load_env():
import logging
logging.basicConfig(
format='[%(asctime)s] %(levelname)s - %(filename)s: %(funcName)s - %(lineno)d: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
try:
import vlmeval
except ImportError:
logging.error('VLMEval is not installed. Failed to import environment variables from .env file. ')
return
pth = osp.realpath(vlmeval.__path__[0])
pth = osp.join(pth, '../.env')
pth = osp.realpath(pth)
if not osp.exists(pth):
logging.error(f'Did not detect the .env file at {pth}, failed to load. ')
return
from dotenv import dotenv_values
values = dotenv_values(pth)
for k, v in values.items():
if v is not None and len(v):
os.environ[k] = v
logging.info(f'API Keys successfully loaded from {pth}')
def pip_install_robust(package):
import sys
retry = 3
while retry > 0:
try:
package_base = package.split('=')[0]
module = __import__(package)
return True
except ImportError:
subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
retry -= 1
return False
def version_cmp(v1, v2, op='eq'):
from packaging import version
import operator
op_func = getattr(operator, op)
return op_func(version.parse(v1), version.parse(v2))
def toliststr(s):
if isinstance(s, str) and (s[0] == '[') and (s[-1] == ']'):
return [str(x) for x in eval(s)]
elif isinstance(s, str):
return [s]
elif isinstance(s, list):
return [str(x) for x in s]
raise NotImplementedError
def extract_json_objects(text, decoder=JSONDecoder()):
pos = 0
while True:
match = text.find('{', pos)
if match == -1: break
try:
result, index = decoder.raw_decode(text[match:])
yield result
pos = match + index
except ValueError:
pos = match + 1
def get_gpu_memory():
import subprocess
try:
command = "rocm-smi --showmeminfo vram"
output = subprocess.check_output(command.split(), stderr=subprocess.STDOUT)
memory_info = output.decode('ascii').split('\n')
memory_free_values = []
for line in memory_info:
if "vram Total Memory" in line:
total_memory = int(line.split(":")[-1].strip().split()[0]) # 提取总显存
elif "vram Total Used Memory" in line:
used_memory = int(line.split(":")[-1].strip().split()[0]) # 提取已用显存
# print(total_memory)
# print(used_memory)
free_memory = total_memory - used_memory # 计算空闲显存
memory_free_values.append(free_memory)
return memory_free_values
#memory_free_info = subprocess.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
#memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
#return memory_free_values
except Exception as e:
print(f'{type(e)}: {str(e)}')
return []
def auto_split_flag():
flag = os.environ.get('AUTO_SPLIT', '0')
return flag == '1'
import os
import io
import pandas as pd
import numpy as np
import string
from uuid import uuid4
import os.path as osp
import base64
from PIL import Image
import sys
Image.MAX_IMAGE_PIXELS = 1e9
def rescale_img(img, tgt=None):
assert isinstance(tgt, tuple) and -1 in tgt
w, h = img.size
if tgt[0] != -1:
new_w, new_h = tgt[0], int(tgt[0] / w * h)
elif tgt[1] != -1:
new_w, new_h = int(tgt[1] / h * w), tgt[1]
img = img.resize((new_w, new_h))
return img
def concat_images_vlmeval(images, target_size=-1, mode='h', return_image=False):
from .file import md5
ims = [Image.open(im) for im in images]
if target_size != -1:
ims = [
rescale_img(im, (-1, target_size) if mode == 'h' else (target_size, -1))
for im in ims
]
ws, hs = [x.width for x in ims], [x.height for x in ims]
if mode == 'h':
new_w, new_h = sum(ws), max(hs)
dst = Image.new('RGB', (new_w, new_h))
for i, im in enumerate(ims):
dst.paste(im, (sum(ws[:i]), 0))
elif mode == 'v':
new_w, new_h = max(ws), sum(hs)
dst = Image.new('RGB', (new_w, new_h))
for i, im in enumerate(ims):
dst.paste(im, (sum(ws[:i], 0)))
if return_image:
return dst
else:
_str = '\n'.join(images)
str_md5 = md5(_str)
tgt = osp.join('/tmp', str_md5 + '.jpg')
dst.save(tgt)
return tgt
def mmqa_display(question, target_size=512):
question = {k.lower(): v for k, v in question.items()}
keys = list(question.keys())
keys = [k for k in keys if k not in ['index', 'image']]
images = question['image']
if isinstance(images, str):
images = [images]
idx = question.pop('index', 'XXX')
print(f'INDEX: {idx}')
for im in images:
image = decode_base64_to_image(im, target_size=target_size)
display(image) # noqa: F821
for k in keys:
try:
if not pd.isna(question[k]):
print(f'{k.upper()}. {question[k]}')
except ValueError:
if False in pd.isna(question[k]):
print(f'{k.upper()}. {question[k]}')
def encode_image_to_base64(img, target_size=-1, fmt='JPEG'):
# if target_size == -1, will not do resizing
# else, will set the max_size ot (target_size, target_size)
if img.mode in ('RGBA', 'P'):
img = img.convert('RGB')
if target_size > 0:
img.thumbnail((target_size, target_size))
img_buffer = io.BytesIO()
img.save(img_buffer, format=fmt)
image_data = img_buffer.getvalue()
ret = base64.b64encode(image_data).decode('utf-8')
return ret
def encode_image_file_to_base64(image_path, target_size=-1):
image = Image.open(image_path)
return encode_image_to_base64(image, target_size=target_size)
def decode_base64_to_image(base64_string, target_size=-1):
image_data = base64.b64decode(base64_string)
image = Image.open(io.BytesIO(image_data))
if image.mode in ('RGBA', 'P'):
image = image.convert('RGB')
if target_size > 0:
image.thumbnail((target_size, target_size))
return image
def decode_base64_to_image_file(base64_string, image_path, target_size=-1):
image = decode_base64_to_image(base64_string, target_size=target_size)
image.save(image_path)
def build_option_str(option_dict):
s = 'There are several options: \n'
for c, content in option_dict.items():
if not pd.isna(content):
s += f'{c}. {content}\n'
return s
def isimg(s):
return osp.exists(s) or s.startswith('http')
def read_ok(img_path):
if not osp.exists(img_path):
return False
try:
im = Image.open(img_path)
assert im.size[0] > 0 and im.size[1] > 0
return True
except:
return False
def gpt_key_set():
openai_key = os.environ.get('OPENAI_API_KEY', None)
return isinstance(openai_key, str) and openai_key.startswith('sk-')
def apiok(wrapper):
s = wrapper.generate('Hello!')
return wrapper.fail_msg not in s
def circular_pred(df, extract_func=None):
if extract_func is None:
extract_func = lambda x: x # noqa: E731
df = df.sort_values('index')
from vlmeval.utils import can_infer_option
shift = int(1e6)
choices = [extract_func(x) for x in df['prediction']]
pred_map = {i: c for i, c in zip(df['index'], choices)}
flag_map = {i: True for i in pred_map if i < 1e6}
valid_map = {i: True for i in pred_map if i < 1e6}
for i in df['index']:
if i >= shift and pred_map[i] and pred_map[i - shift]:
if pred_map[i] not in list(
string.ascii_uppercase
) or pred_map[ # noqa: W504
i - shift
] not in list(
string.ascii_uppercase
):
valid_map[i % shift] = False
continue
if (ord(pred_map[i]) - ord(pred_map[i - shift])) % 4 == 1:
continue
else:
flag_map[i % shift] = False
flag_map = {k: v for k, v in flag_map.items() if valid_map[k]}
flags = list(flag_map.values())
return np.mean(flags)
import sys
from vlmeval.dataset import SUPPORTED_DATASETS
from vlmeval.config import *
from vlmeval.smp import *
# Define valid modes
MODES = ('dlist', 'mlist', 'missing', 'circular', 'localize', 'check', 'run', 'eval')
CLI_HELP_MSG = \
f"""
Arguments received: {str(['vlmutil'] + sys.argv[1:])}. vlmutil commands use the following syntax:
vlmutil MODE MODE_ARGS
Where MODE (required) is one of {MODES}
MODE_ARG (optional) is the argument for specific mode
Some usages for xtuner commands: (See more by using -h for specific command!)
1. List all the dataset by levels: l1, l2, l3, etc.:
vlmutil dlist [l1/l2/l3/...]
2. List all the models by categories: 4.33.0, 4.37.0, api, etc.:
vlmutil mlist 4.33.0 [all/small/large]
3. Report missing results:
vlmutil missing [l1/l2/l3/...]
4. Create circular questions (only for multiple-choice questions with no more than 4 choices):
vlmutil circular input.tsv
5. Create a localized version of the dataset (for very large tsv files):
vlmutil localize input.tsv
6. Check the validity of a model:
vlmutil check [model_name/model_series]
7. Run evaluation for missing results:
vlmutil run l2 hf
8. Evaluate data file:
vlmutil eval [dataset_name] [prediction_file]
GitHub: https://github.com/open-compass/VLMEvalKit
""" # noqa: E501
dataset_levels = {
'l1': [
('MMVet', 'gpt-4-turbo_score.csv'), ('MMMU_DEV_VAL', 'acc.csv'),
('MathVista_MINI', 'gpt-4-turbo_score.csv'), ('HallusionBench', 'score.csv'),
('OCRBench', 'score.json'), ('AI2D_TEST', 'acc.csv'), ('MMStar', 'acc.csv'),
('MMBench_V11', 'acc.csv'), ('MMBench_CN_V11', 'acc.csv')
],
'l2': [
('MME', 'score.csv'), ('LLaVABench', 'score.csv'), ('RealWorldQA', 'acc.csv'),
('MMBench', 'acc.csv'), ('MMBench_CN', 'acc.csv'), ('CCBench', 'acc.csv'),
('SEEDBench_IMG', 'acc.csv'), ('COCO_VAL', 'score.json'), ('POPE', 'score.csv'),
('ScienceQA_VAL', 'acc.csv'), ('ScienceQA_TEST', 'acc.csv'), ('MMT-Bench_VAL', 'acc.csv'),
('SEEDBench2_Plus', 'acc.csv'), ('BLINK', 'acc.csv'), ('MTVQA_TEST', 'acc.json'),
('Q-Bench1_VAL', 'acc.csv'), ('A-Bench_VAL', 'acc.csv'), ('R-Bench-Dis', 'acc.csv'),
('MathVision', 'score.csv'), ('MathVerse_MINI_Vision_Only', 'score.csv'), ('DynaMath', 'score.csv'),
],
'l3': [
('OCRVQA_TESTCORE', 'acc.csv'), ('TextVQA_VAL', 'acc.csv'),
('ChartQA_TEST', 'acc.csv'), ('DocVQA_VAL', 'acc.csv'), ('InfoVQA_VAL', 'acc.csv'),
('SEEDBench2', 'acc.csv')
]
}
dataset_levels['l12'] = dataset_levels['l1'] + dataset_levels['l2']
dataset_levels['l23'] = dataset_levels['l2'] + dataset_levels['l3']
dataset_levels['l123'] = dataset_levels['l12'] + dataset_levels['l3']
models = {
'4.33.0': list(qwen_series) + list(xcomposer_series) + [
'mPLUG-Owl2', 'flamingov2', 'VisualGLM_6b', 'MMAlaya', 'PandaGPT_13B', 'VXVERSE'
] + list(idefics_series) + list(minigpt4_series) + list(instructblip_series),
'4.37.0': [x for x in llava_series if 'next' not in x] + list(internvl_series) + [
'TransCore_M', 'emu2_chat', 'MiniCPM-V', 'MiniCPM-V-2', 'OmniLMM_12B',
'cogvlm-grounding-generalist', 'cogvlm-chat', 'cogvlm2-llama3-chat-19B',
'mPLUG-Owl3'
] + list(xtuner_series) + list(yivl_series) + list(deepseekvl_series) + list(janus_series) + list(cambrian_series),
'4.36.2': ['Moondream1'],
'4.40.0': [
'idefics2_8b', 'Bunny-llama3-8B', 'MiniCPM-Llama3-V-2_5', '360VL-70B', 'Phi-3-Vision',
] + list(wemm_series),
'4.44.0': ['Moondream2'],
'4.45.0': ['Aria'],
'latest': ['paligemma-3b-mix-448', 'MiniCPM-V-2_6', 'glm-4v-9b'] + [x for x in llava_series if 'next' in x]
+ list(chameleon_series) + list(ovis_series) + list(mantis_series),
'api': list(api_models)
}
# SKIP_MODELS will be skipped in report_missing and run APIs
SKIP_MODELS = [
'MGM_7B', 'GPT4V_HIGH', 'GPT4V', 'flamingov2', 'PandaGPT_13B',
'GeminiProVision', 'Step1V-0701', 'SenseChat-5-Vision',
'llava_v1_7b', 'sharegpt4v_7b', 'sharegpt4v_13b',
'llava-v1.5-7b-xtuner', 'llava-v1.5-13b-xtuner',
'cogvlm-grounding-generalist', 'InternVL-Chat-V1-1',
'InternVL-Chat-V1-2', 'InternVL-Chat-V1-2-Plus', 'RekaCore',
'llava_next_72b', 'llava_next_110b', 'MiniCPM-V', 'sharecaptioner', 'XComposer',
'VisualGLM_6b', 'idefics_9b_instruct', 'idefics_80b_instruct',
'mPLUG-Owl2', 'MMAlaya', 'OmniLMM_12B', 'emu2_chat', 'VXVERSE'
] + list(minigpt4_series) + list(instructblip_series) + list(xtuner_series) + list(chameleon_series) + list(vila_series)
LARGE_MODELS = [
'idefics_80b_instruct', '360VL-70B', 'emu2_chat', 'InternVL2-76B',
]
def completed(m, d, suf):
score_file = f'outputs/{m}/{m}_{d}_{suf}'
if osp.exists(score_file):
return True
if d == 'MMBench':
s1, s2 = f'outputs/{m}/{m}_MMBench_DEV_EN_{suf}', f'outputs/{m}/{m}_MMBench_TEST_EN_{suf}'
return osp.exists(s1) and osp.exists(s2)
elif d == 'MMBench_CN':
s1, s2 = f'outputs/{m}/{m}_MMBench_DEV_CN_{suf}', f'outputs/{m}/{m}_MMBench_TEST_CN_{suf}'
return osp.exists(s1) and osp.exists(s2)
return False
def DLIST(lvl):
if lvl in dataset_levels.keys():
return [x[0] for x in dataset_levels[lvl]]
else:
from vlmeval.dataset import SUPPORTED_DATASETS
return SUPPORTED_DATASETS
def MLIST(lvl, size='all'):
if lvl == 'all':
from vlmeval.config import supported_VLM
return [x for x in supported_VLM]
model_list = models[lvl]
if size == 'small':
model_list = [m for m in model_list if m not in LARGE_MODELS]
elif size == 'large':
model_list = [m for m in model_list if m in LARGE_MODELS]
return [x[0] for x in model_list]
def MISSING(lvl):
from vlmeval.config import supported_VLM
models = list(supported_VLM)
models = [m for m in models if m not in SKIP_MODELS and osp.exists(osp.join('outputs', m))]
if lvl in dataset_levels.keys():
data_list = dataset_levels[lvl]
else:
data_list = [(D, suff) for (D, suff) in dataset_levels['l123'] if D == lvl]
missing_list = []
for f in models:
for D, suff in data_list:
if not completed(f, D, suff):
missing_list.append((f, D))
return missing_list
def CIRCULAR(inp):
assert inp.endswith('.tsv')
data = load(inp)
OFFSET = 1e6
while max(data['index']) >= OFFSET:
OFFSET *= 10
assert 'E' not in data, 'Currently build_circular only works for up to 4-choice questions'
data_2c = data[pd.isna(data['C'])]
data_3c = data[~pd.isna(data['C']) & pd.isna(data['D'])]
data_4c = data[~pd.isna(data['D'])]
map_2c = [('AB', 'BA')]
map_3c = [('ABC', 'BCA'), ('ABC', 'CAB')]
map_4c = [('ABCD', 'BCDA'), ('ABCD', 'CDAB'), ('ABCD', 'DABC')]
def okn(o, n=4):
ostr = o.replace(',', ' ')
osplits = ostr.split()
if sum([c in osplits for c in string.ascii_uppercase[:n - 1]]) == n - 1:
return False
olower = o.lower()
olower = olower.replace(',', ' ')
olower_splits = olower.split()
if 'all' in olower_splits or 'none' in olower_splits:
return False
return True
yay4, nay4 = [], []
lt4 = len(data_4c)
for i in range(lt4):
if okn(data_4c.iloc[i]['D'], 4):
yay4.append(i)
else:
nay4.append(i)
data_4c_y = data_4c.iloc[yay4]
data_4c_n = data_4c.iloc[nay4]
data_3c = pd.concat([data_4c_n, data_3c])
yay3, nay3 = [], []
lt3 = len(data_3c)
for i in range(lt3):
if okn(data_3c.iloc[i]['C'], 3):
yay3.append(i)
else:
nay3.append(i)
data_3c_y = data_3c.iloc[yay3]
data_3c_n = data_3c.iloc[nay3]
data_2c = pd.concat([data_3c_n, data_2c])
def remap(data_in, tup, off):
off = int(off)
data = data_in.copy()
char_map = {k: v for k, v in zip(*tup)}
idx = data.pop('index')
answer = data.pop('answer')
answer_new = [char_map[x] if x in char_map else x for x in answer]
data['answer'] = answer_new
options = {}
for c in char_map:
options[char_map[c]] = data.pop(c)
for c in options:
data[c] = options[c]
data.pop('image')
data['image'] = idx
idx = [x + off for x in idx]
data['index'] = idx
return data
data_all = pd.concat([
data_2c,
data_3c_y,
data_4c_y,
remap(data_2c, map_2c[0], OFFSET),
remap(data_3c_y, map_3c[0], OFFSET),
remap(data_4c_y, map_4c[0], OFFSET),
remap(data_3c_y, map_3c[1], OFFSET * 2),
remap(data_4c_y, map_4c[1], OFFSET * 2),
remap(data_4c_y, map_4c[2], OFFSET * 3),
])
tgt_file = inp.replace('.tsv', '_CIRC.tsv')
dump(data_all, tgt_file)
print(f'The circularized data is saved to {tgt_file}')
assert osp.exists(tgt_file)
print(f'The MD5 for the circularized data is {md5(tgt_file)}')
PTH = osp.realpath(__file__)
IMAGE_PTH = osp.join(osp.dirname(PTH), '../assets/apple.jpg')
msg1 = [
IMAGE_PTH,
'What is in this image?'
]
msg2 = [
dict(type='image', value=IMAGE_PTH),
dict(type='text', value='What is in this image?')
]
msg3 = [
IMAGE_PTH,
IMAGE_PTH,
'How many apples are there in these images?'
]
msg4 = [
dict(type='image', value=IMAGE_PTH),
dict(type='image', value=IMAGE_PTH),
dict(type='text', value='How many apples are there in these images?')
]
def CHECK(val):
if val in supported_VLM:
model = supported_VLM[val]()
print(f'Model: {val}')
for i, msg in enumerate([msg1, msg2, msg3, msg4]):
if i > 1 and not model.INTERLEAVE:
continue
res = model.generate(msg)
print(f'Test {i + 1}: {res}')
elif val in models:
model_list = models[val]
for m in model_list:
CHECK(m)
def LOCALIZE(fname, new_fname=None):
if new_fname is None:
new_fname = fname.replace('.tsv', '_local.tsv')
base_name = osp.basename(fname)
dname = osp.splitext(base_name)[0]
data = load(fname)
data_new = localize_df(data, dname)
dump(data_new, new_fname)
print(f'The localized version of data file is {new_fname}')
return new_fname
def RUN(lvl, model):
import torch
NGPU = torch.cuda.device_count()
SCRIPT = osp.join(osp.dirname(__file__), '../run.py')
logger = get_logger('Run Missing')
def get_env(name):
assert name in ['433', '437', '440', 'latest']
load_env()
env_key = f'ENV_{name}'
return os.environ.get(env_key, None)
missing = MISSING(lvl)
if model == 'all':
pass
elif model == 'api':
missing = [x for x in missing if x[0] in models['api']]
elif model == 'hf':
missing = [x for x in missing if x[0] not in models['api']]
elif model in models:
missing = [x for x in missing if x[0] in models[missing]]
elif model in supported_VLM:
missing = [x for x in missing if x[0] == model]
else:
warnings.warn(f'Invalid model {model}.')
missing.sort(key=lambda x: x[0])
groups = defaultdict(list)
for m, D in missing:
groups[m].append(D)
for m in groups:
if m in SKIP_MODELS:
continue
for dataset in groups[m]:
logger.info(f'Running {m} on {dataset}')
exe = 'python' if m in LARGE_MODELS or m in models['api'] else 'torchrun'
if m not in models['api']:
env = None
env = 'latest' if m in models['latest'] else env
env = '433' if m in models['4.33.0'] else env
env = '437' if m in models['4.37.0'] else env
env = '440' if m in models['4.40.0'] else env
if env is None:
# Not found, default to latest
env = 'latest'
logger.warning(
f"Model {m} does not have a specific environment configuration. Defaulting to 'latest'.")
pth = get_env(env)
if pth is not None:
exe = osp.join(pth, 'bin', exe)
else:
logger.warning(f'Cannot find the env path {env} for model {m}')
if exe.endswith('torchrun'):
cmd = f'{exe} --nproc-per-node={NGPU} {SCRIPT} --model {m} --data {dataset}'
elif exe.endswith('python'):
cmd = f'{exe} {SCRIPT} --model {m} --data {dataset}'
os.system(cmd)
def EVAL(dataset_name, data_file, **kwargs):
from vlmeval.dataset import build_dataset
logger = get_logger('VLMEvalKit Tool-Eval')
dataset = build_dataset(dataset_name)
# Set the judge kwargs first before evaluation or dumping
judge_kwargs = {'nproc': 4, 'verbose': True}
if 'model' not in kwargs:
if dataset.TYPE in ['MCQ', 'Y/N']:
judge_kwargs['model'] = 'chatgpt-0125'
elif listinstr(['MMVet', 'LLaVABench', 'MMBench-Video'], dataset_name):
judge_kwargs['model'] = 'gpt-4-turbo'
elif listinstr(['MMLongBench', 'MMDU'], dataset_name):
judge_kwargs['model'] = 'gpt-4o'
elif listinstr(['DynaMath', 'MathVerse', 'MathVista', 'MathVision'], dataset_name):
judge_kwargs['model'] = 'gpt-4o-mini'
else:
judge_kwargs['model'] = kwargs['model']
judge_kwargs['nproc'] = kwargs.get('nproc', 4)
eval_results = dataset.evaluate(data_file, **judge_kwargs)
if eval_results is not None:
assert isinstance(eval_results, dict) or isinstance(eval_results, pd.DataFrame)
logger.info('Evaluation Results:')
if isinstance(eval_results, dict):
logger.info('\n' + json.dumps(eval_results, indent=4))
elif isinstance(eval_results, pd.DataFrame):
logger.info('\n')
logger.info(tabulate(eval_results.T) if len(eval_results) < len(eval_results.columns) else eval_results)
return eval_results
def parse_args_eval():
parser = argparse.ArgumentParser()
# Essential Args, Setting the Names of Datasets and Models
parser.add_argument('cmd', type=str)
parser.add_argument('data_file', type=str)
parser.add_argument('--judge', type=str, default=None)
parser.add_argument('--nproc', type=int, default=4)
parser.add_argument('--retry', type=int, default=None)
args = parser.parse_args()
return args
def cli():
logger = get_logger('VLMEvalKit Tools')
args = sys.argv[1:]
if not args: # no arguments passed
logger.info(CLI_HELP_MSG)
return
if args[0].lower() in MODES:
if args[0].lower() == 'dlist':
assert len(args) >= 2
lst = DLIST(args[1])
print(' '.join(lst))
elif args[0].lower() == 'mlist':
assert len(args) >= 2
size = 'all'
if len(args) > 2:
size = args[2].lower()
lst = MLIST(args[1], size)
print('\n'.join(lst))
elif args[0].lower() == 'missing':
assert len(args) >= 2
missing_list = MISSING(args[1])
logger = get_logger('Find Missing')
logger.info(colored(f'Level {args[1]} Missing Results: ', 'red'))
lines = []
for m, D in missing_list:
line = f'Model {m}, Dataset {D}'
logger.info(colored(line, 'red'))
lines.append(line)
mwlines(lines, f'{args[1]}_missing.txt')
elif args[0].lower() == 'circular':
assert len(args) >= 2
CIRCULAR(args[1])
elif args[0].lower() == 'localize':
assert len(args) >= 2
LOCALIZE(args[1])
elif args[0].lower() == 'check':
assert len(args) >= 2
model_list = args[1:]
for m in model_list:
CHECK(m)
elif args[0].lower() == 'run':
assert len(args) >= 2
lvl = args[1]
if len(args) == 2:
model = 'all'
RUN(lvl, model)
else:
for model in args[2:]:
RUN(lvl, model)
elif args[0].lower() == 'eval':
args = parse_args_eval()
data_file = args.data_file
def extract_dataset(file_name):
fname = osp.splitext(file_name)[0].split('/')[-1]
parts = fname.split('_')
for i in range(len(parts)):
if '_'.join(parts[i:]) in SUPPORTED_DATASETS:
return '_'.join(parts[i:])
return None
dataset = extract_dataset(data_file)
assert dataset is not None, f'Cannot infer dataset name from {data_file}'
kwargs = {'nproc': args.nproc}
if args.judge is not None:
kwargs['model'] = args.judge
if args.retry is not None:
kwargs['retry'] = args.retry
EVAL(dataset_name=dataset, data_file=data_file, **kwargs)
else:
logger.error('WARNING: command error!')
logger.info(CLI_HELP_MSG)
return
from .matching_util import can_infer, can_infer_option, can_infer_text
from .mp_util import track_progress_rich
__all__ = [
'can_infer', 'can_infer_option', 'can_infer_text', 'track_progress_rich',
]
import string
import copy as cp
import os
from ..smp import *
def can_infer_option(answer, choices):
verbose = os.environ.get('VERBOSE', 0)
# Choices is a dictionary
if 'Failed to obtain answer via API' in answer:
return False
reject_to_answer = [
"Sorry, I can't help with images of people yet.",
"I can't process this file.",
"I'm sorry, but without the image provided",
'Cannot determine the answer'
]
for err in reject_to_answer:
if err in answer:
return 'Z'
def count_choice(splits, choices, prefix='', suffix=''):
cnt = 0
for c in choices:
if prefix + c + suffix in splits:
cnt += 1
return cnt
answer_mod = cp.copy(answer)
chars = '.()[],:;!*#{}'
for c in chars:
answer_mod = answer_mod.replace(c, ' ')
splits = [x.strip() for x in answer_mod.split()]
count = count_choice(splits, choices)
if count == 1:
for ch in choices:
if 'A' in splits and len(splits) > 3 and verbose:
logger = get_logger('Evaluation')
logger.info(f'A might be a quantifier in the string: {answer}.')
return False
if ch in splits:
return ch
elif count == 0 and count_choice(splits, {'Z', ''}) == 1:
return 'Z'
return False
def can_infer_text(answer, choices):
answer = answer.lower()
assert isinstance(choices, dict)
for k in choices:
assert k in string.ascii_uppercase
choices[k] = str(choices[k]).lower()
cands = []
for k in choices:
if choices[k] in answer:
cands.append(k)
if len(cands) == 1:
return cands[0]
return False
def can_infer(answer, choices):
answer = str(answer)
copt = can_infer_option(answer, choices)
return copt if copt else can_infer_text(answer, choices)
from multiprocessing import Pool
import os
from typing import Callable, Iterable, Sized
from rich.progress import (BarColumn, MofNCompleteColumn, Progress, Task,
TaskProgressColumn, TextColumn, TimeRemainingColumn)
from rich.text import Text
import os.path as osp
import time
import portalocker
from ..smp import load, dump
def track_progress_rich(
func: Callable,
tasks: Iterable = tuple(),
nproc: int = 1,
save=None,
keys=None,
**kwargs) -> list:
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
if save is not None:
assert osp.exists(osp.dirname(save)) or osp.dirname(save) == ''
if not osp.exists(save):
dump({}, save)
if keys is not None:
assert len(keys) == len(tasks)
if not callable(func):
raise TypeError('func must be a callable object')
if not isinstance(tasks, Iterable):
raise TypeError(
f'tasks must be an iterable object, but got {type(tasks)}')
assert nproc > 0, 'nproc must be a positive number'
res = load(save) if save is not None else {}
results = [None for _ in range(len(tasks))]
with ThreadPoolExecutor(max_workers=nproc) as executor:
futures = []
for inputs in tasks:
if not isinstance(inputs, (tuple, list, dict)):
inputs = (inputs, )
if isinstance(inputs, dict):
future = executor.submit(func, **inputs)
else:
future = executor.submit(func, *inputs)
futures.append(future)
unfinished = set(range(len(tasks)))
pbar = tqdm(total=len(unfinished))
while len(unfinished):
new_finished = set()
for idx in unfinished:
if futures[idx].done():
results[idx] = futures[idx].result()
new_finished.add(idx)
if keys is not None:
res[keys[idx]] = results[idx]
if len(new_finished):
if save is not None:
dump(res, save)
pbar.update(len(new_finished))
for k in new_finished:
unfinished.remove(k)
time.sleep(0.1)
pbar.close()
if save is not None:
dump(res, save)
return results
from ..smp import *
from ..dataset.utils.judge_util import build_judge
from ..dataset.utils.multiple_choice import extract_answer_from_item
from .matching_util import can_infer
from .mp_util import track_progress_rich
def MMMU_result_transfer(result_path):
res = {}
result_data = load(result_path)
mcq = result_data['A'].notna()
lt = len(result_data)
for i in range(lt):
line = result_data.iloc[i]
if mcq[i]:
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
prediction = line['prediction']
infer_prediction = can_infer(prediction, options)
res[line['id']] = infer_prediction
else:
res[line['id']] = line['prediction']
result_json = result_path.replace('.xlsx', '.json')
dump(res, result_json)
return result_json
def MMTBench_result_transfer(eval_file, dataset='default', **judge_kwargs):
logger = get_logger('Evaluation')
nproc = judge_kwargs.pop('nproc', 4)
rd.seed(2680)
suffix = eval_file.split('.')[-1]
model = judge_kwargs['model']
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
name_str_map = {
'chatgpt-0125': 'openai',
'gpt-4-0125': 'gpt4'
}
name_str = name_str_map[model] if model in name_str_map else model
if model == 'exact_matching':
model = None
elif gpt_key_set():
model = build_judge(**judge_kwargs)
if not model.working():
logger.error('The OPENAI API is not working properly, will use exact matching for evaluation')
model = None
else:
logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
model = None
logger.info(f'Evaluating {eval_file}')
result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_option.pkl')
result = {}
if osp.exists(result_file):
result = load(result_file)
data = load(eval_file)
assert 'index' in data, 'Essentail columns missing in the eval_file.'
data = data.sort_values(by='index')
data['prediction'] = [str(x) for x in data['prediction']]
for k in data.keys():
data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
idx2lines = {data.iloc[i]['index']: data.iloc[i] for i in range(len(data))}
idx2lines = {k: v for k, v in idx2lines.items() if k not in result}
indices = list(idx2lines.keys())
lines = [idx2lines[i] for i in indices]
tups = [(model, line) for line in lines]
res = track_progress_rich(
extract_answer_from_item,
tups,
nproc=nproc,
chunksize=nproc,
save=result_file,
keys=indices)
for i, r in zip(indices, res):
if i in result:
assert result[i]['opt'] == r['opt'] and result[i]['log'] == r['log']
else:
result[i] = r
indices = list(data['index'])
data['opt'] = [result[i]['opt'] for i in data['index']]
data['log'] = [result[i]['log'] for i in data['index']]
# load split
output_path = eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv')
dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv'))
return output_path
import torch
torch.set_grad_enabled(False)
torch.manual_seed(1234)
from .aria import Aria
from .base import BaseModel
from .cogvlm import CogVlm, GLM4v
from .emu import Emu
from .eagle_x import Eagle
from .idefics import IDEFICS, IDEFICS2
from .instructblip import InstructBLIP
from .kosmos import Kosmos2
from .llava import LLaVA, LLaVA_Next, LLaVA_XTuner, LLaVA_Next2, LLaVA_OneVision, LLaVA_OneVision_HF
from .minicpm_v import MiniCPM_V, MiniCPM_Llama3_V, MiniCPM_V_2_6
from .minigpt4 import MiniGPT4
from .mmalaya import MMAlaya, MMAlaya2
from .monkey import Monkey, MonkeyChat
from .moondream import Moondream1, Moondream2
from .minimonkey import MiniMonkey
from .mplug_owl2 import mPLUG_Owl2
from .omnilmm import OmniLMM12B
from .open_flamingo import OpenFlamingo
from .pandagpt import PandaGPT
from .qwen_vl import QwenVL, QwenVLChat
from .qwen2_vl import Qwen2VLChat
from .transcore_m import TransCoreM
from .visualglm import VisualGLM
from .xcomposer import ShareCaptioner, XComposer, XComposer2, XComposer2_4KHD, XComposer2d5
from .yi_vl import Yi_VL
from .internvl import InternVLChat
from .deepseek_vl import DeepSeekVL
from .janus import Janus
from .mgm import Mini_Gemini
from .bunnyllama3 import BunnyLLama3
from .vxverse import VXVERSE
from .paligemma import PaliGemma
from .qh_360vl import QH_360VL
from .phi3_vision import Phi3Vision, Phi3_5Vision
from .wemm import WeMM
from .cambrian import Cambrian
from .chameleon import Chameleon
from .video_llm import VideoLLaVA, VideoLLaVA_HF, Chatunivi, VideoChatGPT, LLaMAVID, VideoChat2_HD, PLLaVA
from .vila import VILA
from .ovis import Ovis, Ovis1_6, Ovis1_6_Plus
from .mantis import Mantis
from .mixsense import LLama3Mixsense
from .parrot import Parrot
from .omchat import OmChat
from .rbdash import RBDash
from .xgen_mm import XGenMM
from .slime import SliME
from .mplug_owl3 import mPLUG_Owl3
from .pixtral import Pixtral
from .llama_vision import llama_vision
from .molmo import molmo
from .points import POINTS, POINTSV15
from .nvlm import NVLM
from .vintern_chat import VinternChat
from .h2ovl_mississippi import H2OVLChat
from .falcon_vlm import Falcon2VLM
from .smolvlm import SmolVLM
from .sail_vl import SailVL
from .valley import ValleyEagleChat
import torch
import warnings
import copy as cp
from PIL import Image
import pandas as pd
import string
import re
from .base import BaseModel
from ..smp import isimg, listinstr, cn_string
from ..dataset import DATASET_TYPE, DATASET_MODALITY
class Aria(BaseModel):
INSTALL_REQ = False
INTERLEAVE = True
def __init__(self, model_path='rhymes-ai/Aria', **kwargs):
from transformers import AutoModelForCausalLM, AutoProcessor
assert model_path is not None
self.model_path = model_path
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
tokenizer = processor.tokenizer
tokenizer.padding_side = 'left'
tokenizer.pad_token_id = tokenizer.unk_token_id
self.processor = processor
self.tokenizer = tokenizer
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map='cuda',
torch_dtype=torch.bfloat16,
trust_remote_code=True
).eval()
default_kwargs = dict(
do_sample=False,
num_beams=1,
max_new_tokens=512,
min_new_tokens=1,
num_return_sequences=1,
use_cache=True,
output_hidden_states=True,
pad_token_id=tokenizer.unk_token_id,
stop_strings=["<|im_end|>"],
tokenizer=processor.tokenizer,
)
default_kwargs.update(kwargs)
self.kwargs = default_kwargs
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
torch.cuda.empty_cache()
def use_custom_prompt(self, dataset):
assert dataset is not None
if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN'], dataset):
# For Multi-Turn we don't have custom prompt
return False
if DATASET_MODALITY(dataset) == 'VIDEO':
# For Video benchmarks we don't have custom prompt at here
return False
else:
return True
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert dataset is None or isinstance(dataset, str)
tgt_path = self.dump_image(line, dataset)
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if len(options):
prompt += (
"\nAnswer with the option's letter from the given choices directly."
)
else:
if listinstr(['MathVista', 'MathVision', 'VCR', 'MTVQA', 'MMVet', 'MathVerse'], dataset):
prompt = prompt
elif listinstr(['LLaVABench', 'MMBench-Video'], dataset):
prompt += '\nAnswer this question in detail.'
elif listinstr(['DocVQA'], dataset):
prompt += '\nAnswer briefly and directly.'
else:
prompt += '\nAnswer the question using a single word or phrase.'
message = [dict(type='image', value=s) for s in tgt_path]
message.append(dict(type='text', value=prompt))
return message
def build_video_prompt(self, prompt, dataset=None):
if listinstr(['MMBench-Video'], dataset):
prompt = prompt.replace('\nAnswer:', '')
prompt = prompt.replace(
'Question: ',
'Please carefully check the video and then answer the following question with details:'
)
elif listinstr(['Video-MME'], dataset):
prompt = prompt.replace('\nAnswer:', '')
prompt += "\nAnswer with the option's letter from the given choices directly."
elif listinstr(['MVBench'], dataset):
prompt = prompt.replace('Best option:(', '')
system_prompt = 'Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.\n' # noqa: E501
prompt = prompt.replace(system_prompt, '')
return prompt
def adjust_kwargs(self, dataset):
kwargs = cp.deepcopy(self.kwargs)
kwargs["temperature"] = 0.0
kwargs["do_sample"] = False
if DATASET_MODALITY(dataset) == "VIDEO":
kwargs["max_image_size"] = 490
else:
kwargs["max_image_size"] = 980
kwargs["split_image"] = False
if listinstr(['MMMU', 'MMStar', 'Math'], dataset):
# These datasets may lead the model to work as a CoT-alike behaviour.
# Allow to output longer.
kwargs['max_new_tokens'] = 512
return kwargs
if DATASET_TYPE(dataset) in ['MCQ', 'Y/N']:
kwargs['max_new_tokens'] = 64
elif DATASET_TYPE(dataset) == 'Caption' and 'COCO' in dataset:
kwargs['max_new_tokens'] = 64
elif DATASET_TYPE(dataset) == 'VQA':
if listinstr(['OCRVQA', 'ChartQA', 'DocVQA'], dataset):
kwargs['max_new_tokens'] = 128
elif listinstr(['TextVQA'], dataset):
kwargs['max_new_tokens'] = 32
if listinstr(['OCR', 'ChartQA', 'DocVQA', 'InfoVQA', 'TextVQA'], dataset):
# OCR-related datasets that need to split image
kwargs["split_image"] = True
return kwargs
def generate_inner(self, message, dataset=None):
if dataset is not None:
kwargs = self.adjust_kwargs(dataset)
else:
kwargs = self.kwargs
max_image_size = kwargs.pop("max_image_size")
split_image = kwargs.pop("split_image")
prompt = '<|im_start|>user\n'
images = []
last_message_modality = "text"
if listinstr(['MLVU', 'TempCompass', 'MVBench'], dataset): # re-arrange the data
new_message = []
for s in message:
if s['type'] == 'image':
new_message.append(s)
for s in message:
if s['type'] == 'text':
new_message.append(s)
message = new_message
for s in message:
if s['type'] == 'image':
prompt += '<fim_prefix><|img|><fim_suffix>'
images.append(s['value'])
last_message_modality = "image"
elif s['type'] == 'text':
text = re.sub(r"<image \d+>", "", s["value"])
if last_message_modality == "image":
prompt += "\n"
last_message_modality = "text"
prompt += text
if DATASET_MODALITY(dataset) == 'VIDEO':
prompt = self.build_video_prompt(prompt, dataset)
prompt += '<|im_end|>\n<|im_start|>assistant\n'
if images:
images = [Image.open(s).convert('RGB') for s in images]
encoded = self.processor(
text=prompt,
images=images,
return_tensors='pt',
padding='longest',
max_image_size=max_image_size,
split_image=split_image,
)
else:
encoded = self.processor(text=prompt, return_tensors='pt', padding='longest')
encoded["pixel_values"] = encoded["pixel_values"].to(self.model.dtype)
encoded = {k: v.to(self.model.device) for k, v in encoded.items()}
pred = self.model.generate(**encoded, **kwargs)
answer = self.tokenizer.decode(pred[0][encoded['input_ids'].size(1):].cpu(), skip_special_tokens=True).strip()
answer = answer.replace('<|im_end|>', '')
return answer
from ..smp import *
from ..dataset import img_root_map, DATASET_TYPE
from abc import abstractmethod
class BaseModel:
INTERLEAVE = False
allowed_types = ['text', 'image', 'video']
def __init__(self):
self.dump_image_func = None
def use_custom_prompt(self, dataset):
"""Whether to use custom prompt for the given dataset.
Args:
dataset (str): The name of the dataset.
Returns:
bool: Whether to use custom prompt. If True, will call `build_prompt` of the VLM to build the prompt.
Default to False.
"""
return False
@abstractmethod
def build_prompt(self, line, dataset):
"""Build custom prompts for a specific dataset. Called only if `use_custom_prompt` returns True.
Args:
line (line of pd.DataFrame): The raw input line.
dataset (str): The name of the dataset.
Returns:
str: The built message.
"""
raise NotImplementedError
def set_dump_image(self, dump_image_func):
self.dump_image_func = dump_image_func
def dump_image(self, line, dataset):
return self.dump_image_func(line)
@abstractmethod
def generate_inner(self, message, dataset=None):
raise NotImplementedError
def check_content(self, msgs):
"""Check the content type of the input. Four types are allowed: str, dict, liststr, listdict.
"""
if isinstance(msgs, str):
return 'str'
if isinstance(msgs, dict):
return 'dict'
if isinstance(msgs, list):
types = [self.check_content(m) for m in msgs]
if all(t == 'str' for t in types):
return 'liststr'
if all(t == 'dict' for t in types):
return 'listdict'
return 'unknown'
def preproc_content(self, inputs):
"""Convert the raw input messages to a list of dicts.
Args:
inputs: raw input messages.
Returns:
list(dict): The preprocessed input messages. Will return None if failed to preprocess the input.
"""
if self.check_content(inputs) == 'str':
return [dict(type='text', value=inputs)]
elif self.check_content(inputs) == 'dict':
assert 'type' in inputs and 'value' in inputs
return [inputs]
elif self.check_content(inputs) == 'liststr':
res = []
for s in inputs:
mime, pth = parse_file(s)
if mime is None or mime == 'unknown':
res.append(dict(type='text', value=s))
else:
res.append(dict(type=mime.split('/')[0], value=pth))
return res
elif self.check_content(inputs) == 'listdict':
for item in inputs:
assert 'type' in item and 'value' in item
mime, s = parse_file(item['value'])
if mime is None:
assert item['type'] == 'text'
else:
assert mime.split('/')[0] == item['type']
item['value'] = s
return inputs
else:
return None
def generate(self, message, dataset=None):
"""Generate the output message.
Args:
message (list[dict]): The input message.
dataset (str, optional): The name of the dataset. Defaults to None.
Returns:
str: The generated message.
"""
assert self.check_content(message) in ['str', 'dict', 'liststr', 'listdict'], f'Invalid input type: {message}'
message = self.preproc_content(message)
assert message is not None and self.check_content(message) == 'listdict'
for item in message:
assert item['type'] in self.allowed_types, f'Invalid input type: {item["type"]}'
return self.generate_inner(message, dataset)
def chat(self, messages, dataset=None):
"""The main function for multi-turn chatting. Will call `chat_inner` with the preprocessed input messages."""
assert hasattr(self, 'chat_inner'), 'The API model should has the `chat_inner` method. '
for msg in messages:
assert isinstance(msg, dict) and 'role' in msg and 'content' in msg, msg
assert self.check_content(msg['content']) in ['str', 'dict', 'liststr', 'listdict'], msg
msg['content'] = self.preproc_content(msg['content'])
while len(messages):
try:
return self.chat_inner(messages, dataset=dataset)
except Exception as e:
logging.info(f'{type(e)}: {e}')
messages = messages[1:]
while len(messages) and messages[0]['role'] != 'user':
messages = messages[1:]
continue
return 'Chat Mode: Failed with all possible conversation turns.'
def message_to_promptimg(self, message, dataset=None):
assert not self.INTERLEAVE
model_name = self.__class__.__name__
warnings.warn(
f'Model {model_name} does not support interleaved input. '
'Will use the first image and aggregated texts as prompt. ')
num_images = len([x for x in message if x['type'] == 'image'])
if num_images == 0:
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
image = None
else:
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
images = [x['value'] for x in message if x['type'] == 'image']
if 'BLINK' == dataset:
image = concat_images_vlmeval(images, target_size=512)
else:
image = images[0]
return prompt, image
def message_to_promptvideo(self, message):
if self.VIDEO_LLM:
num_videos = len([x for x in message if x['type'] == 'video'])
if num_videos == 0:
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
video = None
else:
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
video = [x['value'] for x in message if x['type'] == 'video'][0]
return prompt, video
else:
logging.critical('Model does not support video input.')
raise NotImplementedError
def message_to_promptvideo_withrole(self, message, dataset=None):
if self.VIDEO_LLM:
system, user, assistant, video_list = '', '', '', []
for msg in message:
if msg['type'] == 'text':
if 'role' in msg and msg['role'] == 'system':
system += msg['value']
elif 'role' in msg and msg['role'] == 'assistant':
assistant += msg['value']
else:
user += msg['value']
elif msg['type'] == 'video':
video_list.append(msg['value'])
question = {
'system': system,
'user': user,
'assistant': assistant
}
if assistant == '':
if listinstr(['MCQ'], DATASET_TYPE(dataset)):
question['assistant'] = 'Best Option: ('
else:
del question['assistant']
if len(video_list) > 1:
print('VLMEvalKit only support single video as input, take first video as input')
video = video_list[0]
return question, video
else:
logging.critical('Model does not support video input.')
raise NotImplementedError
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import warnings
import re
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE
class BunnyLLama3(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='BAAI/Bunny-v1_1-Llama-3-8B-V', **kwargs):
assert model_path is not None
transformers.logging.set_verbosity_error()
transformers.logging.disable_progress_bar()
warnings.filterwarnings('ignore')
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='auto', trust_remote_code=True)
self.kwargs = kwargs
def use_custom_prompt(self, dataset):
if listinstr(['MCQ', 'Y/N'], DATASET_TYPE(dataset)) or listinstr(['mathvista'], dataset.lower()):
return True
else:
return False
def build_prompt(self, line, dataset):
if dataset is None:
dataset = self.dataset
if isinstance(line, int):
line = self.data.iloc[line]
tgt_path = self.dump_image(line, dataset)
prompt = line['question']
if DATASET_TYPE(dataset) == 'MCQ':
if listinstr(['mmmu'], dataset.lower()):
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
assert hint is None
question = line['question']
question = re.sub(r'<image (\d+)>', lambda x: x.group(0)[1:-1], question)
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = '\n'
for key, item in options.items():
options_prompt += f'({key}) {item}\n'
prompt = question
if len(options):
prompt += options_prompt
prompt += "\nAnswer with the option's letter from the given choices directly."
else:
prompt += '\nAnswer the question using a single word or phrase.'
else:
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = ''
if hint is not None:
prompt += f'{hint}\n'
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = '\n'
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
prompt += question + options_prompt
if listinstr(['cn', 'ccbench'], dataset.lower()):
prompt += '请直接回答选项字母。'
else:
prompt += "Answer with the option's letter from the given choices directly."
elif DATASET_TYPE(dataset) == 'Y/N':
if listinstr(['mme'], dataset.lower()):
if not listinstr(
['code_reasoning', 'commonsense_reasoning', 'numerical_calculation', 'text_translation'],
line['category']):
prompt = prompt.replace(' Please answer yes or no.',
'\nAnswer the question using a single word or phrase.')
elif listinstr(['pope'], dataset.lower()):
prompt = prompt.replace(' Please answer yes or no.',
'\nAnswer the question using a single word or phrase.')
elif listinstr(['mathvista'], dataset.lower()):
match = re.search(r'Hint: (.*?)\nQuestion: (.*?)\n(Choices:\n(.*))?', prompt + '\n', re.DOTALL)
prompt = match.group(2)
if match.group(4) is not None:
prompt += '\n' + match.group(4).rstrip('\n')
prompt += '\n' + match.group(1)
else:
raise ValueError(
f"Bunny doesn't implement a custom prompt for {dataset}. It should use the default prompt, but didn't.")
msgs = []
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=prompt))
return msgs
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
text = (f'A chat between a curious user and an artificial intelligence assistant. '
f"The assistant gives helpful, detailed, and polite answers to the user's questions. "
f'USER: <image>\n{prompt} ASSISTANT:')
text_chunks = [self.tokenizer(chunk).input_ids for chunk in text.split('<image>')]
input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0)
image = Image.open(image_path).convert('RGB')
image_tensor = self.model.process_images([image], self.model.config).to(dtype=self.model.dtype)
output_ids = self.model.generate(input_ids, images=image_tensor, max_new_tokens=128, use_cache=True)[0]
response = self.tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True)
return response
import torch
from PIL import Image
from .base import BaseModel
from ..smp import *
import warnings
IMAGE_TOKEN_INDEX = -200
DEFAULT_IMAGE_TOKEN = '<image>'
DEFAULT_IM_START_TOKEN = '<im_start>'
DEFAULT_IM_END_TOKEN = '<im_end>'
class Cambrian(BaseModel):
INSTALL_REQ = True
INTERLEAVE = False
def __init__(self, model_path='nyu-visionx/cambrian-8b', **kwargs):
assert model_path is not None
try:
from cambrian.conversation import conv_templates, SeparatorStyle
from cambrian.model.builder import load_pretrained_model
from cambrian.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
except Exception as e:
logging.critical('Please install cambrian from https://github.com/cambrian-mllm/cambrian.')
raise e
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(
model_path,
None,
model_name,
device_map=None
)
if '8b' in model_path:
self.conv_mode = 'llama_3'
elif '13b' in model_path:
self.conv_mode = 'vicuna_v1'
else:
self.conv_mode = 'chatml_direct'
self.model_config = model.config
self.conv_templates = conv_templates
self.tokenizer_image_token = tokenizer_image_token
self.process_images = process_images
self.tokenizer = tokenizer
self.image_processor = image_processor
self.model = model.to('cuda')
def process(self, image, question):
if self.model_config.mm_use_im_start_end:
question = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + question
else:
question = DEFAULT_IMAGE_TOKEN + '\n' + question
conv = self.conv_templates[self.conv_mode].copy()
conv.append_message(conv.roles[0], question)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
image_size = [image.size]
image_tensor = self.process_images([image], self.image_processor, self.model_config)
input_ids = self.tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
input_ids = input_ids.unsqueeze(0).cuda()
return input_ids, image_tensor, image_size, prompt
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
image = Image.open(image_path).convert('RGB')
input_ids, image_tensor, image_sizes, prompt = self.process(image, prompt)
input_ids = input_ids.to(device='cuda', non_blocking=True)
with torch.inference_mode():
output_ids = self.model.generate(
input_ids,
images=image_tensor,
image_sizes=image_sizes,
do_sample=False,
temperature=0,
num_beams=1,
max_new_tokens=512,
use_cache=True
)
outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
return outputs
import os.path as osp
import warnings
from .base import BaseModel
from ..smp import *
from PIL import Image
import torch
class Chameleon(BaseModel):
INSTALL_REQ = False
INTERLEAVE = True
def __init__(self, model_path='facebook/chameleon-7b', **kwargs):
try:
from transformers import ChameleonProcessor, ChameleonForConditionalGeneration
except Exception as e:
logging.critical('Please install the latest transformers.')
raise e
processor = ChameleonProcessor.from_pretrained(model_path)
model = ChameleonForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16)
self.model = model.cuda().eval()
self.processor = processor
def generate_inner(self, message, dataset=None):
content, images = '', []
for x in message:
if x['type'] == 'text':
content += x['value']
elif x['type'] == 'image':
content += '<image>\n'
images.append(Image.open(x['value']))
inputs = self.processor(
text=[content],
images=images,
padding=True,
return_tensors='pt'
).to(device='cuda', dtype=torch.bfloat16)
generate_ids = self.model.generate(**inputs, max_new_tokens=512)
input_token_len = inputs.input_ids.shape[1]
text = self.processor.batch_decode(
generate_ids[:, input_token_len:],
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
return text
import torch
from PIL import Image
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE
from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer
class GLM4v(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='THUDM/glm-4v-9b', **kwargs):
assert model_path is not None
self.model_path = model_path
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
trust_remote_code=True
).to('cuda').eval()
gen_kwargs = {'max_length': 2048, 'do_sample': False}
gen_kwargs.update(kwargs)
self.kwargs = gen_kwargs
self.end_text_token = '<|endoftext|>'
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
image = Image.open(image_path).convert('RGB')
if dataset is not None and DATASET_TYPE(dataset) in ['MCQ', 'Y/N']:
prompt += '\nShort Answer.'
inputs = self.tokenizer.apply_chat_template(
[{'role': 'user', 'image': image, 'content': prompt}],
add_generation_prompt=True, tokenize=True, return_tensors='pt', return_dict=True
)
inputs = inputs.to('cuda')
with torch.no_grad():
outputs = self.model.generate(**inputs, **self.kwargs)
outputs = outputs[:, inputs['input_ids'].shape[1]:]
response = self.tokenizer.decode(outputs[0])
return response.split(self.end_text_token)[0]
class CogVlm(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='THUDM/cogvlm2-llama3-chat-19B', tokenizer_name=None, **kwargs):
assert model_path is not None
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
).to('cuda').eval()
self.kwargs = kwargs
if tokenizer_name:
tokenizer = LlamaTokenizer.from_pretrained(tokenizer_name)
gen_kwargs = {'max_length': 2048, 'do_sample': False}
self.end_text_token = '</s>'
else:
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
gen_kwargs = {'max_new_tokens': 2048, 'pad_token_id': 128002}
self.end_text_token = '<|end_of_text|>'
self.kwargs.update(gen_kwargs)
self.tokenizer = tokenizer
self.model = model
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'MCQ':
return True
return False
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
if dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
option_candidate = string.ascii_uppercase
options = {
cand: line[cand]
for cand in option_candidate
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if not cn_string(prompt):
prompt = prompt + '\n' + "Answer with the option's letter from the given choices directly."
else:
prompt = prompt + '\n' + '请直接回答选项字母。'
else:
prompt = line['question']
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=p) for p in tgt_path])
return message
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
if dataset is not None and DATASET_TYPE(dataset) in ['MCQ', 'Y/N']:
prompt += '\nShort Answer.'
image = Image.open(image_path).convert('RGB')
inputs = self.model.build_conversation_input_ids(
self.tokenizer, query=prompt, history=[], images=[image]) # chat mode
inputs = {
'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'),
'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'),
'attention_mask': inputs['attention_mask'].unsqueeze(0).to('cuda'),
'images': [[inputs['images'][0].to('cuda').to(torch.bfloat16)]],
}
with torch.no_grad():
outputs = self.model.generate(**inputs, **self.kwargs)
outputs = outputs[:, inputs['input_ids'].shape[1]:]
response = self.tokenizer.decode(outputs[0])
response = response.split(self.end_text_token)[0].strip()
return response
import sys
import torch
from transformers import AutoModelForCausalLM
import warnings
from .base import BaseModel
from ..smp import *
class DeepSeekVL(BaseModel):
INSTALL_REQ = True
INTERLEAVE = True
def check_install(self):
try:
import deepseek_vl
except Exception as e:
logging.critical(
'Please first install deepseek_vl from source codes in: https://github.com/deepseek-ai/DeepSeek-VL')
raise e
def __init__(self, model_path='deepseek-ai/deepseek-vl-1.3b-chat', **kwargs):
self.check_install()
assert model_path is not None
self.model_path = model_path
from deepseek_vl.models import VLChatProcessor
self.vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
self.tokenizer = self.vl_chat_processor.tokenizer
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
self.model = model.to(torch.bfloat16).cuda().eval()
torch.cuda.empty_cache()
default_kwargs = dict(max_new_tokens=512, do_sample=False, use_cache=True)
default_kwargs.update(kwargs)
self.kwargs = default_kwargs
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
def prepare_inputs(self, message):
def prepare_itlist(msgs):
content, images = '', []
for s in msgs:
if s['type'] == 'image':
images.append(s['value'])
content += '<image_placeholder>'
elif s['type'] == 'text':
content += s['value']
return content, images
conversation = []
if 'role' not in message[0]:
content, images = prepare_itlist(message)
conversation.append(dict(role='User', content=content, images=images))
else:
role_map = {'user': 'User', 'assistant': 'Assistant'}
for msgs in message:
role = role_map[msgs['role']]
content, images = prepare_itlist(msgs['content'])
conversation.append(dict(role=role, content=content, images=images))
conversation.append(dict(role='Assistant', content=''))
return conversation
def generate_inner(self, message, dataset=None):
conversation = self.prepare_inputs(message)
from deepseek_vl.utils.io import load_pil_images
pil_images = load_pil_images(conversation)
prepare_inputs = self.vl_chat_processor(conversations=conversation, images=pil_images, force_batchify=True)
prepare_inputs = prepare_inputs.to(self.model.device)
inputs_embeds = self.model.prepare_inputs_embeds(**prepare_inputs)
outputs = self.model.language_model.generate(
inputs_embeds=inputs_embeds,
attention_mask=prepare_inputs.attention_mask,
pad_token_id=self.tokenizer.eos_token_id,
bos_token_id=self.tokenizer.bos_token_id,
eos_token_id=self.tokenizer.eos_token_id,
**self.kwargs)
answer = self.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
return answer
def chat_inner(self, message, dataset=None):
return self.generate_inner(message, dataset=dataset)
import torch
from PIL import Image
from abc import abstractproperty
import sys
import os.path as osp
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE
import copy
# This function is used to split Eagle-X5-34B
def split_model(model_name):
import math
device_map = {}
num_gpus = torch.cuda.device_count()
rank, world_size = get_rank_and_world_size()
num_gpus = num_gpus // world_size
num_layers_map = {
'Eagle-X5-34B-Chat': 60,
'Eagle-X5-34B-Plus': 60
}
if model_name not in num_layers_map:
return 'cuda'
num_layers = num_layers_map[model_name] + 8
# Since the first GPU will be used for ViT, treat it as 0.5 GPU.
num_layers_per_gpu = math.ceil(num_layers / num_gpus)
num_layers_per_gpu = [num_layers_per_gpu] * num_gpus
num_layers_per_gpu[-1] = num_layers - sum(num_layers_per_gpu[:-1])
num_layers_per_gpu[0] -= 4
layer_cnt = 0
for i, num_layer in enumerate(num_layers_per_gpu):
for j in range(num_layer):
device_map[f'model.layers.{layer_cnt}'] = rank + world_size * i
layer_cnt += 1
device_map['model.vision_tower'] = rank
device_map['model.embed_tokens'] = rank
device_map['model.norm'] = rank
device_map['model.rotary_emb'] = rank
device_map['model.mm_projector'] = rank
device_map['lm_head'] = rank
device_map[f'model.layers.{num_layers - 1}'] = rank
logging.warning("Remove L157-L158 in https://github.com/NVlabs/EAGLE/blob/fef95f103b5e9899acbbe2c237e5b99147ab7e8e/eagle/model/builder.py to make it work properly.") # noqa: E501
return device_map
class Eagle(BaseModel):
INSTALL_REQ = True
INTERLEAVE = True
def __init__(self,
model_path='NVEagle/Eagle-X5-7B',
**kwargs):
try:
from eagle.model.builder import load_pretrained_model
from eagle.utils import disable_torch_init
from eagle.mm_utils import get_model_name_from_path
except Exception as e:
logging.critical('''Please install eagle before using Eagle,
you can install it from "https://github.com/NVlabs/EAGLE.git"''')
raise e
warnings.warn('Please install the latest version of eagle from github before you evaluate the Eagle model.')
assert osp.exists(model_path) or splitlen(model_path) == 2
model_name = get_model_name_from_path(model_path)
rank, world_size = get_rank_and_world_size()
device_map = split_model(model_path.split('/')[-1])
self.tokenizer, self.model, self.image_processor, self.context_len = (
load_pretrained_model(model_path, None, model_name, False, False, device_map=device_map)
)
self.model.eval()
self.conv_mode = 'vicuna_v1'
default_kwargs = dict(
do_sample=True,
temperature=0.2,
top_p=0.5,
num_beams=1,
max_new_tokens=512,
use_cache=True
)
default_kwargs.update(kwargs)
self.kwargs = default_kwargs
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
torch.cuda.empty_cache()
def generate_inner(self, message, dataset=None):
try:
from eagle import conversation as conversation_lib
from eagle.constants import (IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN,
DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN)
from eagle.conversation import conv_templates, SeparatorStyle
from eagle.mm_utils import tokenizer_image_token, process_images, KeywordsStoppingCriteria
except Exception as e:
logging.critical('''Please install eagle before using Eagle,
you can install it from "https://github.com/NVlabs/EAGLE.git"''')
raise e
kwargs = self.kwargs
images = []
prompt = ''
for s in message:
if s['type'] == 'image':
images.append(s['value'])
elif s['type'] == 'text':
prompt += s['value']
DEFAULT_IMAGE_TOKEN = DEFAULT_IMAGE_TOKEN * len(images)
if self.model.config.mm_use_im_start_end:
prompt = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + prompt
else:
prompt = DEFAULT_IMAGE_TOKEN + '\n' + prompt
conv = conv_templates[self.conv_mode].copy()
conv.append_message(conv.roles[0], prompt)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
images = [Image.open(s).convert('RGB') for s in images]
image_tensor = process_images(images, self.image_processor, self.model.config)
input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
input_ids = input_ids.to(device='cuda', non_blocking=True)
image_tensor = image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True)
with torch.inference_mode():
output_ids = self.model.generate(
input_ids.unsqueeze(0),
images=image_tensor,
image_sizes=[img.size for img in images],
**kwargs
)
outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
return outputs
def use_custom_prompt(self, dataset):
assert dataset is not None
if listinstr(['MMMU'], dataset):
return False
if DATASET_TYPE(dataset) == 'MCQ' or dataset == 'MMVet':
return True
return False
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
question = line['question']
if dataset == 'MMVet':
prompt = question + '\nAnswer the question directly. '
elif DATASET_TYPE(dataset) == 'MCQ':
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = ''
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = f'Hint: {hint}\n' if hint is not None else ''
prompt += f'{question}\n'
prompt += (
f'{options_prompt}\nAnswer with the option’s letter from the given choices directly. '
if len(options) else 'Answer the question directly. '
)
else:
raise NotImplementedError
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
import os
import torch
from PIL import Image
import os.path as osp
from .base import BaseModel
from ..smp import *
class Emu(BaseModel):
INSTALL_REQ = False
INTERLEAVE = True
def __init__(self,
model_path='BAAI/Emu2-Chat',
**kwargs):
self.model_path = model_path
assert osp.exists(model_path) or splitlen(model_path) == 2
from transformers import AutoModelForCausalLM, AutoTokenizer
from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model
local_rank = os.environ.get('LOCAL_RANK', 0)
device_num = torch.cuda.device_count()
assert local_rank * 2 <= device_num, 'The number of devices does not match the world size'
assert device_num >= 2, 'You need at least 2 GPUs to use EMU'
device_1 = local_rank
device_2 = local_rank + device_num // 2
torch.cuda.set_device(device_1)
torch.cuda.set_device(device_2)
tokenizer = AutoTokenizer.from_pretrained(model_path) # "BAAI/Emu2-Chat"
self.tokenizer = tokenizer
with init_empty_weights():
model = AutoModelForCausalLM.from_pretrained(
model_path, # "BAAI/Emu2-Chat"
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
trust_remote_code=True)
device_map = infer_auto_device_map(
model,
max_memory={
device_1: '38GiB',
device_2: '38GiB'
},
no_split_module_classes=['Block', 'LlamaDecoderLayer'])
# input and output logits should be on same device
device_map['model.decoder.lm.lm_head'] = device_1
model = dispatch_model(
model,
device_map=device_map).eval()
self.model = model
kwargs_default = dict(max_new_tokens=512, length_penalty=-1)
kwargs_default.update(kwargs)
self.kwargs = kwargs_default
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
def generate_inner(self, message, dataset=None):
query, images = '', []
for item in message:
if item['type'] == 'image':
images.append(Image.open(item['value']).convert('RGB'))
query += '[<IMG_PLH>]'
elif item['type'] == 'text':
query += item['value']
inputs = self.model.build_input_ids(
text=[query],
tokenizer=self.tokenizer,
image=images
)
with torch.no_grad():
outputs = self.model.generate(
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
image=inputs['image'].to(torch.bfloat16),
**self.kwargs)
output_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
return output_text[0]
from PIL import Image
import requests
from .base import BaseModel
class Falcon2VLM(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='tiiuae/falcon-11B-vlm', **kwargs):
import torch
from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor
self.model_path = model_path
self.processor = LlavaNextProcessor.from_pretrained(model_path, tokenizer_class='PreTrainedTokenizerFast')
self.model = LlavaNextForConditionalGeneration.from_pretrained(
model_path, torch_dtype=torch.bfloat16, device_map='cuda').eval()
default_kwargs = {'max_new_tokens': 512}
default_kwargs.update(kwargs)
self.kwargs = default_kwargs
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
image = Image.open(image_path).convert('RGB')
prompt = f'User:<image>\n{prompt} Falcon:'
inputs = self.processor(text=prompt, images=image, return_tensors='pt').to('cuda')
output = self.model.generate(**inputs, **self.kwargs)
prompt_length = inputs['input_ids'].shape[1]
model_response = self.processor.decode(output[0][prompt_length:], skip_special_tokens=True).strip()
return model_response
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment