Initial commit

bc5ebf0f · luopl · bc5ebf0f · bc5ebf0f · bc5ebf0f · bc5ebf0f
Commit bc5ebf0f authored Dec 27, 2024 by luopl
20 changed files
--- a/VLMEvalKit/vlmeval/smp/file.py
+++ b/VLMEvalKit/vlmeval/smp/file.py
+import json
+import pickle
+import pandas as pd
+import os
+import csv
+import hashlib
+import os.path as osp
+import time
+import numpy as np
+import validators
+import mimetypes
+import multiprocessing as mp
+from .misc import toliststr
+from .vlm import decode_base64_to_image_file
+
+
+def decode_img_omni(tup):
+    root, im, p = tup
+    images = toliststr(im)
+    paths = toliststr(p)
+    if len(images) > 1 and len(paths) == 1:
+        paths = [osp.splitext(p)[0] + f'_{i}' + osp.splitext(p)[1] for i in range(len(images))]
+
+    assert len(images) == len(paths)
+    paths = [osp.join(root, p) for p in paths]
+    for p, im in zip(paths, images):
+        if osp.exists(p):
+            continue
+        if isinstance(im, str) and len(im) > 64:
+            decode_base64_to_image_file(im, p)
+    return paths
+
+
+def localize_df(data, dname, nproc=32):
+    assert 'image' in data
+    indices = list(data['index'])
+    indices_str = [str(x) for x in indices]
+    images = list(data['image'])
+    image_map = {x: y for x, y in zip(indices_str, images)}
+
+    root = LMUDataRoot()
+    root = osp.join(root, 'images', dname)
+    os.makedirs(root, exist_ok=True)
+
+    if 'image_path' in data:
+        img_paths = list(data['image_path'])
+    else:
+        img_paths = []
+        for i in indices_str:
+            if len(image_map[i]) <= 64:
+                idx = image_map[i]
+                assert idx in image_map and len(image_map[idx]) > 64
+                img_paths.append(f'{idx}.jpg')
+            else:
+                img_paths.append(f'{i}.jpg')
+
+    tups = [(root, im, p) for p, im in zip(img_paths, images)]
+
+    pool = mp.Pool(32)
+    ret = pool.map(decode_img_omni, tups)
+    pool.close()
+    data.pop('image')
+    if 'image_path' not in data:
+        data['image_path'] = [x[0] if len(x) == 1 else x for x in ret]
+    return data
+
+
+def LMUDataRoot():
+    if 'LMUData' in os.environ and osp.exists(os.environ['LMUData']):
+        return os.environ['LMUData']
+    home = osp.expanduser('~')
+    root = osp.join(home, 'LMUData')
+    os.makedirs(root, exist_ok=True)
+    return root
+
+
+def HFCacheRoot():
+    cache_list = ['HUGGINGFACE_HUB_CACHE', 'HF_HOME']
+    for cache_name in cache_list:
+        if cache_name in os.environ and osp.exists(os.environ[cache_name]):
+            if os.environ[cache_name].split('/')[-1] == 'hub':
+                return os.environ[cache_name]
+            else:
+                return osp.join(os.environ[cache_name], 'hub')
+    home = osp.expanduser('~')
+    root = osp.join(home, '.cache', 'huggingface', 'hub')
+    os.makedirs(root, exist_ok=True)
+    return root
+
+
+def MMBenchOfficialServer(dataset_name):
+    root = LMUDataRoot()
+
+    if dataset_name in ['MMBench', 'MMBench_V11', 'MMBench_CN', 'MMBench_CN_V11']:
+        ans_file = f'{root}/{dataset_name}.tsv'
+        if osp.exists(ans_file):
+            data = load(ans_file)
+            if 'answer' in data and sum([pd.isna(x) for x in data['answer']]) == 0:
+                return True
+
+    if dataset_name in ['MMBench_TEST_EN', 'MMBench_TEST_CN', 'MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11']:
+        ans_file1 = f'{root}/{dataset_name}.tsv'
+        mapp = {
+            'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_CN': 'MMBench_CN',
+            'MMBench_TEST_EN_V11': 'MMBench_V11', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11',
+        }
+        ans_file2 = f'{root}/{mapp[dataset_name]}.tsv'
+        for f in [ans_file1, ans_file2]:
+            if osp.exists(f):
+                data = load(f)
+                if 'answer' in data and sum([pd.isna(x) for x in data['answer']]) == 0:
+                    return True
+    return False
+
+
+class NumpyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
+                            np.int16, np.int32, np.int64, np.uint8,
+                            np.uint16, np.uint32, np.uint64)):
+            return int(obj)
+        elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
+            return float(obj)
+        elif isinstance(obj, (np.complex_, np.complex64, np.complex128)):
+            return {'real': obj.real, 'imag': obj.imag}
+        elif isinstance(obj, (np.ndarray,)):
+            return obj.tolist()
+        elif isinstance(obj, (np.bool_)):
+            return bool(obj)
+        elif isinstance(obj, (np.void)):
+            return None
+        return json.JSONEncoder.default(self, obj)
+
+
+# LOAD & DUMP
+def dump(data, f, **kwargs):
+    def dump_pkl(data, pth, **kwargs):
+        pickle.dump(data, open(pth, 'wb'))
+
+    def dump_json(data, pth, **kwargs):
+        json.dump(data, open(pth, 'w'), indent=4, ensure_ascii=False, cls=NumpyEncoder)
+
+    def dump_jsonl(data, f, **kwargs):
+        lines = [json.dumps(x, ensure_ascii=False, cls=NumpyEncoder) for x in data]
+        with open(f, 'w', encoding='utf8') as fout:
+            fout.write('\n'.join(lines))
+
+    def dump_xlsx(data, f, **kwargs):
+        data.to_excel(f, index=False, engine='xlsxwriter')
+
+    def dump_csv(data, f, quoting=csv.QUOTE_ALL):
+        data.to_csv(f, index=False, encoding='utf-8', quoting=quoting)
+
+    def dump_tsv(data, f, quoting=csv.QUOTE_ALL):
+        data.to_csv(f, sep='\t', index=False, encoding='utf-8', quoting=quoting)
+
+    handlers = dict(pkl=dump_pkl, json=dump_json, jsonl=dump_jsonl, xlsx=dump_xlsx, csv=dump_csv, tsv=dump_tsv)
+    suffix = f.split('.')[-1]
+    return handlers[suffix](data, f, **kwargs)
+
+
+def load(f, fmt=None):
+    def load_pkl(pth):
+        return pickle.load(open(pth, 'rb'))
+
+    def load_json(pth):
+        return json.load(open(pth, 'r', encoding='utf-8'))
+
+    def load_jsonl(f):
+        lines = open(f, encoding='utf-8').readlines()
+        lines = [x.strip() for x in lines]
+        if lines[-1] == '':
+            lines = lines[:-1]
+        data = [json.loads(x) for x in lines]
+        return data
+
+    def load_xlsx(f):
+        return pd.read_excel(f)
+
+    def load_csv(f):
+        return pd.read_csv(f)
+
+    def load_tsv(f):
+        return pd.read_csv(f, sep='\t')
+
+    handlers = dict(pkl=load_pkl, json=load_json, jsonl=load_jsonl, xlsx=load_xlsx, csv=load_csv, tsv=load_tsv)
+    if fmt is not None:
+        return handlers[fmt](f)
+
+    suffix = f.split('.')[-1]
+    return handlers[suffix](f)
+
+
+def download_file(url, filename=None):
+    import urllib.request
+    from tqdm import tqdm
+
+    class DownloadProgressBar(tqdm):
+        def update_to(self, b=1, bsize=1, tsize=None):
+            if tsize is not None:
+                self.total = tsize
+            self.update(b * bsize - self.n)
+
+    if filename is None:
+        filename = url.split('/')[-1]
+
+    try:
+        with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc=url.split('/')[-1]) as t:
+            urllib.request.urlretrieve(url, filename=filename, reporthook=t.update_to)
+    except Exception as e:
+        import logging
+        logging.warning(f'{type(e)}: {e}')
+        # Handle Failed Downloads from huggingface.co
+        if 'huggingface.co' in url:
+            url_new = url.replace('huggingface.co', 'hf-mirror.com')
+            try:
+                download_file(url_new, filename)
+                return filename
+            except Exception as e:
+                logging.warning(f'{type(e)}: {e}')
+                raise Exception(f'Failed to download {url}')
+        else:
+            raise Exception(f'Failed to download {url}')
+
+    return filename
+
+
+def ls(dirname='.', match=[], mode='all', level=1):
+    if isinstance(level, str):
+        assert '+' in level
+        level = int(level[:-1])
+        res = []
+        for i in range(1, level + 1):
+            res.extend(ls(dirname, match=match, mode='file', level=i))
+        return res
+
+    if dirname == '.':
+        ans = os.listdir(dirname)
+    else:
+        ans = [osp.join(dirname, x) for x in os.listdir(dirname)]
+    assert mode in ['all', 'dir', 'file']
+    assert level >= 1 and isinstance(level, int)
+    if level == 1:
+        if isinstance(match, str):
+            match = [match]
+        for m in match:
+            if len(m) == 0:
+                continue
+            if m[0] != '!':
+                ans = [x for x in ans if m in x]
+            else:
+                ans = [x for x in ans if m[1:] not in x]
+        if mode == 'dir':
+            ans = [x for x in ans if osp.isdir(x)]
+        elif mode == 'file':
+            ans = [x for x in ans if not osp.isdir(x)]
+        return ans
+    else:
+        dirs = [x for x in ans if osp.isdir(x)]
+        res = []
+        for d in dirs:
+            res.extend(ls(d, match=match, mode=mode, level=level - 1))
+        return res
+
+
+def mrlines(fname, sp='\n'):
+    f = open(fname).read().split(sp)
+    while f != [] and f[-1] == '':
+        f = f[:-1]
+    return f
+
+
+def mwlines(lines, fname):
+    with open(fname, 'w') as fout:
+        fout.write('\n'.join(lines))
+
+
+def md5(s):
+    hash = hashlib.new('md5')
+    if osp.exists(s):
+        with open(s, 'rb') as f:
+            for chunk in iter(lambda: f.read(2**20), b''):
+                hash.update(chunk)
+    else:
+        hash.update(s.encode('utf-8'))
+    return str(hash.hexdigest())
+
+
+def last_modified(pth):
+    stamp = osp.getmtime(pth)
+    m_ti = time.ctime(stamp)
+    t_obj = time.strptime(m_ti)
+    t = time.strftime('%Y%m%d%H%M%S', t_obj)[2:]
+    return t
+
+
+def parse_file(s):
+    if osp.exists(s) and s != '.':
+        assert osp.isfile(s)
+        suffix = osp.splitext(s)[1].lower()
+        mime = mimetypes.types_map.get(suffix, 'unknown')
+        return (mime, s)
+    elif s.startswith('data:image/'):
+        # To be compatible with OPENAI base64 format
+        content = s[11:]
+        mime = content.split(';')[0]
+        content = ';'.join(content.split(';')[1:])
+        dname = osp.join(LMUDataRoot(), 'files')
+        assert content.startswith('base64,')
+        b64 = content[7:]
+        os.makedirs(dname, exist_ok=True)
+        tgt = osp.join(dname, md5(b64) + '.png')
+        decode_base64_to_image_file(b64, tgt)
+        return parse_file(tgt)
+    elif validators.url(s):
+        suffix = osp.splitext(s)[1].lower()
+        if suffix in mimetypes.types_map:
+            mime = mimetypes.types_map[suffix]
+            dname = osp.join(LMUDataRoot(), 'files')
+            os.makedirs(dname, exist_ok=True)
+            tgt = osp.join(dname, md5(s) + suffix)
+            download_file(s, tgt)
+            return (mime, tgt)
+        else:
+            return ('url', s)
+    else:
+        return (None, s)
+
+
+def file_size(f, unit='GB'):
+    stats = os.stat(f)
+    div_map = {
+        'GB': 2 ** 30,
+        'MB': 2 ** 20,
+        'KB': 2 ** 10,
+    }
+    return stats.st_size / div_map[unit]
+
+
+def parquet_to_tsv(file_path):
+    data = pd.read_parquet(file_path)
+    pth = '/'.join(file_path.split('/')[:-1])
+    data_name = file_path.split('/')[-1].split('.')[0]
+    data.to_csv(osp.join(pth, f'{data_name}.tsv'), sep='\t', index=False)
--- a/VLMEvalKit/vlmeval/smp/log.py
+++ b/VLMEvalKit/vlmeval/smp/log.py
+import logging
+logging.basicConfig(
+    format='[%(asctime)s] %(levelname)s - %(filename)s: %(funcName)s - %(lineno)d: %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S')
+
+logger_initialized = {}
+
+
+def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
+    logger = logging.getLogger(name)
+    if name in logger_initialized:
+        return logger
+
+    for logger_name in logger_initialized:
+        if name.startswith(logger_name):
+            return logger
+
+    stream_handler = logging.StreamHandler()
+    handlers = [stream_handler]
+
+    try:
+        import torch.distributed as dist
+        if dist.is_available() and dist.is_initialized():
+            rank = dist.get_rank()
+        else:
+            rank = 0
+    except ImportError:
+        rank = 0
+
+    if rank == 0 and log_file is not None:
+        file_handler = logging.FileHandler(log_file, file_mode)
+        handlers.append(file_handler)
+
+    formatter = logging.Formatter(
+        '[%(asctime)s] %(levelname)s - %(name)s - %(filename)s: %(funcName)s - %(lineno)d: %(message)s')
+    for handler in handlers:
+        handler.setFormatter(formatter)
+        handler.setLevel(log_level)
+        logger.addHandler(handler)
+
+    if rank == 0:
+        logger.setLevel(log_level)
+    else:
+        logger.setLevel(logging.ERROR)
+
+    logger_initialized[name] = True
+    return logger
--- a/VLMEvalKit/vlmeval/smp/misc.py
+++ b/VLMEvalKit/vlmeval/smp/misc.py
+# flake8: noqa: F401, F403
+import abc
+import argparse
+import csv
+import multiprocessing as mp
+import os
+import os.path as osp
+from pathlib import Path
+import copy as cp
+import random as rd
+import requests
+import shutil
+import subprocess
+import warnings
+import pandas as pd
+from collections import OrderedDict, defaultdict
+from multiprocessing import Pool, current_process
+from tqdm import tqdm
+import datetime
+import matplotlib.pyplot as plt
+from tabulate import tabulate
+from json import JSONDecoder
+from huggingface_hub import scan_cache_dir
+from huggingface_hub.utils._cache_manager import _scan_cached_repo
+from sty import fg, bg, ef, rs
+
+
+def modelscope_flag_set():
+    return os.environ.get('VLMEVALKIT_USE_MODELSCOPE', None) in ['1', 'True']
+
+
+def process_punctuation(inText):
+    import re
+    outText = inText
+    punct = [
+        ';', r'/', '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_', '-',
+        '>', '<', '@', '`', ',', '?', '!'
+    ]
+    commaStrip = re.compile('(\d)(,)(\d)')  # noqa: W605
+    periodStrip = re.compile('(?!<=\d)(\.)(?!\d)')  # noqa: W605
+    for p in punct:
+        if (p + ' ' in inText or ' ' + p in inText) or (re.search(
+                commaStrip, inText) is not None):
+            outText = outText.replace(p, '')
+        else:
+            outText = outText.replace(p, ' ')
+    outText = periodStrip.sub('', outText, re.UNICODE)
+    return outText
+
+def h2r(value):
+    if value[0] == '#':
+        value = value[1:]
+    assert len(value) == 6
+    return tuple(int(value[i:i + 2], 16) for i in range(0, 6, 2))
+
+def r2h(rgb):
+    return '#%02x%02x%02x' % rgb
+
+def colored(s, color):
+    if isinstance(color, str):
+        if hasattr(fg, color):
+            return getattr(fg, color) + s + fg.rs
+        color = h2r(color)
+    return fg(*color) + s + fg.rs
+
+def istype(s, type):
+    if isinstance(s, type):
+        return True
+    try:
+        return isinstance(eval(s), type)
+    except Exception as _:
+        return False
+
+def bincount(lst):
+    bins = defaultdict(lambda: 0)
+    for item in lst:
+        bins[item] += 1
+    return bins
+
+def get_cache_path(repo_id, branch='main', repo_type='datasets'):
+    try:
+        if modelscope_flag_set():
+            from modelscope.hub.file_download import create_temporary_directory_and_cache
+            if repo_type == 'datasets':
+                repo_type = 'dataset'
+            _, cache = create_temporary_directory_and_cache(model_id=repo_id, repo_type=repo_type)
+            cache_path = cache.get_root_location()
+            return cache_path
+        else:
+            from .file import HFCacheRoot
+            cache_path = HFCacheRoot()
+            org, repo_name = repo_id.split('/')
+            repo_path = Path(osp.join(cache_path, f'{repo_type}--{org}--{repo_name}/'))
+            hf_cache_info = _scan_cached_repo(repo_path=repo_path)
+            revs = {r.refs: r for r in hf_cache_info.revisions}
+            if branch is not None:
+                revs = {refs: r for refs, r in revs.items() if branch in refs}
+            rev2keep = max(revs.values(), key=lambda r: r.last_modified)
+            return str(rev2keep.snapshot_path)
+    except Exception as e:
+        import logging
+        logging.warning(f'{type(e)}: {e}')
+        return None
+
+def proxy_set(s):
+    import os
+    for key in ['http_proxy', 'HTTP_PROXY', 'https_proxy', 'HTTPS_PROXY']:
+        os.environ[key] = s
+
+def get_rank_and_world_size():
+    rank = int(os.environ.get('RANK', 0))
+    world_size = int(os.environ.get('WORLD_SIZE', 1))
+    return rank, world_size
+
+def splitlen(s, sym='/'):
+    return len(s.split(sym))
+
+def listinstr(lst, s):
+    assert isinstance(lst, list)
+    for item in lst:
+        if item in s:
+            return True
+    return False
+
+def d2df(D):
+    return pd.DataFrame({x: [D[x]] for x in D})
+
+def cn_string(s):
+    import re
+    if re.search(u'[\u4e00-\u9fff]', s):
+        return True
+    return False
+
+try:
+    import decord
+except ImportError:
+    pass
+
+def timestr(granularity='second'):
+    s = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
+    assert granularity in ['second', 'minute', 'hour', 'day']
+    if granularity == 'second':
+        return s
+    elif granularity == 'minute':
+        return s[:-2]
+    elif granularity == 'hour':
+        return s[:-4]
+    elif granularity == 'day':
+        return s[:-6]
+
+def _minimal_ext_cmd(cmd, cwd=None):
+    env = {}
+    for k in ['SYSTEMROOT', 'PATH', 'HOME']:
+        v = os.environ.get(k)
+        if v is not None:
+            env[k] = v
+    env['LANGUAGE'] = 'C'
+    env['LANG'] = 'C'
+    env['LC_ALL'] = 'C'
+    out = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env, cwd=cwd).communicate()[0]
+    return out
+
+def githash(fallback='unknown', digits=8):
+    if digits is not None and not isinstance(digits, int):
+        raise TypeError('digits must be None or an integer')
+    try:
+        import vlmeval
+    except ImportError as e:
+        import logging
+        logging.error(f'ImportError: {str(e)}')
+        return fallback
+    try:
+        out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'], cwd=vlmeval.__path__[0])
+        sha = out.strip().decode('ascii')
+        if digits is not None:
+            sha = sha[:digits]
+    except OSError:
+        sha = fallback
+    return sha
+
+def dict_merge(dct, merge_dct):
+    for k, _ in merge_dct.items():
+        if (k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], dict)):  #noqa
+            dict_merge(dct[k], merge_dct[k])
+        else:
+            dct[k] = merge_dct[k]
+
+def youtube_dl(idx):
+    cmd = f'youtube-dl -f best -f mp4 "{idx}"  -o {idx}.mp4'
+    os.system(cmd)
+
+def run_command(cmd):
+    if isinstance(cmd, str):
+        cmd = cmd.split()
+    return subprocess.check_output(cmd).decode()
+
+def load_env():
+    import logging
+    logging.basicConfig(
+        format='[%(asctime)s] %(levelname)s - %(filename)s: %(funcName)s - %(lineno)d: %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S')
+
+    try:
+        import vlmeval
+    except ImportError:
+        logging.error('VLMEval is not installed. Failed to import environment variables from .env file. ')
+        return
+    pth = osp.realpath(vlmeval.__path__[0])
+    pth = osp.join(pth, '../.env')
+    pth = osp.realpath(pth)
+    if not osp.exists(pth):
+        logging.error(f'Did not detect the .env file at {pth}, failed to load. ')
+        return
+
+    from dotenv import dotenv_values
+    values = dotenv_values(pth)
+    for k, v in values.items():
+        if v is not None and len(v):
+            os.environ[k] = v
+    logging.info(f'API Keys successfully loaded from {pth}')
+
+def pip_install_robust(package):
+    import sys
+    retry = 3
+    while retry > 0:
+        try:
+            package_base = package.split('=')[0]
+            module = __import__(package)
+            return True
+        except ImportError:
+            subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
+            retry -= 1
+    return False
+
+
+def version_cmp(v1, v2, op='eq'):
+    from packaging import version
+    import operator
+    op_func = getattr(operator, op)
+    return op_func(version.parse(v1), version.parse(v2))
+
+
+def toliststr(s):
+    if isinstance(s, str) and (s[0] == '[') and (s[-1] == ']'):
+        return [str(x) for x in eval(s)]
+    elif isinstance(s, str):
+        return [s]
+    elif isinstance(s, list):
+        return [str(x) for x in s]
+    raise NotImplementedError
+
+
+def extract_json_objects(text, decoder=JSONDecoder()):
+    pos = 0
+    while True:
+        match = text.find('{', pos)
+        if match == -1: break
+        try:
+            result, index = decoder.raw_decode(text[match:])
+            yield result
+            pos = match + index
+        except ValueError:
+            pos = match + 1
+
+
+def get_gpu_memory():
+    import subprocess
+    try:
+        command = "rocm-smi --showmeminfo vram"
+        output = subprocess.check_output(command.split(), stderr=subprocess.STDOUT)
+        memory_info = output.decode('ascii').split('\n')
+        memory_free_values = []
+        for line in memory_info:
+            if "vram Total Memory" in line:
+                total_memory = int(line.split(":")[-1].strip().split()[0])  # 提取总显存
+            elif "vram Total Used Memory" in line:
+                used_memory = int(line.split(":")[-1].strip().split()[0])  # 提取已用显存
+               # print(total_memory)
+               # print(used_memory)
+                free_memory = total_memory - used_memory  # 计算空闲显存
+                memory_free_values.append(free_memory)
+
+        return memory_free_values
+        #memory_free_info = subprocess.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
+        #memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
+        #return memory_free_values
+    except Exception as e:
+        print(f'{type(e)}: {str(e)}')
+        return []
+
+
+def auto_split_flag():
+    flag = os.environ.get('AUTO_SPLIT', '0')
+    return flag == '1'
--- a/VLMEvalKit/vlmeval/smp/vlm.py
+++ b/VLMEvalKit/vlmeval/smp/vlm.py
+import os
+import io
+import pandas as pd
+import numpy as np
+import string
+from uuid import uuid4
+import os.path as osp
+import base64
+from PIL import Image
+import sys
+
+Image.MAX_IMAGE_PIXELS = 1e9
+
+
+def rescale_img(img, tgt=None):
+    assert isinstance(tgt, tuple) and -1 in tgt
+    w, h = img.size
+    if tgt[0] != -1:
+        new_w, new_h = tgt[0], int(tgt[0] / w * h)
+    elif tgt[1] != -1:
+        new_w, new_h = int(tgt[1] / h * w), tgt[1]
+    img = img.resize((new_w, new_h))
+    return img
+
+
+def concat_images_vlmeval(images, target_size=-1, mode='h', return_image=False):
+    from .file import md5
+
+    ims = [Image.open(im) for im in images]
+    if target_size != -1:
+        ims = [
+            rescale_img(im, (-1, target_size) if mode == 'h' else (target_size, -1))
+            for im in ims
+        ]
+
+    ws, hs = [x.width for x in ims], [x.height for x in ims]
+    if mode == 'h':
+        new_w, new_h = sum(ws), max(hs)
+        dst = Image.new('RGB', (new_w, new_h))
+        for i, im in enumerate(ims):
+            dst.paste(im, (sum(ws[:i]), 0))
+    elif mode == 'v':
+        new_w, new_h = max(ws), sum(hs)
+        dst = Image.new('RGB', (new_w, new_h))
+        for i, im in enumerate(ims):
+            dst.paste(im, (sum(ws[:i], 0)))
+    if return_image:
+        return dst
+    else:
+        _str = '\n'.join(images)
+        str_md5 = md5(_str)
+        tgt = osp.join('/tmp', str_md5 + '.jpg')
+        dst.save(tgt)
+        return tgt
+
+
+def mmqa_display(question, target_size=512):
+    question = {k.lower(): v for k, v in question.items()}
+    keys = list(question.keys())
+    keys = [k for k in keys if k not in ['index', 'image']]
+
+    images = question['image']
+    if isinstance(images, str):
+        images = [images]
+
+    idx = question.pop('index', 'XXX')
+    print(f'INDEX: {idx}')
+
+    for im in images:
+        image = decode_base64_to_image(im, target_size=target_size)
+        display(image)  # noqa: F821
+
+    for k in keys:
+        try:
+            if not pd.isna(question[k]):
+                print(f'{k.upper()}. {question[k]}')
+        except ValueError:
+            if False in pd.isna(question[k]):
+                print(f'{k.upper()}. {question[k]}')
+
+
+def encode_image_to_base64(img, target_size=-1, fmt='JPEG'):
+    # if target_size == -1, will not do resizing
+    # else, will set the max_size ot (target_size, target_size)
+    if img.mode in ('RGBA', 'P'):
+        img = img.convert('RGB')
+    if target_size > 0:
+        img.thumbnail((target_size, target_size))
+    img_buffer = io.BytesIO()
+    img.save(img_buffer, format=fmt)
+    image_data = img_buffer.getvalue()
+    ret = base64.b64encode(image_data).decode('utf-8')
+    return ret
+
+
+def encode_image_file_to_base64(image_path, target_size=-1):
+    image = Image.open(image_path)
+    return encode_image_to_base64(image, target_size=target_size)
+
+
+def decode_base64_to_image(base64_string, target_size=-1):
+    image_data = base64.b64decode(base64_string)
+    image = Image.open(io.BytesIO(image_data))
+    if image.mode in ('RGBA', 'P'):
+        image = image.convert('RGB')
+    if target_size > 0:
+        image.thumbnail((target_size, target_size))
+    return image
+
+
+def decode_base64_to_image_file(base64_string, image_path, target_size=-1):
+    image = decode_base64_to_image(base64_string, target_size=target_size)
+    image.save(image_path)
+
+
+def build_option_str(option_dict):
+    s = 'There are several options: \n'
+    for c, content in option_dict.items():
+        if not pd.isna(content):
+            s += f'{c}. {content}\n'
+    return s
+
+
+def isimg(s):
+    return osp.exists(s) or s.startswith('http')
+
+
+def read_ok(img_path):
+    if not osp.exists(img_path):
+        return False
+    try:
+        im = Image.open(img_path)
+        assert im.size[0] > 0 and im.size[1] > 0
+        return True
+    except:
+        return False
+
+
+def gpt_key_set():
+    openai_key = os.environ.get('OPENAI_API_KEY', None)
+    return isinstance(openai_key, str) and openai_key.startswith('sk-')
+
+
+def apiok(wrapper):
+    s = wrapper.generate('Hello!')
+    return wrapper.fail_msg not in s
+
+
+def circular_pred(df, extract_func=None):
+    if extract_func is None:
+        extract_func = lambda x: x  # noqa: E731
+    df = df.sort_values('index')
+    from vlmeval.utils import can_infer_option
+
+    shift = int(1e6)
+
+    choices = [extract_func(x) for x in df['prediction']]
+    pred_map = {i: c for i, c in zip(df['index'], choices)}
+    flag_map = {i: True for i in pred_map if i < 1e6}
+    valid_map = {i: True for i in pred_map if i < 1e6}
+    for i in df['index']:
+        if i >= shift and pred_map[i] and pred_map[i - shift]:
+            if pred_map[i] not in list(
+                string.ascii_uppercase
+            ) or pred_map[  # noqa: W504
+                i - shift
+            ] not in list(
+                string.ascii_uppercase
+            ):
+
+                valid_map[i % shift] = False
+                continue
+            if (ord(pred_map[i]) - ord(pred_map[i - shift])) % 4 == 1:
+                continue
+            else:
+                flag_map[i % shift] = False
+    flag_map = {k: v for k, v in flag_map.items() if valid_map[k]}
+    flags = list(flag_map.values())
+    return np.mean(flags)
--- a/VLMEvalKit/vlmeval/tools.py
+++ b/VLMEvalKit/vlmeval/tools.py
+import sys
+from vlmeval.dataset import SUPPORTED_DATASETS
+from vlmeval.config import *
+from vlmeval.smp import *
+
+# Define valid modes
+MODES = ('dlist', 'mlist', 'missing', 'circular', 'localize', 'check', 'run', 'eval')
+
+CLI_HELP_MSG = \
+    f"""
+    Arguments received: {str(['vlmutil'] + sys.argv[1:])}. vlmutil commands use the following syntax:
+
+        vlmutil MODE MODE_ARGS
+
+        Where   MODE (required) is one of {MODES}
+                MODE_ARG (optional) is the argument for specific mode
+
+    Some usages for xtuner commands: (See more by using -h for specific command!)
+
+        1. List all the dataset by levels: l1, l2, l3, etc.:
+            vlmutil dlist [l1/l2/l3/...]
+        2. List all the models by categories: 4.33.0, 4.37.0, api, etc.:
+            vlmutil mlist 4.33.0 [all/small/large]
+        3. Report missing results:
+            vlmutil missing [l1/l2/l3/...]
+        4. Create circular questions (only for multiple-choice questions with no more than 4 choices):
+            vlmutil circular input.tsv
+        5. Create a localized version of the dataset (for very large tsv files):
+            vlmutil localize input.tsv
+        6. Check the validity of a model:
+            vlmutil check [model_name/model_series]
+        7. Run evaluation for missing results:
+            vlmutil run l2 hf
+        8. Evaluate data file:
+            vlmutil eval [dataset_name] [prediction_file]
+
+    GitHub: https://github.com/open-compass/VLMEvalKit
+    """  # noqa: E501
+
+
+dataset_levels = {
+    'l1': [
+        ('MMVet', 'gpt-4-turbo_score.csv'), ('MMMU_DEV_VAL', 'acc.csv'),
+        ('MathVista_MINI', 'gpt-4-turbo_score.csv'), ('HallusionBench', 'score.csv'),
+        ('OCRBench', 'score.json'), ('AI2D_TEST', 'acc.csv'), ('MMStar', 'acc.csv'),
+        ('MMBench_V11', 'acc.csv'), ('MMBench_CN_V11', 'acc.csv')
+    ],
+    'l2': [
+        ('MME', 'score.csv'), ('LLaVABench', 'score.csv'), ('RealWorldQA', 'acc.csv'),
+        ('MMBench', 'acc.csv'), ('MMBench_CN', 'acc.csv'), ('CCBench', 'acc.csv'),
+        ('SEEDBench_IMG', 'acc.csv'), ('COCO_VAL', 'score.json'), ('POPE', 'score.csv'),
+        ('ScienceQA_VAL', 'acc.csv'), ('ScienceQA_TEST', 'acc.csv'), ('MMT-Bench_VAL', 'acc.csv'),
+        ('SEEDBench2_Plus', 'acc.csv'), ('BLINK', 'acc.csv'), ('MTVQA_TEST', 'acc.json'),
+        ('Q-Bench1_VAL', 'acc.csv'), ('A-Bench_VAL', 'acc.csv'), ('R-Bench-Dis', 'acc.csv'),
+        ('MathVision', 'score.csv'), ('MathVerse_MINI_Vision_Only', 'score.csv'), ('DynaMath', 'score.csv'),
+    ],
+    'l3': [
+        ('OCRVQA_TESTCORE', 'acc.csv'), ('TextVQA_VAL', 'acc.csv'),
+        ('ChartQA_TEST', 'acc.csv'), ('DocVQA_VAL', 'acc.csv'), ('InfoVQA_VAL', 'acc.csv'),
+        ('SEEDBench2', 'acc.csv')
+    ]
+}
+
+dataset_levels['l12'] = dataset_levels['l1'] + dataset_levels['l2']
+dataset_levels['l23'] = dataset_levels['l2'] + dataset_levels['l3']
+dataset_levels['l123'] = dataset_levels['l12'] + dataset_levels['l3']
+
+models = {
+    '4.33.0': list(qwen_series) + list(xcomposer_series) + [
+        'mPLUG-Owl2', 'flamingov2', 'VisualGLM_6b', 'MMAlaya', 'PandaGPT_13B', 'VXVERSE'
+    ] + list(idefics_series) + list(minigpt4_series) + list(instructblip_series),
+    '4.37.0': [x for x in llava_series if 'next' not in x] + list(internvl_series) + [
+        'TransCore_M', 'emu2_chat', 'MiniCPM-V', 'MiniCPM-V-2', 'OmniLMM_12B',
+        'cogvlm-grounding-generalist', 'cogvlm-chat', 'cogvlm2-llama3-chat-19B',
+        'mPLUG-Owl3'
+    ] + list(xtuner_series) + list(yivl_series) + list(deepseekvl_series) + list(janus_series) + list(cambrian_series),
+    '4.36.2': ['Moondream1'],
+    '4.40.0': [
+        'idefics2_8b', 'Bunny-llama3-8B', 'MiniCPM-Llama3-V-2_5', '360VL-70B', 'Phi-3-Vision',
+    ] + list(wemm_series),
+    '4.44.0': ['Moondream2'],
+    '4.45.0': ['Aria'],
+    'latest': ['paligemma-3b-mix-448', 'MiniCPM-V-2_6', 'glm-4v-9b'] + [x for x in llava_series if 'next' in x]
+    + list(chameleon_series) + list(ovis_series) + list(mantis_series),
+    'api': list(api_models)
+}
+
+# SKIP_MODELS will be skipped in report_missing and run APIs
+SKIP_MODELS = [
+    'MGM_7B', 'GPT4V_HIGH', 'GPT4V', 'flamingov2', 'PandaGPT_13B',
+    'GeminiProVision', 'Step1V-0701', 'SenseChat-5-Vision',
+    'llava_v1_7b', 'sharegpt4v_7b', 'sharegpt4v_13b',
+    'llava-v1.5-7b-xtuner', 'llava-v1.5-13b-xtuner',
+    'cogvlm-grounding-generalist', 'InternVL-Chat-V1-1',
+    'InternVL-Chat-V1-2', 'InternVL-Chat-V1-2-Plus', 'RekaCore',
+    'llava_next_72b', 'llava_next_110b', 'MiniCPM-V', 'sharecaptioner', 'XComposer',
+    'VisualGLM_6b', 'idefics_9b_instruct', 'idefics_80b_instruct',
+    'mPLUG-Owl2', 'MMAlaya', 'OmniLMM_12B', 'emu2_chat', 'VXVERSE'
+] + list(minigpt4_series) + list(instructblip_series) + list(xtuner_series) + list(chameleon_series) + list(vila_series)
+
+LARGE_MODELS = [
+    'idefics_80b_instruct', '360VL-70B', 'emu2_chat', 'InternVL2-76B',
+]
+
+
+def completed(m, d, suf):
+    score_file = f'outputs/{m}/{m}_{d}_{suf}'
+    if osp.exists(score_file):
+        return True
+    if d == 'MMBench':
+        s1, s2 = f'outputs/{m}/{m}_MMBench_DEV_EN_{suf}', f'outputs/{m}/{m}_MMBench_TEST_EN_{suf}'
+        return osp.exists(s1) and osp.exists(s2)
+    elif d == 'MMBench_CN':
+        s1, s2 = f'outputs/{m}/{m}_MMBench_DEV_CN_{suf}', f'outputs/{m}/{m}_MMBench_TEST_CN_{suf}'
+        return osp.exists(s1) and osp.exists(s2)
+    return False
+
+
+def DLIST(lvl):
+    if lvl in dataset_levels.keys():
+        return [x[0] for x in dataset_levels[lvl]]
+    else:
+        from vlmeval.dataset import SUPPORTED_DATASETS
+        return SUPPORTED_DATASETS
+
+
+def MLIST(lvl, size='all'):
+    if lvl == 'all':
+        from vlmeval.config import supported_VLM
+        return [x for x in supported_VLM]
+
+    model_list = models[lvl]
+    if size == 'small':
+        model_list = [m for m in model_list if m not in LARGE_MODELS]
+    elif size == 'large':
+        model_list = [m for m in model_list if m in LARGE_MODELS]
+    return [x[0] for x in model_list]
+
+
+def MISSING(lvl):
+    from vlmeval.config import supported_VLM
+    models = list(supported_VLM)
+    models = [m for m in models if m not in SKIP_MODELS and osp.exists(osp.join('outputs', m))]
+    if lvl in dataset_levels.keys():
+        data_list = dataset_levels[lvl]
+    else:
+        data_list = [(D, suff) for (D, suff) in dataset_levels['l123'] if D == lvl]
+    missing_list = []
+    for f in models:
+        for D, suff in data_list:
+            if not completed(f, D, suff):
+                missing_list.append((f, D))
+    return missing_list
+
+
+def CIRCULAR(inp):
+    assert inp.endswith('.tsv')
+    data = load(inp)
+    OFFSET = 1e6
+    while max(data['index']) >= OFFSET:
+        OFFSET *= 10
+
+    assert 'E' not in data, 'Currently build_circular only works for up to 4-choice questions'
+    data_2c = data[pd.isna(data['C'])]
+    data_3c = data[~pd.isna(data['C']) & pd.isna(data['D'])]
+    data_4c = data[~pd.isna(data['D'])]
+    map_2c = [('AB', 'BA')]
+    map_3c = [('ABC', 'BCA'), ('ABC', 'CAB')]
+    map_4c = [('ABCD', 'BCDA'), ('ABCD', 'CDAB'), ('ABCD', 'DABC')]
+
+    def okn(o, n=4):
+        ostr = o.replace(',', ' ')
+        osplits = ostr.split()
+        if sum([c in osplits for c in string.ascii_uppercase[:n - 1]]) == n - 1:
+            return False
+        olower = o.lower()
+        olower = olower.replace(',', ' ')
+        olower_splits = olower.split()
+        if 'all' in olower_splits or 'none' in olower_splits:
+            return False
+        return True
+
+    yay4, nay4 = [], []
+    lt4 = len(data_4c)
+    for i in range(lt4):
+        if okn(data_4c.iloc[i]['D'], 4):
+            yay4.append(i)
+        else:
+            nay4.append(i)
+    data_4c_y = data_4c.iloc[yay4]
+    data_4c_n = data_4c.iloc[nay4]
+    data_3c = pd.concat([data_4c_n, data_3c])
+
+    yay3, nay3 = [], []
+    lt3 = len(data_3c)
+    for i in range(lt3):
+        if okn(data_3c.iloc[i]['C'], 3):
+            yay3.append(i)
+        else:
+            nay3.append(i)
+    data_3c_y = data_3c.iloc[yay3]
+    data_3c_n = data_3c.iloc[nay3]
+    data_2c = pd.concat([data_3c_n, data_2c])
+
+    def remap(data_in, tup, off):
+        off = int(off)
+        data = data_in.copy()
+        char_map = {k: v for k, v in zip(*tup)}
+        idx = data.pop('index')
+        answer = data.pop('answer')
+        answer_new = [char_map[x] if x in char_map else x for x in answer]
+        data['answer'] = answer_new
+        options = {}
+        for c in char_map:
+            options[char_map[c]] = data.pop(c)
+        for c in options:
+            data[c] = options[c]
+        data.pop('image')
+        data['image'] = idx
+        idx = [x + off for x in idx]
+        data['index'] = idx
+        return data
+
+    data_all = pd.concat([
+        data_2c,
+        data_3c_y,
+        data_4c_y,
+        remap(data_2c, map_2c[0], OFFSET),
+        remap(data_3c_y, map_3c[0], OFFSET),
+        remap(data_4c_y, map_4c[0], OFFSET),
+        remap(data_3c_y, map_3c[1], OFFSET * 2),
+        remap(data_4c_y, map_4c[1], OFFSET * 2),
+        remap(data_4c_y, map_4c[2], OFFSET * 3),
+    ])
+
+    tgt_file = inp.replace('.tsv', '_CIRC.tsv')
+    dump(data_all, tgt_file)
+    print(f'The circularized data is saved to {tgt_file}')
+    assert osp.exists(tgt_file)
+    print(f'The MD5 for the circularized data is {md5(tgt_file)}')
+
+
+PTH = osp.realpath(__file__)
+IMAGE_PTH = osp.join(osp.dirname(PTH), '../assets/apple.jpg')
+
+msg1 = [
+    IMAGE_PTH,
+    'What is in this image?'
+]
+msg2 = [
+    dict(type='image', value=IMAGE_PTH),
+    dict(type='text', value='What is in this image?')
+]
+msg3 = [
+    IMAGE_PTH,
+    IMAGE_PTH,
+    'How many apples are there in these images?'
+]
+msg4 = [
+    dict(type='image', value=IMAGE_PTH),
+    dict(type='image', value=IMAGE_PTH),
+    dict(type='text', value='How many apples are there in these images?')
+]
+
+
+def CHECK(val):
+    if val in supported_VLM:
+        model = supported_VLM[val]()
+        print(f'Model: {val}')
+        for i, msg in enumerate([msg1, msg2, msg3, msg4]):
+            if i > 1 and not model.INTERLEAVE:
+                continue
+            res = model.generate(msg)
+            print(f'Test {i + 1}: {res}')
+    elif val in models:
+        model_list = models[val]
+        for m in model_list:
+            CHECK(m)
+
+
+def LOCALIZE(fname, new_fname=None):
+    if new_fname is None:
+        new_fname = fname.replace('.tsv', '_local.tsv')
+
+    base_name = osp.basename(fname)
+    dname = osp.splitext(base_name)[0]
+
+    data = load(fname)
+    data_new = localize_df(data, dname)
+    dump(data_new, new_fname)
+    print(f'The localized version of data file is {new_fname}')
+    return new_fname
+
+
+def RUN(lvl, model):
+    import torch
+    NGPU = torch.cuda.device_count()
+    SCRIPT = osp.join(osp.dirname(__file__), '../run.py')
+    logger = get_logger('Run Missing')
+
+    def get_env(name):
+        assert name in ['433', '437', '440', 'latest']
+        load_env()
+        env_key = f'ENV_{name}'
+        return os.environ.get(env_key, None)
+
+    missing = MISSING(lvl)
+    if model == 'all':
+        pass
+    elif model == 'api':
+        missing = [x for x in missing if x[0] in models['api']]
+    elif model == 'hf':
+        missing = [x for x in missing if x[0] not in models['api']]
+    elif model in models:
+        missing = [x for x in missing if x[0] in models[missing]]
+    elif model in supported_VLM:
+        missing = [x for x in missing if x[0] == model]
+    else:
+        warnings.warn(f'Invalid model {model}.')
+
+    missing.sort(key=lambda x: x[0])
+    groups = defaultdict(list)
+    for m, D in missing:
+        groups[m].append(D)
+    for m in groups:
+        if m in SKIP_MODELS:
+            continue
+        for dataset in groups[m]:
+            logger.info(f'Running {m} on {dataset}')
+            exe = 'python' if m in LARGE_MODELS or m in models['api'] else 'torchrun'
+            if m not in models['api']:
+                env = None
+                env = 'latest' if m in models['latest'] else env
+                env = '433' if m in models['4.33.0'] else env
+                env = '437' if m in models['4.37.0'] else env
+                env = '440' if m in models['4.40.0'] else env
+                if env is None:
+                    # Not found, default to latest
+                    env = 'latest'
+                    logger.warning(
+                        f"Model {m} does not have a specific environment configuration. Defaulting to 'latest'.")
+                pth = get_env(env)
+                if pth is not None:
+                    exe = osp.join(pth, 'bin', exe)
+                else:
+                    logger.warning(f'Cannot find the env path {env} for model {m}')
+            if exe.endswith('torchrun'):
+                cmd = f'{exe} --nproc-per-node={NGPU} {SCRIPT} --model {m} --data {dataset}'
+            elif exe.endswith('python'):
+                cmd = f'{exe} {SCRIPT} --model {m} --data {dataset}'
+            os.system(cmd)
+
+
+def EVAL(dataset_name, data_file, **kwargs):
+    from vlmeval.dataset import build_dataset
+    logger = get_logger('VLMEvalKit Tool-Eval')
+    dataset = build_dataset(dataset_name)
+    # Set the judge kwargs first before evaluation or dumping
+    judge_kwargs = {'nproc': 4, 'verbose': True}
+    if 'model' not in kwargs:
+        if dataset.TYPE in ['MCQ', 'Y/N']:
+            judge_kwargs['model'] = 'chatgpt-0125'
+        elif listinstr(['MMVet', 'LLaVABench', 'MMBench-Video'], dataset_name):
+            judge_kwargs['model'] = 'gpt-4-turbo'
+        elif listinstr(['MMLongBench', 'MMDU'], dataset_name):
+            judge_kwargs['model'] = 'gpt-4o'
+        elif listinstr(['DynaMath', 'MathVerse', 'MathVista', 'MathVision'], dataset_name):
+            judge_kwargs['model'] = 'gpt-4o-mini'
+    else:
+        judge_kwargs['model'] = kwargs['model']
+    judge_kwargs['nproc'] = kwargs.get('nproc', 4)
+    eval_results = dataset.evaluate(data_file, **judge_kwargs)
+    if eval_results is not None:
+        assert isinstance(eval_results, dict) or isinstance(eval_results, pd.DataFrame)
+        logger.info('Evaluation Results:')
+    if isinstance(eval_results, dict):
+        logger.info('\n' + json.dumps(eval_results, indent=4))
+    elif isinstance(eval_results, pd.DataFrame):
+        logger.info('\n')
+        logger.info(tabulate(eval_results.T) if len(eval_results) < len(eval_results.columns) else eval_results)
+    return eval_results
+
+
+def parse_args_eval():
+    parser = argparse.ArgumentParser()
+    # Essential Args, Setting the Names of Datasets and Models
+    parser.add_argument('cmd', type=str)
+    parser.add_argument('data_file', type=str)
+    parser.add_argument('--judge', type=str, default=None)
+    parser.add_argument('--nproc', type=int, default=4)
+    parser.add_argument('--retry', type=int, default=None)
+    args = parser.parse_args()
+    return args
+
+
+def cli():
+    logger = get_logger('VLMEvalKit Tools')
+    args = sys.argv[1:]
+    if not args:  # no arguments passed
+        logger.info(CLI_HELP_MSG)
+        return
+    if args[0].lower() in MODES:
+        if args[0].lower() == 'dlist':
+            assert len(args) >= 2
+            lst = DLIST(args[1])
+            print(' '.join(lst))
+        elif args[0].lower() == 'mlist':
+            assert len(args) >= 2
+            size = 'all'
+            if len(args) > 2:
+                size = args[2].lower()
+            lst = MLIST(args[1], size)
+            print('\n'.join(lst))
+        elif args[0].lower() == 'missing':
+            assert len(args) >= 2
+            missing_list = MISSING(args[1])
+            logger = get_logger('Find Missing')
+            logger.info(colored(f'Level {args[1]} Missing Results: ', 'red'))
+            lines = []
+            for m, D in missing_list:
+                line = f'Model {m}, Dataset {D}'
+                logger.info(colored(line, 'red'))
+                lines.append(line)
+            mwlines(lines, f'{args[1]}_missing.txt')
+        elif args[0].lower() == 'circular':
+            assert len(args) >= 2
+            CIRCULAR(args[1])
+        elif args[0].lower() == 'localize':
+            assert len(args) >= 2
+            LOCALIZE(args[1])
+        elif args[0].lower() == 'check':
+            assert len(args) >= 2
+            model_list = args[1:]
+            for m in model_list:
+                CHECK(m)
+        elif args[0].lower() == 'run':
+            assert len(args) >= 2
+            lvl = args[1]
+            if len(args) == 2:
+                model = 'all'
+                RUN(lvl, model)
+            else:
+                for model in args[2:]:
+                    RUN(lvl, model)
+        elif args[0].lower() == 'eval':
+            args = parse_args_eval()
+            data_file = args.data_file
+
+            def extract_dataset(file_name):
+                fname = osp.splitext(file_name)[0].split('/')[-1]
+                parts = fname.split('_')
+                for i in range(len(parts)):
+                    if '_'.join(parts[i:]) in SUPPORTED_DATASETS:
+                        return '_'.join(parts[i:])
+                return None
+
+            dataset = extract_dataset(data_file)
+            assert dataset is not None, f'Cannot infer dataset name from {data_file}'
+            kwargs = {'nproc': args.nproc}
+            if args.judge is not None:
+                kwargs['model'] = args.judge
+            if args.retry is not None:
+                kwargs['retry'] = args.retry
+            EVAL(dataset_name=dataset, data_file=data_file, **kwargs)
+    else:
+        logger.error('WARNING: command error!')
+        logger.info(CLI_HELP_MSG)
+        return
--- a/VLMEvalKit/vlmeval/utils/__init__.py
+++ b/VLMEvalKit/vlmeval/utils/__init__.py
+from .matching_util import can_infer, can_infer_option, can_infer_text
+from .mp_util import track_progress_rich
+
+
+__all__ = [
+    'can_infer', 'can_infer_option', 'can_infer_text', 'track_progress_rich',
+]
--- a/VLMEvalKit/vlmeval/utils/matching_util.py
+++ b/VLMEvalKit/vlmeval/utils/matching_util.py
+import string
+import copy as cp
+import os
+from ..smp import *
+
+
+def can_infer_option(answer, choices):
+    verbose = os.environ.get('VERBOSE', 0)
+    # Choices is a dictionary
+    if 'Failed to obtain answer via API' in answer:
+        return False
+
+    reject_to_answer = [
+        "Sorry, I can't help with images of people yet.",
+        "I can't process this file.",
+        "I'm sorry, but without the image provided",
+        'Cannot determine the answer'
+    ]
+    for err in reject_to_answer:
+        if err in answer:
+            return 'Z'
+
+    def count_choice(splits, choices, prefix='', suffix=''):
+        cnt = 0
+        for c in choices:
+            if prefix + c + suffix in splits:
+                cnt += 1
+        return cnt
+
+    answer_mod = cp.copy(answer)
+    chars = '.()[],:;!*#{}'
+    for c in chars:
+        answer_mod = answer_mod.replace(c, ' ')
+
+    splits = [x.strip() for x in answer_mod.split()]
+    count = count_choice(splits, choices)
+
+    if count == 1:
+        for ch in choices:
+            if 'A' in splits and len(splits) > 3 and verbose:
+                logger = get_logger('Evaluation')
+                logger.info(f'A might be a quantifier in the string: {answer}.')
+                return False
+            if ch in splits:
+                return ch
+    elif count == 0 and count_choice(splits, {'Z', ''}) == 1:
+        return 'Z'
+    return False
+
+
+def can_infer_text(answer, choices):
+    answer = answer.lower()
+    assert isinstance(choices, dict)
+    for k in choices:
+        assert k in string.ascii_uppercase
+        choices[k] = str(choices[k]).lower()
+    cands = []
+    for k in choices:
+        if choices[k] in answer:
+            cands.append(k)
+    if len(cands) == 1:
+        return cands[0]
+    return False
+
+
+def can_infer(answer, choices):
+    answer = str(answer)
+    copt = can_infer_option(answer, choices)
+    return copt if copt else can_infer_text(answer, choices)
--- a/VLMEvalKit/vlmeval/utils/mp_util.py
+++ b/VLMEvalKit/vlmeval/utils/mp_util.py
+from multiprocessing import Pool
+import os
+from typing import Callable, Iterable, Sized
+
+from rich.progress import (BarColumn, MofNCompleteColumn, Progress, Task,
+                           TaskProgressColumn, TextColumn, TimeRemainingColumn)
+from rich.text import Text
+import os.path as osp
+import time
+import portalocker
+from ..smp import load, dump
+
+
+def track_progress_rich(
+        func: Callable,
+        tasks: Iterable = tuple(),
+        nproc: int = 1,
+        save=None,
+        keys=None,
+        **kwargs) -> list:
+
+    from concurrent.futures import ThreadPoolExecutor
+    from tqdm import tqdm
+    if save is not None:
+        assert osp.exists(osp.dirname(save)) or osp.dirname(save) == ''
+        if not osp.exists(save):
+            dump({}, save)
+    if keys is not None:
+        assert len(keys) == len(tasks)
+    if not callable(func):
+        raise TypeError('func must be a callable object')
+    if not isinstance(tasks, Iterable):
+        raise TypeError(
+            f'tasks must be an iterable object, but got {type(tasks)}')
+    assert nproc > 0, 'nproc must be a positive number'
+    res = load(save) if save is not None else {}
+    results = [None for _ in range(len(tasks))]
+
+    with ThreadPoolExecutor(max_workers=nproc) as executor:
+        futures = []
+
+        for inputs in tasks:
+            if not isinstance(inputs, (tuple, list, dict)):
+                inputs = (inputs, )
+            if isinstance(inputs, dict):
+                future = executor.submit(func, **inputs)
+            else:
+                future = executor.submit(func, *inputs)
+            futures.append(future)
+
+        unfinished = set(range(len(tasks)))
+        pbar = tqdm(total=len(unfinished))
+        while len(unfinished):
+            new_finished = set()
+            for idx in unfinished:
+                if futures[idx].done():
+                    results[idx] = futures[idx].result()
+                    new_finished.add(idx)
+                    if keys is not None:
+                        res[keys[idx]] = results[idx]
+            if len(new_finished):
+                if save is not None:
+                    dump(res, save)
+                pbar.update(len(new_finished))
+                for k in new_finished:
+                    unfinished.remove(k)
+            time.sleep(0.1)
+        pbar.close()
+
+    if save is not None:
+        dump(res, save)
+    return results
--- a/VLMEvalKit/vlmeval/utils/result_transfer.py
+++ b/VLMEvalKit/vlmeval/utils/result_transfer.py
+from ..smp import *
+from ..dataset.utils.judge_util import build_judge
+from ..dataset.utils.multiple_choice import extract_answer_from_item
+from .matching_util import can_infer
+from .mp_util import track_progress_rich
+
+
+def MMMU_result_transfer(result_path):
+    res = {}
+    result_data = load(result_path)
+    mcq = result_data['A'].notna()
+    lt = len(result_data)
+    for i in range(lt):
+        line = result_data.iloc[i]
+        if mcq[i]:
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            prediction = line['prediction']
+            infer_prediction = can_infer(prediction, options)
+            res[line['id']] = infer_prediction
+        else:
+            res[line['id']] = line['prediction']
+    result_json = result_path.replace('.xlsx', '.json')
+    dump(res, result_json)
+    return result_json
+
+
+def MMTBench_result_transfer(eval_file, dataset='default', **judge_kwargs):
+    logger = get_logger('Evaluation')
+    nproc = judge_kwargs.pop('nproc', 4)
+
+    rd.seed(2680)
+    suffix = eval_file.split('.')[-1]
+    model = judge_kwargs['model']
+    assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
+    name_str_map = {
+        'chatgpt-0125': 'openai',
+        'gpt-4-0125': 'gpt4'
+    }
+    name_str = name_str_map[model] if model in name_str_map else model
+
+    if model == 'exact_matching':
+        model = None
+    elif gpt_key_set():
+        model = build_judge(**judge_kwargs)
+        if not model.working():
+            logger.error('The OPENAI API is not working properly, will use exact matching for evaluation')
+            model = None
+    else:
+        logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+        model = None
+
+    logger.info(f'Evaluating {eval_file}')
+    result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_option.pkl')
+    result = {}
+    if osp.exists(result_file):
+        result = load(result_file)
+
+    data = load(eval_file)
+    assert 'index' in data, 'Essentail columns missing in the eval_file.'
+
+    data = data.sort_values(by='index')
+    data['prediction'] = [str(x) for x in data['prediction']]
+    for k in data.keys():
+        data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
+
+    idx2lines = {data.iloc[i]['index']: data.iloc[i] for i in range(len(data))}
+    idx2lines = {k: v for k, v in idx2lines.items() if k not in result}
+
+    indices = list(idx2lines.keys())
+    lines = [idx2lines[i] for i in indices]
+    tups = [(model, line) for line in lines]
+    res = track_progress_rich(
+        extract_answer_from_item,
+        tups,
+        nproc=nproc,
+        chunksize=nproc,
+        save=result_file,
+        keys=indices)
+
+    for i, r in zip(indices, res):
+        if i in result:
+            assert result[i]['opt'] == r['opt'] and result[i]['log'] == r['log']
+        else:
+            result[i] = r
+
+    indices = list(data['index'])
+    data['opt'] = [result[i]['opt'] for i in data['index']]
+    data['log'] = [result[i]['log'] for i in data['index']]
+
+    # load split
+    output_path = eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv')
+    dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv'))
+    return output_path
--- a/VLMEvalKit/vlmeval/vlm/__init__.py
+++ b/VLMEvalKit/vlmeval/vlm/__init__.py
+import torch
+
+torch.set_grad_enabled(False)
+torch.manual_seed(1234)
+from .aria import Aria
+from .base import BaseModel
+from .cogvlm import CogVlm, GLM4v
+from .emu import Emu
+from .eagle_x import Eagle
+from .idefics import IDEFICS, IDEFICS2
+from .instructblip import InstructBLIP
+from .kosmos import Kosmos2
+from .llava import LLaVA, LLaVA_Next, LLaVA_XTuner, LLaVA_Next2, LLaVA_OneVision, LLaVA_OneVision_HF
+from .minicpm_v import MiniCPM_V, MiniCPM_Llama3_V, MiniCPM_V_2_6
+from .minigpt4 import MiniGPT4
+from .mmalaya import MMAlaya, MMAlaya2
+from .monkey import Monkey, MonkeyChat
+from .moondream import Moondream1, Moondream2
+from .minimonkey import MiniMonkey
+from .mplug_owl2 import mPLUG_Owl2
+from .omnilmm import OmniLMM12B
+from .open_flamingo import OpenFlamingo
+from .pandagpt import PandaGPT
+from .qwen_vl import QwenVL, QwenVLChat
+from .qwen2_vl import Qwen2VLChat
+from .transcore_m import TransCoreM
+from .visualglm import VisualGLM
+from .xcomposer import ShareCaptioner, XComposer, XComposer2, XComposer2_4KHD, XComposer2d5
+from .yi_vl import Yi_VL
+from .internvl import InternVLChat
+from .deepseek_vl import DeepSeekVL
+from .janus import Janus
+from .mgm import Mini_Gemini
+from .bunnyllama3 import BunnyLLama3
+from .vxverse import VXVERSE
+from .paligemma import PaliGemma
+from .qh_360vl import QH_360VL
+from .phi3_vision import Phi3Vision, Phi3_5Vision
+from .wemm import WeMM
+from .cambrian import Cambrian
+from .chameleon import Chameleon
+from .video_llm import VideoLLaVA, VideoLLaVA_HF, Chatunivi, VideoChatGPT, LLaMAVID, VideoChat2_HD, PLLaVA
+from .vila import VILA
+from .ovis import Ovis, Ovis1_6, Ovis1_6_Plus
+from .mantis import Mantis
+from .mixsense import LLama3Mixsense
+from .parrot import Parrot
+from .omchat import OmChat
+from .rbdash import RBDash
+from .xgen_mm import XGenMM
+from .slime import SliME
+from .mplug_owl3 import mPLUG_Owl3
+from .pixtral import Pixtral
+from .llama_vision import llama_vision
+from .molmo import molmo
+from .points import POINTS, POINTSV15
+from .nvlm import NVLM
+from .vintern_chat import VinternChat
+from .h2ovl_mississippi import H2OVLChat
+from .falcon_vlm import Falcon2VLM
+from .smolvlm import SmolVLM
+from .sail_vl import SailVL
+from .valley import ValleyEagleChat
--- a/VLMEvalKit/vlmeval/vlm/aria.py
+++ b/VLMEvalKit/vlmeval/vlm/aria.py
+import torch
+import warnings
+import copy as cp
+from PIL import Image
+import pandas as pd
+import string
+import re
+from .base import BaseModel
+from ..smp import isimg, listinstr, cn_string
+from ..dataset import DATASET_TYPE, DATASET_MODALITY
+
+
+class Aria(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = True
+
+    def __init__(self, model_path='rhymes-ai/Aria', **kwargs):
+        from transformers import AutoModelForCausalLM, AutoProcessor
+        assert model_path is not None
+        self.model_path = model_path
+        processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+        tokenizer = processor.tokenizer
+        tokenizer.padding_side = 'left'
+        tokenizer.pad_token_id = tokenizer.unk_token_id
+        self.processor = processor
+        self.tokenizer = tokenizer
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            device_map='cuda',
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True
+        ).eval()
+        default_kwargs = dict(
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=512,
+            min_new_tokens=1,
+            num_return_sequences=1,
+            use_cache=True,
+            output_hidden_states=True,
+            pad_token_id=tokenizer.unk_token_id,
+            stop_strings=["<|im_end|>"],
+            tokenizer=processor.tokenizer,
+        )
+        default_kwargs.update(kwargs)
+        self.kwargs = default_kwargs
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+        torch.cuda.empty_cache()
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN'], dataset):
+            # For Multi-Turn we don't have custom prompt
+            return False
+        if DATASET_MODALITY(dataset) == 'VIDEO':
+            # For Video benchmarks we don't have custom prompt at here
+            return False
+        else:
+            return True
+
+    def build_prompt(self, line, dataset=None):
+        assert self.use_custom_prompt(dataset)
+        assert dataset is None or isinstance(dataset, str)
+        tgt_path = self.dump_image(line, dataset)
+
+        question = line['question']
+        hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+        if hint is not None:
+            question = hint + '\n' + question
+
+        options = {
+            cand: line[cand]
+            for cand in string.ascii_uppercase
+            if cand in line and not pd.isna(line[cand])
+        }
+        for key, item in options.items():
+            question += f'\n{key}. {item}'
+        prompt = question
+
+        if len(options):
+            prompt += (
+                "\nAnswer with the option's letter from the given choices directly."
+            )
+        else:
+            if listinstr(['MathVista', 'MathVision', 'VCR', 'MTVQA', 'MMVet', 'MathVerse'], dataset):
+                prompt = prompt
+            elif listinstr(['LLaVABench', 'MMBench-Video'], dataset):
+                prompt += '\nAnswer this question in detail.'
+            elif listinstr(['DocVQA'], dataset):
+                prompt += '\nAnswer briefly and directly.'
+            else:
+                prompt += '\nAnswer the question using a single word or phrase.'
+
+        message = [dict(type='image', value=s) for s in tgt_path]
+        message.append(dict(type='text', value=prompt))
+        return message
+
+    def build_video_prompt(self, prompt, dataset=None):
+        if listinstr(['MMBench-Video'], dataset):
+            prompt = prompt.replace('\nAnswer:', '')
+            prompt = prompt.replace(
+                'Question: ',
+                'Please carefully check the video and then answer the following question with details:'
+            )
+        elif listinstr(['Video-MME'], dataset):
+            prompt = prompt.replace('\nAnswer:', '')
+            prompt += "\nAnswer with the option's letter from the given choices directly."
+        elif listinstr(['MVBench'], dataset):
+            prompt = prompt.replace('Best option:(', '')
+            system_prompt = 'Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.\n'  # noqa: E501
+            prompt = prompt.replace(system_prompt, '')
+
+        return prompt
+
+    def adjust_kwargs(self, dataset):
+        kwargs = cp.deepcopy(self.kwargs)
+        kwargs["temperature"] = 0.0
+        kwargs["do_sample"] = False
+
+        if DATASET_MODALITY(dataset) == "VIDEO":
+            kwargs["max_image_size"] = 490
+        else:
+            kwargs["max_image_size"] = 980
+
+        kwargs["split_image"] = False
+
+        if listinstr(['MMMU', 'MMStar', 'Math'], dataset):
+            # These datasets may lead the model to work as a CoT-alike behaviour.
+            # Allow to output longer.
+            kwargs['max_new_tokens'] = 512
+            return kwargs
+        if DATASET_TYPE(dataset) in ['MCQ', 'Y/N']:
+            kwargs['max_new_tokens'] = 64
+        elif DATASET_TYPE(dataset) == 'Caption' and 'COCO' in dataset:
+            kwargs['max_new_tokens'] = 64
+        elif DATASET_TYPE(dataset) == 'VQA':
+            if listinstr(['OCRVQA', 'ChartQA', 'DocVQA'], dataset):
+                kwargs['max_new_tokens'] = 128
+            elif listinstr(['TextVQA'], dataset):
+                kwargs['max_new_tokens'] = 32
+
+        if listinstr(['OCR', 'ChartQA', 'DocVQA', 'InfoVQA', 'TextVQA'], dataset):
+            # OCR-related datasets that need to split image
+            kwargs["split_image"] = True
+
+        return kwargs
+
+    def generate_inner(self, message, dataset=None):
+        if dataset is not None:
+            kwargs = self.adjust_kwargs(dataset)
+        else:
+            kwargs = self.kwargs
+
+        max_image_size = kwargs.pop("max_image_size")
+        split_image = kwargs.pop("split_image")
+
+        prompt = '<|im_start|>user\n'
+        images = []
+        last_message_modality = "text"
+
+        if listinstr(['MLVU', 'TempCompass', 'MVBench'], dataset):  # re-arrange the data
+            new_message = []
+            for s in message:
+                if s['type'] == 'image':
+                    new_message.append(s)
+            for s in message:
+                if s['type'] == 'text':
+                    new_message.append(s)
+            message = new_message
+
+        for s in message:
+            if s['type'] == 'image':
+                prompt += '<fim_prefix><|img|><fim_suffix>'
+                images.append(s['value'])
+                last_message_modality = "image"
+            elif s['type'] == 'text':
+                text = re.sub(r"<image \d+>", "", s["value"])
+                if last_message_modality == "image":
+                    prompt += "\n"
+                    last_message_modality = "text"
+                prompt += text
+
+        if DATASET_MODALITY(dataset) == 'VIDEO':
+            prompt = self.build_video_prompt(prompt, dataset)
+
+        prompt += '<|im_end|>\n<|im_start|>assistant\n'
+        if images:
+            images = [Image.open(s).convert('RGB') for s in images]
+            encoded = self.processor(
+                text=prompt,
+                images=images,
+                return_tensors='pt',
+                padding='longest',
+                max_image_size=max_image_size,
+                split_image=split_image,
+            )
+        else:
+            encoded = self.processor(text=prompt, return_tensors='pt', padding='longest')
+        encoded["pixel_values"] = encoded["pixel_values"].to(self.model.dtype)
+        encoded = {k: v.to(self.model.device) for k, v in encoded.items()}
+
+        pred = self.model.generate(**encoded, **kwargs)
+        answer = self.tokenizer.decode(pred[0][encoded['input_ids'].size(1):].cpu(), skip_special_tokens=True).strip()
+        answer = answer.replace('<|im_end|>', '')
+        return answer
--- a/VLMEvalKit/vlmeval/vlm/base.py
+++ b/VLMEvalKit/vlmeval/vlm/base.py
+from ..smp import *
+from ..dataset import img_root_map, DATASET_TYPE
+from abc import abstractmethod
+
+
+class BaseModel:
+
+    INTERLEAVE = False
+    allowed_types = ['text', 'image', 'video']
+
+    def __init__(self):
+        self.dump_image_func = None
+
+    def use_custom_prompt(self, dataset):
+        """Whether to use custom prompt for the given dataset.
+
+        Args:
+            dataset (str): The name of the dataset.
+
+        Returns:
+            bool: Whether to use custom prompt. If True, will call `build_prompt` of the VLM to build the prompt.
+                Default to False.
+        """
+        return False
+
+    @abstractmethod
+    def build_prompt(self, line, dataset):
+        """Build custom prompts for a specific dataset. Called only if `use_custom_prompt` returns True.
+
+        Args:
+            line (line of pd.DataFrame): The raw input line.
+            dataset (str): The name of the dataset.
+
+        Returns:
+            str: The built message.
+        """
+        raise NotImplementedError
+
+    def set_dump_image(self, dump_image_func):
+        self.dump_image_func = dump_image_func
+
+    def dump_image(self, line, dataset):
+        return self.dump_image_func(line)
+
+    @abstractmethod
+    def generate_inner(self, message, dataset=None):
+        raise NotImplementedError
+
+    def check_content(self, msgs):
+        """Check the content type of the input. Four types are allowed: str, dict, liststr, listdict.
+        """
+        if isinstance(msgs, str):
+            return 'str'
+        if isinstance(msgs, dict):
+            return 'dict'
+        if isinstance(msgs, list):
+            types = [self.check_content(m) for m in msgs]
+            if all(t == 'str' for t in types):
+                return 'liststr'
+            if all(t == 'dict' for t in types):
+                return 'listdict'
+        return 'unknown'
+
+    def preproc_content(self, inputs):
+        """Convert the raw input messages to a list of dicts.
+
+        Args:
+            inputs: raw input messages.
+
+        Returns:
+            list(dict): The preprocessed input messages. Will return None if failed to preprocess the input.
+        """
+        if self.check_content(inputs) == 'str':
+            return [dict(type='text', value=inputs)]
+        elif self.check_content(inputs) == 'dict':
+            assert 'type' in inputs and 'value' in inputs
+            return [inputs]
+        elif self.check_content(inputs) == 'liststr':
+            res = []
+            for s in inputs:
+                mime, pth = parse_file(s)
+                if mime is None or mime == 'unknown':
+                    res.append(dict(type='text', value=s))
+                else:
+                    res.append(dict(type=mime.split('/')[0], value=pth))
+            return res
+        elif self.check_content(inputs) == 'listdict':
+            for item in inputs:
+                assert 'type' in item and 'value' in item
+                mime, s = parse_file(item['value'])
+                if mime is None:
+                    assert item['type'] == 'text'
+                else:
+                    assert mime.split('/')[0] == item['type']
+                    item['value'] = s
+            return inputs
+        else:
+            return None
+
+    def generate(self, message, dataset=None):
+        """Generate the output message.
+
+        Args:
+            message (list[dict]): The input message.
+            dataset (str, optional): The name of the dataset. Defaults to None.
+
+        Returns:
+            str: The generated message.
+        """
+        assert self.check_content(message) in ['str', 'dict', 'liststr', 'listdict'], f'Invalid input type: {message}'
+        message = self.preproc_content(message)
+        assert message is not None and self.check_content(message) == 'listdict'
+        for item in message:
+            assert item['type'] in self.allowed_types, f'Invalid input type: {item["type"]}'
+        return self.generate_inner(message, dataset)
+
+    def chat(self, messages, dataset=None):
+        """The main function for multi-turn chatting. Will call `chat_inner` with the preprocessed input messages."""
+        assert hasattr(self, 'chat_inner'), 'The API model should has the `chat_inner` method. '
+        for msg in messages:
+            assert isinstance(msg, dict) and 'role' in msg and 'content' in msg, msg
+            assert self.check_content(msg['content']) in ['str', 'dict', 'liststr', 'listdict'], msg
+            msg['content'] = self.preproc_content(msg['content'])
+
+        while len(messages):
+            try:
+                return self.chat_inner(messages, dataset=dataset)
+            except Exception as e:
+                logging.info(f'{type(e)}: {e}')
+                messages = messages[1:]
+                while len(messages) and messages[0]['role'] != 'user':
+                    messages = messages[1:]
+                continue
+        return 'Chat Mode: Failed with all possible conversation turns.'
+
+    def message_to_promptimg(self, message, dataset=None):
+        assert not self.INTERLEAVE
+        model_name = self.__class__.__name__
+        warnings.warn(
+            f'Model {model_name} does not support interleaved input. '
+            'Will use the first image and aggregated texts as prompt. ')
+        num_images = len([x for x in message if x['type'] == 'image'])
+        if num_images == 0:
+            prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+            image = None
+        else:
+            prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+            images = [x['value'] for x in message if x['type'] == 'image']
+            if 'BLINK' == dataset:
+                image = concat_images_vlmeval(images, target_size=512)
+            else:
+                image = images[0]
+        return prompt, image
+
+    def message_to_promptvideo(self, message):
+        if self.VIDEO_LLM:
+            num_videos = len([x for x in message if x['type'] == 'video'])
+            if num_videos == 0:
+                prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+                video = None
+            else:
+                prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+                video = [x['value'] for x in message if x['type'] == 'video'][0]
+            return prompt, video
+        else:
+            logging.critical('Model does not support video input.')
+            raise NotImplementedError
+
+    def message_to_promptvideo_withrole(self, message, dataset=None):
+        if self.VIDEO_LLM:
+            system, user, assistant, video_list = '', '', '', []
+            for msg in message:
+                if msg['type'] == 'text':
+                    if 'role' in msg and msg['role'] == 'system':
+                        system += msg['value']
+                    elif 'role' in msg and msg['role'] == 'assistant':
+                        assistant += msg['value']
+                    else:
+                        user += msg['value']
+                elif msg['type'] == 'video':
+                    video_list.append(msg['value'])
+            question = {
+                'system': system,
+                'user': user,
+                'assistant': assistant
+            }
+            if assistant == '':
+                if listinstr(['MCQ'], DATASET_TYPE(dataset)):
+                    question['assistant'] = 'Best Option: ('
+                else:
+                    del question['assistant']
+            if len(video_list) > 1:
+                print('VLMEvalKit only support single video as input, take first video as input')
+            video = video_list[0]
+            return question, video
+        else:
+            logging.critical('Model does not support video input.')
+            raise NotImplementedError
--- a/VLMEvalKit/vlmeval/vlm/bunnyllama3.py
+++ b/VLMEvalKit/vlmeval/vlm/bunnyllama3.py
+import torch
+import transformers
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from PIL import Image
+import warnings
+import re
+
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+
+
+class BunnyLLama3(BaseModel):
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='BAAI/Bunny-v1_1-Llama-3-8B-V', **kwargs):
+        assert model_path is not None
+        transformers.logging.set_verbosity_error()
+        transformers.logging.disable_progress_bar()
+        warnings.filterwarnings('ignore')
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='auto', trust_remote_code=True)
+        self.kwargs = kwargs
+
+    def use_custom_prompt(self, dataset):
+        if listinstr(['MCQ', 'Y/N'], DATASET_TYPE(dataset)) or listinstr(['mathvista'], dataset.lower()):
+            return True
+        else:
+            return False
+
+    def build_prompt(self, line, dataset):
+        if dataset is None:
+            dataset = self.dataset
+
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+
+        tgt_path = self.dump_image(line, dataset)
+
+        prompt = line['question']
+
+        if DATASET_TYPE(dataset) == 'MCQ':
+            if listinstr(['mmmu'], dataset.lower()):
+                hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+                assert hint is None
+
+                question = line['question']
+                question = re.sub(r'<image (\d+)>', lambda x: x.group(0)[1:-1], question)
+
+                options = {
+                    cand: line[cand]
+                    for cand in string.ascii_uppercase
+                    if cand in line and not pd.isna(line[cand])
+                }
+                options_prompt = '\n'
+                for key, item in options.items():
+                    options_prompt += f'({key}) {item}\n'
+
+                prompt = question
+                if len(options):
+                    prompt += options_prompt
+                    prompt += "\nAnswer with the option's letter from the given choices directly."
+                else:
+                    prompt += '\nAnswer the question using a single word or phrase.'
+            else:
+                hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+                prompt = ''
+                if hint is not None:
+                    prompt += f'{hint}\n'
+
+                question = line['question']
+
+                options = {
+                    cand: line[cand]
+                    for cand in string.ascii_uppercase
+                    if cand in line and not pd.isna(line[cand])
+                }
+                options_prompt = '\n'
+                for key, item in options.items():
+                    options_prompt += f'{key}. {item}\n'
+
+                prompt += question + options_prompt
+                if listinstr(['cn', 'ccbench'], dataset.lower()):
+                    prompt += '请直接回答选项字母。'
+                else:
+                    prompt += "Answer with the option's letter from the given choices directly."
+        elif DATASET_TYPE(dataset) == 'Y/N':
+            if listinstr(['mme'], dataset.lower()):
+                if not listinstr(
+                        ['code_reasoning', 'commonsense_reasoning', 'numerical_calculation', 'text_translation'],
+                        line['category']):
+                    prompt = prompt.replace(' Please answer yes or no.',
+                                            '\nAnswer the question using a single word or phrase.')
+            elif listinstr(['pope'], dataset.lower()):
+                prompt = prompt.replace(' Please answer yes or no.',
+                                        '\nAnswer the question using a single word or phrase.')
+        elif listinstr(['mathvista'], dataset.lower()):
+            match = re.search(r'Hint: (.*?)\nQuestion: (.*?)\n(Choices:\n(.*))?', prompt + '\n', re.DOTALL)
+
+            prompt = match.group(2)
+            if match.group(4) is not None:
+                prompt += '\n' + match.group(4).rstrip('\n')
+            prompt += '\n' + match.group(1)
+        else:
+            raise ValueError(
+                f"Bunny doesn't implement a custom prompt for {dataset}. It should use the default prompt, but didn't.")
+
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=prompt))
+
+        return msgs
+
+    def generate_inner(self, message, dataset=None):
+
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+
+        text = (f'A chat between a curious user and an artificial intelligence assistant. '
+                f"The assistant gives helpful, detailed, and polite answers to the user's questions. "
+                f'USER: <image>\n{prompt} ASSISTANT:')
+
+        text_chunks = [self.tokenizer(chunk).input_ids for chunk in text.split('<image>')]
+        input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0)
+        image = Image.open(image_path).convert('RGB')
+        image_tensor = self.model.process_images([image], self.model.config).to(dtype=self.model.dtype)
+
+        output_ids = self.model.generate(input_ids, images=image_tensor, max_new_tokens=128, use_cache=True)[0]
+        response = self.tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True)
+        return response
--- a/VLMEvalKit/vlmeval/vlm/cambrian.py
+++ b/VLMEvalKit/vlmeval/vlm/cambrian.py
+import torch
+from PIL import Image
+from .base import BaseModel
+from ..smp import *
+import warnings
+
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = '<image>'
+DEFAULT_IM_START_TOKEN = '<im_start>'
+DEFAULT_IM_END_TOKEN = '<im_end>'
+
+
+class Cambrian(BaseModel):
+
+    INSTALL_REQ = True
+    INTERLEAVE = False
+
+    def __init__(self, model_path='nyu-visionx/cambrian-8b', **kwargs):
+        assert model_path is not None
+        try:
+            from cambrian.conversation import conv_templates, SeparatorStyle
+            from cambrian.model.builder import load_pretrained_model
+            from cambrian.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+        except Exception as e:
+            logging.critical('Please install cambrian from https://github.com/cambrian-mllm/cambrian.')
+            raise e
+
+        model_name = get_model_name_from_path(model_path)
+        tokenizer, model, image_processor, context_len = load_pretrained_model(
+            model_path,
+            None,
+            model_name,
+            device_map=None
+        )
+
+        if '8b' in model_path:
+            self.conv_mode = 'llama_3'
+        elif '13b' in model_path:
+            self.conv_mode = 'vicuna_v1'
+        else:
+            self.conv_mode = 'chatml_direct'
+
+        self.model_config = model.config
+        self.conv_templates = conv_templates
+        self.tokenizer_image_token = tokenizer_image_token
+        self.process_images = process_images
+
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        self.model = model.to('cuda')
+
+    def process(self, image, question):
+        if self.model_config.mm_use_im_start_end:
+            question = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + question
+        else:
+            question = DEFAULT_IMAGE_TOKEN + '\n' + question
+        conv = self.conv_templates[self.conv_mode].copy()
+        conv.append_message(conv.roles[0], question)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        image_size = [image.size]
+        image_tensor = self.process_images([image], self.image_processor, self.model_config)
+        input_ids = self.tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
+        input_ids = input_ids.unsqueeze(0).cuda()
+        return input_ids, image_tensor, image_size, prompt
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        image = Image.open(image_path).convert('RGB')
+        input_ids, image_tensor, image_sizes, prompt = self.process(image, prompt)
+        input_ids = input_ids.to(device='cuda', non_blocking=True)
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                input_ids,
+                images=image_tensor,
+                image_sizes=image_sizes,
+                do_sample=False,
+                temperature=0,
+                num_beams=1,
+                max_new_tokens=512,
+                use_cache=True
+            )
+        outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+        return outputs
--- a/VLMEvalKit/vlmeval/vlm/chameleon.py
+++ b/VLMEvalKit/vlmeval/vlm/chameleon.py
+import os.path as osp
+import warnings
+from .base import BaseModel
+from ..smp import *
+from PIL import Image
+import torch
+
+
+class Chameleon(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = True
+
+    def __init__(self, model_path='facebook/chameleon-7b', **kwargs):
+        try:
+            from transformers import ChameleonProcessor, ChameleonForConditionalGeneration
+        except Exception as e:
+            logging.critical('Please install the latest transformers.')
+            raise e
+
+        processor = ChameleonProcessor.from_pretrained(model_path)
+        model = ChameleonForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16)
+
+        self.model = model.cuda().eval()
+        self.processor = processor
+
+    def generate_inner(self, message, dataset=None):
+        content, images = '', []
+        for x in message:
+            if x['type'] == 'text':
+                content += x['value']
+            elif x['type'] == 'image':
+                content += '<image>\n'
+                images.append(Image.open(x['value']))
+
+        inputs = self.processor(
+            text=[content],
+            images=images,
+            padding=True,
+            return_tensors='pt'
+        ).to(device='cuda', dtype=torch.bfloat16)
+        generate_ids = self.model.generate(**inputs, max_new_tokens=512)
+        input_token_len = inputs.input_ids.shape[1]
+        text = self.processor.batch_decode(
+            generate_ids[:, input_token_len:],
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
+        return text
--- a/VLMEvalKit/vlmeval/vlm/cogvlm.py
+++ b/VLMEvalKit/vlmeval/vlm/cogvlm.py
+import torch
+from PIL import Image
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer
+
+
+class GLM4v(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='THUDM/glm-4v-9b', **kwargs):
+        assert model_path is not None
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True
+        ).to('cuda').eval()
+        gen_kwargs = {'max_length': 2048, 'do_sample': False}
+        gen_kwargs.update(kwargs)
+        self.kwargs = gen_kwargs
+        self.end_text_token = '<|endoftext|>'
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        image = Image.open(image_path).convert('RGB')
+        if dataset is not None and DATASET_TYPE(dataset) in ['MCQ', 'Y/N']:
+            prompt += '\nShort Answer.'
+        inputs = self.tokenizer.apply_chat_template(
+            [{'role': 'user', 'image': image, 'content': prompt}],
+            add_generation_prompt=True, tokenize=True, return_tensors='pt', return_dict=True
+        )
+        inputs = inputs.to('cuda')
+
+        with torch.no_grad():
+            outputs = self.model.generate(**inputs, **self.kwargs)
+            outputs = outputs[:, inputs['input_ids'].shape[1]:]
+            response = self.tokenizer.decode(outputs[0])
+        return response.split(self.end_text_token)[0]
+
+
+class CogVlm(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='THUDM/cogvlm2-llama3-chat-19B', tokenizer_name=None, **kwargs):
+        assert model_path is not None
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+        ).to('cuda').eval()
+
+        self.kwargs = kwargs
+        if tokenizer_name:
+            tokenizer = LlamaTokenizer.from_pretrained(tokenizer_name)
+            gen_kwargs = {'max_length': 2048, 'do_sample': False}
+            self.end_text_token = '</s>'
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+            gen_kwargs = {'max_new_tokens': 2048, 'pad_token_id': 128002}
+            self.end_text_token = '<|end_of_text|>'
+        self.kwargs.update(gen_kwargs)
+        self.tokenizer = tokenizer
+        self.model = model
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if DATASET_TYPE(dataset) == 'MCQ':
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+
+        if dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
+            question = line['question']
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+            if hint is not None:
+                question = hint + '\n' + question
+
+            option_candidate = string.ascii_uppercase
+            options = {
+                cand: line[cand]
+                for cand in option_candidate
+                if cand in line and not pd.isna(line[cand])
+            }
+            for key, item in options.items():
+                question += f'\n{key}. {item}'
+            prompt = question
+
+            if not cn_string(prompt):
+                prompt = prompt + '\n' + "Answer with the option's letter from the given choices directly."
+            else:
+                prompt = prompt + '\n' + '请直接回答选项字母。'
+        else:
+            prompt = line['question']
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=p) for p in tgt_path])
+
+        return message
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        if dataset is not None and DATASET_TYPE(dataset) in ['MCQ', 'Y/N']:
+            prompt += '\nShort Answer.'
+
+        image = Image.open(image_path).convert('RGB')
+        inputs = self.model.build_conversation_input_ids(
+            self.tokenizer, query=prompt, history=[], images=[image])  # chat mode
+        inputs = {
+            'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'),
+            'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'),
+            'attention_mask': inputs['attention_mask'].unsqueeze(0).to('cuda'),
+            'images': [[inputs['images'][0].to('cuda').to(torch.bfloat16)]],
+        }
+
+        with torch.no_grad():
+            outputs = self.model.generate(**inputs, **self.kwargs)
+            outputs = outputs[:, inputs['input_ids'].shape[1]:]
+            response = self.tokenizer.decode(outputs[0])
+        response = response.split(self.end_text_token)[0].strip()
+        return response
--- a/VLMEvalKit/vlmeval/vlm/deepseek_vl.py
+++ b/VLMEvalKit/vlmeval/vlm/deepseek_vl.py
+import sys
+import torch
+from transformers import AutoModelForCausalLM
+import warnings
+from .base import BaseModel
+from ..smp import *
+
+
+class DeepSeekVL(BaseModel):
+
+    INSTALL_REQ = True
+    INTERLEAVE = True
+
+    def check_install(self):
+        try:
+            import deepseek_vl
+        except Exception as e:
+            logging.critical(
+                'Please first install deepseek_vl from source codes in: https://github.com/deepseek-ai/DeepSeek-VL')
+            raise e
+
+    def __init__(self, model_path='deepseek-ai/deepseek-vl-1.3b-chat', **kwargs):
+        self.check_install()
+        assert model_path is not None
+        self.model_path = model_path
+        from deepseek_vl.models import VLChatProcessor
+
+        self.vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
+        self.tokenizer = self.vl_chat_processor.tokenizer
+
+        model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
+        self.model = model.to(torch.bfloat16).cuda().eval()
+
+        torch.cuda.empty_cache()
+        default_kwargs = dict(max_new_tokens=512, do_sample=False, use_cache=True)
+        default_kwargs.update(kwargs)
+        self.kwargs = default_kwargs
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+
+    def prepare_inputs(self, message):
+        def prepare_itlist(msgs):
+            content, images = '', []
+            for s in msgs:
+                if s['type'] == 'image':
+                    images.append(s['value'])
+                    content += '<image_placeholder>'
+                elif s['type'] == 'text':
+                    content += s['value']
+            return content, images
+        conversation = []
+        if 'role' not in message[0]:
+            content, images = prepare_itlist(message)
+            conversation.append(dict(role='User', content=content, images=images))
+        else:
+            role_map = {'user': 'User', 'assistant': 'Assistant'}
+            for msgs in message:
+                role = role_map[msgs['role']]
+                content, images = prepare_itlist(msgs['content'])
+                conversation.append(dict(role=role, content=content, images=images))
+        conversation.append(dict(role='Assistant', content=''))
+        return conversation
+
+    def generate_inner(self, message, dataset=None):
+        conversation = self.prepare_inputs(message)
+        from deepseek_vl.utils.io import load_pil_images
+        pil_images = load_pil_images(conversation)
+        prepare_inputs = self.vl_chat_processor(conversations=conversation, images=pil_images, force_batchify=True)
+        prepare_inputs = prepare_inputs.to(self.model.device)
+        inputs_embeds = self.model.prepare_inputs_embeds(**prepare_inputs)
+
+        outputs = self.model.language_model.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=prepare_inputs.attention_mask,
+            pad_token_id=self.tokenizer.eos_token_id,
+            bos_token_id=self.tokenizer.bos_token_id,
+            eos_token_id=self.tokenizer.eos_token_id,
+            **self.kwargs)
+        answer = self.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
+        return answer
+
+    def chat_inner(self, message, dataset=None):
+        return self.generate_inner(message, dataset=dataset)
--- a/VLMEvalKit/vlmeval/vlm/eagle_x.py
+++ b/VLMEvalKit/vlmeval/vlm/eagle_x.py
+import torch
+from PIL import Image
+from abc import abstractproperty
+import sys
+import os.path as osp
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+import copy
+
+
+# This function is used to split Eagle-X5-34B
+def split_model(model_name):
+    import math
+    device_map = {}
+    num_gpus = torch.cuda.device_count()
+    rank, world_size = get_rank_and_world_size()
+    num_gpus = num_gpus // world_size
+
+    num_layers_map = {
+        'Eagle-X5-34B-Chat': 60,
+        'Eagle-X5-34B-Plus': 60
+    }
+    if model_name not in num_layers_map:
+        return 'cuda'
+    num_layers = num_layers_map[model_name] + 8
+    # Since the first GPU will be used for ViT, treat it as 0.5 GPU.
+    num_layers_per_gpu = math.ceil(num_layers / num_gpus)
+    num_layers_per_gpu = [num_layers_per_gpu] * num_gpus
+    num_layers_per_gpu[-1] = num_layers - sum(num_layers_per_gpu[:-1])
+    num_layers_per_gpu[0] -= 4
+    layer_cnt = 0
+    for i, num_layer in enumerate(num_layers_per_gpu):
+        for j in range(num_layer):
+            device_map[f'model.layers.{layer_cnt}'] = rank + world_size * i
+            layer_cnt += 1
+    device_map['model.vision_tower'] = rank
+    device_map['model.embed_tokens'] = rank
+    device_map['model.norm'] = rank
+    device_map['model.rotary_emb'] = rank
+    device_map['model.mm_projector'] = rank
+    device_map['lm_head'] = rank
+    device_map[f'model.layers.{num_layers - 1}'] = rank
+
+    logging.warning("Remove L157-L158 in https://github.com/NVlabs/EAGLE/blob/fef95f103b5e9899acbbe2c237e5b99147ab7e8e/eagle/model/builder.py to make it work properly.")  # noqa: E501
+    return device_map
+
+
+class Eagle(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = True
+
+    def __init__(self,
+                 model_path='NVEagle/Eagle-X5-7B',
+                 **kwargs):
+        try:
+            from eagle.model.builder import load_pretrained_model
+            from eagle.utils import disable_torch_init
+            from eagle.mm_utils import get_model_name_from_path
+        except Exception as e:
+            logging.critical('''Please install eagle before using Eagle,
+            you can install it from "https://github.com/NVlabs/EAGLE.git"''')
+            raise e
+
+        warnings.warn('Please install the latest version of eagle from github before you evaluate the Eagle model.')
+        assert osp.exists(model_path) or splitlen(model_path) == 2
+        model_name = get_model_name_from_path(model_path)
+        rank, world_size = get_rank_and_world_size()
+
+        device_map = split_model(model_path.split('/')[-1])
+
+        self.tokenizer, self.model, self.image_processor, self.context_len = (
+            load_pretrained_model(model_path, None, model_name, False, False, device_map=device_map)
+        )
+        self.model.eval()
+        self.conv_mode = 'vicuna_v1'
+
+        default_kwargs = dict(
+            do_sample=True,
+            temperature=0.2,
+            top_p=0.5,
+            num_beams=1,
+            max_new_tokens=512,
+            use_cache=True
+        )
+
+        default_kwargs.update(kwargs)
+        self.kwargs = default_kwargs
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+        torch.cuda.empty_cache()
+
+    def generate_inner(self, message, dataset=None):
+        try:
+            from eagle import conversation as conversation_lib
+            from eagle.constants import (IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN,
+                                         DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN)
+            from eagle.conversation import conv_templates, SeparatorStyle
+            from eagle.mm_utils import tokenizer_image_token, process_images, KeywordsStoppingCriteria
+        except Exception as e:
+            logging.critical('''Please install eagle before using Eagle,
+            you can install it from "https://github.com/NVlabs/EAGLE.git"''')
+            raise e
+
+        kwargs = self.kwargs
+
+        images = []
+        prompt = ''
+
+        for s in message:
+            if s['type'] == 'image':
+                images.append(s['value'])
+            elif s['type'] == 'text':
+                prompt += s['value']
+
+        DEFAULT_IMAGE_TOKEN = DEFAULT_IMAGE_TOKEN * len(images)
+        if self.model.config.mm_use_im_start_end:
+            prompt = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + prompt
+        else:
+            prompt = DEFAULT_IMAGE_TOKEN + '\n' + prompt
+
+        conv = conv_templates[self.conv_mode].copy()
+        conv.append_message(conv.roles[0], prompt)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        images = [Image.open(s).convert('RGB') for s in images]
+
+        image_tensor = process_images(images, self.image_processor, self.model.config)
+        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
+        input_ids = input_ids.to(device='cuda', non_blocking=True)
+        image_tensor = image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True)
+
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                input_ids.unsqueeze(0),
+                images=image_tensor,
+                image_sizes=[img.size for img in images],
+                **kwargs
+            )
+
+        outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+        return outputs
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        if listinstr(['MMMU'], dataset):
+            return False
+        if DATASET_TYPE(dataset) == 'MCQ' or dataset == 'MMVet':
+            return True
+        return False
+
+    def build_prompt(self, line, dataset=None):
+        assert dataset is None or isinstance(dataset, str)
+        assert self.use_custom_prompt(dataset)
+        tgt_path = self.dump_image(line, dataset)
+        question = line['question']
+        if dataset == 'MMVet':
+            prompt = question + '\nAnswer the question directly. '
+        elif DATASET_TYPE(dataset) == 'MCQ':
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            options_prompt = ''
+            for key, item in options.items():
+                options_prompt += f'{key}. {item}\n'
+
+            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+            prompt = f'Hint: {hint}\n' if hint is not None else ''
+            prompt += f'{question}\n'
+            prompt += (
+                f'{options_prompt}\nAnswer with the option’s letter from the given choices directly. '
+                if len(options) else 'Answer the question directly. '
+            )
+        else:
+            raise NotImplementedError
+
+        message = [dict(type='text', value=prompt)]
+        message.extend([dict(type='image', value=s) for s in tgt_path])
+        return message
--- a/VLMEvalKit/vlmeval/vlm/emu.py
+++ b/VLMEvalKit/vlmeval/vlm/emu.py
+import os
+import torch
+from PIL import Image
+import os.path as osp
+from .base import BaseModel
+from ..smp import *
+
+
+class Emu(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = True
+
+    def __init__(self,
+                 model_path='BAAI/Emu2-Chat',
+                 **kwargs):
+
+        self.model_path = model_path
+        assert osp.exists(model_path) or splitlen(model_path) == 2
+
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model
+
+        local_rank = os.environ.get('LOCAL_RANK', 0)
+
+        device_num = torch.cuda.device_count()
+        assert local_rank * 2 <= device_num, 'The number of devices does not match the world size'
+        assert device_num >= 2, 'You need at least 2 GPUs to use EMU'
+
+        device_1 = local_rank
+        device_2 = local_rank + device_num // 2
+
+        torch.cuda.set_device(device_1)
+        torch.cuda.set_device(device_2)
+
+        tokenizer = AutoTokenizer.from_pretrained(model_path)  # "BAAI/Emu2-Chat"
+        self.tokenizer = tokenizer
+        with init_empty_weights():
+            model = AutoModelForCausalLM.from_pretrained(
+                model_path,  # "BAAI/Emu2-Chat"
+                torch_dtype=torch.bfloat16,
+                low_cpu_mem_usage=True,
+                trust_remote_code=True)
+
+        device_map = infer_auto_device_map(
+            model,
+            max_memory={
+                device_1: '38GiB',
+                device_2: '38GiB'
+            },
+            no_split_module_classes=['Block', 'LlamaDecoderLayer'])
+
+        # input and output logits should be on same device
+        device_map['model.decoder.lm.lm_head'] = device_1
+
+        model = dispatch_model(
+            model,
+            device_map=device_map).eval()
+
+        self.model = model
+        kwargs_default = dict(max_new_tokens=512, length_penalty=-1)
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+
+    def generate_inner(self, message, dataset=None):
+        query, images = '', []
+        for item in message:
+            if item['type'] == 'image':
+                images.append(Image.open(item['value']).convert('RGB'))
+                query += '[<IMG_PLH>]'
+            elif item['type'] == 'text':
+                query += item['value']
+
+        inputs = self.model.build_input_ids(
+            text=[query],
+            tokenizer=self.tokenizer,
+            image=images
+        )
+
+        with torch.no_grad():
+            outputs = self.model.generate(
+                input_ids=inputs['input_ids'],
+                attention_mask=inputs['attention_mask'],
+                image=inputs['image'].to(torch.bfloat16),
+                **self.kwargs)
+
+        output_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        return output_text[0]
--- a/VLMEvalKit/vlmeval/vlm/falcon_vlm.py
+++ b/VLMEvalKit/vlmeval/vlm/falcon_vlm.py
+from PIL import Image
+import requests
+
+from .base import BaseModel
+
+
+class Falcon2VLM(BaseModel):
+
+    INSTALL_REQ = False
+    INTERLEAVE = False
+
+    def __init__(self, model_path='tiiuae/falcon-11B-vlm', **kwargs):
+        import torch
+        from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor
+
+        self.model_path = model_path
+        self.processor = LlavaNextProcessor.from_pretrained(model_path, tokenizer_class='PreTrainedTokenizerFast')
+        self.model = LlavaNextForConditionalGeneration.from_pretrained(
+            model_path, torch_dtype=torch.bfloat16, device_map='cuda').eval()
+        default_kwargs = {'max_new_tokens': 512}
+        default_kwargs.update(kwargs)
+        self.kwargs = default_kwargs
+
+    def generate_inner(self, message, dataset=None):
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        image = Image.open(image_path).convert('RGB')
+
+        prompt = f'User:<image>\n{prompt} Falcon:'
+        inputs = self.processor(text=prompt, images=image, return_tensors='pt').to('cuda')
+
+        output = self.model.generate(**inputs, **self.kwargs)
+        prompt_length = inputs['input_ids'].shape[1]
+        model_response = self.processor.decode(output[0][prompt_length:], skip_special_tokens=True).strip()
+        return model_response