Initial commit

bc5ebf0f · luopl · bc5ebf0f · bc5ebf0f · bc5ebf0f · bc5ebf0f
Commit bc5ebf0f authored Dec 27, 2024 by luopl
20 changed files
--- a/VLMEvalKit/docs/zh-CN/_templates/autosummary/class.rst
+++ b/VLMEvalKit/docs/zh-CN/_templates/autosummary/class.rst
+.. role:: hidden
+    :class: hidden-section
+.. currentmodule:: {{ module }}
+
+
+{{ name | underline}}
+
+.. autoclass:: {{ name }}
+    :members:
+
+..
+  autogenerated from _templates/autosummary/class.rst
+  note it does not have :inherited-members:
--- a/VLMEvalKit/docs/zh-CN/_templates/callable.rst
+++ b/VLMEvalKit/docs/zh-CN/_templates/callable.rst
+.. role:: hidden
+    :class: hidden-section
+.. currentmodule:: {{ module }}
+
+
+{{ name | underline}}
+
+.. autoclass:: {{ name }}
+    :members:
+    :special-members: __call__
+
+..
+  autogenerated from _templates/callable.rst
+  note it does not have :inherited-members:
--- a/VLMEvalKit/docs/zh-CN/conf.py
+++ b/VLMEvalKit/docs/zh-CN/conf.py
+# flake8: noqa
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import ast
+import subprocess
+import sys
+
+import pytorch_sphinx_theme
+from sphinx.builders.html import StandaloneHTMLBuilder
+
+sys.path.insert(0, os.path.abspath('../../'))
+
+# -- Project information -----------------------------------------------------
+
+project = 'VLMEvalKit'
+copyright = '2023, VLMEvalKit'
+author = 'VLMEvalKit Authors'
+
+# The full version, including alpha/beta/rc tags
+version_file = '../../vlmeval/__init__.py'
+
+
+def get_version():
+    with open(version_file, 'r') as f:
+        file_content = f.read()
+    # Parse the file content into an abstract syntax tree (AST)
+    tree = ast.parse(file_content, filename=version_file)
+
+    # Iterate through the body of the AST, looking for an assignment to __version__
+    for node in tree.body:
+        if isinstance(node, ast.Assign):
+            for target in node.targets:
+                if isinstance(target, ast.Name) and target.id == '__version__':
+                    return node.value.s
+    raise ValueError('__version__ not found')
+
+
+release = get_version()
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+    'myst_parser',
+    'sphinx_copybutton',
+    'sphinx_tabs.tabs',
+    'notfound.extension',
+    'sphinxcontrib.jquery',
+    'sphinx_design',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.md': 'markdown',
+}
+
+language = 'cn'
+
+# The master toctree document.
+root_doc = 'index'
+html_context = {
+    'github_version': 'latest',
+}
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'pytorch_sphinx_theme'
+html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+# yapf: disable
+html_theme_options = {
+    'menu': [
+        {
+            'name': 'GitHub',
+            'url': 'https://github.com/open-compass/VLMEvalKit'
+        },
+    ],
+    # Specify the language of shared menu
+    'menu_lang': 'cn',
+    # Disable the default edit on GitHub
+    'default_edit_on_github': False,
+}
+# yapf: enable
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+html_css_files = [
+    'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.css',
+    'css/readthedocs.css'
+]
+html_js_files = [
+    'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.js',
+    'js/custom.js'
+]
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'vlmevalkitdoc'
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (root_doc, 'vlmevalkit.tex', 'VLMEvalKit Documentation', author,
+     'manual'),
+]
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [(root_doc, 'vlmevalkit', 'VLMEvalKit Documentation', [author],
+              1)]
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (root_doc, 'vlmevalkit', 'VLMEvalKit Documentation', author,
+     'VLMEvalKit Authors', 'AGI evaluation toolbox and benchmark.',
+     'Miscellaneous'),
+]
+
+# -- Options for Epub output -------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+# set priority when building html
+StandaloneHTMLBuilder.supported_image_types = [
+    'image/svg+xml', 'image/gif', 'image/png', 'image/jpeg'
+]
+
+# -- Extension configuration -------------------------------------------------
+# Ignore >>> when copying code
+copybutton_prompt_text = r'>>> |\.\.\. '
+copybutton_prompt_is_regexp = True
+
+# Auto-generated header anchors
+myst_heading_anchors = 3
+# Enable "colon_fence" extension of myst.
+myst_enable_extensions = ['colon_fence', 'dollarmath']
+
+# Configuration for intersphinx
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/3', None),
+    'numpy': ('https://numpy.org/doc/stable', None),
+    'torch': ('https://pytorch.org/docs/stable/', None),
+    'mmengine': ('https://mmengine.readthedocs.io/en/latest/', None),
+    'transformers':
+    ('https://huggingface.co/docs/transformers/main/en/', None),
+}
+napoleon_custom_sections = [
+    # Custom sections for data elements.
+    ('Meta fields', 'params_style'),
+    ('Data fields', 'params_style'),
+]
+
+# Disable docstring inheritance
+autodoc_inherit_docstrings = False
+# Mock some imports during generate API docs.
+autodoc_mock_imports = ['rich', 'attr', 'einops']
+# Disable displaying type annotations, these can be very verbose
+autodoc_typehints = 'none'
+
+# The not found page
+notfound_template = '404.html'
+
+
+def builder_inited_handler(app):
+    subprocess.run(['./cp_origin_docs.sh'])
+
+
+def setup(app):
+    app.connect('builder-inited', builder_inited_handler)
--- a/VLMEvalKit/docs/zh-CN/cp_origin_docs.sh
+++ b/VLMEvalKit/docs/zh-CN/cp_origin_docs.sh
+#!/usr/bin/env bash
+
+# Copy *.md files from docs/ if it doesn't have a Chinese translation
+
+for filename in $(find ../en/ -name '*.md' -printf "%P\n");
+do
+    mkdir -p $(dirname $filename)
+    cp -n ../en/$filename ./$filename
+done
--- a/VLMEvalKit/docs/zh-CN/docutils.conf
+++ b/VLMEvalKit/docs/zh-CN/docutils.conf
+[html writers]
+table_style: colwidths-auto
--- a/VLMEvalKit/docs/zh-CN/index.rst
+++ b/VLMEvalKit/docs/zh-CN/index.rst
+欢迎来到 VLMEvalKit 中文教程！
+==========================================
+
+VLMEvalKit 上手路线
+-------------------------------
+
+为了用户能够快速上手，我们推荐以下流程：
+
+- 对于想要使用 VLMEvalKit 的用户，我们推荐先阅读 开始你的第一步_ 部分来设置环境，并启动一个迷你实验熟悉流程。
+
+- 若您想进行更多模块的自定义，例如增加数据集和模型，我们提供了 进阶教程_ 。
+
+我们始终非常欢迎用户的 PRs 和 Issues 来完善 VLMEvalKit！
+
+.. _快速开始:
+.. toctree::
+   :maxdepth: 1
+   :caption: 快速开始
+
+   Quickstart.md
+
+
+.. .. _教程:
+.. .. toctree::
+..    :maxdepth: 1
+..    :caption: 教程
+
+..    user_guides/framework_overview.md
+
+.. _进阶教程:
+.. toctree::
+   :maxdepth: 1
+   :caption: 进阶教程
+
+   Development.md
+   ConfigSystem.md
+
+.. .. _其他说明:
+.. .. toctree::
+..    :maxdepth: 1
+..    :caption: 其他说明
+
+..    notes/contribution_guide.md
+
+索引与表格
+==================
+
+* :ref:`genindex`
+* :ref:`search`
--- a/VLMEvalKit/requirements.txt
+++ b/VLMEvalKit/requirements.txt
+decord; platform_machine != 'arm64'
+eva-decord; platform_machine == 'arm64'
+gradio
+huggingface_hub
+imageio
+matplotlib
+numpy
+omegaconf
+openai
+opencv-python>=4.4.0.46
+openpyxl
+pandas
+pillow
+portalocker
+protobuf
+python-dotenv
+requests
+rich
+sentencepiece
+setuptools
+sty
+tabulate
+tiktoken
+timeout-decorator
+torch
+tqdm
+transformers
+typing_extensions
+validators
+xlsxwriter
--- a/VLMEvalKit/requirements/docs.txt
+++ b/VLMEvalKit/requirements/docs.txt
+docutils==0.18.1
+modelindex
+myst-parser
+-e git+https://github.com/open-compass/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+sphinx==6.1.3
+sphinx-copybutton
+sphinx-design
+sphinx-notfound-page
+sphinx-tabs
+sphinxcontrib-jquery
+tabulate
--- a/VLMEvalKit/run.py
+++ b/VLMEvalKit/run.py
+import torch
+import torch.distributed as dist
+
+from vlmeval.config import supported_VLM
+from vlmeval.dataset.video_dataset_config import supported_video_datasets
+from vlmeval.dataset import build_dataset
+from vlmeval.inference import infer_data_job
+from vlmeval.inference_video import infer_data_job_video
+from vlmeval.inference_mt import infer_data_job_mt
+from vlmeval.smp import *
+from vlmeval.utils.result_transfer import MMMU_result_transfer, MMTBench_result_transfer
+
+
+def build_model_from_config(cfg, model_name):
+    import vlmeval.api
+    import vlmeval.vlm
+    config = cp.deepcopy(cfg[model_name])
+    if config == {}:
+        return supported_VLM[model_name]()
+    assert 'class' in config
+    cls_name = config.pop('class')
+    if hasattr(vlmeval.api, cls_name):
+        return getattr(vlmeval.api, cls_name)(**config)
+    elif hasattr(vlmeval.vlm, cls_name):
+        return getattr(vlmeval.vlm, cls_name)(**config)
+    else:
+        raise ValueError(f'Class {cls_name} is not supported in `vlmeval.api` or `vlmeval.vlm`')
+
+
+def build_dataset_from_config(cfg, dataset_name):
+    import vlmeval.dataset
+    import inspect
+    config = cp.deepcopy(cfg[dataset_name])
+    if config == {}:
+        return supported_video_datasets[dataset_name]()
+    assert 'class' in config
+    cls_name = config.pop('class')
+    if hasattr(vlmeval.dataset, cls_name):
+        cls = getattr(vlmeval.dataset, cls_name)
+        sig = inspect.signature(cls.__init__)
+        valid_params = {k: v for k, v in config.items() if k in sig.parameters}
+        if valid_params.get('fps', 0) > 0 and valid_params.get('nframe', 0) > 0:
+            raise ValueError('fps and nframe should not be set at the same time')
+        if valid_params.get('fps', 0) <= 0 and valid_params.get('nframe', 0) <= 0:
+            raise ValueError('fps and nframe should be set at least one valid value')
+        return cls(**valid_params)
+    else:
+        raise ValueError(f'Class {cls_name} is not supported in `vlmeval.dataset`')
+
+
+def parse_args():
+    help_msg = """\
+You can launch the evaluation by setting either --data and --model or --config.
+
+--data and --model:
+    Each Arg should be a list of strings, specifying the names of datasets and models.
+    To find all supported model names, please refer to the `vlmeval/config.py` of check the output of the command \
+        `vlmutil mlist all` in the terminal (you should first have vlmeval installed).
+    To find all supported dataset names, please refer to the `vlmeval/dataset/__init__.py` file. The python script \
+        to print all supported dataset names is as follows:
+        ```python
+        from vlmeval.dataset import SUPPORTED_DATASETS
+        print(SUPPORTED_DATASETS)
+        ```
+        or you can check the output of the command `vlmutil dlist all` in the terminal.
+    To find all supported video dataset default settings, please refer to the \
+        `vlmeval/dataset/video_dataset_config.py` file.
+
+--config:
+    Launch the evaluation by specifying the path to the config json file. Sample Json Content:
+    ```json
+    {
+        "model": {
+            "GPT4o_20240806_T00_HIGH": {
+                "class": "GPT4V",
+                "model": "gpt-4o-2024-08-06",
+                "temperature": 0,
+                "img_detail": "high"
+            },
+            "GPT4o_20240806_T10_Low": {
+                "class": "GPT4V",
+                "model": "gpt-4o-2024-08-06",
+                "temperature": 1.0,
+                "img_detail": "low"
+            },
+            "GPT4o_20241120": {}
+        },
+        "data": {
+            "MME-RealWorld-Lite": {
+                "class": "MMERealWorld",
+                "dataset": "MME-RealWorld-Lite"
+            },
+            "MMBench_DEV_EN_V11": {
+                "class": "ImageMCQDataset",
+                "dataset": "MMBench_DEV_EN_V11"
+            },
+            "MMBench_Video_8frame_nopack": {},
+            "Video-MME_16frame_subs": {
+                "class": "VideoMME",
+                "dataset": "Video-MME",
+                "nframe": 16,
+                "use_subtitle": true,
+            }
+        }
+    }
+    ```
+    Currently, only `model` and `data` are supported fields. The content of each field is a dictionary.
+    For `model`, the key is the name of the model, and the value is a dictionary containing the following keys:
+    - `class`: The class name of the model, which should be a class in `vlmeval.vlm` or `vlmeval.api`.
+    - Other keys are specific to the model, please refer to the corresponding class.
+    - Tip: The defined model in the `supported_VLM` of `vlmeval/config.py` can be used as a shortcut.
+    For `data`, the key is the name of the dataset (should be the same as the `dataset` field in most cases, \
+        except for video datasets), and the value is a dictionary containing the following keys:
+    - `class`: The class name of the dataset, which should be a class in `vlmeval.dataset`.
+    - `dataset`: The name of the dataset, which should be a string that is accepted by the `dataset` argument of the \
+        corresponding class.
+    - Other keys are specific to the dataset, please refer to the corresponding class.
+    - Tip: The defined dataset in the `supported_video_datasets` of `vlmeval/dataset/video_dataset_config.py` \
+        can be used as a shortcut.
+
+    The keys in the `model` and `data` fields will be used for naming the prediction files and evaluation results.
+    When launching with `--config`, args for API VLMs, such as `--retry`, `--verbose`, will be ignored.
+"""
+    parser = argparse.ArgumentParser(description=help_msg, formatter_class=argparse.RawTextHelpFormatter)
+    # Essential Args, Setting the Names of Datasets and Models
+    parser.add_argument('--data', type=str, nargs='+', help='Names of Datasets')
+    parser.add_argument('--model', type=str, nargs='+', help='Names of Models')
+    parser.add_argument('--config', type=str, help='Path to the Config Json File')
+    # Work Dir
+    parser.add_argument('--work-dir', type=str, default='./outputs', help='select the output directory')
+    # Infer + Eval or Infer Only
+    parser.add_argument('--mode', type=str, default='all', choices=['all', 'infer'])
+    # API Kwargs, Apply to API VLMs and Judge API LLMs
+    parser.add_argument('--nproc', type=int, default=4, help='Parallel API calling')
+    parser.add_argument('--retry', type=int, default=None, help='retry numbers for API VLMs')
+    # Explicitly Set the Judge Model
+    parser.add_argument('--judge', type=str, default=None)
+    # Logging Utils
+    parser.add_argument('--verbose', action='store_true')
+    # Configuration for Resume
+    # Ignore: will not rerun failed VLM inference
+    parser.add_argument('--ignore', action='store_true', help='Ignore failed indices. ')
+    # Reuse: will reuse the existing prediction files
+    parser.add_argument('--reuse', action='store_true')
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    logger = get_logger('RUN')
+    rank, world_size = get_rank_and_world_size()
+    args = parse_args()
+    use_config, cfg = False, None
+    if args.config is not None:
+        assert args.data is None and args.model is None, '--data and --model should not be set when using --config'
+        use_config, cfg = True, load(args.config)
+        args.model = list(cfg['model'].keys())
+        args.data = list(cfg['data'].keys())
+    else:
+        assert len(args.data), '--data should be a list of data files'
+
+    if rank == 0:
+        if not args.reuse:
+            logger.warning('--reuse is not set, will not reuse previous (before one day) temporary files')
+        else:
+            logger.warning('--reuse is set, will reuse the latest prediction & temporary pickle files')
+
+    if 'MMEVAL_ROOT' in os.environ:
+        args.work_dir = os.environ['MMEVAL_ROOT']
+
+    if not use_config:
+        for k, v in supported_VLM.items():
+            if hasattr(v, 'keywords') and 'retry' in v.keywords and args.retry is not None:
+                v.keywords['retry'] = args.retry
+                supported_VLM[k] = v
+            if hasattr(v, 'keywords') and 'verbose' in v.keywords and args.verbose is not None:
+                v.keywords['verbose'] = args.verbose
+                supported_VLM[k] = v
+
+    if world_size > 1:
+        local_rank = os.environ.get('LOCAL_RANK', 0)
+        torch.cuda.set_device(int(local_rank))
+        dist.init_process_group(
+            backend='nccl',
+            timeout=datetime.timedelta(seconds=os.environ.get('DIST_TIMEOUT', 3600))
+        )
+
+    for _, model_name in enumerate(args.model):
+        model = None
+        date, commit_id = timestr('day'), githash(digits=8)
+        eval_id = f"T{date}_G{commit_id}"
+
+        pred_root = osp.join(args.work_dir, model_name, eval_id)
+        pred_root_meta = osp.join(args.work_dir, model_name)
+        os.makedirs(pred_root_meta, exist_ok=True)
+
+        prev_pred_roots = ls(osp.join(args.work_dir, model_name), mode='dir')
+        if len(prev_pred_roots) and args.reuse:
+            prev_pred_roots.sort()
+
+        if not osp.exists(pred_root):
+            os.makedirs(pred_root, exist_ok=True)
+
+        if use_config:
+            model = build_model_from_config(cfg['model'], model_name)
+
+        for _, dataset_name in enumerate(args.data):
+            try:
+                result_file_base = f'{model_name}_{dataset_name}.xlsx'
+
+                if use_config:
+                    if world_size > 1:
+                        if rank == 0:
+                            dataset = build_dataset_from_config(cfg['data'], dataset_name)
+                        dist.barrier()
+                    dataset = build_dataset_from_config(cfg['data'], dataset_name)
+                    if dataset is None:
+                        logger.error(f'Dataset {dataset_name} is not valid, will be skipped. ')
+                        continue
+                else:
+                    dataset_kwargs = {}
+                    if dataset_name in ['MMLongBench_DOC', 'DUDE', 'DUDE_MINI', 'SLIDEVQA', 'SLIDEVQA_MINI']:
+                        dataset_kwargs['model'] = model_name
+
+                    # If distributed, first build the dataset on the main process for doing preparation works
+                    if world_size > 1:
+                        if rank == 0:
+                            dataset = build_dataset(dataset_name, **dataset_kwargs)
+                        dist.barrier()
+
+                    dataset = build_dataset(dataset_name, **dataset_kwargs)
+                    if dataset is None:
+                        logger.error(f'Dataset {dataset_name} is not valid, will be skipped. ')
+                        continue
+
+                # Handling Multi-Turn Dataset
+                if dataset.TYPE == 'MT':
+                    result_file_base = result_file_base.replace('.xlsx', '.tsv')
+
+                result_file = osp.join(pred_root, result_file_base)
+
+                # Reuse the previous prediction file if exists
+                if rank == 0 and len(prev_pred_roots):
+                    prev_result_file = None
+                    prev_pkl_file_list = []
+                    for root in prev_pred_roots[::-1]:
+                        if osp.exists(osp.join(root, result_file_base)):
+                            prev_result_file = osp.join(root, result_file_base)
+                            break
+                        elif commit_id in root and len(ls(root)) and root != pred_root:
+                            temp_files = ls(root, match=[dataset_name, '.pkl'])
+                            if len(temp_files):
+                                prev_pkl_file_list.extend(temp_files)
+                                break
+                    if not args.reuse:
+                        prev_result_file = None
+                        prev_pkl_file_list = []
+                    if prev_result_file is not None:
+                        logger.warning(
+                            f'--reuse is set, will reuse the prediction file {prev_result_file}.')
+                        if prev_result_file != result_file:
+                            shutil.copy(prev_result_file, result_file)
+                    elif len(prev_pkl_file_list):
+                        for fname in prev_pkl_file_list:
+                            target_path = osp.join(pred_root, osp.basename(fname))
+                            if not osp.exists(target_path):
+                                shutil.copy(fname, target_path)
+                                logger.info(f'--reuse is set, will reuse the prediction pickle file {fname}.')
+                            else:
+                                logger.warning(f'File already exists: {target_path}')
+
+                if world_size > 1:
+                    dist.barrier()
+
+                if model is None:
+                    model = model_name  # which is only a name
+
+                # Perform the Inference
+                if dataset.MODALITY == 'VIDEO':
+                    model = infer_data_job_video(
+                        model,
+                        work_dir=pred_root,
+                        model_name=model_name,
+                        dataset=dataset,
+                        result_file_name=result_file_base,
+                        verbose=args.verbose,
+                        api_nproc=args.nproc)
+                elif dataset.TYPE == 'MT':
+                    model = infer_data_job_mt(
+                        model,
+                        work_dir=pred_root,
+                        model_name=model_name,
+                        dataset=dataset,
+                        verbose=args.verbose,
+                        api_nproc=args.nproc,
+                        ignore_failed=args.ignore)
+                else:
+                    model = infer_data_job(
+                        model,
+                        work_dir=pred_root,
+                        model_name=model_name,
+                        dataset=dataset,
+                        verbose=args.verbose,
+                        api_nproc=args.nproc,
+                        ignore_failed=args.ignore)
+
+                # Set the judge kwargs first before evaluation or dumping
+
+                judge_kwargs = {
+                    'nproc': args.nproc,
+                    'verbose': args.verbose,
+                    'retry': args.retry if args.retry is not None else 3
+                }
+
+                if args.retry is not None:
+                    judge_kwargs['retry'] = args.retry
+                if args.judge is not None:
+                    judge_kwargs['model'] = args.judge
+                else:
+                    if dataset.TYPE in ['MCQ', 'Y/N']:
+                        judge_kwargs['model'] = 'chatgpt-0125'
+                    elif listinstr(['MMVet', 'LLaVABench', 'MMBench-Video'], dataset_name):
+                        judge_kwargs['model'] = 'gpt-4-turbo'
+                    elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'DynaMath'], dataset_name):
+                        judge_kwargs['model'] = 'gpt-4o-mini'
+                    elif listinstr(['MMLongBench', 'MMDU', 'DUDE', 'SLIDEVQA', 'MIA-Bench', 'WildVision'], dataset_name):  # noqa: E501
+                        judge_kwargs['model'] = 'gpt-4o'
+
+                if rank == 0:
+                    logger.info(judge_kwargs)
+
+                if world_size > 1:
+                    dist.barrier()
+
+                # Only Rank 0 handles the evaluation part
+                if rank == 0:
+                    # Prepare Submission Files for MMMU_TEST AND MMT-Bench_ALL
+                    if dataset_name in ['MMMU_TEST']:
+                        result_json = MMMU_result_transfer(result_file)
+                        logger.info(f'Transfer MMMU_TEST result to json for official evaluation, '
+                                    f'json file saved in {result_json}')
+                        continue
+                    elif 'MMT-Bench_ALL' in dataset_name:
+                        submission_file = MMTBench_result_transfer(result_file, **judge_kwargs)
+                        logger.info(f'Extract options from prediction of MMT-Bench FULL split for official evaluation '
+                                    f'(https://eval.ai/web/challenges/challenge-page/2328/overview), '
+                                    f'submission file saved in {submission_file}')
+                        continue
+
+                    # Skip the evaluation part if only infer
+                    if args.mode == 'infer':
+                        continue
+
+                    # Skip the evaluation part if the dataset evaluation is not supported or annotations are missing
+                    if 'MLLMGuard_DS' in dataset_name:
+                        logger.info('The evaluation of MLLMGuard_DS is not supported yet. ')
+                        continue
+                    elif 'AesBench_TEST' == dataset_name:
+                        logger.info(f'The results are saved in {result_file}. '
+                                    f'Please send it to the AesBench Team via huangyipo@hotmail.com.')
+                        continue
+                    elif dataset_name in ['DocVQA_TEST', 'InfoVQA_TEST', 'Q-Bench1_TEST', 'A-Bench_TEST']:
+                        logger.info(f'{dataset_name} is a test split without ground-truth. '
+                                    'Thus only the inference part is supported for those datasets. ')
+                        continue
+                    elif dataset_name in [
+                        'MMBench_TEST_CN', 'MMBench_TEST_EN', 'MMBench', 'MMBench_CN',
+                        'MMBench_TEST_CN_V11', 'MMBench_TEST_EN_V11', 'MMBench_V11', 'MMBench_CN_V11'
+                    ] and not MMBenchOfficialServer(dataset_name):
+                        logger.error(
+                            f'Can not evaluate {dataset_name} on non-official servers, will skip the evaluation.')
+                        continue
+
+                    # Setup the proxy for the evaluation
+                    eval_proxy = os.environ.get('EVAL_PROXY', None)
+                    old_proxy = os.environ.get('HTTP_PROXY', '')
+                    if eval_proxy is not None:
+                        proxy_set(eval_proxy)
+
+                    # Perform the Evaluation
+                    eval_results = dataset.evaluate(result_file, **judge_kwargs)
+                    # Display Evaluation Results in Terminal
+                    if eval_results is not None:
+                        assert isinstance(eval_results, dict) or isinstance(eval_results, pd.DataFrame)
+                        logger.info(f'The evaluation of model {model_name} x dataset {dataset_name} has finished! ')
+                        logger.info('Evaluation Results:')
+                        if isinstance(eval_results, dict):
+                            logger.info('\n' + json.dumps(eval_results, indent=4))
+                        elif isinstance(eval_results, pd.DataFrame):
+                            if len(eval_results) < len(eval_results.columns):
+                                eval_results = eval_results.T
+                            logger.info('\n' + tabulate(eval_results))
+
+                    # Restore the proxy
+                    if eval_proxy is not None:
+                        proxy_set(old_proxy)
+
+                    # Create the symbolic links for the prediction files
+                    files = os.listdir(pred_root)
+                    files = [x for x in files if f'{model_name}_{dataset_name}' in x]
+                    for f in files:
+                        cwd = os.getcwd()
+                        file_addr = osp.join(cwd, pred_root, f)
+                        link_addr = osp.join(cwd, pred_root_meta, f)
+                        if osp.exists(link_addr) or osp.islink(link_addr):
+                            os.remove(link_addr)
+                        os.symlink(file_addr, link_addr)
+
+            except Exception as e:
+                logger.exception(f'Model {model_name} x Dataset {dataset_name} combination failed: {e}, '
+                                 'skipping this combination.')
+                continue
+
+            if world_size > 1:
+                dist.barrier()
+
+    if world_size > 1:
+        dist.destroy_process_group()
+
+
+if __name__ == '__main__':
+    load_env()
+    main()
--- a/VLMEvalKit/scripts/AI2D_preproc.ipynb
+++ b/VLMEvalKit/scripts/AI2D_preproc.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os, cv2\n",
+    "import string\n",
+    "import os.path as osp\n",
+    "import numpy as np\n",
+    "from collections import defaultdict\n",
+    "from vlmeval.smp import ls, load, dump, download_file, encode_image_file_to_base64, md5, mrlines\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import multiprocessing as mp\n",
+    "from PIL import Image, ImageFont, ImageDraw\n",
+    "\n",
+    "font_URL = 'http://opencompass.openxlab.space/utils/Fonts/timesb.ttf'\n",
+    "font_file = 'timesb.ttf'\n",
+    "if not osp.exists(font_file):\n",
+    "    download_file(font_URL)\n",
+    "    \n",
+    "test_split_URL = 'https://s3-us-east-2.amazonaws.com/prior-datasets/ai2d_test_ids.csv'\n",
+    "test_split_file = 'ai2d_test_ids.csv'\n",
+    "if not osp.exists(test_split_file):\n",
+    "    download_file(test_split_URL)\n",
+    "    \n",
+    "test_ids = set(mrlines(test_split_file))\n",
+    "    \n",
+    "def proper_font_size(font_file, wh, text, ratio=1):\n",
+    "    font_size = 2\n",
+    "    while True:\n",
+    "        font = ImageFont.truetype(font_file, font_size)\n",
+    "        real_box = font.getbbox(text)\n",
+    "        real_wh = (real_box[2] - real_box[0], real_box[3] - real_box[1])\n",
+    "        if real_wh[0] > wh[0] * ratio or real_wh[1] > wh[1] * ratio:\n",
+    "            break\n",
+    "        font_size += 1\n",
+    "    return font_size\n",
+    "\n",
+    "def cover_image(ann_path):\n",
+    "    data = load(ann_path)\n",
+    "    texts = list(data['text'].values())\n",
+    "    raw_img = ann_path.replace('annotations', 'images').replace('.json', '')\n",
+    "    tgt_img = raw_img.replace('images', 'images_abc')\n",
+    "    img = Image.open(raw_img)\n",
+    "    draw = ImageDraw.Draw(img)\n",
+    "    for text in texts:\n",
+    "        st, ed = tuple(text['rectangle'][0]), tuple(text['rectangle'][1])\n",
+    "        T = text['replacementText']\n",
+    "        draw.rectangle((st, ed), fill='white')\n",
+    "        font_size = proper_font_size(font_file, (ed[0] - st[0], ed[1] - st[1]), T, ratio=1)\n",
+    "        font = ImageFont.truetype(font_file, font_size)\n",
+    "        text_box = font.getbbox(T)\n",
+    "        text_wh = (text_box[2] - text_box[0], text_box[3] - text_box[1])\n",
+    "        cx, cy = (st[0] + ed[0]) // 2, st[1]\n",
+    "        stx = cx - text_wh[0] // 2\n",
+    "        sty = cy - text_wh[1] // 2\n",
+    "        draw.text((stx, sty), T, font=font, fill='black')\n",
+    "    img.save(tgt_img)    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Process for no mask images\n",
+    "test_ids = set(mrlines(test_split_file))\n",
+    "\n",
+    "def detect_image_color(image):\n",
+    "    gray_image = image.convert('L')\n",
+    "    mean_brightness = np.mean(np.array(gray_image))\n",
+    "    if mean_brightness < 127:\n",
+    "        return 'white'\n",
+    "    else:\n",
+    "        return 'black'\n",
+    "\n",
+    "def cover_image(ann_path):\n",
+    "    data = load(ann_path)\n",
+    "    texts = list(data['text'].values())\n",
+    "    raw_img = ann_path.replace('annotations', 'images').replace('.json', '')\n",
+    "    tgt_img = raw_img.replace('images', 'images_abc')\n",
+    "    img = Image.open(raw_img)\n",
+    "    draw = ImageDraw.Draw(img)\n",
+    "    color = detect_image_color(img)\n",
+    "    font_size = 0\n",
+    "    for text in texts:\n",
+    "        st, ed = tuple(text['rectangle'][0]), tuple(text['rectangle'][1])\n",
+    "        font_size += (ed[1] - st[1])\n",
+    "    if len(texts) != 0:\n",
+    "        font_size /= len(texts)\n",
+    "    else:\n",
+    "        font_size = 2\n",
+    "    for text in texts:\n",
+    "        st, ed = tuple(text['rectangle'][0]), tuple(text['rectangle'][1])\n",
+    "        T = text['replacementText']\n",
+    "        for i in range(2):\n",
+    "            draw.rectangle(\n",
+    "                [(st[0] - i, st[1] - i), (ed[0] + i, ed[1] + i)],\n",
+    "                outline=color\n",
+    "            )\n",
+    "        font = ImageFont.truetype(font_file, font_size)\n",
+    "        text_box = font.getbbox(T)\n",
+    "        text_wh = (text_box[2] - text_box[0], text_box[3] - text_box[1])\n",
+    "        cx, cy = (st[0] + ed[0]) // 2, st[1]\n",
+    "        stx = cx - text_wh[0] // 2\n",
+    "        sty = cy - text_wh[1] * 1.5\n",
+    "        if sty < 0:\n",
+    "            sty = cy + text_wh[1] * 1.3\n",
+    "        draw.text((stx, sty), T, font=font, fill=color)\n",
+    "    img.save(tgt_img)  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "download_file('https://ai2-public-datasets.s3.amazonaws.com/diagrams/ai2d-all.zip')\n",
+    "os.system('unzip -o ai2d-all.zip')\n",
+    "\n",
+    "images = ls('ai2d/images/')\n",
+    "questions = ls('ai2d/questions/')\n",
+    "annotations = ls('ai2d/annotations/')\n",
+    "cates = load('ai2d/categories.json')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pool = mp.Pool(32)\n",
+    "pool.map(cover_image, annotations)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def puncproc(inText):\n",
+    "    import re\n",
+    "    outText = inText\n",
+    "    punct = [\n",
+    "        ';', r'/', '[', ']', '\"', '{', '}', '(', ')', '=', '+', '\\\\', '_', '-',\n",
+    "        '>', '<', '@', '`', ',', '?', '!'\n",
+    "    ]\n",
+    "    commaStrip = re.compile('(\\d)(,)(\\d)')  # noqa: W605\n",
+    "    periodStrip = re.compile('(?!<=\\d)(\\.)(?!\\d)')  # noqa: W605\n",
+    "    for p in punct:\n",
+    "        if (p + ' ' in inText or ' ' + p in inText) or (re.search(commaStrip, inText) is not None):\n",
+    "            outText = outText.replace(p, '')\n",
+    "        else:\n",
+    "            outText = outText.replace(p, ' ')\n",
+    "    outText = periodStrip.sub('', outText, re.UNICODE)\n",
+    "    return outText\n",
+    "\n",
+    "def check_choices(line):\n",
+    "    def ischar(s):\n",
+    "        s = str(s)\n",
+    "        if s in ['{}', 'Both', 'None of above']:\n",
+    "            return True\n",
+    "        elif s.startswith('Stage ') and ischar(s[6:]):\n",
+    "            return True\n",
+    "        elif ' and ' in s and np.all([ischar(x) for x in s.split(' and ')]):\n",
+    "            return True\n",
+    "        elif len(s) <= 2:\n",
+    "            return True\n",
+    "        elif len(puncproc(s).split()) > 1:\n",
+    "            return np.all([ischar(x) for x in puncproc(s).split()])\n",
+    "        return False\n",
+    "    n_char = sum([ischar(line[x]) for x in 'ABCD'])\n",
+    "    return n_char >= 3\n",
+    "\n",
+    "def check_question(question):\n",
+    "    words = puncproc(question).split()\n",
+    "    for ch in string.ascii_lowercase + string.ascii_uppercase:\n",
+    "        if ch in words:\n",
+    "            return True\n",
+    "    return False\n",
+    "\n",
+    "def is_abc(abc, choices, question):\n",
+    "    if abc == 0:\n",
+    "        return False\n",
+    "    if check_choices(choices):\n",
+    "        return True\n",
+    "    if check_question(question):\n",
+    "        return True\n",
+    "    return False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_all = defaultdict(list)\n",
+    "for qfile in questions:\n",
+    "    data = load(qfile)\n",
+    "    idx = data['imageName'].split('.')[0]\n",
+    "    if idx not in test_ids:\n",
+    "        continue\n",
+    "    image_pth = qfile.replace('questions', 'images').replace('.json', '')\n",
+    "    cate = cates[image_pth.split('/')[-1]]\n",
+    "    for q, qmeta in data['questions'].items():\n",
+    "        assert '.png-' in qmeta['questionId']\n",
+    "        main, sub = qmeta['questionId'].split('.png-')\n",
+    "        idx = int(main) * 100 + int(sub)\n",
+    "        \n",
+    "        answers = qmeta['answerTexts']\n",
+    "        correct = qmeta['correctAnswer']\n",
+    "        \n",
+    "        data_all['index'].append(idx)\n",
+    "        data_all['question'].append(q)\n",
+    "        assert len(answers) == 4\n",
+    "        for c, a in zip('ABCD', answers):\n",
+    "            data_all[c].append(a)\n",
+    "        data_all['answer'].append('ABCD'[qmeta['correctAnswer']])\n",
+    "        data_all['category'].append(cate)\n",
+    "        data_all['abcLabel'].append(qmeta['abcLabel'])\n",
+    "        abc = is_abc(qmeta['abcLabel'], {x: data_all[x][-1] for x in 'ABCD'}, q)\n",
+    "        # if qmeta['abcLabel'] and not abc:\n",
+    "        #     print(qmeta['abcLabel'], {x: data_all[x][-1] for x in 'ABCD'}, q)\n",
+    "        data_all['image_path'].append(image_pth.replace('images', 'images_abc') if abc else image_pth)\n",
+    "data = pd.DataFrame(data_all)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "images = []\n",
+    "image_seen = {}\n",
+    "for idx, pth in zip(data['index'], data['image_path']):\n",
+    "    images.append(encode_image_file_to_base64(pth))\n",
+    "\n",
+    "data['image'] = images\n",
+    "dump(data, 'AI2D_TEST.tsv')\n",
+    "print(md5('AI2D_TEST.tsv'))"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/VLMEvalKit/scripts/apires_scan.py
+++ b/VLMEvalKit/scripts/apires_scan.py
+import sys
+from vlmeval import *
+from vlmeval.dataset import SUPPORTED_DATASETS
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+root = sys.argv[1]
+if root[-1] in '/\\':
+    root = root[:-1]
+
+model_name = root.split('/')[-1]
+
+for d in SUPPORTED_DATASETS:
+    fname = f'{model_name}_{d}.xlsx'
+    pth = osp.join(root, fname)
+    if osp.exists(pth):
+        data = load(pth)
+        # Detect Failure
+        assert 'prediction' in data
+        data['prediction'] = [str(x) for x in data['prediction']]
+        fail = [FAIL_MSG in x for x in data['prediction']]
+        if sum(fail):
+            nfail = sum(fail)
+            ntot = len(fail)
+            print(f'Model {model_name} x Dataset {d}: {nfail} out of {ntot} failed. {nfail / ntot * 100: .2f}%. ')
+
+        eval_files = ls(root, match=f'{model_name}_{d}_')
+        eval_files = [x for x in eval_files if listinstr([f'{d}_openai', f'{d}_gpt'], x) and x.endswith('.xlsx')]
+
+        if len(eval_files) == 0:
+            print(f'Model {model_name} x Dataset {d} openai missing')
+            continue
+        
+        assert len(eval_files) == 1
+        eval_file = eval_files[0]
+        data = load(eval_file)
+        
+        if 'MMVet' in d:
+            bad = [x for x in data['log'] if 'All 5 retries failed.' in str(x)]
+            if len(bad):
+                print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.')
+        elif 'MathVista' in d:
+            bad = [x for x in data['res'] if FAIL_MSG in str(x)]
+            if len(bad):
+                print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.')
+            
+        elif d == 'LLaVABench':
+            sub = data[data['gpt4_score'] == -1]
+            sub = sub[sub['gpt4_score'] == -1]
+            if len(sub):
+                print(f'Model {model_name} x Dataset {d} Evaluation: {len(sub)} out of {len(data)} failed.')
+        else:
+            bad = [x for x in data['log'] if FAIL_MSG in str(x)]
+            if len(bad):
+                print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.')
+                
\ No newline at end of file
--- a/VLMEvalKit/scripts/auto_run.py
+++ b/VLMEvalKit/scripts/auto_run.py
+import argparse
+from vlmeval.smp import *
+from vlmeval.config import supported_VLM
+
+def is_api(x):
+    return getattr(supported_VLM[x].func, 'is_api', False)
+
+models = list(supported_VLM)
+models = [x for x in models if 'fs' not in x]
+models = [x for x in models if not is_api(x)]
+exclude_list = ['cogvlm-grounding-generalist', 'emu2']
+models = [x for x in models if x not in exclude_list]
+
+def is_large(x):
+    return '80b' in x or 'emu2' in x or '34B' in x
+
+small_models = [x for x in models if not is_large(x)]
+large_models = [x for x in models if is_large(x)]
+models = small_models + large_models
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--data', type=str, nargs='+', required=True)
+args = parser.parse_args()
+
+# Skip some models
+models = [x for x in models if not listinstr(['MiniGPT', 'grounding-generalist'], x)]
+
+for m in models:
+    unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.xlsx')]
+    if len(unknown_datasets) == 0:
+        continue
+    dataset_str = ' '.join(unknown_datasets)
+    if '80b' in m:
+        cmd = f'python run.py --data {dataset_str} --model {m}'
+    else:
+        cmd = f'bash run.sh --data {dataset_str} --model {m}'
+    print(cmd)
+    os.system(cmd)
\ No newline at end of file
--- a/VLMEvalKit/scripts/cover.sh
+++ b/VLMEvalKit/scripts/cover.sh
+#!/bin/bash
+DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cp $DIR/../config.py $DIR/../vlmeval/
+cp $DIR/../misc/* $DIR/../vlmeval/vlm/misc/
\ No newline at end of file
--- a/VLMEvalKit/scripts/data_browser.py
+++ b/VLMEvalKit/scripts/data_browser.py
+"""
+pip install gradio    # proxy_on first
+python vis_geochat_data.py
+# browse data in http://127.0.0.1:10064
+"""
+
+import os
+import io
+import json
+import copy
+import time
+import gradio as gr
+import base64
+from PIL import Image
+from io import BytesIO
+from argparse import Namespace
+# from llava import conversation as conversation_lib
+from typing import Sequence
+from vlmeval import *
+from vlmeval.dataset import SUPPORTED_DATASETS, build_dataset
+
+SYS = "You are a helpful assistant. Your job is to faithfully translate all provided text into Chinese faithfully. "
+
+# Translator = SiliconFlowAPI(model='Qwen/Qwen2.5-7B-Instruct', system_prompt=SYS)
+Translator = OpenAIWrapper(model='gpt-4o-mini', system_prompt=SYS)
+
+
+def image_to_mdstring(image):
+    return f"![image](data:image/jpeg;base64,{image})"
+
+
+def images_to_md(images):
+    return '\n\n'.join([image_to_mdstring(image) for image in images])
+
+
+def mmqa_display(question, target_size=768):
+    question = {k.lower() if len(k) > 1 else k: v for k, v in question.items()}
+    keys = list(question.keys())
+    keys = [k for k in keys if k not in ['index', 'image']]
+
+    idx = question.pop('index', 'XXX')
+    text = f'\n- INDEX: {idx}\n'
+
+    images = question.pop('image')
+    if images[0] == '[' and images[-1] == ']':
+        images = eval(images)
+    else:
+        images = [images]
+
+    qtext = question.pop('question', None)
+    if qtext is not None:
+        text += f'- QUESTION: {qtext}\n'
+
+    if 'A' in question:
+        text += f'- Choices: \n'
+        for k in string.ascii_uppercase:
+            if k in question:
+                text += f'\t-{k}: {question.pop(k)}\n'
+    answer = question.pop('answer', None)
+    
+    for k in question:
+        if not pd.isna(question[k]):
+            text += f'- {k.upper()}. {question[k]}\n'
+    
+    if answer is not None:
+        text += f'- ANSWER: {answer}\n'
+
+    image_md = images_to_md(images)
+
+    return text, image_md
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    # Essential Args, Setting the Names of Datasets and Models
+    parser.add_argument('--port', type=int, default=7860)
+    args = parser.parse_args()
+    return args
+
+
+def gradio_app_vis_dataset(port=7860):
+    data, loaded_obj = None, {}
+
+    def btn_submit_click(filename, ann_id):
+        if filename not in loaded_obj:
+            return filename_change(filename, ann_id)
+        nonlocal data
+        data_desc = gr.Markdown(f'Visualizing {filename}, {len(data)} samples in total. ')
+        if ann_id < 0 or ann_id >= len(data):
+            return filename, ann_id, data_desc, gr.Markdown('Invalid Index'), gr.Markdown(f'Index out of range [0, {len(data) - 1}]')
+        item = data.iloc[ann_id]
+        text, image_md = mmqa_display(item)
+        return filename, ann_id, data_desc, image_md, text
+
+    def btn_next_click(filename, ann_id):
+        return btn_submit_click(filename, ann_id + 1)
+
+    # def translate_click(anno_en):
+    #     return gr.Markdown(Translator.generate(anno_en))
+
+    def filename_change(filename, ann_id):
+        nonlocal data, loaded_obj
+
+        def legal_filename(filename):
+            LMURoot = LMUDataRoot()
+            if filename in SUPPORTED_DATASETS:
+                return build_dataset(filename).data
+            elif osp.exists(filename):
+                data = load(filename)
+                assert 'index' in data and 'image' in data
+                image_map = {i: image for i, image in zip(data['index'], data['image'])}
+                for k, v in image_map.items():
+                    if (not isinstance(v, str) or len(v) < 64) and v in image_map:
+                        image_map[k] = image_map[v]
+                data['image'] = [image_map[k] for k in data['index']]
+                return data
+            elif osp.exists(osp.join(LMURoot, filename)):
+                filename = osp.join(LMURoot, filename)
+                return legal_filename(filename)
+            else:
+                return None
+
+        data = legal_filename(filename)
+        if data is None:
+            return filename, 0, gr.Markdown(''), gr.Markdown("File not found"), gr.Markdown("File not found")
+        
+        loaded_obj[filename] = data
+        return btn_submit_click(filename, 0)
+
+    with gr.Blocks() as app:
+        
+        filename = gr.Textbox(
+            value='Dataset Name (supported by VLMEvalKit) or TSV FileName (Relative under `LMURoot` or Real Path)', 
+            label='Dataset', 
+            interactive=True,
+            visible=True)
+            
+        with gr.Row():
+            ann_id = gr.Number(0, label='Sample Index (Press Enter)', interactive=True, visible=True)
+            btn_next = gr.Button("Next")
+            # btn_translate = gr.Button('CN Translate')
+
+        with gr.Row():
+            data_desc = gr.Markdown('Dataset Description', label='Dataset Description')
+        
+        with gr.Row():
+            image_output = gr.Markdown('Image PlaceHolder', label='Image Visualization')
+            anno_en = gr.Markdown('Image Annotation', label='Image Annotation')
+            # anno_cn = gr.Markdown('Image Annotation (Chinese)', label='Image Annotation (Chinese)')
+
+        input_components = [filename, ann_id]
+        all_components = [filename, ann_id, data_desc, image_output, anno_en]
+
+        filename.submit(filename_change, input_components, all_components)
+        ann_id.submit(btn_submit_click, input_components, all_components)
+        btn_next.click(btn_next_click, input_components, all_components)
+        # btn_translate.click(translate_click, anno_en, anno_cn)
+
+    # app.launch()
+    app.launch(server_name='0.0.0.0', debug=True, show_error=True, server_port=port)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    gradio_app_vis_dataset(port=args.port)
+
--- a/VLMEvalKit/scripts/mmb_eval_gradio.py
+++ b/VLMEvalKit/scripts/mmb_eval_gradio.py
+from vlmeval.smp import *
+from vlmeval.tools import EVAL
+import gradio as gr
+
+HEADER = """
+# Welcome to MMBench👏👏
+We are delighted that you are willing to submit the evaluation results to the MMBench official website! The evaluation service currently can handle submissions of MMBench, MMBench-CN, and CCBench. We use `gpt-3.5-turbo-0125` to help answer matching. Evaluation Codes in VLMEvalKit: https://github.com/open-compass/VLMEvalKit. Please adopt / follow the implementation of VLMEvalKit to generate the submission files. 
+
+The evaluation script is available at https://github.com/open-compass/VLMEvalKit/tree/main/scripts/mmb_eval_gradio.py
+Please contact `opencompass@pjlab.org.cn` for any inquirys about this script. 
+"""
+
+def upload_file(file):
+    file_path = file.name
+    return file_path
+
+def prepare_file(file_name):
+    file_md5 = md5(file_name)
+    root = LMUDataRoot()
+    root = osp.join(root, 'eval_server')
+    os.makedirs(root, exist_ok=True)
+    suffix = file_name.split('.')[-1]
+    if suffix not in ['xlsx', 'tsv', 'csv']:
+        return False, "Please submit a file that ends with `.xlsx`, `.tsv`, or `.csv`"
+    new_file_name = osp.join(root, f'{file_md5}.{suffix}')
+    shutil.move(file_name, new_file_name)
+    eval_file = new_file_name
+    try:
+        data = load(eval_file)
+    except:
+        return False, "Your excel file can not be successfully loaded by `pd.read_excel`, please double check and submit again. "
+    for k in data.keys():
+        data[k.lower() if k not in 'ABCD' else k] = data.pop(k)
+    if "index" not in data:
+        return False, "Your excel file should have a column named `index`, please double check and submit again" , {}
+    if "prediction" not in data:
+        return False, "Your excel file should have a column named `prediction`, please double check and submit again" , {}
+    for ch in 'ABCD':
+        if ch not in data:
+            return False, f"Your excel file should have a column named `{ch}`, please double check and submit again" , {}
+    dump(data, eval_file)
+    return True, eval_file
+
+def determine_dataset(eval_file):
+    data = load(eval_file)
+    def cn_ratio(data):
+        iscn = [cn_string(x) for x in data['question']]
+        return np.mean(iscn)
+    max_ind = np.max([int(x) for x in data['index'] if int(x) < 1e5])
+    if max_ind < 1000 and 'l2-category' not in data:
+        return 'CCBench' if cn_ratio(data) > 0.5 else "Unknown" 
+    elif max_ind < 3000 :
+        return 'MMBench_CN' if cn_ratio(data) > 0.5 else "MMBench"
+    else:
+        return 'MMBench_CN_V11' if cn_ratio(data) > 0.5 else "MMBench_V11"
+
+    
+def reformat_acc(acc):
+    splits = set(acc['split'])
+    keys = list(acc.keys())
+    keys.remove('split')
+    nacc = {'Category': []}
+    for sp in splits:
+        nacc[sp.upper()] = []
+    for k in keys:
+        nacc['Category'].append(k)
+        for sp in splits:
+            nacc[sp.upper()].append(acc[acc['split'] == sp].iloc[0][k] * 100)
+    return pd.DataFrame(nacc)
+
+def evaluate(file):
+    file_name = file.name
+    flag, eval_file = prepare_file(file_name)
+    if not flag:
+        return "Error: " + eval_file
+    dataset = determine_dataset(eval_file)
+    if dataset == 'Unknown':
+        return "Error: Cannot determine the dataset given your submitted file. " 
+
+    eval_id = eval_file.split('/')[-1].split('.')[0]
+    ret = f"Evaluation ID: {eval_id}\n"
+    timestamp = datetime.datetime.now().strftime('%Y.%m.%d  %H:%M:%S')
+    ret += f'Evaluation Timestamp: {timestamp}\n'
+    acc = EVAL(dataset, eval_file)
+    nacc = reformat_acc(acc).round(1)
+    return ret, nacc
+
+with gr.Blocks() as demo:
+    gr.Markdown(HEADER)
+    file_output = gr.File()
+    upload_button = gr.UploadButton("Click to upload you prediction files for a supported benchmark")
+    upload_button.upload(upload_file, upload_button, file_output)
+    
+    btn = gr.Button("🚀 Evaluate")
+    eval_log = gr.Textbox(label="Evaluation Log", placeholder="Your evaluation log will be displayed here")
+    df_empty = pd.DataFrame([], columns=['Evaluation Result'])
+    eval_result = gr.components.DataFrame(value=df_empty)
+    btn.click(evaluate, inputs=[file_output], outputs=[eval_log, eval_result])
+
+if __name__ == '__main__':
+    demo.launch(server_name='0.0.0.0', debug=True, show_error=True)
\ No newline at end of file
--- a/VLMEvalKit/scripts/run.sh
+++ b/VLMEvalKit/scripts/run.sh
+#!/bin/bash
+set -x
+export GPU=$(nvidia-smi --list-gpus | wc -l)
+torchrun --nproc-per-node=$GPU run.py ${@:1}
\ No newline at end of file
--- a/VLMEvalKit/scripts/srun.sh
+++ b/VLMEvalKit/scripts/srun.sh
+#!/bin/bash
+set -x
+srun -n1 --ntasks-per-node=1 --partition $1 --gres=gpu:8 --quotatype=reserved --job-name vlmeval --cpus-per-task=64 torchrun --nproc-per-node=8 run.py ${@:2}
\ No newline at end of file
--- a/VLMEvalKit/scripts/summarize.py
+++ b/VLMEvalKit/scripts/summarize.py
+from vlmeval.smp import *
+from vlmeval.dataset import SUPPORTED_DATASETS
+
+def get_score(model, dataset):
+
+    file_name = f'{model}/{model}_{dataset}'
+    if listinstr([
+        'CCBench', 'MMBench', 'SEEDBench_IMG', 'MMMU', 'ScienceQA', 
+        'AI2D_TEST', 'MMStar', 'RealWorldQA', 'BLINK', 'VisOnlyQA-VLMEvalKit'
+    ], dataset):
+        file_name += '_acc.csv'
+    elif listinstr(['MME', 'Hallusion', 'LLaVABench'], dataset):
+        file_name += '_score.csv'
+    elif listinstr(['MMVet', 'MathVista'], dataset):
+        file_name += '_gpt-4-turbo_score.csv'
+    elif listinstr(['COCO', 'OCRBench'], dataset):
+        file_name += '_score.json'
+    else:
+        raise NotImplementedError
+    
+    if not osp.exists(file_name):
+        return {}
+    
+    data = load(file_name)
+    ret = {}
+    if dataset == 'CCBench':
+        ret[dataset] = data['Overall'][0] * 100
+    elif dataset == 'MMBench':
+        for n, a in zip(data['split'], data['Overall']):
+            if n == 'dev':
+                ret['MMBench_DEV_EN'] = a * 100
+            elif n == 'test':
+                ret['MMBench_TEST_EN'] = a * 100
+    elif dataset == 'MMBench_CN':
+        for n, a in zip(data['split'], data['Overall']):
+            if n == 'dev':
+                ret['MMBench_DEV_CN'] = a * 100
+            elif n == 'test':
+                ret['MMBench_TEST_CN'] = a * 100
+    elif listinstr(['SEEDBench', 'ScienceQA', 'MMBench', 'AI2D_TEST', 'MMStar', 'RealWorldQA', 'BLINK'], dataset):
+        ret[dataset] = data['Overall'][0] * 100
+    elif 'MME' == dataset:
+        ret[dataset] = data['perception'][0] + data['reasoning'][0]
+    elif 'MMVet' == dataset:
+        data = data[data['Category'] == 'Overall']
+        ret[dataset] = float(data.iloc[0]['acc'])
+    elif 'HallusionBench' == dataset:
+        data = data[data['split'] == 'Overall']
+        for met in ['aAcc', 'qAcc', 'fAcc']:
+            ret[dataset + f' ({met})'] = float(data.iloc[0][met])
+    elif 'MMMU' in dataset:
+        data = data[data['split'] == 'validation']
+        ret['MMMU (val)'] = float(data.iloc[0]['Overall']) * 100
+    elif 'MathVista' in dataset:
+        data = data[data['Task&Skill'] == 'Overall']
+        ret[dataset] = float(data.iloc[0]['acc'])
+    elif 'LLaVABench' in dataset:
+        data = data[data['split'] == 'overall'].iloc[0]
+        ret[dataset] = float(data['Relative Score (main)'])
+    elif 'OCRBench' in dataset:
+        ret[dataset] = data['Final Score']
+    elif dataset == 'VisOnlyQA-VLMEvalKit':
+        for n, a in zip(data['split'], data['Overall']):
+            ret[f'VisOnlyQA-VLMEvalKit_{n}'] = a * 100
+     
+    return ret
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data', type=str, nargs='+', default=[])
+    parser.add_argument("--model", type=str, nargs='+', required=True)
+    args = parser.parse_args()
+    return args
+
+def gen_table(models, datasets):
+    res = defaultdict(dict)
+    for m in models:
+        for d in datasets:
+            try:
+                res[m].update(get_score(m, d))
+            except Exception as e:
+                logging.warning(f'{type(e)}: {e}')
+                logging.warning(f'Missing Results for Model {m} x Dataset {d}')
+    keys = []
+    for m in models:
+        for d in res[m]:
+            keys.append(d)
+    keys = list(set(keys))
+    keys.sort()
+    final = defaultdict(list)
+    for m in models:
+        final['Model'].append(m)
+        for k in keys:
+            if k in res[m]:
+                final[k].append(res[m][k])
+            else:
+                final[k].append(None)
+    final = pd.DataFrame(final)
+    dump(final, 'summ.csv')
+    if len(final) >= len(final.iloc[0].keys()):
+        print(tabulate(final))
+    else:
+        print(tabulate(final.T))
+    
+if __name__ == '__main__':
+    args = parse_args()
+    if args.data == []:
+        args.data = list(SUPPORTED_DATASETS)
+    gen_table(args.model, args.data)
\ No newline at end of file
--- a/VLMEvalKit/scripts/visualize.ipynb
+++ b/VLMEvalKit/scripts/visualize.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import copy as cp\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.font_manager as fm\n",
+    "\n",
+    "def download_file(url, filename=None):\n",
+    "    from urllib.request import urlretrieve\n",
+    "    if filename is None:\n",
+    "        filename = url.split('/')[-1]\n",
+    "    urlretrieve(url, filename)\n",
+    "\n",
+    "font_URL = 'http://opencompass.openxlab.space/utils/Fonts/segoepr.ttf'\n",
+    "download_file(font_URL)\n",
+    "\n",
+    "font12 = fm.FontProperties(fname='segoepr.ttf', size=12)\n",
+    "font15 = fm.FontProperties(fname='segoepr.ttf', size=15, weight='bold')\n",
+    "font18 = fm.FontProperties(fname='segoepr.ttf', size=18, weight='bold')\n",
+    "\n",
+    "DATA_URL = 'http://opencompass.openxlab.space/utils/OpenVLM.json'\n",
+    "download_file(DATA_URL)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pre_normalize(raw_data, labels):\n",
+    "    data_list = cp.deepcopy(raw_data)\n",
+    "    minimum, maximum, max_range, range_map = {}, {}, 0, {}\n",
+    "    for lb in labels:\n",
+    "        minimum[lb] = min([x[lb] for x in data_list])\n",
+    "        maximum[lb] = max([x[lb] for x in data_list])\n",
+    "        max_range = max(max_range, maximum[lb] - minimum[lb])\n",
+    "    max_range *= 1.25\n",
+    "    for lb in labels:\n",
+    "        mid = (minimum[lb] + maximum[lb]) / 2\n",
+    "        new_range = (mid - max_range / 2, mid + max_range / 2) if (mid + max_range / 2) < 100 else (100 - max_range, 100)\n",
+    "        range_map[lb] = new_range\n",
+    "        for item in data_list:\n",
+    "            assert new_range[0] <= item[lb] <= new_range[1]\n",
+    "            item[lb] = (item[lb] - new_range[0]) / max_range * 100\n",
+    "    return data_list, range_map\n",
+    "\n",
+    "# solve the problem that some benchmark score is too high and out of range\n",
+    "def log_normalize(raw_data, labels):\n",
+    "    data_list = cp.deepcopy(raw_data)\n",
+    "    minimum, maximum, max_range, range_map = {}, {}, 0, {}\n",
+    "    for lb in labels:\n",
+    "        minimum[lb] = min([np.log(x[lb]) for x in data_list])\n",
+    "        maximum[lb] = max([np.log(x[lb]) for x in data_list])\n",
+    "        max_range = max(max_range, maximum[lb] - minimum[lb])\n",
+    "    max_range *= 1.005\n",
+    "    for lb in labels:\n",
+    "        mid = (minimum[lb] + maximum[lb]) / 2\n",
+    "        new_range = (mid - max_range / 2, mid + max_range / 2) if (mid + max_range / 2) < 100 else (100 - max_range, 100)\n",
+    "        range_map[lb] = new_range\n",
+    "        for item in data_list:\n",
+    "            assert new_range[0] <= np.log(item[lb]) <= new_range[1]\n",
+    "            item[lb] = (np.log(item[lb]) - new_range[0]) / max_range * 100\n",
+    "    return data_list, range_map"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Draw MMBench Radar Graph\n",
+    "data = json.loads(open('OpenVLM.json').read())['results']\n",
+    "models = list(data)\n",
+    "print(models)\n",
+    "\n",
+    "# model2vis = [\n",
+    "#     'GPT-4v (detail: low)', 'GeminiProVision', 'Qwen-VL-Plus', \n",
+    "#     'InternLM-XComposer2-VL', 'LLaVA-v1.5-13B', 'CogVLM-17B-Chat',\n",
+    "#     'mPLUG-Owl2', 'Qwen-VL-Chat', 'IDEFICS-80B-Instruct'\n",
+    "# ]\n",
+    "\n",
+    "model2vis = [\n",
+    "    # 'GPT-4v (detail: low)', 'GeminiProVision', 'InternLM-XComposer2-VL', \n",
+    "    'GPT-4v (1106, detail-low)', 'Gemini-1.0-Pro', 'Gemini-1.5-Pro', #'Gemini-1.5-Flash', 'Qwen-VL-Plus', \n",
+    "    'InternLM-XComposer2', 'LLaVA-v1.5-13B', 'CogVLM-17B-Chat',\n",
+    "    'mPLUG-Owl2', 'Qwen-VL-Chat', 'IDEFICS-80B-Instruct'\n",
+    "]\n",
+    "\n",
+    "colors = [\n",
+    "    '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', \n",
+    "    '#e377c2', '#7f7f7f', '#bcbd22'\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from collections import defaultdict\n",
+    "\n",
+    "split = 'MMBench_TEST_EN'\n",
+    "# data_sub = {k: v[split] for k, v in data.items()}\n",
+    "data_sub = {k: defaultdict(int, v)[split] for k, v in data.items()}\n",
+    "# solve the problem that some model lack the evaluation of MMBench_TEST_EN\n",
+    "\n",
+    "labels = list(data_sub[model2vis[0]])\n",
+    "labels.remove('Overall')\n",
+    "num_vars = len(labels)\n",
+    "\n",
+    "raw_data = [data_sub[m] for m in model2vis]\n",
+    "data_list, range_map = pre_normalize(raw_data, labels)\n",
+    "\n",
+    "alpha = 0.25\n",
+    "angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()\n",
+    "angles_deg = np.linspace(0, 360, num_vars, endpoint=False).tolist()\n",
+    "fig, ax_base = plt.subplots(nrows=1, ncols=1, figsize=(10, 10), subplot_kw=dict(polar=True))\n",
+    "\n",
+    "for i in range(len(data_list)):\n",
+    "    item = data_list[i]\n",
+    "    model_name = model2vis[i]\n",
+    "    color = colors[i]\n",
+    "    tmp_angles = angles[:] + [angles[0]]\n",
+    "    tmp_values = [item[lb] for lb in labels] + [item[labels[0]]]\n",
+    "    ax_base.plot(tmp_angles, tmp_values, color=color, linewidth=1, linestyle='solid', label=model_name)\n",
+    "    ax_base.fill(tmp_angles, tmp_values, color=color, alpha=alpha)\n",
+    "    \n",
+    "angles += [angles[0]]\n",
+    "ax_base.set_ylim(0, 100)\n",
+    "ax_base.set_yticks([40, 60, 80, 100])\n",
+    "ax_base.set_yticklabels([''] * 4)\n",
+    "\n",
+    "ax_base.tick_params(pad=25)\n",
+    "ax_base.set_xticks(angles[:-1])\n",
+    "ax_base.set_xticklabels(labels, fontproperties=font18)\n",
+    "\n",
+    "leg = ax_base.legend(loc='center right', bbox_to_anchor=(1.6, 0.5), prop=font15, ncol=1, frameon=True, labelspacing=1.2)\n",
+    "for line in leg.get_lines():\n",
+    "    line.set_linewidth(2.5)\n",
+    "\n",
+    "cx, cy, sz = 0.44, 0.435, 0.34\n",
+    "axes = [fig.add_axes([cx - sz, cy - sz, cx + sz, cy + sz], projection='polar', label='axes%d' % i) for i in range(num_vars)]\n",
+    "    \n",
+    "for ax, angle, label in zip(axes, angles_deg, labels):\n",
+    "    ax.patch.set_visible(False)\n",
+    "    ax.grid(False)\n",
+    "    ax.xaxis.set_visible(False)\n",
+    "    cur_range = range_map[label]\n",
+    "    label_list = [cur_range[0] + (cur_range[1] - cur_range[0]) / 5 * i for i in range(2, 6)]\n",
+    "    label_list = [f'{x:.1f}' for x in label_list]\n",
+    "    ax.set_rgrids(range(40, 120, 20), angle=angle, labels=label_list, font_properties=font12)\n",
+    "    ax.spines['polar'].set_visible(False)\n",
+    "    ax.set_ylim(0, 100)\n",
+    "\n",
+    "title_text = f'{len(model2vis)} Representative VLMs on MMBench Test.'\n",
+    "plt.figtext(.7, .95, title_text, fontproperties=font18, ha='center')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "labels = ['SEEDBench_IMG', 'CCBench', 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'MME', 'MMVet', 'MMMU_VAL', 'MathVista', 'HallusionBench', 'LLaVABench']\n",
+    "num_vars = len(labels)\n",
+    "\n",
+    "raw_data = [{k: data[m][k]['Overall'] for k in labels} for m in model2vis]\n",
+    "data_list, range_map = pre_normalize(raw_data, labels)\n",
+    "\n",
+    "alpha = 0.25\n",
+    "angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()\n",
+    "angles_deg = np.linspace(0, 360, num_vars, endpoint=False).tolist()\n",
+    "fig, ax_base = plt.subplots(nrows=1, ncols=1, figsize=(10, 10), subplot_kw=dict(polar=True))\n",
+    "\n",
+    "for i in range(len(data_list)):\n",
+    "    item = data_list[i]\n",
+    "    model_name = model2vis[i]\n",
+    "    color = colors[i]\n",
+    "    tmp_angles = angles[:] + [angles[0]]\n",
+    "    tmp_values = [item[lb] for lb in labels] + [item[labels[0]]]\n",
+    "    ax_base.plot(tmp_angles, tmp_values, color=color, linewidth=1, linestyle='solid', label=model_name)\n",
+    "    ax_base.fill(tmp_angles, tmp_values, color=color, alpha=alpha)\n",
+    "    \n",
+    "angles += [angles[0]]\n",
+    "ax_base.set_ylim(0, 100)\n",
+    "ax_base.set_yticks([40, 60, 80, 100])\n",
+    "ax_base.set_yticklabels([''] * 4)\n",
+    "\n",
+    "ax_base.tick_params(pad=15)\n",
+    "ax_base.set_xticks(angles[:-1])\n",
+    "ax_base.set_xticklabels(labels, fontproperties=font18)\n",
+    "\n",
+    "dataset_map = {\n",
+    "    'MMBench_TEST_EN': 'MMBench (Test)', \n",
+    "    'MMBench_TEST_CN': 'MMBenchCN (Test)', \n",
+    "    'MathVista': 'MathVista (TestMini)', \n",
+    "    'MMMU_VAL': 'MMMU (Val)'\n",
+    "}\n",
+    "for i, label in enumerate(ax_base.get_xticklabels()):\n",
+    "    x,y = label.get_position()\n",
+    "    text = label.get_text()\n",
+    "    text = dataset_map[text] if text in dataset_map else text\n",
+    "    lab = ax_base.text(x, y, text, transform=label.get_transform(),\n",
+    "                  ha=label.get_ha(), va=label.get_va(), font_properties=font15)\n",
+    "    lab.set_rotation(360 / num_vars * i + 270)\n",
+    "    labels.append(lab)\n",
+    "ax_base.set_xticklabels([])\n",
+    "\n",
+    "leg = ax_base.legend(loc='center right', bbox_to_anchor=(1.6, 0.5), prop=font15, ncol=1, frameon=True, labelspacing=1.2)\n",
+    "for line in leg.get_lines():\n",
+    "    line.set_linewidth(2.5)\n",
+    "\n",
+    "cx, cy, sz = 0.44, 0.435, 0.34\n",
+    "axes = [fig.add_axes([cx - sz, cy - sz, cx + sz, cy + sz], projection='polar', label='axes%d' % i) for i in range(num_vars)]\n",
+    "    \n",
+    "for ax, angle, label in zip(axes, angles_deg, labels):\n",
+    "    ax.patch.set_visible(False)\n",
+    "    ax.grid(False)\n",
+    "    ax.xaxis.set_visible(False)\n",
+    "    cur_range = range_map[label]\n",
+    "    label_list = [cur_range[0] + (cur_range[1] - cur_range[0]) / 5 * i for i in range(2, 6)]\n",
+    "    label_list = [f'{x:.1f}' for x in label_list]\n",
+    "    ax.set_rgrids(range(40, 120, 20), angle=angle, labels=label_list, font_properties=font12)\n",
+    "    ax.spines['polar'].set_visible(False)\n",
+    "    ax.set_ylim(0, 100)\n",
+    "\n",
+    "title_text = f'{len(model2vis)} Representative VLMs on {num_vars} Benchmarks in OpenCompass Multi-Modal Leaderboard.'\n",
+    "plt.figtext(.7, .95, title_text, fontproperties=font18, ha='center')\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/VLMEvalKit/setup.py
+++ b/VLMEvalKit/setup.py
+import re
+import sys
+from os.path import exists
+from setuptools import find_packages, setup
+
+
+def parse_requirements(fname='requirements.txt', with_version=True):
+    """Parse the package dependencies listed in a requirements file but strips
+    specific versioning information.
+
+    Args:
+        fname (str): path to requirements file
+        with_version (bool, default=False): if True include version specs
+
+    Returns:
+        List[str]: list of requirements items
+
+    CommandLine:
+        python -c "import setup; print(setup.parse_requirements())"
+    """
+
+    require_fpath = fname
+
+    def parse_line(line):
+        """Parse information from a line in a requirements text file."""
+        if line.startswith('-r '):
+            # Allow specifying requirements in other files
+            target = line.split(' ')[1]
+            for info in parse_require_file(target):
+                yield info
+        else:
+            info = {'line': line}
+            if line.startswith('-e '):
+                info['package'] = line.split('#egg=')[1]
+            elif '@git+' in line:
+                info['package'] = line
+            else:
+                # Remove versioning from the package
+                pat = '(' + '|'.join(['>=', '==', '>']) + ')'
+                parts = re.split(pat, line, maxsplit=1)
+                parts = [p.strip() for p in parts]
+
+                info['package'] = parts[0]
+                if len(parts) > 1:
+                    op, rest = parts[1:]
+                    if ';' in rest:
+                        # Handle platform specific dependencies
+                        # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies
+                        version, platform_deps = map(str.strip,
+                                                     rest.split(';'))
+                        info['platform_deps'] = platform_deps
+                    else:
+                        version = rest  # NOQA
+                    info['version'] = (op, version)
+            yield info
+
+    def parse_require_file(fpath):
+        with open(fpath, 'r') as f:
+            for line in f.readlines():
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    for info in parse_line(line):
+                        yield info
+
+    def gen_packages_items():
+        if exists(require_fpath):
+            for info in parse_require_file(require_fpath):
+                parts = [info['package']]
+                if with_version and 'version' in info:
+                    parts.extend(info['version'])
+                if not sys.version.startswith('3.4'):
+                    # apparently package_deps are broken in 3.4
+                    platform_deps = info.get('platform_deps')
+                    if platform_deps is not None:
+                        parts.append(';' + platform_deps)
+                item = ''.join(parts)
+                yield item
+
+    packages = list(gen_packages_items())
+    return packages
+
+
+with open('README.md') as f:
+    readme = f.read()
+
+
+def do_setup():
+    setup(
+        name='vlmeval',
+        version='0.1.0',
+        description='OpenCompass VLM Evaluation Kit',
+        author='Haodong Duan',
+        author_email='dhd.efz@gmail.com',
+        maintainer='Haodong Duan',
+        maintainer_email='dhd.efz@gmail.com',
+        long_description=readme,
+        long_description_content_type='text/markdown',
+        cmdclass={},
+        install_requires=parse_requirements('requirements.txt'),
+        setup_requires=[],
+        python_requires='>=3.7.0',
+        packages=find_packages(exclude=[
+            'test*',
+            'paper_test*',
+        ]),
+        keywords=['AI', 'NLP', 'in-context learning'],
+        entry_points={
+            'console_scripts': ['vlmutil = vlmeval:cli']
+        },
+        classifiers=[
+            'Programming Language :: Python :: 3.7',
+            'Programming Language :: Python :: 3.8',
+            'Programming Language :: Python :: 3.9',
+            'Programming Language :: Python :: 3.10',
+            'Intended Audience :: Developers',
+            'Intended Audience :: Education',
+            'Intended Audience :: Science/Research',
+        ])
+
+
+if __name__ == '__main__':
+    do_setup()