Initial commit

26e59280 · wanglch · 26e59280 · 26e59280 · 26e59280 · 26e59280
Commit 26e59280 authored Apr 24, 2025 by wanglch
20 changed files
--- a/clip_benchmark/probe_benchmark/process_vtab.py
+++ b/clip_benchmark/probe_benchmark/process_vtab.py
+import json
+
+import pandas as pd
+
+# make a new version of vtab
+
+if __name__ == '__main__':
+    df = pd.read_json('probe_benchmark/scaling_experiment_data2.json')
+    df = df[df.fewshot_k == -1]
+    datasets = [
+        'vtab/caltech101',
+        'vtab/cifar10',
+        'vtab/cifar100',
+        'vtab/clevr_count_all',
+        'vtab/clevr_closest_object_distance',
+        'vtab/diabetic_retinopathy',
+        'vtab/dmlab',
+        'vtab/dsprites_label_orientation',
+        'vtab/dsprites_label_x_position',
+        'vtab/dtd',
+        'vtab/eurosat',
+        'vtab/kitti_closest_vehicle_distance',
+        'vtab/flowers',
+        'vtab/pets',
+        'vtab/pcam',
+        'vtab/resisc45',
+        'vtab/smallnorb_label_azimuth',
+        'vtab/smallnorb_label_elevation',
+        'vtab/svhn',
+    ]
+    all_info = []
+    for n, g in df.groupby(['model', 'pretrained', 'samples_seen_pretty']):
+        count = 0
+        total = 0.
+        for d in datasets:
+            g_filter = g[g.dataset == d]
+            count += 1
+            total += g_filter.lp_acc1.max()
+
+        avg = total / count
+        info = {'dataset': 'vtab', 'lp_acc1': avg, 'fewshot_k': -1}
+        for k in ['model', 'pretrained', 'upstream_dataset', 'gmacs_total', 'samples_seen_pretty']:
+            info[k] = g[k].values[0]
+        all_info.append(info)
+
+    with open('probe_benchmark/scaling_experiment_data_vtab.json', 'w') as f:
+        json.dump(all_info, f)
--- a/clip_benchmark/probe_benchmark/scaling_experiment_data2.json
+++ b/clip_benchmark/probe_benchmark/scaling_experiment_data2.json
--- a/clip_benchmark/probe_benchmark/scaling_experiment_data_vtab.json
+++ b/clip_benchmark/probe_benchmark/scaling_experiment_data_vtab.json
+[
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.7272385796110142,
+    "fewshot_k": -1,
+    "model": "ViT-B-16",
+    "pretrained": "laion400m_e32",
+    "upstream_dataset": "LAION-400M",
+    "gmacs_total": 268122270972.16,
+    "samples_seen_pretty": "13B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.7125347395825511,
+    "fewshot_k": -1,
+    "model": "ViT-B-16",
+    "pretrained": "openai",
+    "upstream_dataset": "CLIP-WIT",
+    "gmacs_total": 263296000000.0,
+    "samples_seen_pretty": "13B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.7332202011443508,
+    "fewshot_k": -1,
+    "model": "ViT-B-16-plus-240",
+    "pretrained": "laion400m_e32",
+    "upstream_dataset": "LAION-400M",
+    "gmacs_total": 370313744206.08,
+    "samples_seen_pretty": "13B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.7143166719197058,
+    "fewshot_k": -1,
+    "model": "ViT-B-32",
+    "pretrained": "laion2b_e16",
+    "upstream_dataset": "LAION-2B",
+    "gmacs_total": 256967931347.2,
+    "samples_seen_pretty": "34B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.7152995214130362,
+    "fewshot_k": -1,
+    "model": "ViT-B-32",
+    "pretrained": "laion2b_s34b_b79k",
+    "upstream_dataset": "LAION-2B",
+    "gmacs_total": 291096483388.0,
+    "samples_seen_pretty": "34B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.7183753019516755,
+    "fewshot_k": -1,
+    "model": "ViT-B-32",
+    "pretrained": "laion400m_e32",
+    "upstream_dataset": "LAION-400M",
+    "gmacs_total": 96456237491.2,
+    "samples_seen_pretty": "13B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.6971394911855741,
+    "fewshot_k": -1,
+    "model": "ViT-B-32",
+    "pretrained": "openai",
+    "upstream_dataset": "CLIP-WIT",
+    "gmacs_total": 94720000000.0,
+    "samples_seen_pretty": "13B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.7596462313700938,
+    "fewshot_k": -1,
+    "model": "ViT-H-14",
+    "pretrained": "laion2b_s32b_b79k",
+    "upstream_dataset": "LAION-2B",
+    "gmacs_total": 6631508868008.96,
+    "samples_seen_pretty": "34B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.744758325311516,
+    "fewshot_k": -1,
+    "model": "ViT-L-14",
+    "pretrained": "laion2b_s32b_b82k",
+    "upstream_dataset": "LAION-2B",
+    "gmacs_total": 2807360000000.0,
+    "samples_seen_pretty": "34B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.7397637678783028,
+    "fewshot_k": -1,
+    "model": "ViT-L-14",
+    "pretrained": "laion400m_e32",
+    "upstream_dataset": "LAION-400M",
+    "gmacs_total": 1143527799338.24,
+    "samples_seen_pretty": "13B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.7376775015037333,
+    "fewshot_k": -1,
+    "model": "ViT-L-14",
+    "pretrained": "openai",
+    "upstream_dataset": "CLIP-WIT",
+    "gmacs_total": 1122944000000.0,
+    "samples_seen_pretty": "13B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.7517780869059744,
+    "fewshot_k": -1,
+    "model": "ViT-g-14",
+    "pretrained": "laion2b_s12b_b42k",
+    "upstream_dataset": "LAION-2B",
+    "gmacs_total": 3549396664594.8,
+    "samples_seen_pretty": "13B"
+  }
+]
--- a/clip_benchmark/probe_benchmark/scaling_experiments.py
+++ b/clip_benchmark/probe_benchmark/scaling_experiments.py
+import os
+
+from clip_benchmark.cli import get_parser_args, run
+
+if __name__ == '__main__':
+
+    models = ['ViT-B-32-quickgelu,laion400m_e32',
+              'ViT-B-32,openai',
+              'ViT-B-32,laion2b_s34b_b79k',
+              'ViT-B-16,laion400m_e32',
+              'ViT-B-16-plus-240,laion400m_e32',
+              'ViT-B-16,openai',
+              'ViT-L-14-336,openai',
+              'ViT-L-14,openai',
+              'ViT-B-32,laion2b_e16',
+              'ViT-L-14,laion400m_e32',
+              'ViT-L-14,laion2b_s32b_b82k',
+              'ViT-H-14,laion2b_s32b_b79k',
+              'ViT-g-14,laion2b_s12b_b42k',
+              ]
+
+    datasets = ['imagenet1k-unverified', 'cifar100']
+    datasets = datasets + [
+        'vtab/caltech101',
+        'vtab/cifar10',
+        'vtab/cifar100',
+        'vtab/clevr_count_all',
+        'vtab/clevr_closest_object_distance',
+        'vtab/diabetic_retinopathy',
+        'vtab/dmlab',
+        'vtab/dsprites_label_orientation',
+        'vtab/dsprites_label_x_position',
+        'vtab/dtd',
+        'vtab/eurosat',
+        'vtab/kitti_closest_vehicle_distance',
+        'vtab/flowers',
+        'vtab/pets',
+        'vtab/pcam',
+        'vtab/resisc45',
+        'vtab/smallnorb_label_azimuth',
+        'vtab/smallnorb_label_elevation',
+        'vtab/svhn',
+    ]
+    ks = [10, 25, -1]
+    lrs = [0.1, 0.01, 0.001]
+    epoch_vals = [10, 20, 40]
+    batch_sizes = [32 * 8]
+
+    if not os.path.exists('probe_benchmark/data'):
+        os.mkdir('probe_benchmark/data')
+
+    for dataset in datasets:
+        dataset_root = 'datasets/' + dataset.split('/')[-1]  # TODO: change!
+        print(dataset_root)
+        for model_info in models:
+            model_info_split = model_info.split(',')
+            model, pretrained = model_info_split[0], model_info_split[1]
+            for epochs in epoch_vals:
+                # For VTAB, do not run >= 25 shot.
+                for k in ks:
+                    if k >= 25 and dataset.startswith('vtab'):
+                        continue
+                    for lr in lrs:
+                        for bs in batch_sizes:
+                            args = get_parser_args()
+                            args.dataset_root = dataset_root
+                            args.dataset = dataset
+                            args.task = 'linear_probe'
+                            args.pretrained = pretrained
+                            args.model = model
+                            args.output = f'probe_benchmark/data/' + f'{model}-{pretrained}-{dataset}-{epochs}-{k}-{lr}-{bs}.json'.replace(
+                                '/', '_')
+                            if os.path.exists(args.output):
+                                print('skipping - exists.')
+                                continue
+                            args.fewshot_k = k
+                            args.fewshot_epochs = epochs
+                            args.fewshot_lr = lr
+                            args.batch_size = bs
+                            run(args)
+                            print(dataset, model, pretrained, epochs, k, lr, bs)
--- a/clip_benchmark/probe_benchmark/scaling_plot.ipynb
+++ b/clip_benchmark/probe_benchmark/scaling_plot.ipynb
--- a/clip_benchmark/requirements-test.txt
+++ b/clip_benchmark/requirements-test.txt
+pytest
--- a/clip_benchmark/requirements.txt
+++ b/clip_benchmark/requirements.txt
+open_clip_torch>=0.2.1
+opencv-python
+peft>=0.6.2
+protobuf==3.20.3
+pycocoevalcap
+pyyaml
+scikit-learn>=1.0,<2
+scikit-learn
+scipy
+task_adaptation
+tensorflow==2.11.0
+termcolor
+tqdm>=2
+transformers>=4.32.0
+webdataset>=0.2.31
+yacs
--- a/clip_benchmark/setup.cfg
+++ b/clip_benchmark/setup.cfg
+[bumpversion]
+current_version = 0.1.0
+commit = True
+tag = True
+
+[bumpversion:file:setup.py]
+search = version='{current_version}'
+replace = version='{new_version}'
+
+[bumpversion:file:clip_benchmark/__init__.py]
+search = __version__ = '{current_version}'
+replace = __version__ = '{new_version}'
+
+[bdist_wheel]
+universal = 1
+
+[flake8]
+exclude = docs
--- a/clip_benchmark/setup.py
+++ b/clip_benchmark/setup.py
+#!/usr/bin/env python
+
+"""The setup script."""
+
+from setuptools import find_packages, setup
+
+with open('README.md') as readme_file:
+    readme = readme_file.read()
+
+with open('HISTORY.rst') as history_file:
+    history = history_file.read()
+
+
+def load_requirements(f):
+    return [l.strip() for l in open(f).readlines()]
+
+
+requirements = load_requirements('requirements.txt')
+
+test_requirements = requirements + ['pytest', 'pytest-runner']
+
+setup(
+    author='Mehdi Cherti',
+    author_email='mehdicherti@gmail.com',
+    python_requires='>=3.6',
+    classifiers=[
+        'Development Status :: 2 - Pre-Alpha',
+        'Intended Audience :: Developers',
+        'License :: OSI Approved :: MIT License',
+        'Natural Language :: English',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+    ],
+    description='CLIP-like models benchmarks on various datasets',
+    entry_points={
+        'console_scripts': [
+            'clip_benchmark=clip_benchmark.cli:main',
+            'clip_benchmark_export_wds=clip_benchmark.webdataset_builder:main',
+        ],
+    },
+    install_requires=requirements,
+    license='MIT license',
+    long_description=readme + '\n\n' + history,
+    long_description_content_type='text/markdown',
+    include_package_data=True,
+    keywords='clip_benchmark',
+    name='clip_benchmark',
+    packages=find_packages(include=['clip_benchmark', 'clip_benchmark.*']),
+    test_suite='tests',
+    tests_require=test_requirements,
+    url='https://github.com/mehdidc/clip_benchmark',
+    version='1.4.0',
+    zip_safe=False,
+    extra_require={
+        'vtab': ['task_adaptation==0.1', 'timm>=0.5.4'],
+        'tfds': ['tfds-nightly', 'timm>=0.5.4'],
+        'coco': ['pycocotools>=2.0.4'],
+    }
+)
--- a/clip_benchmark/test_internvl_c_classification.sh
+++ b/clip_benchmark/test_internvl_c_classification.sh
+set -x
+
+PARTITION=${PARTITION:-'INTERN4'}
+alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "birdsnap" --dataset_root ./data/birdsnap/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "cifar10" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "cifar100" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "food101" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "sun397" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "cars" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "fgvc_aircraft" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "dtd" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "pets" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "caltech101" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "mnist" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "stl10" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "eurosat" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "gtsrb" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "country211" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "pcam" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "renderedsst2" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "fer2013" --dataset_root ./data/fer2013 --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "voc2007" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "vtab/flowers" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "vtab/resisc45" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
--- a/clip_benchmark/test_internvl_c_imagenet.sh
+++ b/clip_benchmark/test_internvl_c_imagenet.sh
+set -x
+
+PARTITION=${PARTITION:-'INTERN4'}
+alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
+    --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" \
+    --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
+    --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "it" \
+    --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
+    --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "jp" \
+    --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
+    --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "ar" \
+    --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
+    --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "imagenetv2" --dataset_root ./data/imagenetv2/ \
+    --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "imagenet_sketch" --dataset_root ./data/imagenet-sketch/ \
+    --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "imagenet-a" --dataset_root ./data/imagenet-a/ \
+    --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "imagenet-r" --dataset_root ./data/imagenet-r/ \
+    --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "objectnet" --dataset_root ./data/objectnet-1.0/ \
+    --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
--- a/clip_benchmark/test_internvl_c_retrieval.sh
+++ b/clip_benchmark/test_internvl_c_retrieval.sh
+set -x
+
+PARTITION=${PARTITION:-'INTERN4'}
+alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
+    --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
+    --dataset "mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
--- a/clip_benchmark/test_internvl_c_xtd.sh
+++ b/clip_benchmark/test_internvl_c_xtd.sh
+set -x
+
+PARTITION=${PARTITION:-'INTERN4'}
+alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=en
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=es
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=fr
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=zh
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=it
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=ko
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=ru
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=jp
--- a/clip_benchmark/test_internvl_g_classification.sh
+++ b/clip_benchmark/test_internvl_g_classification.sh
+set -x
+
+PARTITION=${PARTITION:-'INTERN4'}
+alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "birdsnap" --dataset_root ./data/birdsnap/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "cifar10" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "cifar100" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "food101" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "sun397" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "cars" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "fgvc_aircraft" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "dtd" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "pets" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "caltech101" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "mnist" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "stl10" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "eurosat" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "gtsrb" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "country211" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "pcam" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "renderedsst2" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "fer2013" --dataset_root ./data/fer2013 --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "voc2007" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "vtab/flowers" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "vtab/resisc45" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
--- a/clip_benchmark/test_internvl_g_imagenet.sh
+++ b/clip_benchmark/test_internvl_g_imagenet.sh
+set -x
+
+PARTITION=${PARTITION:-'INTERN4'}
+alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
+    --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" \
+    --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
+    --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "it" \
+    --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
+    --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "jp" \
+    --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
+    --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "ar" \
+    --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
+    --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "imagenetv2" --dataset_root ./data/imagenetv2/ \
+    --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "imagenet_sketch" --dataset_root ./data/imagenet-sketch/ \
+    --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "imagenet-a" --dataset_root ./data/imagenet-a/ \
+    --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "imagenet-r" --dataset_root ./data/imagenet-r/ \
+    --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "objectnet" --dataset_root ./data/objectnet-1.0/ \
+    --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
--- a/clip_benchmark/test_internvl_g_retrieval.sh
+++ b/clip_benchmark/test_internvl_g_retrieval.sh
+set -x
+
+PARTITION=${PARTITION:-'INTERN4'}
+alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
+    --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
+    --dataset "mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
--- a/clip_benchmark/test_internvl_g_retrieval_finetune.sh
+++ b/clip_benchmark/test_internvl_g_retrieval_finetune.sh
+set -x
+
+PARTITION=${PARTITION:-'INTERN4'}
+alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+     --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_c_retrieval_hf \
+     --pretrained ./work_dirs/internvl_stage2_finetune_flickrcn_364_bs1024_ep10 \
+     --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+     --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_g_retrieval_hf \
+     --pretrained ./work_dirs/internvl_stage2_finetune_flickrcn_364_bs1024_ep10 \
+     --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
+     --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_c_retrieval_hf \
+     --pretrained ./work_dirs/internvl_stage2_finetune_flickr_364_bs1024_ep10 \
+     --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
+     --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_g_retrieval_hf \
+     --pretrained ./work_dirs/internvl_stage2_finetune_flickr_364_bs1024_ep10 \
+     --output result.json
--- a/clip_benchmark/test_internvl_g_xtd.sh
+++ b/clip_benchmark/test_internvl_g_xtd.sh
+set -x
+
+PARTITION=${PARTITION:-'INTERN4'}
+alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=en
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=es
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=fr
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=zh
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=it
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=ko
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=ru
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=jp
--- a/clip_benchmark/tests/test_clip_benchmark.py
+++ b/clip_benchmark/tests/test_clip_benchmark.py
+#!/usr/bin/env python
+
+"""Tests for `clip_benchmark` package."""
+
+import os
+
+os.environ['CUDA_VISIBLE_DEVICES'] = ''
+from clip_benchmark.cli import run
+
+
+class base_args:
+    dataset = 'dummy'
+    split = 'test'
+    model = 'ViT-B-32-quickgelu'
+    pretrained = 'laion400m_e32'
+    task = 'zeroshot_classification'
+    amp = False
+    num_workers = 4
+    batch_size = 64
+    dataset_root = 'root'
+    output = 'result.json'
+    verbose = True
+    root = 'root'
+    annotation_file = ''
+    seed = 0
+    skip_load = False
+    language = 'en'
+    model_cache_dir = None
+    cupl = False
+    save_clf = None
+    load_clfs = []
+    model_type = 'open_clip'
+    wds_cache_dir = None
+    which = 'eval'
+    skip_existing = False
+
+
+def test_base():
+    run(base_args)
--- a/clip_benchmark/tox.ini
+++ b/clip_benchmark/tox.ini
+[tox]
+envlist = py36, py37, py38, flake8
+
+[travis]
+python =
+    3.8: py38
+    3.7: py37
+    3.6: py36
+
+[testenv:flake8]
+basepython = python
+deps = flake8
+commands = flake8 clip_benchmark tests
+
+[testenv]
+setenv =
+    PYTHONPATH = {toxinidir}
+
+commands = python setup.py test