Initial commit

1d5a34cf · wanglch · 1d5a34cf · 1d5a34cf · 1d5a34cf · 1d5a34cf
Commit 1d5a34cf authored Jul 31, 2024 by wanglch
20 changed files
--- a/clip_benchmark/data/mscoco_captions/coco-cn_test.json
+++ b/clip_benchmark/data/mscoco_captions/coco-cn_test.json
--- a/clip_benchmark/probe_benchmark/PROBES.md
+++ b/clip_benchmark/probe_benchmark/PROBES.md
+Steps to run.
+
+1. Navigate to `CLIP_benchmark`.
+2. Run `export PYTHONPATH=$PWD`.
+3. (Optional) To re-run the experiments, run `python probe_benchmark/scaling_experiments.py`. You'll have to change line
+   51 to point to your data.
+4. (Optional) To generate the results, run `python probe_benchmark/build_df_scaling_experiments.py`.
+5. (Optional) VTAB requires post-processing to average. Run `python probe_benchmark/process_vtab.py`.
+6. Generate plots with `python probe_benchmark/scaling_plot.py`.
+7. Generate table with `python probe_benchark/generate_table.py`.
--- a/clip_benchmark/probe_benchmark/build_df_scaling_experiments.py
+++ b/clip_benchmark/probe_benchmark/build_df_scaling_experiments.py
+import json
+import os
+
+import pandas as pd
+
+if __name__ == '__main__':
+
+    compute_df = pd.read_csv('probe_benchmark/clip_table_2.csv')
+    # mdf = pd.read_csv("https://gist.githubusercontent.com/mehdidc/58dee67cecd5431a80ee3a2346c9c165/raw/45288ebccaacc34a97f580f8bf16fb3274927f2c/gistfile1.txt")
+    mdf = pd.read_csv('probe_benchmark/openclip_results.csv')
+    info = []
+    # import pdb; pdb.set_trace()
+    models = ['ViT-B-32-quickgelu,laion400m_e32',
+              'ViT-B-32,openai',
+              'ViT-B-32,laion2b_s34b_b79k',
+              'ViT-B-16,laion400m_e32',
+              'ViT-B-16-plus-240,laion400m_e32',
+              'ViT-B-16,openai',
+              # 'ViT-L-14-336,openai',
+              'ViT-L-14,openai',
+              'ViT-B-32,laion2b_e16',
+              'ViT-L-14,laion400m_e32',
+              'ViT-L-14,laion2b_s32b_b82k',
+              'ViT-H-14,laion2b_s32b_b79k',
+              'ViT-g-14,laion2b_s12b_b42k',
+              ]
+    alt_models = ['B/32 400M',
+                  'B/32 CLIP WIT',
+                  'B/32 2B',
+                  'B/16 400M',
+                  'B/16+ 400M',
+                  'B/16 CLIP WIT',
+                  # 'ViT-L-14-336,openai',
+                  'L/14 CLIP WIT',
+                  'B/32 2B',
+                  'L/14 400M',
+                  'L/14 2B',
+                  'H/14 2B',
+                  'g/14 2B',
+                  ]
+
+    datasets = ['imagenet1k-unverified', 'cifar100']
+    datasets = datasets + [
+        'vtab/caltech101',
+        'vtab/cifar10',
+        'vtab/cifar100',
+        'vtab/clevr_count_all',
+        'vtab/clevr_closest_object_distance',
+        'vtab/diabetic_retinopathy',
+        'vtab/dmlab',
+        'vtab/dsprites_label_orientation',
+        'vtab/dsprites_label_x_position',
+        'vtab/dtd',
+        'vtab/eurosat',
+        'vtab/kitti_closest_vehicle_distance',
+        'vtab/flowers',
+        'vtab/pets',
+        'vtab/pcam',
+        'vtab/resisc45',
+        'vtab/smallnorb_label_azimuth',
+        'vtab/smallnorb_label_elevation',
+        'vtab/svhn',
+    ]
+
+    ks = [10, 25, -1]
+    lrs = [0.1, 0.01, 0.001]
+    epoch_vals = [10, 20, 40]
+    batch_sizes = [32 * 8]
+
+    def get_us_dataset(pretrained):
+        if '2b' in pretrained:
+            return 'LAION-2B'
+        elif 'laion' in pretrained:
+            return 'LAION-400M'
+        else:
+            return 'CLIP-WIT'
+
+    for dataset in datasets:
+        dataset_root = '/datasets01/imagenet_full_size/061417' if dataset.startswith(
+            'imagenet') else '/private/home/mitchellw/git/forks/CLIP_benchmark'
+        for ii, model_info in enumerate(models):
+            model_info_split = model_info.split(',')
+            model, pretrained = model_info_split[0], model_info_split[1]
+            for epochs in epoch_vals:
+                for k in ks:
+                    if k == 25 and 'vtab' in dataset:
+                        continue
+                    for lr in lrs:
+                        for bs in batch_sizes:
+                            pth = '/private/home/mitchellw/git/forks/CLIP_benchmark/probe_benchmark/data/' + f'{model}-{pretrained}-{dataset}-{epochs}-{k}-{lr}-{bs}.json'.replace(
+                                '/', '_')
+                            print(pth)
+                            assert os.path.exists(pth)
+                            row = {
+                                'k': k,
+                                'lr': lr,
+                                'bs': bs,
+                                'epochs': epochs,
+                                'model': model.replace('-quickgelu', ''),
+                                'pretrained': pretrained,
+                                'pretrained_short': 'laion2b' if 'laion2b' in pretrained else pretrained,
+                                'pretrained_clean': 'LAION' if 'laion' in pretrained else 'CLIP-WiT',
+                                'dataset': dataset,
+                                'macts': compute_df[compute_df.model == model.replace('-quickgelu', '')][
+                                    'image_macts'].values[0],
+                                # 'gmacs_total': mdf[mdf.model_fullname_pretty == alt_models[ii]]['gmacs_total'].values[0],
+                                # 'samples_seen': mdf[mdf.model_fullname_pretty == alt_models[ii]]['samples_seen'].values[0],
+                                'gmacs_total':
+                                    mdf[mdf.model_fullname == models[ii].replace(',', ' ')]['gmacs_total'].values[0],
+                                'samples_seen':
+                                    mdf[mdf.model_fullname == models[ii].replace(',', ' ')]['samples_seen'].values[0],
+                                'samples_seen_pretty': mdf[mdf.model_fullname == models[ii].replace(',', ' ')][
+                                    'samples_seen_pretty'].values[0],
+                                'model_short': models[ii].replace(',', ' '),
+                                'upstream_dataset': get_us_dataset(pretrained)
+                            }
+                            with open(pth, 'r') as f:
+                                row.update(json.load(f)['metrics'])
+                        info.append(row)
+
+    with open('probe_benchmark/scaling_experiment_data2.json', 'w') as f:
+        json.dump(info, f)
--- a/clip_benchmark/probe_benchmark/clip_table_2.csv
+++ b/clip_benchmark/probe_benchmark/clip_table_2.csv
+model,image_size,image_width,text_width,embed_dim,gmacs,macts,mparams,image_gmacs,image_macts,image_mparams,text_gmacs,text_macts,text_mparams
+ViT-B-32,224,768,512,512,7.4,10.31,151.28,4.41,5.01,87.85,2.98,5.3,63.43
+ViT-B-32-plus-256,256,896,640,640,12.43,14.38,210.3,7.79,7.76,119.13,4.64,6.63,91.16
+RN50,224,2048,512,1024,9.16,18.29,102.01,6.17,12.98,38.32,2.98,5.3,63.69
+ViT-M-16,224,512,512,512,10.99,21.23,102.02,8.0,15.93,38.59,2.98,5.3,63.43
+RN101,224,2048,512,512,12.84,23.38,119.69,9.86,18.08,56.26,2.98,5.3,63.43
+ViT-M-16-256,256,512,512,512,13.62,27.56,102.05,10.63,22.26,38.62,2.98,5.3,63.43
+ViT-B-16,224,768,512,512,20.57,29.2,149.62,17.58,23.9,86.19,2.98,5.3,63.43
+ViT-B-16-plus,224,896,640,640,28.41,34.5,208.35,23.77,27.88,117.19,4.64,6.63,91.16
+ViT-B-16-plus-240,240,896,640,640,32.05,39.71,208.38,27.41,33.08,117.21,4.64,6.63,91.16
+RN50x4,288,2560,640,640,26.09,41.9,178.3,21.45,35.27,87.14,4.64,6.63,91.16
+ViT-L-16,224,1024,768,768,68.26,71.47,427.74,61.6,63.52,304.09,6.66,7.95,123.65
+ViT-L-14,224,1024,768,768,87.73,96.74,427.62,81.08,88.79,303.97,6.66,7.95,123.65
+RN50x16,384,3072,768,768,81.86,111.49,290.98,75.2,103.54,167.33,6.66,7.95,123.65
+ViT-H-16,224,1280,1024,1024,150.96,122.01,986.26,127.4,100.81,632.23,23.57,21.2,354.03
+ViT-H-14,224,1280,1024,1024,190.97,160.61,986.11,167.4,139.41,632.08,23.57,21.2,354.03
+ViT-L-14-280,280,1024,768,768,136.0,168.66,427.76,129.34,160.71,304.11,6.66,7.95,123.65
+RN50x64,448,4096,1024,1024,193.4,199.15,500.28,181.61,188.55,297.4,11.78,10.6,202.88
+ViT-g-14,224,1408,1024,1024,290.74,213.84,1366.68,267.18,192.64,1012.65,23.57,21.2,354.03
+ViT-H-14-280,280,1280,1024,1024,289.49,268.29,986.29,265.93,247.09,632.26,23.57,21.2,354.03
+ViT-L-14-336,336,1024,768,768,197.76,278.19,427.94,191.1,270.24,304.29,6.66,7.95,123.65
+ViT-g-14-280,280,1408,1024,1024,446.95,358.73,1366.88,423.38,337.53,1012.85,23.57,21.2,354.03
+ViT-H-14-336,336,1280,1024,1024,414.53,428.74,986.52,390.97,407.54,632.49,23.57,21.2,354.03
+ViT-g-14-336,336,1408,1024,1024,644.21,571.87,1367.13,620.65,550.67,1013.1,23.57,21.2,354.03
--- a/clip_benchmark/probe_benchmark/generate_table.py
+++ b/clip_benchmark/probe_benchmark/generate_table.py
+import pandas as pd
+
+# make a new version of vtab
+
+if __name__ == '__main__':
+    df_full = pd.read_json('probe_benchmark/scaling_experiment_data2.json')
+    df = df_full[df_full.fewshot_k == -1]
+    df25 = df_full[df_full.fewshot_k == 25]
+    df10 = df_full[df_full.fewshot_k == 10]
+
+    datasets = [
+        'vtab/caltech101',
+        'vtab/cifar10',
+        'vtab/cifar100',
+        'vtab/clevr_count_all',
+        'vtab/clevr_closest_object_distance',
+        'vtab/diabetic_retinopathy',
+        'vtab/dmlab',
+        'vtab/dsprites_label_orientation',
+        'vtab/dsprites_label_x_position',
+        'vtab/dtd',
+        'vtab/eurosat',
+        'vtab/kitti_closest_vehicle_distance',
+        'vtab/flowers',
+        'vtab/pets',
+        'vtab/pcam',
+        'vtab/resisc45',
+        'vtab/smallnorb_label_azimuth',
+        'vtab/smallnorb_label_elevation',
+        'vtab_svhn',
+    ]
+
+    datasets2 = [
+        'imagenet1k-unverified', 'cifar100'
+    ]
+
+    all_info = []
+    cols = []
+    first = True
+    for n, g in df_full.groupby(['model', 'pretrained', 'samples_seen_pretty']):
+        count = 0
+        total = 0.
+        for d in datasets:
+            g_filter = g[(g.dataset == d) & (g.fewshot_k == -1)]
+            count += 1
+            total += g_filter.lp_acc1.max()
+
+        avg = total / count
+        info = {'VTAB acc': avg}
+        if first:
+            cols.append('VTAB acc')
+
+        for d in datasets2:
+            for k in [10, 25, -1]:
+                g_filter = g[(g.dataset == d) & (g.fewshot_k == k)]
+                info[f'{d}: {k} shot'] = g_filter.lp_acc1.max()
+                if first:
+                    cols.append(f'{d}: {k} shot')
+
+        for k in ['model', 'pretrained', 'upstream_dataset', 'gmacs_total', 'samples_seen_pretty']:
+            info[k] = g[k].values[0]
+        all_info.append(info)
+        first = False
+
+    df = pd.DataFrame(all_info)
+    formatters = {}
+    print(df.keys())
+    columns = ['model', 'samples_seen_pretty', 'upstream_dataset']
+    df = df.sort_values(by=['model', 'samples_seen_pretty', 'upstream_dataset'])
+    for ds in cols:
+        columns.append(ds)
+        formatters[ds] = lambda x: f'{100 * x:.2f}'
+    latex = df.to_latex(columns=columns, formatters=formatters)
+    print(latex)
+
+    # with open('probe_benchmark/scaling_experiment_data_combined.json', 'w') as f:
+    #   json.dump(all_info, f)
--- a/clip_benchmark/probe_benchmark/gmacs_vs_perf_retrieval.pdf
+++ b/clip_benchmark/probe_benchmark/gmacs_vs_perf_retrieval.pdf
--- a/clip_benchmark/probe_benchmark/imagenet_cifar_lp.pdf
+++ b/clip_benchmark/probe_benchmark/imagenet_cifar_lp.pdf
--- a/clip_benchmark/probe_benchmark/imagenet_cifar_lp_vtab.pdf
+++ b/clip_benchmark/probe_benchmark/imagenet_cifar_lp_vtab.pdf
--- a/clip_benchmark/probe_benchmark/laion5b_fewshot_experiments.py
+++ b/clip_benchmark/probe_benchmark/laion5b_fewshot_experiments.py
+import os
+
+from clip_benchmark.cli import get_parser_args, run
+
+# /private/home/mitchellw/miniconda3/envs/cb/bin/python probe_benchmark/laion5b_fewshot_experiments.py
+if __name__ == '__main__':
+
+    models = ['ViT-B-32-quickgelu,laion400m_e32',
+              'ViT-B-32,openai',
+              'ViT-B-32,laion2b_s34b_b79k',
+              'ViT-B-16,laion400m_e32',
+              # 'ViT-B-16-plus-240,laion400m_e32',
+              'ViT-B-16,openai',
+              # 'ViT-L-14-336,openai',
+              'ViT-L-14,openai',
+              # 'ViT-B-32,laion2b_e16',
+              'ViT-L-14,laion400m_e32',
+              'ViT-L-14,laion2b_s32b_b82k',
+              'ViT-H-14,laion2b_s32b_b79k',
+              ]
+
+    datasets = ['imagenet1k-unverified']
+
+    ks = [1, 2, 4, 8, 16, 32, 64, 128]
+    lrs = [0.1, 0.01, 0.001, 0.0001]
+    epoch_vals = [10, 20, 40, 80]
+    batch_sizes = [32 * 8]
+
+    for epochs in epoch_vals:
+        for dataset in datasets:
+            dataset_root = '/datasets01/imagenet_full_size/061417' if dataset.startswith(
+                'imagenet') else '/private/home/mitchellw/git/forks/CLIP_benchmark'
+            for model_info in models:
+                model_info_split = model_info.split(',')
+                model, pretrained = model_info_split[0], model_info_split[1]
+
+                for k in ks:
+                    for lr in lrs:
+                        for bs in batch_sizes:
+                            args = get_parser_args()
+                            args.dataset_root = dataset_root
+                            args.dataset = dataset
+                            args.task = 'linear_probe'
+                            args.pretrained = pretrained
+                            args.model = model
+                            args.output = '/private/home/mitchellw/git/forks/CLIP_benchmark/probe_benchmark/data/' + f'{model}-{pretrained}-{dataset}-{epochs}-{k}-{lr}-{bs}.json'.replace(
+                                '/', '_')
+                            if os.path.exists(args.output):
+                                print('skipping - exists.')
+                            args.fewshot_k = k
+                            args.fewshot_epochs = epochs
+                            args.fewshot_lr = lr
+                            args.batch_size = bs
+                            args.skip_load = True  # NOTE
+                            run(args)
--- a/clip_benchmark/probe_benchmark/openclip_results.csv
+++ b/clip_benchmark/probe_benchmark/openclip_results.csv
--- a/clip_benchmark/probe_benchmark/process_vtab.py
+++ b/clip_benchmark/probe_benchmark/process_vtab.py
+import json
+
+import pandas as pd
+
+# make a new version of vtab
+
+if __name__ == '__main__':
+    df = pd.read_json('probe_benchmark/scaling_experiment_data2.json')
+    df = df[df.fewshot_k == -1]
+    datasets = [
+        'vtab/caltech101',
+        'vtab/cifar10',
+        'vtab/cifar100',
+        'vtab/clevr_count_all',
+        'vtab/clevr_closest_object_distance',
+        'vtab/diabetic_retinopathy',
+        'vtab/dmlab',
+        'vtab/dsprites_label_orientation',
+        'vtab/dsprites_label_x_position',
+        'vtab/dtd',
+        'vtab/eurosat',
+        'vtab/kitti_closest_vehicle_distance',
+        'vtab/flowers',
+        'vtab/pets',
+        'vtab/pcam',
+        'vtab/resisc45',
+        'vtab/smallnorb_label_azimuth',
+        'vtab/smallnorb_label_elevation',
+        'vtab/svhn',
+    ]
+    all_info = []
+    for n, g in df.groupby(['model', 'pretrained', 'samples_seen_pretty']):
+        count = 0
+        total = 0.
+        for d in datasets:
+            g_filter = g[g.dataset == d]
+            count += 1
+            total += g_filter.lp_acc1.max()
+
+        avg = total / count
+        info = {'dataset': 'vtab', 'lp_acc1': avg, 'fewshot_k': -1}
+        for k in ['model', 'pretrained', 'upstream_dataset', 'gmacs_total', 'samples_seen_pretty']:
+            info[k] = g[k].values[0]
+        all_info.append(info)
+
+    with open('probe_benchmark/scaling_experiment_data_vtab.json', 'w') as f:
+        json.dump(all_info, f)
--- a/clip_benchmark/probe_benchmark/scaling_experiment_data2.json
+++ b/clip_benchmark/probe_benchmark/scaling_experiment_data2.json
--- a/clip_benchmark/probe_benchmark/scaling_experiment_data_vtab.json
+++ b/clip_benchmark/probe_benchmark/scaling_experiment_data_vtab.json
+[
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.7272385796110142,
+    "fewshot_k": -1,
+    "model": "ViT-B-16",
+    "pretrained": "laion400m_e32",
+    "upstream_dataset": "LAION-400M",
+    "gmacs_total": 268122270972.16,
+    "samples_seen_pretty": "13B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.7125347395825511,
+    "fewshot_k": -1,
+    "model": "ViT-B-16",
+    "pretrained": "openai",
+    "upstream_dataset": "CLIP-WIT",
+    "gmacs_total": 263296000000.0,
+    "samples_seen_pretty": "13B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.7332202011443508,
+    "fewshot_k": -1,
+    "model": "ViT-B-16-plus-240",
+    "pretrained": "laion400m_e32",
+    "upstream_dataset": "LAION-400M",
+    "gmacs_total": 370313744206.08,
+    "samples_seen_pretty": "13B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.7143166719197058,
+    "fewshot_k": -1,
+    "model": "ViT-B-32",
+    "pretrained": "laion2b_e16",
+    "upstream_dataset": "LAION-2B",
+    "gmacs_total": 256967931347.2,
+    "samples_seen_pretty": "34B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.7152995214130362,
+    "fewshot_k": -1,
+    "model": "ViT-B-32",
+    "pretrained": "laion2b_s34b_b79k",
+    "upstream_dataset": "LAION-2B",
+    "gmacs_total": 291096483388.0,
+    "samples_seen_pretty": "34B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.7183753019516755,
+    "fewshot_k": -1,
+    "model": "ViT-B-32",
+    "pretrained": "laion400m_e32",
+    "upstream_dataset": "LAION-400M",
+    "gmacs_total": 96456237491.2,
+    "samples_seen_pretty": "13B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.6971394911855741,
+    "fewshot_k": -1,
+    "model": "ViT-B-32",
+    "pretrained": "openai",
+    "upstream_dataset": "CLIP-WIT",
+    "gmacs_total": 94720000000.0,
+    "samples_seen_pretty": "13B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.7596462313700938,
+    "fewshot_k": -1,
+    "model": "ViT-H-14",
+    "pretrained": "laion2b_s32b_b79k",
+    "upstream_dataset": "LAION-2B",
+    "gmacs_total": 6631508868008.96,
+    "samples_seen_pretty": "34B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.744758325311516,
+    "fewshot_k": -1,
+    "model": "ViT-L-14",
+    "pretrained": "laion2b_s32b_b82k",
+    "upstream_dataset": "LAION-2B",
+    "gmacs_total": 2807360000000.0,
+    "samples_seen_pretty": "34B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.7397637678783028,
+    "fewshot_k": -1,
+    "model": "ViT-L-14",
+    "pretrained": "laion400m_e32",
+    "upstream_dataset": "LAION-400M",
+    "gmacs_total": 1143527799338.24,
+    "samples_seen_pretty": "13B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.7376775015037333,
+    "fewshot_k": -1,
+    "model": "ViT-L-14",
+    "pretrained": "openai",
+    "upstream_dataset": "CLIP-WIT",
+    "gmacs_total": 1122944000000.0,
+    "samples_seen_pretty": "13B"
+  },
+  {
+    "dataset": "vtab",
+    "lp_acc1": 0.7517780869059744,
+    "fewshot_k": -1,
+    "model": "ViT-g-14",
+    "pretrained": "laion2b_s12b_b42k",
+    "upstream_dataset": "LAION-2B",
+    "gmacs_total": 3549396664594.8,
+    "samples_seen_pretty": "13B"
+  }
+]
--- a/clip_benchmark/probe_benchmark/scaling_experiments.py
+++ b/clip_benchmark/probe_benchmark/scaling_experiments.py
+import os
+
+from clip_benchmark.cli import get_parser_args, run
+
+if __name__ == '__main__':
+
+    models = ['ViT-B-32-quickgelu,laion400m_e32',
+              'ViT-B-32,openai',
+              'ViT-B-32,laion2b_s34b_b79k',
+              'ViT-B-16,laion400m_e32',
+              'ViT-B-16-plus-240,laion400m_e32',
+              'ViT-B-16,openai',
+              'ViT-L-14-336,openai',
+              'ViT-L-14,openai',
+              'ViT-B-32,laion2b_e16',
+              'ViT-L-14,laion400m_e32',
+              'ViT-L-14,laion2b_s32b_b82k',
+              'ViT-H-14,laion2b_s32b_b79k',
+              'ViT-g-14,laion2b_s12b_b42k',
+              ]
+
+    datasets = ['imagenet1k-unverified', 'cifar100']
+    datasets = datasets + [
+        'vtab/caltech101',
+        'vtab/cifar10',
+        'vtab/cifar100',
+        'vtab/clevr_count_all',
+        'vtab/clevr_closest_object_distance',
+        'vtab/diabetic_retinopathy',
+        'vtab/dmlab',
+        'vtab/dsprites_label_orientation',
+        'vtab/dsprites_label_x_position',
+        'vtab/dtd',
+        'vtab/eurosat',
+        'vtab/kitti_closest_vehicle_distance',
+        'vtab/flowers',
+        'vtab/pets',
+        'vtab/pcam',
+        'vtab/resisc45',
+        'vtab/smallnorb_label_azimuth',
+        'vtab/smallnorb_label_elevation',
+        'vtab/svhn',
+    ]
+    ks = [10, 25, -1]
+    lrs = [0.1, 0.01, 0.001]
+    epoch_vals = [10, 20, 40]
+    batch_sizes = [32 * 8]
+
+    if not os.path.exists('probe_benchmark/data'):
+        os.mkdir('probe_benchmark/data')
+
+    for dataset in datasets:
+        dataset_root = 'datasets/' + dataset.split('/')[-1]  # TODO: change!
+        print(dataset_root)
+        for model_info in models:
+            model_info_split = model_info.split(',')
+            model, pretrained = model_info_split[0], model_info_split[1]
+            for epochs in epoch_vals:
+                # For VTAB, do not run >= 25 shot.
+                for k in ks:
+                    if k >= 25 and dataset.startswith('vtab'):
+                        continue
+                    for lr in lrs:
+                        for bs in batch_sizes:
+                            args = get_parser_args()
+                            args.dataset_root = dataset_root
+                            args.dataset = dataset
+                            args.task = 'linear_probe'
+                            args.pretrained = pretrained
+                            args.model = model
+                            args.output = f'probe_benchmark/data/' + f'{model}-{pretrained}-{dataset}-{epochs}-{k}-{lr}-{bs}.json'.replace(
+                                '/', '_')
+                            if os.path.exists(args.output):
+                                print('skipping - exists.')
+                                continue
+                            args.fewshot_k = k
+                            args.fewshot_epochs = epochs
+                            args.fewshot_lr = lr
+                            args.batch_size = bs
+                            run(args)
+                            print(dataset, model, pretrained, epochs, k, lr, bs)
--- a/clip_benchmark/probe_benchmark/scaling_plot.ipynb
+++ b/clip_benchmark/probe_benchmark/scaling_plot.ipynb
--- a/clip_benchmark/requirements-test.txt
+++ b/clip_benchmark/requirements-test.txt
+pytest
--- a/clip_benchmark/requirements.txt
+++ b/clip_benchmark/requirements.txt
+open_clip_torch>=0.2.1
+opencv-python
+peft>=0.6.2
+protobuf==3.20.3
+pycocoevalcap
+pyyaml
+scikit-learn>=1.0,<2
+scikit-learn
+scipy
+task_adaptation
+tensorflow==2.11.0
+termcolor
+tqdm>=2
+transformers>=4.32.0
+webdataset>=0.2.31
+yacs
--- a/clip_benchmark/setup.cfg
+++ b/clip_benchmark/setup.cfg
+[bumpversion]
+current_version = 0.1.0
+commit = True
+tag = True
+
+[bumpversion:file:setup.py]
+search = version='{current_version}'
+replace = version='{new_version}'
+
+[bumpversion:file:clip_benchmark/__init__.py]
+search = __version__ = '{current_version}'
+replace = __version__ = '{new_version}'
+
+[bdist_wheel]
+universal = 1
+
+[flake8]
+exclude = docs
--- a/clip_benchmark/setup.py
+++ b/clip_benchmark/setup.py
+#!/usr/bin/env python
+
+"""The setup script."""
+
+from setuptools import find_packages, setup
+
+with open('README.md') as readme_file:
+    readme = readme_file.read()
+
+with open('HISTORY.rst') as history_file:
+    history = history_file.read()
+
+
+def load_requirements(f):
+    return [l.strip() for l in open(f).readlines()]
+
+
+requirements = load_requirements('requirements.txt')
+
+test_requirements = requirements + ['pytest', 'pytest-runner']
+
+setup(
+    author='Mehdi Cherti',
+    author_email='mehdicherti@gmail.com',
+    python_requires='>=3.6',
+    classifiers=[
+        'Development Status :: 2 - Pre-Alpha',
+        'Intended Audience :: Developers',
+        'License :: OSI Approved :: MIT License',
+        'Natural Language :: English',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+    ],
+    description='CLIP-like models benchmarks on various datasets',
+    entry_points={
+        'console_scripts': [
+            'clip_benchmark=clip_benchmark.cli:main',
+            'clip_benchmark_export_wds=clip_benchmark.webdataset_builder:main',
+        ],
+    },
+    install_requires=requirements,
+    license='MIT license',
+    long_description=readme + '\n\n' + history,
+    long_description_content_type='text/markdown',
+    include_package_data=True,
+    keywords='clip_benchmark',
+    name='clip_benchmark',
+    packages=find_packages(include=['clip_benchmark', 'clip_benchmark.*']),
+    test_suite='tests',
+    tests_require=test_requirements,
+    url='https://github.com/mehdidc/clip_benchmark',
+    version='1.4.0',
+    zip_safe=False,
+    extra_require={
+        'vtab': ['task_adaptation==0.1', 'timm>=0.5.4'],
+        'tfds': ['tfds-nightly', 'timm>=0.5.4'],
+        'coco': ['pycocotools>=2.0.4'],
+    }
+)
--- a/clip_benchmark/test_internvl_c_classification.sh
+++ b/clip_benchmark/test_internvl_c_classification.sh
+set -x
+
+PARTITION=${PARTITION:-'INTERN4'}
+alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "birdsnap" --dataset_root ./data/birdsnap/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "cifar10" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "cifar100" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "food101" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "sun397" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "cars" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "fgvc_aircraft" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "dtd" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "pets" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "caltech101" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "mnist" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "stl10" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "eurosat" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "gtsrb" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "country211" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "pcam" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "renderedsst2" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "fer2013" --dataset_root ./data/fer2013 --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "voc2007" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "vtab/flowers" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "vtab/resisc45" --dataset_root ./data/ --model internvl_c_classification \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json