Commit 1d5a34cf authored by wanglch's avatar wanglch
Browse files

Initial commit

parents
Pipeline #1446 canceled with stages
set -x
PARTITION=${PARTITION:-'INTERN4'}
alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
--task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
--model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" \
--task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
--model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "it" \
--task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
--model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "jp" \
--task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
--model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "ar" \
--task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
--model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
--task "zeroshot_classification" --dataset "imagenetv2" --dataset_root ./data/imagenetv2/ \
--model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
--task "zeroshot_classification" --dataset "imagenet_sketch" --dataset_root ./data/imagenet-sketch/ \
--model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
--task "zeroshot_classification" --dataset "imagenet-a" --dataset_root ./data/imagenet-a/ \
--model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
--task "zeroshot_classification" --dataset "imagenet-r" --dataset_root ./data/imagenet-r/ \
--model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
--task "zeroshot_classification" --dataset "objectnet" --dataset_root ./data/objectnet-1.0/ \
--model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
set -x
PARTITION=${PARTITION:-'INTERN4'}
alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
--dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_c_retrieval \
--pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
--dataset "mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
--pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
--dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_c_retrieval \
--pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
--dataset "mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
--pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
set -x
PARTITION=${PARTITION:-'INTERN4'}
alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
--dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
--pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=en
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
--dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
--pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=es
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
--dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
--pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=fr
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
--dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
--pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=zh
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
--dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
--pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=it
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
--dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
--pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=ko
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
--dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
--pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=ru
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
--dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
--pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=jp
set -x
PARTITION=${PARTITION:-'INTERN4'}
alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
--dataset "birdsnap" --dataset_root ./data/birdsnap/ --model internvl_g_classification_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
--dataset "cifar10" --dataset_root ./data/ --model internvl_g_classification_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
--dataset "cifar100" --dataset_root ./data/ --model internvl_g_classification_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
--dataset "food101" --dataset_root ./data/ --model internvl_g_classification_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
--dataset "sun397" --dataset_root ./data/ --model internvl_g_classification_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
--dataset "cars" --dataset_root ./data/ --model internvl_g_classification_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
--dataset "fgvc_aircraft" --dataset_root ./data/ --model internvl_g_classification_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
--dataset "dtd" --dataset_root ./data/ --model internvl_g_classification_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
--dataset "pets" --dataset_root ./data/ --model internvl_g_classification_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
--dataset "caltech101" --dataset_root ./data/ --model internvl_g_classification_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
--dataset "mnist" --dataset_root ./data/ --model internvl_g_classification_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
--dataset "stl10" --dataset_root ./data/ --model internvl_g_classification_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
--dataset "eurosat" --dataset_root ./data/ --model internvl_g_classification_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
--dataset "gtsrb" --dataset_root ./data/ --model internvl_g_classification_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
--dataset "country211" --dataset_root ./data/ --model internvl_g_classification_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
--dataset "pcam" --dataset_root ./data/ --model internvl_g_classification_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
--dataset "renderedsst2" --dataset_root ./data/ --model internvl_g_classification_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
--dataset "fer2013" --dataset_root ./data/fer2013 --model internvl_g_classification_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
--dataset "voc2007" --dataset_root ./data/ --model internvl_g_classification_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
--dataset "vtab/flowers" --dataset_root ./data/ --model internvl_g_classification_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
--dataset "vtab/resisc45" --dataset_root ./data/ --model internvl_g_classification_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
set -x
PARTITION=${PARTITION:-'INTERN4'}
alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
--task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
--model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" \
--task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
--model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "it" \
--task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
--model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "jp" \
--task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
--model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "ar" \
--task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
--model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
--task "zeroshot_classification" --dataset "imagenetv2" --dataset_root ./data/imagenetv2/ \
--model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
--task "zeroshot_classification" --dataset "imagenet_sketch" --dataset_root ./data/imagenet-sketch/ \
--model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
--task "zeroshot_classification" --dataset "imagenet-a" --dataset_root ./data/imagenet-a/ \
--model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
--task "zeroshot_classification" --dataset "imagenet-r" --dataset_root ./data/imagenet-r/ \
--model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
--task "zeroshot_classification" --dataset "objectnet" --dataset_root ./data/objectnet-1.0/ \
--model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
set -x
PARTITION=${PARTITION:-'INTERN4'}
alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
--dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_g_retrieval_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
--dataset "mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
--dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_g_retrieval_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
--dataset "mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json
set -x
PARTITION=${PARTITION:-'INTERN4'}
alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
--dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_c_retrieval_hf \
--pretrained ./work_dirs/internvl_stage2_finetune_flickrcn_364_bs1024_ep10 \
--output result.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
--dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_g_retrieval_hf \
--pretrained ./work_dirs/internvl_stage2_finetune_flickrcn_364_bs1024_ep10 \
--output result.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
--dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_c_retrieval_hf \
--pretrained ./work_dirs/internvl_stage2_finetune_flickr_364_bs1024_ep10 \
--output result.json
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
--dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_g_retrieval_hf \
--pretrained ./work_dirs/internvl_stage2_finetune_flickr_364_bs1024_ep10 \
--output result.json
set -x
PARTITION=${PARTITION:-'INTERN4'}
alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
--dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=en
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
--dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=es
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
--dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=fr
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
--dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=zh
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
--dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=it
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
--dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=ko
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
--dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=ru
s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
--dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
--pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=jp
#!/usr/bin/env python
"""Tests for `clip_benchmark` package."""
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''
from clip_benchmark.cli import run
class base_args:
dataset = 'dummy'
split = 'test'
model = 'ViT-B-32-quickgelu'
pretrained = 'laion400m_e32'
task = 'zeroshot_classification'
amp = False
num_workers = 4
batch_size = 64
dataset_root = 'root'
output = 'result.json'
verbose = True
root = 'root'
annotation_file = ''
seed = 0
skip_load = False
language = 'en'
model_cache_dir = None
cupl = False
save_clf = None
load_clfs = []
model_type = 'open_clip'
wds_cache_dir = None
which = 'eval'
skip_existing = False
def test_base():
run(base_args)
[tox]
envlist = py36, py37, py38, flake8
[travis]
python =
3.8: py38
3.7: py37
3.6: py36
[testenv:flake8]
basepython = python
deps = flake8
commands = flake8 clip_benchmark tests
[testenv]
setenv =
PYTHONPATH = {toxinidir}
commands = python setup.py test
# How to deploy a local demo?
## Launch a Controller
```shell
# run the command in the `internvl_chat_llava` folder
python -m llava.serve.controller --host 0.0.0.0 --port 10000
```
## Launch a Gradio Web Server
```shell
# run the command in the `internvl_chat_llava` folder
python -m llava.serve.gradio_web_server --controller http://localhost:10000 --model-list-mode reload
```
## Launch a Model Worker
### Options
- `--host <host_address>`: Specifies the host address on which the model worker will run. Use "0.0.0.0" to allow connections from any IP address.
- `--controller <controller_address>`: Specifies the address of the controller node responsible for managing model deployment and execution.
- `--port <port_number>`: Specifies the port number on which the model worker will listen for incoming requests.
- `--worker <worker_address>`: Specifies the address of the worker node where the model will be executed.
- `--model-path <model_file_path>`: Specifies the file path to the machine learning model to be deployed and executed.
### Additional Options
#### Multi-GPU Deployment
To enable deployment on multiple GPUs, use the `--device auto` option. This allows the script to utilize all available GPU devices for model execution automatically.
#### Quantization Deployment
To enable quantization for model deployment, use the `--load-8bit` option. This performs quantization on the model, reducing its precision to 8 bits for improved efficiency.
__Note: The `--device auto` and `--load-8bit` options cannot be used simultaneously.__
```shell
# OpenGVLab/InternVL-Chat-ViT-6B-Vicuna-7B
# run the command in the `internvl_chat_llava` folder
python -m llava.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --model-path OpenGVLab/InternVL-Chat-ViT-6B-Vicuna-7B
# OpenGVLab/InternVL-Chat-ViT-6B-Vicuna-13B
# run the command in the `internvl_chat_llava` folder
python -m llava.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40001 --worker http://localhost:40001 --model-path OpenGVLab/InternVL-Chat-ViT-6B-Vicuna-13B
# OpenGVLab/InternVL-Chat-V1-1
# run the command in the `internvl_chat` folder
python -m internvl.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40002 --worker http://localhost:40002 --model-path OpenGVLab/InternVL-Chat-V1-1
# OpenGVLab/InternVL-Chat-V1-2
# run the command in the `internvl_chat` folder
python -m internvl.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40003 --worker http://localhost:40003 --model-path OpenGVLab/InternVL-Chat-V1-2
# OpenGVLab/InternVL-Chat-V1-2-Plus
# run the command in the `internvl_chat` folder
python -m internvl.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40004 --worker http://localhost:40004 --model-path OpenGVLab/InternVL-Chat-V1-2-Plus
# OpenGVLab/InternVL-Chat-V1-5
# run the command in the `internvl_chat` folder
python -m internvl.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40005 --worker http://localhost:40005 --model-path OpenGVLab/InternVL-Chat-V1-5
```
# How to Evaluate InternVL-Chat-V1-5?
In this tutorial, we will provide a detailed guide on how to replicate the results presented in the InternVL 1.5 technical report.
The results are shown in the table below.
_If you encounter any difficulties while testing according to this guide, please let me know. Thank you._
> Note that if you are aiming for an exact replication, please use this code repository and follow the testing methods outlined below; otherwise, using the [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) would be more convenient.
![image](https://github.com/OpenGVLab/InternVL/assets/23737120/8b62d429-c689-426a-9267-2727b6430b6e)
## Model Preparation
| model name | type | download | #param |
| ------------------ | ---- | ----------------------------------------------------------------- | :----: |
| InternVL-Chat-V1-5 | MLLM | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5) | 25.5B |
Please download the above model weights and place them in the `pretrained/` folder.
```sh
cd pretrained/
# pip install -U huggingface_hub
huggingface-cli download --resume-download --local-dir-use-symlinks False OpenGVLab/InternVL-Chat-V1-5 --local-dir InternVL-Chat-V1-5
```
The directory structure is:
```
pretrained
└── InternVL-Chat-V1-5
```
## OCR-related Benchmarks
Our tests will be divided into three parts. First, we will focus on OCR-related datasets, including DocVQA, ChartQA, InfoVQA, TextVQA, and OCRBench. Next, let's proceed to test each dataset one by one.
### DocVQA val & test
<details>
<summary>click to expand</summary>
1. Download the DocVQA dataset using the following instructions:
```shell
mkdir -p data/docvqa && cd data/docvqa
# download images and annotations
wget https://datasets.cvc.uab.es/rrc/DocVQA/train.tar.gz --no-check-certificate # (optional)
wget https://datasets.cvc.uab.es/rrc/DocVQA/val.tar.gz --no-check-certificate
wget https://datasets.cvc.uab.es/rrc/DocVQA/test.tar.gz --no-check-certificate
# unzip files
tar -zxvf train.tar.gz
tar -zxvf val.tar.gz
tar -zxvf test.tar.gz
# download converted jsonl files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/docvqa/train.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/docvqa/val.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/docvqa/test.jsonl
cd ../..
```
2. After preparation is complete, the directory structure is:
```
data
├── docvqa
│ ├── test
│ ├── test.jsonl
│ ├── train
│ ├── train.jsonl
│ ├── val
│ └── val.jsonl
```
3. Test the model with the following commands:
We use a maximum of `18 tiles` to test the DocVQA dataset.
```shell
# evaluation on the val set
GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 vqa-docvqa-val --dynamic --max-num 18
# evaluation on the test set
GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 vqa-docvqa-test --dynamic --max-num 18
```
The result of the validation set is:
```
Overall ANLS: 0.9049
```
For the test set, the test results need to be submitted to the [testing server](https://rrc.cvc.uab.es/?ch=17&com=tasks).
</details>
### ChartQA test
<details>
<summary>click to expand</summary>
1. Download the ChartQA dataset using the following instructions:
```shell
mkdir -p data/chartqa && cd data/chartqa
# download images from https://drive.google.com/file/d/1Lm_w6zeET1Hyl_9ks6w5nEsgpoyPHalV/view
# download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/train_human.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/train_augmented.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/test_human.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/test_augmented.jsonl
cd ../..
```
2. After preparation is complete, the directory structure is:
```
data
├── chartqa
│ ├── ChartQA Dataset
│ │ ├── test
│ │ ├── train
│ │ └── val
│ ├── test_augmented.jsonl
│ ├── test_human.jsonl
│ ├── train_augmented.jsonl
│ └── train_human.jsonl
```
3. Test the model with the following commands:
We use a maximum of `12 tiles` to test the ChartQA dataset.
```shell
# evaluation on the test set
GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 vqa-chartqa-test --dynamic --max-num 12
```
The result of the test set is:
```
['chartqa_test_human', {'relaxed_accuracy': 0.736}]
['chartqa_test_augmented', {'relaxed_accuracy': 0.9408}]
# the average score = (73.6 + 94.08) / 2 = 83.8
```
</details>
### InfoVQA val & test
<details>
<summary>click to expand</summary>
1. Download the InfoVQA dataset using the following instructions:
```shell
mkdir -p data/infographicsvqa && cd data/infographicsvqa
# download images and annotations from https://rrc.cvc.uab.es/?ch=17&com=downloads
# infographicsVQA_test_v1.0.json, infographicsVQA_val_v1.0_withQT.json, infographicVQA_train_v1.0.json
# download converted files
wget https://huggingface.co/OpenGVLab/InternVL/raw/main/infographicsvqa_val.jsonl -O val.jsonl
wget https://huggingface.co/OpenGVLab/InternVL/raw/main/infographicsvqa_test.jsonl -O test.jsonl
cd ../..
```
2. After preparation is complete, the directory structure is:
```
data
├── infographicsvqa
│ ├── infographicsvqa_images
│ ├── infographicsVQA_test_v1.0.json
│ ├── infographicsVQA_val_v1.0_withQT.json
│ ├── infographicVQA_train_v1.0.json
│ ├── test.jsonl
│ └── val.jsonl
```
3. Test the model with the following commands:
We use a maximum of `24 tiles` to test the InfoVQA dataset.
```shell
# evaluation on the val set
GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 vqa-infovqa-val --dynamic --max-num 24
# evaluation on the test set
GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 vqa-infovqa-test --dynamic --max-num 24
```
The result of the val set is:
```
Overall ANLS: 0.7235
```
For the test set, the test results need to be submitted to the [testing server](https://rrc.cvc.uab.es/?ch=17&com=tasks).
</details>
### TextVQA val
<details>
<summary>click to expand</summary>
1. Download the TextVQA dataset using the following instructions:
```shell
mkdir -p data/textvqa && cd data/textvqa
# download images
wget https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip && unzip train_val_images.zip
# download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train_annotations.json
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train_questions.json
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_val_annotations.json
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_val_questions.json
wget https://huggingface.co/OpenGVLab/InternVL/raw/main/textvqa_val.jsonl
wget https://huggingface.co/OpenGVLab/InternVL/raw/main/textvqa_val_llava.jsonl
cd ../..
```
2. After preparation is complete, the directory structure is:
```
data
├── textvqa
│ ├── textvqa_train_annotations.json
│ ├── textvqa_train.jsonl
│ ├── textvqa_train_questions.json
│ ├── textvqa_val_annotations.json
│ ├── textvqa_val.jsonl
│ ├── textvqa_val_llava.jsonl
│ ├── textvqa_val_questions.json
│ └── train_images
```
3. Test the model with the following commands:
We use a maximum of `24 tiles` to test the TextVQA dataset.
```shell
# evaluation on the val set
GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 vqa-textvqa-val --dynamic --max-num 24
```
The result of the val set is:
```
['pretrained/InternVL-Chat-V1-5', 'textvqa_val', 0.8061000000000043]
```
</details>
### OCRBench
<details>
<summary>click to expand</summary>
Please use [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) for the test of OCRBench.
The command to test InternVL-Chat-V1-5 on OCRBench using VLMEvalKit is:
```
torchrun --nproc-per-node=8 run.py --data OCRBench --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
2024-04-29 00:28:29,681 - Evaluation - INFO - Score:
2024-04-29 00:28:29,681 - Evaluation - INFO - Text Recognition:238
2024-04-29 00:28:29,681 - Evaluation - INFO - Scene Text-centric VQA:178
2024-04-29 00:28:29,681 - Evaluation - INFO - Doc-oriented VQA:151
2024-04-29 00:28:29,681 - Evaluation - INFO - Key Information Extraction:153
2024-04-29 00:28:29,681 - Evaluation - INFO - Handwritten Mathematical Expression Recognition:4
2024-04-29 00:28:29,681 - Evaluation - INFO - Final Score:724
2024-04-29 00:28:29,681 - Evaluation - INFO - Final Score Norm:72.4
```
</details>
## General Multimodal Benchmarks
Next, we will test InternVL-Chat-V1-5 using 10 general multimodal benchmarks, which include MME, RealWorldQA, AI2D, MMMU, MMBench-EN, MMBench-CN, CCBench, MMVet, SEED, and HallusionBench.
### MME
<details>
<summary>click to expand</summary>
1. Download the MME dataset using the following instructions:
```shell
mkdir -p data/mme && cd data/mme
# 1. Download the data following the official instructions [here](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation).
# 2. Downloaded images to `MME_Benchmark_release_version`.
cd ../..
```
2. After preparation is complete, the directory structure is:
```
data
├── mme
│ └── MME_Benchmark_release_version
```
3. Single-GPU inference and evaluate:
We use a maximum of `12 tiles` to test the MME dataset.
```shell
# evaluation on the val set
GPUS=1 sh evaluate.sh pretrained/InternVL-Chat-V1-5 mme --dynamic --max-num 12
```
The result of MME is:
```
total score: 1658.3683473389356
existence score: 190.0
count score: 175.0
position score: 171.66666666666669
color score: 178.33333333333331
posters score: 173.8095238095238
celebrity score: 142.05882352941177
scene score: 156.5
landmark score: 179.5
artwork score: 144.0
OCR score: 147.5
=========== Cognition ===========
total score: 533.5714285714286
commonsense_reasoning score: 133.57142857142858
numerical_calculation score: 117.5
text_translation score: 185.0
code_reasoning score: 97.5
# 1658.3683473389356 + 533.5714285714286 = 2191.939775910364
```
</details>
### RealWorldQA
<details>
<summary>click to expand</summary>
Please use [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) for the test of RealWorldQA.
The command to test InternVL-Chat-V1-5 on RealWorldQA using VLMEvalKit is:
```
torchrun --nproc-per-node=8 run.py --data RealWorldQA --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
2024-04-29 00:35:13,282 - Evaluation - INFO - Score:
2024-04-29 00:35:13,282 - Evaluation - INFO - split Overall
0 none 0.660131
```
</details>
### AI2D test
<details>
<summary>click to expand</summary>
1. Download the AI2D dataset using the following instructions:
```shell
mkdir -p data/ai2diagram && cd data/ai2diagram
# download converted files
wget https://huggingface.co/OpenGVLab/InternVL/raw/main/ai2d_test_vlmevalkit.jsonl -O test_vlmevalkit.jsonl
wget https://huggingface.co/OpenGVLab/InternVL/resolve/main/AI2D_TEST.zip && unzip AI2D_TEST.zip
# download images from Google drive (optional, provided by InternLM-XComposer)
# https://drive.google.com/file/d/1dqqa3MnrxMXaU_K9JA6C83je32ibwdOY/view?usp=sharing
# images should be placed in `data/ai2diagram/ai2d/abc_images` and `data/ai2diagram/ai2d/images`
cd ../..
```
2. After preparation is complete, the directory structure is:
```
data
├── ai2diagram
│ ├── test_vlmevalkit.jsonl
│ ├── ai2d # (optional)
│ │ ├── abc_images
│ │ └── images
│ └── AI2D_TEST
```
3. Test the model with the following commands:
We use a maximum of `6 tiles` to test the AI2D dataset.
```shell
# evaluation on the test set
GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 vqa-ai2d-test --dynamic
```
The result of AI2D is:
```
ai2diagram_test {'accuracy': 0.8073186528497409}
```
</details>
### MMMU val
<details>
<summary>click to expand</summary>
1. The evaluation code will automatically download the dataset from HuggingFace.
2. Test the model with the following commands:
```
GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 mmmu-val --dynamic
```
The result of MMMU val is:
```
{'Overall-Art and Design': {'num': 120, 'acc': 0.608}, 'Art': {'num': 30, 'acc': 0.7}, 'Art_Theory': {'num': 30, 'acc': 0.8}, 'Design': {'num': 30, 'acc': 0.767}, 'Music': {'num': 30, 'acc': 0.167}, 'Overall-Business': {'num': 150, 'acc': 0.413}, 'Accounting': {'num': 30, 'acc': 0.467}, 'Economics': {'num': 30, 'acc': 0.4}, 'Finance': {'num': 30, 'acc': 0.4}, 'Manage': {'num': 30, 'acc': 0.4}, 'Marketing': {'num': 30, 'acc': 0.4}, 'Overall-Science': {'num': 150, 'acc': 0.38}, 'Biology': {'num': 30, 'acc': 0.6}, 'Chemistry': {'num': 30, 'acc': 0.233}, 'Geography': {'num': 30, 'acc': 0.4}, 'Math': {'num': 30, 'acc': 0.333}, 'Physics': {'num': 30, 'acc': 0.333}, 'Overall-Health and Medicine': {'num': 150, 'acc': 0.433}, 'Basic_Medical_Science': {'num': 30, 'acc': 0.5}, 'Clinical_Medicine': {'num': 30, 'acc': 0.5}, 'Diagnostics_and_Laboratory_Medicine': {'num': 30, 'acc': 0.333}, 'Pharmacy': {'num': 30, 'acc': 0.367}, 'Public_Health': {'num': 30, 'acc': 0.467}, 'Overall-Humanities and Social Science': {'num': 120, 'acc': 0.617}, 'History': {'num': 30, 'acc': 0.633}, 'Literature': {'num': 30, 'acc': 0.8}, 'Sociology': {'num': 30, 'acc': 0.567}, 'Psychology': {'num': 30, 'acc': 0.467}, 'Overall-Tech and Engineering': {'num': 210, 'acc': 0.362}, 'Agriculture': {'num': 30, 'acc': 0.567}, 'Architecture_and_Engineering': {'num': 30, 'acc': 0.267}, 'Computer_Science': {'num': 30, 'acc': 0.367}, 'Electronics': {'num': 30, 'acc': 0.3}, 'Energy_and_Power': {'num': 30, 'acc': 0.333}, 'Materials': {'num': 30, 'acc': 0.467}, 'Mechanical_Engineering': {'num': 30, 'acc': 0.233}, 'Overall': {'num': 900, 'acc': 0.452}}
```
</details>
### MMBench-EN & CN test
<details>
<summary>click to expand</summary>
1. Download the MMBench dataset using the following instructions:
```
mkdir -p data/mmbench && cd data/mmbench
# download csv files of mmbench
wget http://opencompass.openxlab.space/utils/MMBench/CCBench_legacy.tsv
wget https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_20230712.tsv
wget https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_cn_20231003.tsv
wget https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_en_20231003.tsv
wget https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_test_cn_20231003.tsv
wget https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_test_en_20231003.tsv
cd ../..
```
2. After preparation is complete, the directory structure is:
```
data
├── mmbench
│ ├── CCBench_legacy.tsv
│ ├── mmbench_dev_20230712.tsv
│ ├── mmbench_dev_cn_20231003.tsv
│ ├── mmbench_dev_en_20231003.tsv
│ ├── mmbench_test_cn_20231003.tsv
│ └── mmbench_test_en_20231003.tsv
```
3. Test the model with the following commands:
We use a maximum of `6 tiles` to test the MMBench dataset.
```shell
# evaluation on the test-en set
GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 mmbench-test-en --dynamic
# evaluation on the test-cn set
GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 mmbench-test-cn --dynamic
```
Submit the result to the [test server](mmbench.opencompass.org.cn). The result of MMBench is:
```
# result of the test-en set
A_Overall (test) 0.8217488789237668
# result of the test-cn set
A_Overall (test) 0.8195067264573991
```
</details>
### CCBench dev
<details>
<summary>click to expand</summary>
1. See the `MMBench-EN & CN test` part to prepare the CCBench data.
2. Test the model with the following commands:
We use a maximum of `6 tiles` to test the CCBench dataset.
```shell
# evaluation on the dev set
GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 ccbench-dev --dynamic
```
Submit the result to the [test server](mmbench.opencompass.org.cn). The result of CCBench is:
```
A_Overall (dev) 0.7
```
</details>
</details>
### MMVet
<details>
<summary>click to expand</summary>
1. Download the MMVet dataset using the following instructions:
```
mkdir -p data/mm-vet && cd data/mm-vet
wget https://github.com/yuweihao/MM-Vet/releases/download/v1/mm-vet.zip
unzip mm-vet.zip
wget https://huggingface.co/OpenGVLab/InternVL/raw/main/llava-mm-vet.jsonl
cd ../..
```
2. After preparation is complete, the directory structure is:
```
data
├── mm-vet
│ ├── images
│ └── llava-mm-vet.jsonl
```
3. Test the model with the following commands:
We use a maximum of `6 tiles` to test the MMVet dataset.
```shell
# evaluation on the mmvet
GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 mmvet --dynamic
```
Submit the result to the [test server](https://huggingface.co/spaces/whyu/MM-Vet_Evaluator). The result of MMVet is:
```
total
62.7
```
</details>
### SEED Image
<details>
<summary>click to expand</summary>
1. Download the SEED dataset using the following instructions:
```
mkdir -p data/SEED && cd data/SEED
# 1. Follow the official instructions [Data Preparation for SEED-Bench-1](https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md#data-preparation-for-seed-bench-1)
# to download the images and the videos. Put images under `./data/SEED/SEED-Bench-image`.
# 2. Extract the video frame in the middle from the downloaded videos, and put them under `./data/SEED/SEED-Bench-image`.
# LLaVA provided the script [`extract_video_frames.py`](../internvl_chat/tools/extract_video_frames.py) modified from the official one.
wget https://huggingface.co/OpenGVLab/InternVL/raw/main/seed.jsonl
cd ../..
```
2. After preparation is complete, the directory structure is:
```
data
├── SEED
│ ├── SEED-Bench-image
│ └── seed.jsonl
```
3. Test the model with the following commands:
```shell
sh evaluate.sh pretrained/InternVL-Chat-V1-5 seed --dynamic
```
The result is:
```
Acc@1: 0.6999444135630906
length: 17990
Accuracy for each data type:
Data type Scene Understanding: 80.37%
Data type Instance Identity: 80.45%
Data type Instance Location: 78.03%
Data type Instance Attributes: 72.39%
Data type Instances Counting: 69.19%
Data type Spatial Relation: 59.82%
Data type Instance Interaction: 77.32%
Data type Visual Reasoning: 78.85%
Data type Text Understanding: 55.81%
Data type Action Recognition: 54.08%
Data type Action Prediction: 44.82%
Data type Procedure Understanding: 40.18%
Total accuracy: 69.99%
Image accuracy: 75.99%
Video accuracy: 47.27%
```
</details>
### HallusionBench
<details>
<summary>click to expand</summary>
Please use [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) for the test of HallusionBench.
The command to test InternVL-Chat-V1-5 on HallusionBench using VLMEvalKit is:
```
torchrun --nproc-per-node=8 run.py --data HallusionBench --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
2024-04-29 00:46:23,688 - Evaluation - INFO - Score:
2024-04-29 00:46:23,688 - Evaluation - INFO - split aAcc fAcc qAcc
0 Overall 66.771819 40.173410 40.879121
1 VD 63.620981 40.000000 34.296029
2 VS 71.944444 40.517241 51.123596
3 VD_figure 77.500000 65.853659 53.846154
4 VS_map 56.250000 18.181818 18.750000
5 VD_illusion 66.666667 41.935484 34.722222
6 VS_table 75.892857 46.428571 55.813953
7 VD_ocr 78.651685 58.139535 58.139535
8 VS_ocr 59.259259 38.461538 22.222222
9 VS_chart 81.538462 50.000000 72.368421
10 VD_video 51.176471 10.416667 13.043478
11 VD_math 56.481481 25.000000 27.777778
```
The final score reported in our technical report is the average: (66.771819 + 40.173410 + 40.879121) / 3 = 49.3
</details>
## Math Benchmark
Finally, we use a representative math dataset, MathVista, to test InternVL-Chat-V1-5.
### MathVista testmini
<details>
<summary>click to expand</summary>
1. Download the MathVista dataset using the following instructions:
```bash
mkdir -p data/MathVista && cd data/MathVista
wget https://huggingface.co/datasets/AI4Math/MathVista/raw/main/annot_testmini.json
cd ../..
```
2. Test the model with the following commands:
```shell
export OPENAI_API_KEY='your-openai-key'
GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 mathvista-testmini --dynamic
```
The result is:
```
Correct: 535, Total: 1000, Accuracy: 53.5%
1000
Number of test problems: 1000
Type: [question_type]
[free_form]: 47.17% (217/460)
[multi_choice]: 58.89% (318/540)
Type: [answer_type]
[float]: 0.00% (0/40)
[integer]: 51.67% (216/418)
[text]: 58.89% (318/540)
[list]: 50.00% (1/2)
Type: [language]
[english]: 53.31% (499/936)
[chinese]: 56.45% (35/62)
[persian]: 50.00% (1/2)
```
</details>
# How to Evaluate InternVL-Chat-V1-5 using VLMEvalKit?
In this tutorial, we will provide a detailed guide on how to evaluate InternVL-Chat-V1-5 using VLMEvalKit.
First of all, please follow this [guide](https://github.com/open-compass/VLMEvalKit/blob/main/docs/en/Quickstart.md) to install VLMEvalKit.
## MMBench_DEV_EN
```
torchrun --nproc-per-node=8 run.py --data MMBench_DEV_EN --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
2024-04-29 18:24:58,395 - Evaluation - INFO - split Overall ... spatial_relationship structuralized_imagetext_understanding
0 dev 0.808419 ... 0.422222 0.628205
```
## MMBench_DEV_CN
```
torchrun --nproc-per-node=8 run.py --data MMBench_DEV_CN --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
2024-04-29 18:26:05,209 - Evaluation - INFO - split Overall ... spatial_relationship structuralized_imagetext_understanding
0 dev 0.803265 ... 0.377778 0.615385
```
## MMStar
```
torchrun --nproc-per-node=8 run.py --data MMStar --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
2024-04-29 18:21:56,491 - Evaluation - INFO - split Overall ... math science & technology
0 none 0.572667 ... 0.564 0.408
```
## MME
```
torchrun --nproc-per-node=8 run.py --data MME --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
2024-04-29 18:42:43,864 - Evaluation - INFO - Score:
2024-04-29 18:42:43,864 - Evaluation - INFO - perception reasoning OCR ... posters scene text_translation
0 1641.915766 519.642857 147.5 ... 171.768707 156.5 185.0
```
## SEEDBench_IMG
```
torchrun --nproc-per-node=8 run.py --data SEEDBench_IMG --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
2024-04-29 18:58:54,973 - Evaluation - INFO - Score:
2024-04-29 18:58:54,973 - Evaluation - INFO - split Overall ... Text Understanding Visual Reasoning
0 none 0.757167 ... 0.440476 0.806647
```
## MMVet
```
torchrun --nproc-per-node=8 run.py --data MMVet --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
2024-04-29 18:32:38,748 - Evaluation - INFO - Score:
2024-04-29 18:32:38,748 - Evaluation - INFO - Category tot acc
0 rec 187 61.818182
1 ocr 108 68.981481
2 know 84 46.428571
3 gen 80 44.875000
4 spat 75 63.600000
5 math 26 62.307692
6 Overall 218 61.513761
```
Note that because the version of GPT used for scoring differs from the official server, the scores tested by VLMEvalKit will be slightly different.
## MMMU_DEV_VAL
```
torchrun --nproc-per-node=8 run.py --data MMMU_DEV_VAL --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
2024-04-29 18:20:04,977 - Evaluation - INFO - split Overall ... Science Tech & Engineering
0 dev 0.48 ... 0.36 0.428571
1 validation 0.45 ... 0.38 0.371429
```
## MathVista_MINI
```
torchrun --nproc-per-node=8 run.py --data MathVista_MINI --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
2024-04-29 18:39:25,736 - Evaluation - INFO - Task&Skill tot prefetch hit prefetch_rate acc
0 Overall 1000 545 521 54.500000 52.100000
1 scientific reasoning 122 89 70 72.950820 57.377049
2 textbook question answering 158 101 86 63.924051 54.430380
3 numeric commonsense 144 39 41 27.083333 28.472222
4 arithmetic reasoning 353 147 198 41.643059 56.090652
5 visual question answering 179 91 88 50.837989 49.162011
6 geometry reasoning 239 144 94 60.251046 39.330544
7 algebraic reasoning 281 170 109 60.498221 38.790036
8 geometry problem solving 208 135 79 64.903846 37.980769
9 math word problem 186 70 118 37.634409 63.440860
10 logical reasoning 37 18 5 48.648649 13.513514
11 figure question answering 269 148 150 55.018587 55.762082
12 statistical reasoning 301 143 196 47.508306 65.116279
```
Note that because the version of GPT used for answer extraction differs from the official code, the scores tested by VLMEvalKit will be slightly different.
## ScienceQA_TEST
```
torchrun --nproc-per-node=8 run.py --data ScienceQA_TEST --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
2024-04-29 19:10:03,279 - Evaluation - INFO - Score:
2024-04-29 19:10:03,279 - Evaluation - INFO - split Overall ... Weather and climate World religions
0 test 0.940506 ... 0.948276 1.0
```
## HallusionBench
```
torchrun --nproc-per-node=8 run.py --data HallusionBench --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
2024-04-29 18:21:37,606 - Evaluation - INFO - Score:
2024-04-29 18:21:37,606 - Evaluation - INFO - split aAcc fAcc qAcc
0 Overall 66.771819 40.173410 40.879121
1 VS 71.944444 40.517241 51.123596
2 VD 63.620981 40.000000 34.296029
3 VS_ocr 59.259259 38.461538 22.222222
4 VD_video 51.176471 10.416667 13.043478
5 VS_map 56.250000 18.181818 18.750000
6 VS_chart 81.538462 50.000000 72.368421
7 VS_table 75.892857 46.428571 55.813953
8 VD_figure 77.500000 65.853659 53.846154
9 VD_illusion 66.666667 41.935484 34.722222
10 VD_math 56.481481 25.000000 27.777778
11 VD_ocr 78.651685 58.139535 58.139535
```
## TextVQA_VAL
```
torchrun --nproc-per-node=8 run.py --data TextVQA_VAL --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
2024-04-29 18:41:32,873 - Evaluation - INFO - VQA Eval Finished. Saved to ./InternVL-Chat-V1-5/InternVL-Chat-V1-5_TextVQA_VAL_acc.csv.
2024-04-29 18:41:32,873 - Evaluation - INFO - Overall
0 80.488
```
## ChartQA_TEST
```
torchrun --nproc-per-node=8 run.py --data ChartQA_TEST --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
2024-04-29 18:44:05,458 - Evaluation - INFO - VQA Eval Finished. Saved to ./InternVL-Chat-V1-5/InternVL-Chat-V1-5_ChartQA_TEST_acc.csv.
2024-04-29 18:44:05,458 - Evaluation - INFO - test_human test_augmented Overall
0 73.04 94.32 83.68
```
## AI2D_TEST
```
torchrun --nproc-per-node=8 run.py --data AI2D_TEST --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
2024-04-29 19:02:17,402 - Evaluation - INFO - Score:
2024-04-29 19:02:17,402 - Evaluation - INFO - split Overall atomStructure ... typesOf volcano waterCNPCycle
0 none 0.806995 0.75 ... 0.752187 1.0 0.727273
```
## LLaVABench
```
torchrun --nproc-per-node=8 run.py --data LLaVABench --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
Processing ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0:00:00 60/60 100%
split Relative Score (main) VLM Score GPT4 Score
0 overall 82.0 63.7 77.7
1 conv 82.9 74.1 89.4
2 detail 72.0 48.0 66.7
3 complex 86.0 65.7 76.4
```
## DocVQA_VAL
```
torchrun --nproc-per-node=8 run.py --data DocVQA_VAL --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
2024-04-29 19:18:54,661 - Evaluation - INFO - VQA Eval Finished. Saved to ./InternVL-Chat-V1-5/InternVL-Chat-V1-5_DocVQA_VAL_acc.csv.
2024-04-29 19:18:54,661 - Evaluation - INFO - val Overall
0 90.500323 90.500323
```
## InfoVQA_VAL
```
torchrun --nproc-per-node=8 run.py --data InfoVQA_VAL --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
2024-04-29 18:44:50,851 - Evaluation - INFO - VQA Eval Finished. Saved to ./InternVL-Chat-V1-5/InternVL-Chat-V1-5_InfoVQA_VAL_acc.csv.
2024-04-29 18:44:50,851 - Evaluation - INFO - val Overall
0 71.920408 71.920408
```
## OCRBench
```
torchrun --nproc-per-node=8 run.py --data OCRBench --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
2024-04-29 18:56:05,437 - Evaluation - INFO - Score:
2024-04-29 18:56:05,437 - Evaluation - INFO - Text Recognition:238
2024-04-29 18:56:05,437 - Evaluation - INFO - Scene Text-centric VQA:178
2024-04-29 18:56:05,437 - Evaluation - INFO - Doc-oriented VQA:151
2024-04-29 18:56:05,438 - Evaluation - INFO - Key Information Extraction:153
2024-04-29 18:56:05,438 - Evaluation - INFO - Handwritten Mathematical Expression Recognition:4
2024-04-29 18:56:05,438 - Evaluation - INFO - Final Score:724
2024-04-29 18:56:05,438 - Evaluation - INFO - Final Score Norm:72.4
```
## RealWorldQA
```
torchrun --nproc-per-node=8 run.py --data RealWorldQA --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
2024-04-29 18:56:43,192 - Evaluation - INFO - Score:
2024-04-29 18:56:43,192 - Evaluation - INFO - split Overall
0 none 0.660131
```
## SEEDBench2_Plus
```
torchrun --nproc-per-node=8 run.py --data SEEDBench2_Plus --model InternVL-Chat-V1-5 --verbose
```
The result is:
```
2024-05-29 12:41:47,313 - Evaluation - INFO - split Overall chart map web
0 none 0.666227 0.650617 0.574969 0.79697
```
# How to Evaluate Mini-InternVL-Chat-2B-V1-5 using VLMEvalKit?
In this tutorial, we will provide a detailed guide on how to evaluate Mini-InternVL-Chat-2B-V1-5 using VLMEvalKit.
First of all, please follow this [guide](https://github.com/open-compass/VLMEvalKit/blob/main/docs/en/Quickstart.md) to install VLMEvalKit.
## MMBench_DEV_EN
```
torchrun --nproc-per-node=8 run.py --data MMBench_DEV_EN --model Mini-InternVL-Chat-2B-V1-5 --verbose
```
The result is:
```
2024-05-24 23:38:26,074 - Evaluation - INFO - split Overall ... spatial_relationship structuralized_imagetext_understanding
0 dev 0.706186 ... 0.266667 0.423077
```
## MMBench_DEV_CN
```
torchrun --nproc-per-node=8 run.py --data MMBench_DEV_CN --model Mini-InternVL-Chat-2B-V1-5 --verbose
```
The result is:
```
2024-05-24 23:38:10,864 - Evaluation - INFO - split Overall ... spatial_relationship structuralized_imagetext_understanding
0 dev 0.656357 ... 0.222222 0.307692
```
## MMStar
```
torchrun --nproc-per-node=8 run.py --data MMStar --model Mini-InternVL-Chat-2B-V1-5 --verbose
```
The result is:
```
2024-05-24 23:38:37,502 - Evaluation - INFO - split Overall ... math science & technology
0 none 0.461333 ... 0.448 0.372
```
## MME
```
torchrun --nproc-per-node=8 run.py --data MME --model Mini-InternVL-Chat-2B-V1-5 --verbose
```
The result is:
```
2024-05-24 23:38:42,360 - Evaluation - INFO - perception reasoning OCR ... posters scene text_translation
0 1475.888655 423.928571 147.5 ... 130.952381 151.0 170.0
```
## SEEDBench_IMG
```
torchrun --nproc-per-node=8 run.py --data SEEDBench_IMG --model Mini-InternVL-Chat-2B-V1-5 --verbose
```
The result is:
```
2024-05-24 23:39:49,107 - Evaluation - INFO - split Overall ... Text Understanding Visual Reasoning
0 none 0.694491 ... 0.690476 0.731118
```
## MMVet
```
torchrun --nproc-per-node=8 run.py --data MMVet --model Mini-InternVL-Chat-2B-V1-5 --verbose
```
The result is:
```
2024-05-24 23:38:53,665 - Evaluation - INFO - Category tot acc
0 rec 187 42.352941
1 ocr 108 42.500000
2 know 84 20.357143
3 gen 80 22.375000
4 spat 75 42.533333
5 math 26 18.461538
6 Overall 218 38.256881
```
Note that because the version of GPT used for scoring differs from the official server, the scores tested by VLMEvalKit will be slightly different.
## MMMU_DEV_VAL
```
torchrun --nproc-per-node=8 run.py --data MMMU_DEV_VAL --model Mini-InternVL-Chat-2B-V1-5 --verbose
```
The result is:
```
2024-05-24 23:39:23,721 - Evaluation - INFO - split Overall ... Science Tech & Engineering
0 dev 0.353333 ... 0.240000 0.342857
1 validation 0.376667 ... 0.286667 0.376190
```
## MathVista_MINI
```
torchrun --nproc-per-node=8 run.py --data MathVista_MINI --model Mini-InternVL-Chat-2B-V1-5 --verbose
```
The result is:
```
0 Overall 1000 520 411 52.000000 41.100000
1 scientific reasoning 122 91 54 74.590164 44.262295
2 textbook question answering 158 100 72 63.291139 45.569620
3 numeric commonsense 144 41 45 28.472222 31.250000
4 arithmetic reasoning 353 108 129 30.594901 36.543909
5 visual question answering 179 94 69 52.513966 38.547486
6 geometry reasoning 239 158 85 66.108787 35.564854
7 algebraic reasoning 281 180 104 64.056940 37.010676
8 geometry problem solving 208 149 76 71.634615 36.538462
9 math word problem 186 27 68 14.516129 36.559140
10 logical reasoning 37 24 4 64.864865 10.810811
11 figure question answering 269 150 126 55.762082 46.840149
12 statistical reasoning 301 139 159 46.179402 52.823920
```
Note that because the version of GPT used for answer extraction differs from the official code, the scores tested by VLMEvalKit will be slightly different.
## ScienceQA_TEST
```
torchrun --nproc-per-node=8 run.py --data ScienceQA_TEST --model Mini-InternVL-Chat-2B-V1-5 --verbose
```
The result is:
```
2024-05-24 23:42:24,271 - Evaluation - INFO - split Overall ... Weather and climate World religions
0 test 0.852256 ... 0.810345 0.0
```
## HallusionBench
```
torchrun --nproc-per-node=8 run.py --data HallusionBench --model Mini-InternVL-Chat-2B-V1-5 --verbose
```
The result is:
```
2024-05-24 23:41:40,703 - Evaluation - INFO - split aAcc fAcc qAcc
0 Overall 59.411146 24.277457 28.791209
1 VS 61.111111 17.241379 34.269663
2 VD 58.375635 27.826087 25.270758
3 VD_illusion 57.638889 22.580645 19.444444
4 VS_ocr 53.703704 15.384615 11.111111
5 VS_map 54.687500 9.090909 15.625000
6 VS_chart 66.153846 15.000000 50.000000
7 VS_table 62.500000 28.571429 34.883721
8 VD_math 58.333333 11.111111 31.481481
9 VD_video 47.058824 12.500000 8.695652
10 VD_figure 65.000000 41.463415 28.205128
11 VD_ocr 75.280899 53.488372 51.162791
```
## TextVQA_VAL
```
torchrun --nproc-per-node=8 run.py --data TextVQA_VAL --model Mini-InternVL-Chat-2B-V1-5 --verbose
```
The result is:
```
2024-05-24 23:45:09,563 - Evaluation - INFO - Overall
0 70.452
```
## ChartQA_TEST
```
torchrun --nproc-per-node=8 run.py --data ChartQA_TEST --model Mini-InternVL-Chat-2B-V1-5 --verbose
```
The result is:
```
2024-05-24 23:43:24,645 - Evaluation - INFO - test_augmented test_human Overall
0 91.68 54.88 73.28
```
## AI2D_TEST
```
torchrun --nproc-per-node=8 run.py --data AI2D_TEST --model Mini-InternVL-Chat-2B-V1-5 --verbose
```
The result is:
```
2024-05-24 23:43:12,535 - Evaluation - INFO - split Overall atomStructure ... typesOf volcano waterCNPCycle
0 none 0.699482 0.625 ... 0.61516 0.6875 0.477273
```
## LLaVABench
```
torchrun --nproc-per-node=8 run.py --data LLaVABench --model Mini-InternVL-Chat-2B-V1-5 --verbose
```
The result is:
```
Processing ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0:00:00 60/60 100%
split Relative Score (main) VLM Score GPT4 Score
0 overall 61.0 47.7 78.2
1 complex 68.4 52.5 76.8
2 conv 59.9 53.5 89.4
3 detail 47.1 32.0 68.0
```
## DocVQA_VAL
```
torchrun --nproc-per-node=8 run.py --data DocVQA_VAL --model Mini-InternVL-Chat-2B-V1-5 --verbose
```
The result is:
```
2024-05-24 23:47:40,385 - Evaluation - INFO - val Overall
0 83.883006 83.883006
```
## InfoVQA_VAL
```
torchrun --nproc-per-node=8 run.py --data InfoVQA_VAL --model Mini-InternVL-Chat-2B-V1-5 --verbose
```
The result is:
```
2024-05-24 23:46:53,303 - Evaluation - INFO - val Overall
0 55.86691 55.86691
```
## OCRBench
```
torchrun --nproc-per-node=8 run.py --data OCRBench --model Mini-InternVL-Chat-2B-V1-5 --verbose
```
The result is:
```
2024-05-24 23:45:30,929 - Evaluation - INFO - Score:
2024-05-24 23:45:30,929 - Evaluation - INFO - Text Recognition:222
2024-05-24 23:45:30,929 - Evaluation - INFO - Scene Text-centric VQA:163
2024-05-24 23:45:30,929 - Evaluation - INFO - Doc-oriented VQA:125
2024-05-24 23:45:30,929 - Evaluation - INFO - Key Information Extraction:139
2024-05-24 23:45:30,929 - Evaluation - INFO - Handwritten Mathematical Expression Recognition:5
2024-05-24 23:45:30,929 - Evaluation - INFO - Final Score:654
2024-05-24 23:45:30,929 - Evaluation - INFO - Final Score Norm:65.4
```
## RealWorldQA
```
torchrun --nproc-per-node=8 run.py --data RealWorldQA --model Mini-InternVL-Chat-2B-V1-5 --verbose
```
The result is:
```
2024-05-25 00:13:34,645 - Evaluation - INFO - split Overall
0 none 0.579085
```
## SEEDBench2_Plus
```
torchrun --nproc-per-node=8 run.py --data SEEDBench2_Plus --model Mini-InternVL-Chat-2B-V1-5 --verbose
```
The result is:
```
2024-05-29 12:31:50,587 - Evaluation - INFO - split Overall chart map web
0 none 0.588933 0.562963 0.482032 0.751515
```
# How to Evaluate Mini-InternVL-Chat-4B-V1-5 using VLMEvalKit?
In this tutorial, we will provide a detailed guide on how to evaluate Mini-InternVL-Chat-4B-V1-5 using VLMEvalKit.
First of all, please follow this [guide](https://github.com/open-compass/VLMEvalKit/blob/main/docs/en/Quickstart.md) to install VLMEvalKit.
## MMBench_DEV_EN
```
torchrun --nproc-per-node=8 run.py --data MMBench_DEV_EN --model Mini-InternVL-Chat-4B-V1-5 --verbose
```
The result is:
```
2024-05-29 04:01:07,750 - Evaluation - INFO - split Overall ... spatial_relationship structuralized_imagetext_understanding
0 dev 0.764605 ... 0.355556 0.551282
```
## MMBench_DEV_CN
```
torchrun --nproc-per-node=8 run.py --data MMBench_DEV_CN --model Mini-InternVL-Chat-4B-V1-5 --verbose
```
The result is:
```
2024-05-29 04:11:49,747 - Evaluation - INFO - split Overall ... spatial_relationship structuralized_imagetext_understanding
0 dev 0.699313 ... 0.244444 0.512821
```
## MMStar
```
torchrun --nproc-per-node=8 run.py --data MMStar --model Mini-InternVL-Chat-4B-V1-5 --verbose
```
The result is:
```
2024-05-29 04:02:01,943 - Evaluation - INFO - split Overall ... math science & technology
0 none 0.527333 ... 0.516 0.408
```
## MME
```
torchrun --nproc-per-node=8 run.py --data MME --model Mini-InternVL-Chat-4B-V1-5 --verbose
```
The result is:
```
2024-05-29 04:06:31,735 - Evaluation - INFO - perception reasoning OCR ... posters scene text_translation
0 1569.933774 492.857143 147.5 ... 153.061224 153.25 162.5
```
## SEEDBench_IMG
```
torchrun --nproc-per-node=8 run.py --data SEEDBench_IMG --model Mini-InternVL-Chat-4B-V1-5 --verbose
```
The result is:
```
2024-05-29 04:17:50,620 - Evaluation - INFO - split Overall ... Text Understanding Visual Reasoning
0 none 0.721684 ... 0.559524 0.779456
```
## MMVet
```
torchrun --nproc-per-node=8 run.py --data MMVet --model Mini-InternVL-Chat-4B-V1-5 --verbose
```
The result is:
```
2024-05-29 11:04:32,615 - Evaluation - INFO - Category tot acc
0 rec 187 43.636364
1 ocr 108 47.037037
2 know 84 26.904762
3 gen 80 27.625000
4 spat 75 43.066667
5 math 26 34.230769
6 Overall 218 41.972477
```
Note that because the version of GPT used for scoring differs from the official server, the scores tested by VLMEvalKit will be slightly different.
## MMMU_DEV_VAL
```
torchrun --nproc-per-node=8 run.py --data MMMU_DEV_VAL --model Mini-InternVL-Chat-4B-V1-5 --verbose
```
The result is:
```
2024-05-29 04:16:48,300 - Evaluation - INFO - split Overall ... Science Tech & Engineering
0 validation 0.457778 ... 0.4 0.404762
1 dev 0.480000 ... 0.4 0.371429
```
## MathVista_MINI
```
torchrun --nproc-per-node=8 run.py --data MathVista_MINI --model Mini-InternVL-Chat-4B-V1-5 --verbose
```
The result is:
```
2024-05-29 04:56:36,722 - Evaluation - INFO - Task&Skill tot prefetch hit prefetch_rate acc
0 Overall 1000 560 537 56.000000 53.700000
1 scientific reasoning 122 86 71 70.491803 58.196721
2 textbook question answering 158 99 89 62.658228 56.329114
3 numeric commonsense 144 41 41 28.472222 28.472222
4 arithmetic reasoning 353 134 180 37.960340 50.991501
5 visual question answering 179 94 88 52.513966 49.162011
6 geometry reasoning 239 161 118 67.364017 49.372385
7 algebraic reasoning 281 189 139 67.259786 49.466192
8 geometry problem solving 208 153 105 73.557692 50.480769
9 math word problem 186 52 96 27.956989 51.612903
10 logical reasoning 37 18 4 48.648649 10.810811
11 figure question answering 269 162 159 60.223048 59.107807
12 statistical reasoning 301 177 203 58.803987 67.441860
```
Note that because the version of GPT used for answer extraction differs from the official code, the scores tested by VLMEvalKit will be slightly different.
## ScienceQA_TEST
```
torchrun --nproc-per-node=8 run.py --data ScienceQA_TEST --model Mini-InternVL-Chat-4B-V1-5 --verbose
```
The result is:
```
2024-05-29 04:14:56,970 - Evaluation - INFO - split Overall ... Weather and climate World religions
0 test 0.927119 ... 0.948276 1.0
```
## HallusionBench
```
torchrun --nproc-per-node=8 run.py --data HallusionBench --model Mini-InternVL-Chat-4B-V1-5 --verbose
```
The result is:
```
2024-05-29 04:24:07,079 - Evaluation - INFO - split aAcc fAcc qAcc
0 Overall 62.460568 33.236994 32.747253
1 VD 58.544839 31.739130 24.187726
2 VS 68.888889 36.206897 46.067416
3 VD_illusion 59.027778 30.645161 23.611111
4 VD_math 55.555556 25.000000 27.777778
5 VS_ocr 61.111111 34.615385 25.925926
6 VD_figure 62.500000 41.463415 23.076923
7 VD_ocr 74.157303 51.162791 48.837209
8 VS_map 54.687500 27.272727 15.625000
9 VS_chart 76.153846 40.000000 64.473684
10 VS_table 72.321429 39.285714 48.837209
11 VD_video 50.000000 12.500000 7.246377
```
## TextVQA_VAL
```
torchrun --nproc-per-node=8 run.py --data TextVQA_VAL --model Mini-InternVL-Chat-4B-V1-5 --verbose
```
The result is:
```
2024-05-29 04:26:03,441 - Evaluation - INFO - Overall
0 72.886
```
## ChartQA_TEST
```
torchrun --nproc-per-node=8 run.py --data ChartQA_TEST --model Mini-InternVL-Chat-4B-V1-5 --verbose
```
The result is:
```
2024-05-29 04:28:40,199 - Evaluation - INFO - test_augmented test_human Overall
0 93.2 68.8 81.0
```
## AI2D_TEST
```
torchrun --nproc-per-node=8 run.py --data AI2D_TEST --model Mini-InternVL-Chat-4B-V1-5 --verbose
```
The result is:
```
2024-05-29 04:29:54,674 - Evaluation - INFO - split Overall atomStructure ... typesOf volcano waterCNPCycle
0 none 0.769754 0.875 ... 0.720117 0.875 0.5
```
## LLaVABench
```
torchrun --nproc-per-node=8 run.py --data LLaVABench --model Mini-InternVL-Chat-4B-V1-5 --verbose
```
The result is:
```
Processing ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0:00:00 60/60 100%
split Relative Score (main) VLM Score GPT4 Score
0 overall 68.3 49.7 72.7
1 complex 77.5 55.4 71.4
2 detail 67.0 40.7 60.7
3 conv 56.6 48.2 85.3
```
## DocVQA_VAL
```
torchrun --nproc-per-node=8 run.py --data DocVQA_VAL --model Mini-InternVL-Chat-4B-V1-5 --verbose
```
The result is:
```
2024-05-29 04:40:18,175 - Evaluation - INFO - val Overall
0 86.635414 86.635414
```
## InfoVQA_VAL
```
torchrun --nproc-per-node=8 run.py --data InfoVQA_VAL --model Mini-InternVL-Chat-4B-V1-5 --verbose
```
The result is:
```
2024-05-29 04:49:18,835 - Evaluation - INFO - val Overall
0 64.588708 64.588708
```
## OCRBench
```
torchrun --nproc-per-node=8 run.py --data OCRBench --model Mini-InternVL-Chat-4B-V1-5 --verbose
```
The result is:
```
2024-05-29 04:31:09,980 - Evaluation - INFO - Score:
2024-05-29 04:31:09,980 - Evaluation - INFO - Text Recognition:194
2024-05-29 04:31:09,980 - Evaluation - INFO - Scene Text-centric VQA:160
2024-05-29 04:31:09,980 - Evaluation - INFO - Doc-oriented VQA:145
2024-05-29 04:31:09,980 - Evaluation - INFO - Key Information Extraction:133
2024-05-29 04:31:09,980 - Evaluation - INFO - Handwritten Mathematical Expression Recognition:6
2024-05-29 04:31:09,980 - Evaluation - INFO - Final Score:638
2024-05-29 04:31:09,980 - Evaluation - INFO - Final Score Norm:63.8
```
## RealWorldQA
```
torchrun --nproc-per-node=8 run.py --data RealWorldQA --model Mini-InternVL-Chat-4B-V1-5 --verbose
```
The result is:
```
2024-05-29 04:34:10,091 - Evaluation - INFO - split Overall
0 none 0.601307
```
## SEEDBench2_Plus
```
torchrun --nproc-per-node=8 run.py --data SEEDBench2_Plus --model Mini-InternVL-Chat-4B-V1-5 --verbose
```
The result is:
```
2024-05-29 12:33:20,074 - Evaluation - INFO - split Overall chart map web
0 none 0.625823 0.616049 0.537794 0.745455
```
# How to Fine-tune InternVL-Chat-V1-2 on a Custom Dataset
## 1. Prepare the Pre-trained Model
Before starting the second fine-tuning process, download the pre-trained model we provide. Two versions are available: [InternVL-Chat-V1-2](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-2) and [InternVL-Chat-V1-2-Plus](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-2-Plus). We recommend downloading the Plus version.
Use the following commands to download the desired model:
```shell
cd pretrained/
# pip install -U huggingface_hub
# Download OpenGVLab/InternVL-Chat-V1-2
huggingface-cli download --resume-download --local-dir-use-symlinks False OpenGVLab/InternVL-Chat-V1-2 --local-dir InternVL-Chat-V1-2
# Download OpenGVLab/InternVL-Chat-V1-2-Plus
huggingface-cli download --resume-download --local-dir-use-symlinks False OpenGVLab/InternVL-Chat-V1-2-Plus --local-dir InternVL-Chat-V1-2-Plus
```
## 2. Prepare Your Customized Training Data
After downloading the pre-trained model, prepare your customized SFT (Supervised Fine-Tuning) data. Create a JSON file in `internvl_chat/shell/data/` similar to [this example](./shell/data/internvl_1_2_finetune.json).
The format for the JSON file should be:
```json
{
"your-custom-dataset-1": {
"root": "path/to/the/image/",
"annotation": "path/to/the/jsonl/annotation",
"data_augment": false,
"repeat_time": 1,
"length": "number of your data"
},
...
}
```
Example:
```json
{
"sharegpt4v_instruct_gpt4-vision_cap100k": {
"root": "playground/data/",
"annotation": "playground/opensource/sharegpt4v_instruct_gpt4-vision_cap100k.jsonl",
"data_augment": false,
"repeat_time": 1,
"length": 102025
}
}
```
## 3. Start Fine-tuning
Fine-tune the pre-trained models using either the [script for training the full LLM](../internvl_chat/shell/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_res_finetune_continue.sh) or the [script for training the LoRA adapter](../internvl_chat/shell/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_res_finetune_continue_lora.sh), depending on your available GPU resources.
Before fine-tuning, set the `--meta_path` to the path of the JSON file you created in the previous step. The default pre-trained model path in these shell scripts is `./pretrained/InternVL-Chat-V1-2`. Update it to `./pretrained/InternVL-Chat-V1-2-Plus` if you are using the Plus version.
> Note: Fine-tuning the full LLM requires 16 A100 80G GPUs, whereas fine-tuning the LoRA requires 2 A100 80G GPUs.
Commands for fine-tuning:
```sh
# Using 16 GPUs with SLURM system, fine-tune the full LLM
PARTITION='your partition' GPUS=16 sh shell/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_res_finetune_continue.sh
# Using 2 GPUs, fine-tune the LoRA
CUDA_VISIBLE_DEVICES=0,1 sh shell/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_res_finetune_continue_lora.sh
```
If you encounter any issues, please let me know, and I will update the training guide to enhance its usability.
# How to Fine-tune the Mini-InternVL-Chat Series on a Custom Dataset
## 1. Prepare the Pre-trained Model
Before starting the second fine-tuning process, download the models we released. We provide two models: [Mini-InternVL-Chat-2B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-2B-V1-5) and [Mini-InternVL-Chat-4B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-4B-V1-5).
Use the following commands to download the desired model:
```bash
huggingface-cli download --resume-download --local-dir-use-symlinks False OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --local-dir path/to/Mini-InternVL-Chat-2B-V1-5
huggingface-cli download --resume-download --local-dir-use-symlinks False OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --local-dir path/to/Mini-InternVL-Chat-4B-V1-5
```
## 2. Prepare Datasets
### Prepare Released Training Datasets
Refer to [this link](../internvl_chat#prepare-training-datasets) for details on preparing released training datasets.
### Prepare Your Customized Data
Create a JSONL file with annotations for your custom data in the following format:
```json
{"id": 0, "image": "image path relative to dataset path", "conversations": [{"from": "human", "value": "<image>\nyour question"}, {"from": "gpt", "value": "response"}]}
```
If you want to train with your customized SFT data, merge your data with our [internvl_1_2_finetune](../internvl_chat/shell/data/internvl_1_2_finetune.json) data by adding your data's metadata to our [JSON file](../internvl_chat/shell/data/internvl_1_2_finetune.json). The format for organizing this JSON file is:
```json
{
"sharegpt4v_instruct_gpt4-vision_cap100k": {
"root": "playground/data/",
"annotation": "playground/opensource/sharegpt4v_instruct_gpt4-vision_cap100k.jsonl",
"data_augment": false,
"repeat_time": 1,
"length": 102025
},
"llava_instruct_150k_zh": {
"root": "playground/data/coco/",
"annotation": "playground/opensource/llava_instruct_150k_zh.jsonl",
"data_augment": false,
"repeat_time": 1,
"length": 157712
},
"sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k": {
"root": "playground/data/",
"annotation": "playground/opensource/sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.jsonl",
"data_augment": false,
"repeat_time": 1,
"length": 665058
},
"dvqa_train_200k": {
"root": "playground/data/dvqa/",
"annotation": "playground/opensource/dvqa_train_200k.jsonl",
"data_augment": false,
"repeat_time": 1,
"length": 200000
},
"chartqa_train_18k": {
"root": "playground/data/chartqa/",
"annotation": "playground/opensource/chartqa_train_18k.jsonl",
"data_augment": false,
"repeat_time": 1,
"length": 18317
},
"ai2d_train_12k": {
"root": "playground/data/ai2d/",
"annotation": "playground/opensource/ai2d_train_12k.jsonl",
"data_augment": false,
"repeat_time": 1,
"length": 12413
},
"docvqa_train_10k": {
"root": "playground/data/docvqa/",
"annotation": "playground/opensource/docvqa_train_10k.jsonl",
"data_augment": false,
"repeat_time": 1,
"length": 10211
},
"geoqa+": {
"root": "playground/data/geoqa+/",
"annotation": "playground/opensource/geoqa+.jsonl",
"data_augment": false,
"repeat_time": 1,
"length": 72318
},
"synthdog_en": {
"root": "playground/data/synthdog-en/",
"annotation": "playground/opensource/synthdog_en.jsonl",
"data_augment": false,
"repeat_time": 1,
"length": 29765
},
"your_new_dataset": {
"root": "path/to/images",
"annotation": "path/to/annotation_file",
"data_augment": false,
"repeat_time": 1,
"length": 499712
}
}
```
## 3. Start Fine-tuning
Fine-tune the released models using either the [script for Mini-InternVL-Chat-2B-V1-5](./internvl_chat/shell/internlm2_1_8b_dynamic/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_finetune.sh) or the [script for Mini-InternVL-Chat-4B-V1-5](./internvl_chat/shell/phi3_3_8b_dynamic/internvl_chat_v1_5_phi3_3_8b_dynamic_res_finetune.sh). Set the `--meta_path` to the path of the JSON file you created in the last step and update `--model_name_or_path` in these shell scripts to `path/to/Mini-InternVL-Chat-2B-V1-5` or `path/to/Mini-InternVL-Chat-4B-V1-5`.
```bash
# Using 16 GPUs with SLURM system, fine-tune the full LLM
cd internvl_chat/
# Mini-InternVL-Chat-2B-V1-5
PARTITION='your partition' GPUS=16 sh shell/internlm2_1_8b_dynamic/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_finetune.sh
# Mini-InternVL-Chat-4B-V1-5
PARTITION='your partition' GPUS=16 sh shell/phi3_3_8b_dynamic/internvl_chat_v1_5_phi3_3_8b_dynamic_res_finetune.sh
```
If you see the following log in the terminal, it means the training has started successfully:
![Training Started Successfully](https://github.com/G-z-w/InternVL/assets/95175307/d66a2c40-be4c-42c8-babf-052621d2995e)
For a complete example training log, refer to [this link](./training_log.txt).
## 4. Evaluate
Refer to [this link](./document/How_to_evaluate_internvl_chat_v1_5.md) for evaluation details.
# How to use InternVL API?
## 1. Official API of InternVL2
We encourage everyone to use our API for research. For better management, please submit ([English application form](https://docs.google.com/forms/d/e/1FAIpQLSfMCzhPr1OOEKau_6jwTU0EiZMSFckDo-HMlc_hUudhF_97rw/viewform?usp=sf_link))/([中文申请表](https://wj.qq.com/s2/14910502/25a4/)) to obtain free API access.
## 2. Community-Host API of InternVL 1.5
https://rapidapi.com/adushar1320/api/internvl-chat
## 3. Examples
TBD
This source diff could not be displayed because it is too large. You can view the blob instead.
set -x
GPUS=${GPUS:-8}
BATCH_SIZE=${BATCH_SIZE:-16}
PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2}
GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
export MASTER_PORT=34229
export TF_CPP_MIN_LOG_LEVEL=3
export LAUNCHER=pytorch
OUTPUT_DIR='/home/wanglch/projects/saves/InternVL/internvl2-26b/finetune_multi_dcu'
if [ ! -d "$OUTPUT_DIR" ]; then
mkdir -p "$OUTPUT_DIR"
fi
# number of gpus: 4
# batch size per gpu: 8
# gradient accumulation steps: 2
# total batch size: 16
# epoch: 1
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
internvl/train/internvl_chat_finetune.py \
--model_name_or_path "/home/wanglch/projects/InternVL/InternVL2-26B" \
--conv_style "internlm2-chat" \
--output_dir ${OUTPUT_DIR} \
--meta_path "/home/wanglch/projects/InternVL/internvl_chat/shell/data/internvl_1_2_finetune_custom.json" \
--overwrite_output_dir True \
--force_image_size 448 \
--max_dynamic_patch 12 \
--down_sample_ratio 0.5 \
--drop_path_rate 0.0 \
--freeze_llm True \
--freeze_mlp True \
--freeze_backbone True \
--use_llm_lora 16 \
--vision_select_layer -1 \
--dataloader_num_workers 8 \
--fp16 True \
--num_train_epochs 1 \
--per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
--gradient_accumulation_steps ${GRADIENT_ACC} \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 200 \
--save_total_limit 1 \
--learning_rate 2e-5 \
--weight_decay 0.05 \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--max_seq_length 4096 \
--do_train True \
--grad_checkpoint True \
--group_by_length True \
--dynamic_image_size True \
--use_thumbnail True \
--ps_version 'v2' \
--deepspeed "/home/wanglch/projects/InternVL/internvl_chat/zero_stage3_config.json" \
--report_to "tensorboard" \
2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment