Initial commit

1d5a34cf · wanglch · 1d5a34cf · 1d5a34cf · 1d5a34cf · 1d5a34cf
Commit 1d5a34cf authored Jul 31, 2024 by wanglch
20 changed files
--- a/clip_benchmark/test_internvl_c_imagenet.sh
+++ b/clip_benchmark/test_internvl_c_imagenet.sh
+set -x
+
+PARTITION=${PARTITION:-'INTERN4'}
+alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
+    --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" \
+    --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
+    --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "it" \
+    --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
+    --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "jp" \
+    --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
+    --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "ar" \
+    --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
+    --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "imagenetv2" --dataset_root ./data/imagenetv2/ \
+    --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "imagenet_sketch" --dataset_root ./data/imagenet-sketch/ \
+    --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "imagenet-a" --dataset_root ./data/imagenet-a/ \
+    --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "imagenet-r" --dataset_root ./data/imagenet-r/ \
+    --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "objectnet" --dataset_root ./data/objectnet-1.0/ \
+    --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
--- a/clip_benchmark/test_internvl_c_retrieval.sh
+++ b/clip_benchmark/test_internvl_c_retrieval.sh
+set -x
+
+PARTITION=${PARTITION:-'INTERN4'}
+alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
+    --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
+    --dataset "mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
--- a/clip_benchmark/test_internvl_c_xtd.sh
+++ b/clip_benchmark/test_internvl_c_xtd.sh
+set -x
+
+PARTITION=${PARTITION:-'INTERN4'}
+alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=en
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=es
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=fr
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=zh
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=it
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=ko
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=ru
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
+    --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=jp
--- a/clip_benchmark/test_internvl_g_classification.sh
+++ b/clip_benchmark/test_internvl_g_classification.sh
+set -x
+
+PARTITION=${PARTITION:-'INTERN4'}
+alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "birdsnap" --dataset_root ./data/birdsnap/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "cifar10" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "cifar100" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "food101" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "sun397" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "cars" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "fgvc_aircraft" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "dtd" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "pets" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "caltech101" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "mnist" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "stl10" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "eurosat" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "gtsrb" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "country211" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "pcam" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "renderedsst2" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "fer2013" --dataset_root ./data/fer2013 --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "voc2007" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "vtab/flowers" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
+    --dataset "vtab/resisc45" --dataset_root ./data/ --model internvl_g_classification_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
--- a/clip_benchmark/test_internvl_g_imagenet.sh
+++ b/clip_benchmark/test_internvl_g_imagenet.sh
+set -x
+
+PARTITION=${PARTITION:-'INTERN4'}
+alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
+    --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" \
+    --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
+    --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "it" \
+    --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
+    --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "jp" \
+    --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
+    --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "ar" \
+    --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
+    --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "imagenetv2" --dataset_root ./data/imagenetv2/ \
+    --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "imagenet_sketch" --dataset_root ./data/imagenet-sketch/ \
+    --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "imagenet-a" --dataset_root ./data/imagenet-a/ \
+    --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "imagenet-r" --dataset_root ./data/imagenet-r/ \
+    --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
+    --task "zeroshot_classification" --dataset "objectnet" --dataset_root ./data/objectnet-1.0/ \
+    --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
--- a/clip_benchmark/test_internvl_g_retrieval.sh
+++ b/clip_benchmark/test_internvl_g_retrieval.sh
+set -x
+
+PARTITION=${PARTITION:-'INTERN4'}
+alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
+    --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
+    --dataset "mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json
--- a/clip_benchmark/test_internvl_g_retrieval_finetune.sh
+++ b/clip_benchmark/test_internvl_g_retrieval_finetune.sh
+set -x
+
+PARTITION=${PARTITION:-'INTERN4'}
+alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+     --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_c_retrieval_hf \
+     --pretrained ./work_dirs/internvl_stage2_finetune_flickrcn_364_bs1024_ep10 \
+     --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+     --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_g_retrieval_hf \
+     --pretrained ./work_dirs/internvl_stage2_finetune_flickrcn_364_bs1024_ep10 \
+     --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
+     --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_c_retrieval_hf \
+     --pretrained ./work_dirs/internvl_stage2_finetune_flickr_364_bs1024_ep10 \
+     --output result.json
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
+     --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_g_retrieval_hf \
+     --pretrained ./work_dirs/internvl_stage2_finetune_flickr_364_bs1024_ep10 \
+     --output result.json
--- a/clip_benchmark/test_internvl_g_xtd.sh
+++ b/clip_benchmark/test_internvl_g_xtd.sh
+set -x
+
+PARTITION=${PARTITION:-'INTERN4'}
+alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=en
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=es
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=fr
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=zh
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=it
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=ko
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=ru
+
+s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
+    --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
+    --pretrained ./pretrained/internvl_14b_224px --output result_g.json --language=jp
--- a/clip_benchmark/tests/test_clip_benchmark.py
+++ b/clip_benchmark/tests/test_clip_benchmark.py
+#!/usr/bin/env python
+
+"""Tests for `clip_benchmark` package."""
+
+import os
+
+os.environ['CUDA_VISIBLE_DEVICES'] = ''
+from clip_benchmark.cli import run
+
+
+class base_args:
+    dataset = 'dummy'
+    split = 'test'
+    model = 'ViT-B-32-quickgelu'
+    pretrained = 'laion400m_e32'
+    task = 'zeroshot_classification'
+    amp = False
+    num_workers = 4
+    batch_size = 64
+    dataset_root = 'root'
+    output = 'result.json'
+    verbose = True
+    root = 'root'
+    annotation_file = ''
+    seed = 0
+    skip_load = False
+    language = 'en'
+    model_cache_dir = None
+    cupl = False
+    save_clf = None
+    load_clfs = []
+    model_type = 'open_clip'
+    wds_cache_dir = None
+    which = 'eval'
+    skip_existing = False
+
+
+def test_base():
+    run(base_args)
--- a/clip_benchmark/tox.ini
+++ b/clip_benchmark/tox.ini
+[tox]
+envlist = py36, py37, py38, flake8
+
+[travis]
+python =
+    3.8: py38
+    3.7: py37
+    3.6: py36
+
+[testenv:flake8]
+basepython = python
+deps = flake8
+commands = flake8 clip_benchmark tests
+
+[testenv]
+setenv =
+    PYTHONPATH = {toxinidir}
+
+commands = python setup.py test
--- a/document/How_to_deploy_a_local_demo.md
+++ b/document/How_to_deploy_a_local_demo.md
+# How to deploy a local demo?
+
+## Launch a Controller
+
+```shell
+# run the command in the `internvl_chat_llava` folder
+python -m llava.serve.controller --host 0.0.0.0 --port 10000
+```
+
+## Launch a Gradio Web Server
+
+```shell
+# run the command in the `internvl_chat_llava` folder
+python -m llava.serve.gradio_web_server --controller http://localhost:10000 --model-list-mode reload
+```
+
+## Launch a Model Worker
+
+### Options
+
+- `--host <host_address>`: Specifies the host address on which the model worker will run. Use "0.0.0.0" to allow connections from any IP address.
+- `--controller <controller_address>`: Specifies the address of the controller node responsible for managing model deployment and execution.
+- `--port <port_number>`: Specifies the port number on which the model worker will listen for incoming requests.
+- `--worker <worker_address>`: Specifies the address of the worker node where the model will be executed.
+- `--model-path <model_file_path>`: Specifies the file path to the machine learning model to be deployed and executed.
+
+### Additional Options
+
+#### Multi-GPU Deployment
+
+To enable deployment on multiple GPUs, use the `--device auto` option. This allows the script to utilize all available GPU devices for model execution automatically.
+
+#### Quantization Deployment
+
+To enable quantization for model deployment, use the `--load-8bit` option. This performs quantization on the model, reducing its precision to 8 bits for improved efficiency.
+
+__Note: The `--device auto` and `--load-8bit` options cannot be used simultaneously.__
+
+```shell
+# OpenGVLab/InternVL-Chat-ViT-6B-Vicuna-7B
+# run the command in the `internvl_chat_llava` folder
+python -m llava.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --model-path OpenGVLab/InternVL-Chat-ViT-6B-Vicuna-7B
+
+# OpenGVLab/InternVL-Chat-ViT-6B-Vicuna-13B
+# run the command in the `internvl_chat_llava` folder
+python -m llava.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40001 --worker http://localhost:40001 --model-path OpenGVLab/InternVL-Chat-ViT-6B-Vicuna-13B
+
+# OpenGVLab/InternVL-Chat-V1-1
+# run the command in the `internvl_chat` folder
+python -m internvl.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40002 --worker http://localhost:40002 --model-path OpenGVLab/InternVL-Chat-V1-1
+
+# OpenGVLab/InternVL-Chat-V1-2
+# run the command in the `internvl_chat` folder
+python -m internvl.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40003 --worker http://localhost:40003 --model-path OpenGVLab/InternVL-Chat-V1-2
+
+# OpenGVLab/InternVL-Chat-V1-2-Plus
+# run the command in the `internvl_chat` folder
+python -m internvl.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40004 --worker http://localhost:40004 --model-path OpenGVLab/InternVL-Chat-V1-2-Plus
+
+# OpenGVLab/InternVL-Chat-V1-5
+# run the command in the `internvl_chat` folder
+python -m internvl.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40005 --worker http://localhost:40005 --model-path OpenGVLab/InternVL-Chat-V1-5
+```
--- a/document/How_to_evaluate_internvl_chat_v1_5.md
+++ b/document/How_to_evaluate_internvl_chat_v1_5.md
+# How to Evaluate InternVL-Chat-V1-5?
+
+In this tutorial, we will provide a detailed guide on how to replicate the results presented in the InternVL 1.5 technical report.
+
+The results are shown in the table below.
+
+_If you encounter any difficulties while testing according to this guide, please let me know. Thank you._
+
+> Note that if you are aiming for an exact replication, please use this code repository and follow the testing methods outlined below; otherwise, using the [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) would be more convenient.
+
+![image](https://github.com/OpenGVLab/InternVL/assets/23737120/8b62d429-c689-426a-9267-2727b6430b6e)
+
+## Model Preparation
+
+| model name         | type | download                                                          | #param |
+| ------------------ | ---- | ----------------------------------------------------------------- | :----: |
+| InternVL-Chat-V1-5 | MLLM | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5) | 25.5B  |
+
+Please download the above model weights and place them in the `pretrained/` folder.
+
+```sh
+cd pretrained/
+# pip install -U huggingface_hub
+huggingface-cli download --resume-download --local-dir-use-symlinks False OpenGVLab/InternVL-Chat-V1-5 --local-dir InternVL-Chat-V1-5
+```
+
+The directory structure is:
+
+```
+pretrained
+└── InternVL-Chat-V1-5
+```
+
+## OCR-related Benchmarks
+
+Our tests will be divided into three parts. First, we will focus on OCR-related datasets, including DocVQA, ChartQA, InfoVQA, TextVQA, and OCRBench. Next, let's proceed to test each dataset one by one.
+
+### DocVQA val & test
+
+<details>
+<summary>click to expand</summary>
+
+1. Download the DocVQA dataset using the following instructions:
+
+   ```shell
+   mkdir -p data/docvqa && cd data/docvqa
+
+   # download images and annotations
+   wget https://datasets.cvc.uab.es/rrc/DocVQA/train.tar.gz --no-check-certificate # (optional)
+   wget https://datasets.cvc.uab.es/rrc/DocVQA/val.tar.gz --no-check-certificate
+   wget https://datasets.cvc.uab.es/rrc/DocVQA/test.tar.gz --no-check-certificate
+
+   # unzip files
+   tar -zxvf train.tar.gz
+   tar -zxvf val.tar.gz
+   tar -zxvf test.tar.gz
+
+   # download converted jsonl files
+   wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/docvqa/train.jsonl
+   wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/docvqa/val.jsonl
+   wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/docvqa/test.jsonl
+   cd ../..
+   ```
+
+2. After preparation is complete, the directory structure is:
+
+   ```
+   data
+    ├── docvqa
+    │   ├── test
+    │   ├── test.jsonl
+    │   ├── train
+    │   ├── train.jsonl
+    │   ├── val
+    │   └── val.jsonl
+   ```
+
+3. Test the model with the following commands:
+
+   We use a maximum of `18 tiles` to test the DocVQA dataset.
+
+   ```shell
+   # evaluation on the val set
+   GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 vqa-docvqa-val --dynamic --max-num 18
+   # evaluation on the test set
+   GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 vqa-docvqa-test --dynamic --max-num 18
+   ```
+
+   The result of the validation set is:
+
+   ```
+   Overall ANLS: 0.9049
+   ```
+
+   For the test set, the test results need to be submitted to the [testing server](https://rrc.cvc.uab.es/?ch=17&com=tasks).
+
+</details>
+
+### ChartQA test
+
+<details>
+<summary>click to expand</summary>
+
+1. Download the ChartQA dataset using the following instructions:
+
+   ```shell
+   mkdir -p data/chartqa && cd data/chartqa
+
+   # download images from https://drive.google.com/file/d/1Lm_w6zeET1Hyl_9ks6w5nEsgpoyPHalV/view
+
+   # download converted files
+   wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/train_human.jsonl
+   wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/train_augmented.jsonl
+   wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/test_human.jsonl
+   wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/test_augmented.jsonl
+
+   cd ../..
+   ```
+
+2. After preparation is complete, the directory structure is:
+
+   ```
+   data
+    ├── chartqa
+    │   ├── ChartQA Dataset
+    │   │    ├── test
+    │   │    ├── train
+    │   │    └── val
+    │   ├── test_augmented.jsonl
+    │   ├── test_human.jsonl
+    │   ├── train_augmented.jsonl
+    │   └── train_human.jsonl
+   ```
+
+3. Test the model with the following commands:
+
+   We use a maximum of `12 tiles` to test the ChartQA dataset.
+
+   ```shell
+   # evaluation on the test set
+   GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 vqa-chartqa-test --dynamic --max-num 12
+   ```
+
+   The result of the test set is:
+
+   ```
+   ['chartqa_test_human', {'relaxed_accuracy': 0.736}]
+   ['chartqa_test_augmented', {'relaxed_accuracy': 0.9408}]
+   # the average score = (73.6 + 94.08) / 2 = 83.8
+   ```
+
+</details>
+
+### InfoVQA val & test
+
+<details>
+<summary>click to expand</summary>
+
+1. Download the InfoVQA dataset using the following instructions:
+
+   ```shell
+   mkdir -p data/infographicsvqa && cd data/infographicsvqa
+
+   # download images and annotations from https://rrc.cvc.uab.es/?ch=17&com=downloads
+   # infographicsVQA_test_v1.0.json, infographicsVQA_val_v1.0_withQT.json, infographicVQA_train_v1.0.json
+
+   # download converted files
+   wget https://huggingface.co/OpenGVLab/InternVL/raw/main/infographicsvqa_val.jsonl -O val.jsonl
+   wget https://huggingface.co/OpenGVLab/InternVL/raw/main/infographicsvqa_test.jsonl -O test.jsonl
+
+   cd ../..
+   ```
+
+2. After preparation is complete, the directory structure is:
+
+   ```
+   data
+    ├── infographicsvqa
+    │   ├── infographicsvqa_images
+    │   ├── infographicsVQA_test_v1.0.json
+    │   ├── infographicsVQA_val_v1.0_withQT.json
+    │   ├── infographicVQA_train_v1.0.json
+    │   ├── test.jsonl
+    │   └── val.jsonl
+   ```
+
+3. Test the model with the following commands:
+
+   We use a maximum of `24 tiles` to test the InfoVQA dataset.
+
+   ```shell
+   # evaluation on the val set
+   GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 vqa-infovqa-val --dynamic --max-num 24
+   # evaluation on the test set
+   GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 vqa-infovqa-test --dynamic --max-num 24
+   ```
+
+   The result of the val set is:
+
+   ```
+   Overall ANLS: 0.7235
+   ```
+
+   For the test set, the test results need to be submitted to the [testing server](https://rrc.cvc.uab.es/?ch=17&com=tasks).
+
+</details>
+
+### TextVQA val
+
+<details>
+<summary>click to expand</summary>
+
+1. Download the TextVQA dataset using the following instructions:
+
+   ```shell
+   mkdir -p data/textvqa && cd data/textvqa
+
+   # download images
+   wget https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip && unzip train_val_images.zip
+
+   # download converted files
+   wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train_annotations.json
+   wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train_questions.json
+   wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train.jsonl
+   wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_val_annotations.json
+   wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_val_questions.json
+   wget https://huggingface.co/OpenGVLab/InternVL/raw/main/textvqa_val.jsonl
+   wget https://huggingface.co/OpenGVLab/InternVL/raw/main/textvqa_val_llava.jsonl
+
+   cd ../..
+   ```
+
+2. After preparation is complete, the directory structure is:
+
+   ```
+   data
+    ├── textvqa
+    │   ├── textvqa_train_annotations.json
+    │   ├── textvqa_train.jsonl
+    │   ├── textvqa_train_questions.json
+    │   ├── textvqa_val_annotations.json
+    │   ├── textvqa_val.jsonl
+    │   ├── textvqa_val_llava.jsonl
+    │   ├── textvqa_val_questions.json
+    │   └── train_images
+   ```
+
+3. Test the model with the following commands:
+
+   We use a maximum of `24 tiles` to test the TextVQA dataset.
+
+   ```shell
+   # evaluation on the val set
+   GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 vqa-textvqa-val --dynamic --max-num 24
+   ```
+
+   The result of the val set is:
+
+   ```
+   ['pretrained/InternVL-Chat-V1-5', 'textvqa_val', 0.8061000000000043]
+   ```
+
+</details>
+
+### OCRBench
+
+<details>
+<summary>click to expand</summary>
+
+Please use [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) for the test of OCRBench.
+
+The command to test InternVL-Chat-V1-5 on OCRBench using VLMEvalKit is:
+
+```
+torchrun --nproc-per-node=8 run.py --data OCRBench --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-04-29 00:28:29,681 - Evaluation - INFO - Score:
+2024-04-29 00:28:29,681 - Evaluation - INFO - Text Recognition:238
+2024-04-29 00:28:29,681 - Evaluation - INFO - Scene Text-centric VQA:178
+2024-04-29 00:28:29,681 - Evaluation - INFO - Doc-oriented VQA:151
+2024-04-29 00:28:29,681 - Evaluation - INFO - Key Information Extraction:153
+2024-04-29 00:28:29,681 - Evaluation - INFO - Handwritten Mathematical Expression Recognition:4
+2024-04-29 00:28:29,681 - Evaluation - INFO - Final Score:724
+2024-04-29 00:28:29,681 - Evaluation - INFO - Final Score Norm:72.4
+```
+
+</details>
+
+## General Multimodal Benchmarks
+
+Next, we will test InternVL-Chat-V1-5 using 10 general multimodal benchmarks, which include MME, RealWorldQA, AI2D, MMMU, MMBench-EN, MMBench-CN, CCBench, MMVet, SEED, and HallusionBench.
+
+### MME
+
+<details>
+<summary>click to expand</summary>
+
+1. Download the MME dataset using the following instructions:
+
+   ```shell
+   mkdir -p data/mme && cd data/mme
+
+   # 1. Download the data following the official instructions [here](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation).
+   # 2. Downloaded images to `MME_Benchmark_release_version`.
+
+   cd ../..
+   ```
+
+2. After preparation is complete, the directory structure is:
+
+   ```
+   data
+    ├── mme
+    │   └── MME_Benchmark_release_version
+   ```
+
+3. Single-GPU inference and evaluate:
+
+   We use a maximum of `12 tiles` to test the MME dataset.
+
+   ```shell
+   # evaluation on the val set
+   GPUS=1 sh evaluate.sh pretrained/InternVL-Chat-V1-5 mme --dynamic --max-num 12
+   ```
+
+   The result of MME is:
+
+   ```
+   total score: 1658.3683473389356
+
+      existence  score: 190.0
+      count  score: 175.0
+      position  score: 171.66666666666669
+      color  score: 178.33333333333331
+      posters  score: 173.8095238095238
+      celebrity  score: 142.05882352941177
+      scene  score: 156.5
+      landmark  score: 179.5
+      artwork  score: 144.0
+      OCR  score: 147.5
+
+
+   =========== Cognition ===========
+   total score: 533.5714285714286
+
+      commonsense_reasoning  score: 133.57142857142858
+      numerical_calculation  score: 117.5
+      text_translation  score: 185.0
+      code_reasoning  score: 97.5
+
+   # 1658.3683473389356 + 533.5714285714286 = 2191.939775910364
+   ```
+
+</details>
+
+### RealWorldQA
+
+<details>
+<summary>click to expand</summary>
+
+Please use [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) for the test of RealWorldQA.
+
+The command to test InternVL-Chat-V1-5 on RealWorldQA using VLMEvalKit is:
+
+```
+torchrun --nproc-per-node=8 run.py --data RealWorldQA --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-04-29 00:35:13,282 - Evaluation - INFO - Score:
+2024-04-29 00:35:13,282 - Evaluation - INFO -   split   Overall
+0  none  0.660131
+```
+
+</details>
+
+### AI2D test
+
+<details>
+<summary>click to expand</summary>
+
+1. Download the AI2D dataset using the following instructions:
+
+   ```shell
+   mkdir -p data/ai2diagram && cd data/ai2diagram
+   # download converted files
+   wget https://huggingface.co/OpenGVLab/InternVL/raw/main/ai2d_test_vlmevalkit.jsonl -O test_vlmevalkit.jsonl
+   wget https://huggingface.co/OpenGVLab/InternVL/resolve/main/AI2D_TEST.zip && unzip AI2D_TEST.zip
+
+   # download images from Google drive (optional, provided by InternLM-XComposer)
+   # https://drive.google.com/file/d/1dqqa3MnrxMXaU_K9JA6C83je32ibwdOY/view?usp=sharing
+   # images should be placed in `data/ai2diagram/ai2d/abc_images` and `data/ai2diagram/ai2d/images`
+   cd ../..
+   ```
+
+2. After preparation is complete, the directory structure is:
+
+   ```
+   data
+    ├── ai2diagram
+    │   ├── test_vlmevalkit.jsonl
+    │   ├── ai2d # (optional)
+    │   │    ├── abc_images
+    │   │    └── images
+    │   └── AI2D_TEST
+   ```
+
+3. Test the model with the following commands:
+
+   We use a maximum of `6 tiles` to test the AI2D dataset.
+
+   ```shell
+   # evaluation on the test set
+   GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 vqa-ai2d-test --dynamic
+   ```
+
+   The result of AI2D is:
+
+   ```
+   ai2diagram_test {'accuracy': 0.8073186528497409}
+   ```
+
+</details>
+
+### MMMU val
+
+<details>
+<summary>click to expand</summary>
+
+1. The evaluation code will automatically download the dataset from HuggingFace.
+
+2. Test the model with the following commands:
+
+   ```
+   GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 mmmu-val --dynamic
+   ```
+
+   The result of MMMU val is:
+
+   ```
+   {'Overall-Art and Design': {'num': 120, 'acc': 0.608}, 'Art': {'num': 30, 'acc': 0.7}, 'Art_Theory': {'num': 30, 'acc': 0.8}, 'Design': {'num': 30, 'acc': 0.767}, 'Music': {'num': 30, 'acc': 0.167}, 'Overall-Business': {'num': 150, 'acc': 0.413}, 'Accounting': {'num': 30, 'acc': 0.467}, 'Economics': {'num': 30, 'acc': 0.4}, 'Finance': {'num': 30, 'acc': 0.4}, 'Manage': {'num': 30, 'acc': 0.4}, 'Marketing': {'num': 30, 'acc': 0.4}, 'Overall-Science': {'num': 150, 'acc': 0.38}, 'Biology': {'num': 30, 'acc': 0.6}, 'Chemistry': {'num': 30, 'acc': 0.233}, 'Geography': {'num': 30, 'acc': 0.4}, 'Math': {'num': 30, 'acc': 0.333}, 'Physics': {'num': 30, 'acc': 0.333}, 'Overall-Health and Medicine': {'num': 150, 'acc': 0.433}, 'Basic_Medical_Science': {'num': 30, 'acc': 0.5}, 'Clinical_Medicine': {'num': 30, 'acc': 0.5}, 'Diagnostics_and_Laboratory_Medicine': {'num': 30, 'acc': 0.333}, 'Pharmacy': {'num': 30, 'acc': 0.367}, 'Public_Health': {'num': 30, 'acc': 0.467}, 'Overall-Humanities and Social Science': {'num': 120, 'acc': 0.617}, 'History': {'num': 30, 'acc': 0.633}, 'Literature': {'num': 30, 'acc': 0.8}, 'Sociology': {'num': 30, 'acc': 0.567}, 'Psychology': {'num': 30, 'acc': 0.467}, 'Overall-Tech and Engineering': {'num': 210, 'acc': 0.362}, 'Agriculture': {'num': 30, 'acc': 0.567}, 'Architecture_and_Engineering': {'num': 30, 'acc': 0.267}, 'Computer_Science': {'num': 30, 'acc': 0.367}, 'Electronics': {'num': 30, 'acc': 0.3}, 'Energy_and_Power': {'num': 30, 'acc': 0.333}, 'Materials': {'num': 30, 'acc': 0.467}, 'Mechanical_Engineering': {'num': 30, 'acc': 0.233}, 'Overall': {'num': 900, 'acc': 0.452}}
+   ```
+
+</details>
+
+### MMBench-EN & CN test
+
+<details>
+<summary>click to expand</summary>
+
+1. Download the MMBench dataset using the following instructions:
+
+   ```
+   mkdir -p data/mmbench && cd data/mmbench
+
+   # download csv files of mmbench
+   wget http://opencompass.openxlab.space/utils/MMBench/CCBench_legacy.tsv
+   wget https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_20230712.tsv
+   wget https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_cn_20231003.tsv
+   wget https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_en_20231003.tsv
+   wget https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_test_cn_20231003.tsv
+   wget https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_test_en_20231003.tsv
+
+   cd ../..
+   ```
+
+2. After preparation is complete, the directory structure is:
+
+   ```
+   data
+    ├── mmbench
+    │   ├── CCBench_legacy.tsv
+    │   ├── mmbench_dev_20230712.tsv
+    │   ├── mmbench_dev_cn_20231003.tsv
+    │   ├── mmbench_dev_en_20231003.tsv
+    │   ├── mmbench_test_cn_20231003.tsv
+    │   └── mmbench_test_en_20231003.tsv
+   ```
+
+3. Test the model with the following commands:
+
+   We use a maximum of `6 tiles` to test the MMBench dataset.
+
+   ```shell
+   # evaluation on the test-en set
+   GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 mmbench-test-en --dynamic
+   # evaluation on the test-cn set
+   GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 mmbench-test-cn --dynamic
+   ```
+
+   Submit the result to the [test server](mmbench.opencompass.org.cn). The result of MMBench is:
+
+   ```
+   # result of the test-en set
+   A_Overall (test)	0.8217488789237668
+   # result of the test-cn set
+   A_Overall (test)	0.8195067264573991
+   ```
+
+</details>
+
+### CCBench dev
+
+<details>
+<summary>click to expand</summary>
+
+1. See the `MMBench-EN & CN test` part to prepare the CCBench data.
+
+2. Test the model with the following commands:
+
+   We use a maximum of `6 tiles` to test the CCBench dataset.
+
+   ```shell
+   # evaluation on the dev set
+   GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 ccbench-dev --dynamic
+   ```
+
+   Submit the result to the [test server](mmbench.opencompass.org.cn). The result of CCBench is:
+
+   ```
+   A_Overall (dev)	0.7
+   ```
+
+</details>
+
+</details>
+
+### MMVet
+
+<details>
+<summary>click to expand</summary>
+
+1. Download the MMVet dataset using the following instructions:
+
+   ```
+   mkdir -p data/mm-vet && cd data/mm-vet
+   wget https://github.com/yuweihao/MM-Vet/releases/download/v1/mm-vet.zip
+   unzip mm-vet.zip
+   wget https://huggingface.co/OpenGVLab/InternVL/raw/main/llava-mm-vet.jsonl
+   cd ../..
+   ```
+
+2. After preparation is complete, the directory structure is:
+
+   ```
+   data
+    ├── mm-vet
+    │   ├── images
+    │   └── llava-mm-vet.jsonl
+   ```
+
+3. Test the model with the following commands:
+
+   We use a maximum of `6 tiles` to test the MMVet dataset.
+
+   ```shell
+   # evaluation on the mmvet
+   GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 mmvet --dynamic
+   ```
+
+   Submit the result to the [test server](https://huggingface.co/spaces/whyu/MM-Vet_Evaluator). The result of MMVet is:
+
+   ```
+   total
+   62.7
+   ```
+
+</details>
+
+### SEED Image
+
+<details>
+<summary>click to expand</summary>
+
+1. Download the SEED dataset using the following instructions:
+
+   ```
+   mkdir -p data/SEED && cd data/SEED
+   # 1. Follow the official instructions [Data Preparation for SEED-Bench-1](https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md#data-preparation-for-seed-bench-1)
+   #    to download the images and the videos. Put images under `./data/SEED/SEED-Bench-image`.
+   # 2. Extract the video frame in the middle from the downloaded videos, and put them under `./data/SEED/SEED-Bench-image`.
+   #    LLaVA provided the script [`extract_video_frames.py`](../internvl_chat/tools/extract_video_frames.py) modified from the official one.
+
+   wget https://huggingface.co/OpenGVLab/InternVL/raw/main/seed.jsonl
+   cd ../..
+   ```
+
+2. After preparation is complete, the directory structure is:
+
+   ```
+   data
+    ├── SEED
+    │   ├── SEED-Bench-image
+    │   └── seed.jsonl
+   ```
+
+3. Test the model with the following commands:
+
+   ```shell
+   sh evaluate.sh pretrained/InternVL-Chat-V1-5 seed --dynamic
+   ```
+
+   The result is:
+
+   ```
+   Acc@1: 0.6999444135630906
+   length: 17990
+   Accuracy for each data type:
+   Data type Scene Understanding: 80.37%
+   Data type Instance Identity: 80.45%
+   Data type Instance Location: 78.03%
+   Data type Instance Attributes: 72.39%
+   Data type Instances Counting: 69.19%
+   Data type Spatial Relation: 59.82%
+   Data type Instance Interaction: 77.32%
+   Data type Visual Reasoning: 78.85%
+   Data type Text Understanding: 55.81%
+   Data type Action Recognition: 54.08%
+   Data type Action Prediction: 44.82%
+   Data type Procedure Understanding: 40.18%
+   Total accuracy: 69.99%
+   Image accuracy: 75.99%
+   Video accuracy: 47.27%
+   ```
+
+</details>
+
+### HallusionBench
+
+<details>
+<summary>click to expand</summary>
+
+Please use [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) for the test of HallusionBench.
+
+The command to test InternVL-Chat-V1-5 on HallusionBench using VLMEvalKit is:
+
+```
+torchrun --nproc-per-node=8 run.py --data HallusionBench --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-04-29 00:46:23,688 - Evaluation - INFO - Score:
+2024-04-29 00:46:23,688 - Evaluation - INFO -           split       aAcc       fAcc       qAcc
+0       Overall  66.771819  40.173410  40.879121
+1            VD  63.620981  40.000000  34.296029
+2            VS  71.944444  40.517241  51.123596
+3     VD_figure  77.500000  65.853659  53.846154
+4        VS_map  56.250000  18.181818  18.750000
+5   VD_illusion  66.666667  41.935484  34.722222
+6      VS_table  75.892857  46.428571  55.813953
+7        VD_ocr  78.651685  58.139535  58.139535
+8        VS_ocr  59.259259  38.461538  22.222222
+9      VS_chart  81.538462  50.000000  72.368421
+10     VD_video  51.176471  10.416667  13.043478
+11      VD_math  56.481481  25.000000  27.777778
+```
+
+The final score reported in our technical report is the average: (66.771819 + 40.173410 + 40.879121) / 3 = 49.3
+
+</details>
+
+## Math Benchmark
+
+Finally, we use a representative math dataset, MathVista, to test InternVL-Chat-V1-5.
+
+### MathVista testmini
+
+<details>
+<summary>click to expand</summary>
+
+1. Download the MathVista dataset using the following instructions:
+
+   ```bash
+   mkdir -p data/MathVista && cd data/MathVista
+   wget https://huggingface.co/datasets/AI4Math/MathVista/raw/main/annot_testmini.json
+   cd ../..
+   ```
+
+2. Test the model with the following commands:
+
+   ```shell
+   export OPENAI_API_KEY='your-openai-key'
+   GPUS=8 sh evaluate.sh pretrained/InternVL-Chat-V1-5 mathvista-testmini --dynamic
+   ```
+
+   The result is:
+
+   ```
+   Correct: 535, Total: 1000, Accuracy: 53.5%
+   1000
+   Number of test problems: 1000
+
+   Type: [question_type]
+   [free_form]: 47.17% (217/460)
+   [multi_choice]: 58.89% (318/540)
+
+   Type: [answer_type]
+   [float]: 0.00% (0/40)
+   [integer]: 51.67% (216/418)
+   [text]: 58.89% (318/540)
+   [list]: 50.00% (1/2)
+
+   Type: [language]
+   [english]: 53.31% (499/936)
+   [chinese]: 56.45% (35/62)
+   [persian]: 50.00% (1/2)
+   ```
+
+</details>
--- a/document/How_to_evaluate_internvl_chat_v1_5_using_vlmevalkit.md
+++ b/document/How_to_evaluate_internvl_chat_v1_5_using_vlmevalkit.md
+# How to Evaluate InternVL-Chat-V1-5 using VLMEvalKit?
+
+In this tutorial, we will provide a detailed guide on how to evaluate InternVL-Chat-V1-5 using VLMEvalKit.
+
+First of all, please follow this [guide](https://github.com/open-compass/VLMEvalKit/blob/main/docs/en/Quickstart.md) to install VLMEvalKit.
+
+## MMBench_DEV_EN
+
+```
+torchrun --nproc-per-node=8 run.py --data MMBench_DEV_EN --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-04-29 18:24:58,395 - Evaluation - INFO -   split   Overall  ...  spatial_relationship  structuralized_imagetext_understanding
+0   dev  0.808419  ...              0.422222                                0.628205
+```
+
+## MMBench_DEV_CN
+
+```
+torchrun --nproc-per-node=8 run.py --data MMBench_DEV_CN --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-04-29 18:26:05,209 - Evaluation - INFO -   split   Overall  ...  spatial_relationship  structuralized_imagetext_understanding
+0   dev  0.803265  ...              0.377778                                0.615385
+```
+
+## MMStar
+
+```
+torchrun --nproc-per-node=8 run.py --data MMStar --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-04-29 18:21:56,491 - Evaluation - INFO -   split   Overall  ...   math  science & technology
+0  none  0.572667  ...  0.564                 0.408
+```
+
+## MME
+
+```
+torchrun --nproc-per-node=8 run.py --data MME --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-04-29 18:42:43,864 - Evaluation - INFO - Score:
+2024-04-29 18:42:43,864 - Evaluation - INFO -     perception   reasoning    OCR  ...     posters  scene  text_translation
+0  1641.915766  519.642857  147.5  ...  171.768707  156.5             185.0
+```
+
+## SEEDBench_IMG
+
+```
+torchrun --nproc-per-node=8 run.py --data SEEDBench_IMG --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-04-29 18:58:54,973 - Evaluation - INFO - Score:
+2024-04-29 18:58:54,973 - Evaluation - INFO -   split   Overall  ...  Text Understanding  Visual Reasoning
+0  none  0.757167  ...            0.440476          0.806647
+```
+
+## MMVet
+
+```
+torchrun --nproc-per-node=8 run.py --data MMVet --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-04-29 18:32:38,748 - Evaluation - INFO - Score:
+2024-04-29 18:32:38,748 - Evaluation - INFO -   Category  tot        acc
+0      rec  187  61.818182
+1      ocr  108  68.981481
+2     know   84  46.428571
+3      gen   80  44.875000
+4     spat   75  63.600000
+5     math   26  62.307692
+6  Overall  218  61.513761
+```
+
+Note that because the version of GPT used for scoring differs from the official server, the scores tested by VLMEvalKit will be slightly different.
+
+## MMMU_DEV_VAL
+
+```
+torchrun --nproc-per-node=8 run.py --data MMMU_DEV_VAL --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-04-29 18:20:04,977 - Evaluation - INFO -         split  Overall  ...  Science  Tech & Engineering
+0         dev     0.48  ...     0.36            0.428571
+1  validation     0.45  ...     0.38            0.371429
+```
+
+## MathVista_MINI
+
+```
+torchrun --nproc-per-node=8 run.py --data MathVista_MINI --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-04-29 18:39:25,736 - Evaluation - INFO -                      Task&Skill   tot  prefetch  hit  prefetch_rate        acc
+0                       Overall  1000       545  521      54.500000  52.100000
+1          scientific reasoning   122        89   70      72.950820  57.377049
+2   textbook question answering   158       101   86      63.924051  54.430380
+3           numeric commonsense   144        39   41      27.083333  28.472222
+4          arithmetic reasoning   353       147  198      41.643059  56.090652
+5     visual question answering   179        91   88      50.837989  49.162011
+6            geometry reasoning   239       144   94      60.251046  39.330544
+7           algebraic reasoning   281       170  109      60.498221  38.790036
+8      geometry problem solving   208       135   79      64.903846  37.980769
+9             math word problem   186        70  118      37.634409  63.440860
+10            logical reasoning    37        18    5      48.648649  13.513514
+11    figure question answering   269       148  150      55.018587  55.762082
+12        statistical reasoning   301       143  196      47.508306  65.116279
+```
+
+Note that because the version of GPT used for answer extraction differs from the official code, the scores tested by VLMEvalKit will be slightly different.
+
+## ScienceQA_TEST
+
+```
+torchrun --nproc-per-node=8 run.py --data ScienceQA_TEST --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-04-29 19:10:03,279 - Evaluation - INFO - Score:
+2024-04-29 19:10:03,279 - Evaluation - INFO -   split   Overall  ...  Weather and climate  World religions
+0  test  0.940506  ...             0.948276              1.0
+```
+
+## HallusionBench
+
+```
+torchrun --nproc-per-node=8 run.py --data HallusionBench --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-04-29 18:21:37,606 - Evaluation - INFO - Score:
+2024-04-29 18:21:37,606 - Evaluation - INFO -           split       aAcc       fAcc       qAcc
+0       Overall  66.771819  40.173410  40.879121
+1            VS  71.944444  40.517241  51.123596
+2            VD  63.620981  40.000000  34.296029
+3        VS_ocr  59.259259  38.461538  22.222222
+4      VD_video  51.176471  10.416667  13.043478
+5        VS_map  56.250000  18.181818  18.750000
+6      VS_chart  81.538462  50.000000  72.368421
+7      VS_table  75.892857  46.428571  55.813953
+8     VD_figure  77.500000  65.853659  53.846154
+9   VD_illusion  66.666667  41.935484  34.722222
+10      VD_math  56.481481  25.000000  27.777778
+11       VD_ocr  78.651685  58.139535  58.139535
+```
+
+## TextVQA_VAL
+
+```
+torchrun --nproc-per-node=8 run.py --data TextVQA_VAL --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-04-29 18:41:32,873 - Evaluation - INFO - VQA Eval Finished. Saved to ./InternVL-Chat-V1-5/InternVL-Chat-V1-5_TextVQA_VAL_acc.csv.
+2024-04-29 18:41:32,873 - Evaluation - INFO -    Overall
+0   80.488
+```
+
+## ChartQA_TEST
+
+```
+torchrun --nproc-per-node=8 run.py --data ChartQA_TEST --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-04-29 18:44:05,458 - Evaluation - INFO - VQA Eval Finished. Saved to ./InternVL-Chat-V1-5/InternVL-Chat-V1-5_ChartQA_TEST_acc.csv.
+2024-04-29 18:44:05,458 - Evaluation - INFO -    test_human  test_augmented  Overall
+0       73.04           94.32    83.68
+```
+
+## AI2D_TEST
+
+```
+torchrun --nproc-per-node=8 run.py --data AI2D_TEST --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-04-29 19:02:17,402 - Evaluation - INFO - Score:
+2024-04-29 19:02:17,402 - Evaluation - INFO -   split   Overall  atomStructure  ...   typesOf  volcano  waterCNPCycle
+0  none  0.806995           0.75  ...  0.752187      1.0       0.727273
+```
+
+## LLaVABench
+
+```
+torchrun --nproc-per-node=8 run.py --data LLaVABench --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+Processing ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0:00:00 60/60 100%
+     split  Relative Score (main)  VLM Score  GPT4 Score
+0  overall                   82.0       63.7        77.7
+1     conv                   82.9       74.1        89.4
+2   detail                   72.0       48.0        66.7
+3  complex                   86.0       65.7        76.4
+```
+
+## DocVQA_VAL
+
+```
+torchrun --nproc-per-node=8 run.py --data DocVQA_VAL --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-04-29 19:18:54,661 - Evaluation - INFO - VQA Eval Finished. Saved to ./InternVL-Chat-V1-5/InternVL-Chat-V1-5_DocVQA_VAL_acc.csv.
+2024-04-29 19:18:54,661 - Evaluation - INFO -          val    Overall
+0  90.500323  90.500323
+```
+
+## InfoVQA_VAL
+
+```
+torchrun --nproc-per-node=8 run.py --data InfoVQA_VAL --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-04-29 18:44:50,851 - Evaluation - INFO - VQA Eval Finished. Saved to ./InternVL-Chat-V1-5/InternVL-Chat-V1-5_InfoVQA_VAL_acc.csv.
+2024-04-29 18:44:50,851 - Evaluation - INFO -          val    Overall
+0  71.920408  71.920408
+```
+
+## OCRBench
+
+```
+torchrun --nproc-per-node=8 run.py --data OCRBench --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-04-29 18:56:05,437 - Evaluation - INFO - Score:
+2024-04-29 18:56:05,437 - Evaluation - INFO - Text Recognition:238
+2024-04-29 18:56:05,437 - Evaluation - INFO - Scene Text-centric VQA:178
+2024-04-29 18:56:05,437 - Evaluation - INFO - Doc-oriented VQA:151
+2024-04-29 18:56:05,438 - Evaluation - INFO - Key Information Extraction:153
+2024-04-29 18:56:05,438 - Evaluation - INFO - Handwritten Mathematical Expression Recognition:4
+2024-04-29 18:56:05,438 - Evaluation - INFO - Final Score:724
+2024-04-29 18:56:05,438 - Evaluation - INFO - Final Score Norm:72.4
+```
+
+## RealWorldQA
+
+```
+torchrun --nproc-per-node=8 run.py --data RealWorldQA --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-04-29 18:56:43,192 - Evaluation - INFO - Score:
+2024-04-29 18:56:43,192 - Evaluation - INFO -   split   Overall
+0  none  0.660131
+```
+
+## SEEDBench2_Plus
+
+```
+torchrun --nproc-per-node=8 run.py --data SEEDBench2_Plus --model InternVL-Chat-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-29 12:41:47,313 - Evaluation - INFO -   split   Overall     chart       map      web
+0  none  0.666227  0.650617  0.574969  0.79697
+```
--- a/document/How_to_evaluate_mini_internvl_chat_2b_v1_5_using_vlmevalkit.md
+++ b/document/How_to_evaluate_mini_internvl_chat_2b_v1_5_using_vlmevalkit.md
+# How to Evaluate Mini-InternVL-Chat-2B-V1-5 using VLMEvalKit?
+
+In this tutorial, we will provide a detailed guide on how to evaluate Mini-InternVL-Chat-2B-V1-5 using VLMEvalKit.
+
+First of all, please follow this [guide](https://github.com/open-compass/VLMEvalKit/blob/main/docs/en/Quickstart.md) to install VLMEvalKit.
+
+## MMBench_DEV_EN
+
+```
+torchrun --nproc-per-node=8 run.py --data MMBench_DEV_EN --model Mini-InternVL-Chat-2B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-24 23:38:26,074 - Evaluation - INFO -   split   Overall  ...  spatial_relationship  structuralized_imagetext_understanding
+0   dev  0.706186  ...              0.266667                                0.423077
+```
+
+## MMBench_DEV_CN
+
+```
+torchrun --nproc-per-node=8 run.py --data MMBench_DEV_CN --model Mini-InternVL-Chat-2B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-24 23:38:10,864 - Evaluation - INFO -   split   Overall  ...  spatial_relationship  structuralized_imagetext_understanding
+0   dev  0.656357  ...              0.222222                                0.307692
+```
+
+## MMStar
+
+```
+torchrun --nproc-per-node=8 run.py --data MMStar --model Mini-InternVL-Chat-2B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-24 23:38:37,502 - Evaluation - INFO -   split   Overall  ...   math  science & technology
+0  none  0.461333  ...  0.448                 0.372
+```
+
+## MME
+
+```
+torchrun --nproc-per-node=8 run.py --data MME --model Mini-InternVL-Chat-2B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-24 23:38:42,360 - Evaluation - INFO -     perception   reasoning    OCR  ...     posters  scene  text_translation
+0  1475.888655  423.928571  147.5  ...  130.952381  151.0             170.0
+```
+
+## SEEDBench_IMG
+
+```
+torchrun --nproc-per-node=8 run.py --data SEEDBench_IMG --model Mini-InternVL-Chat-2B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-24 23:39:49,107 - Evaluation - INFO -   split   Overall  ...  Text Understanding  Visual Reasoning
+0  none  0.694491  ...            0.690476          0.731118
+```
+
+## MMVet
+
+```
+torchrun --nproc-per-node=8 run.py --data MMVet --model Mini-InternVL-Chat-2B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-24 23:38:53,665 - Evaluation - INFO -   Category  tot        acc
+0      rec  187  42.352941
+1      ocr  108  42.500000
+2     know   84  20.357143
+3      gen   80  22.375000
+4     spat   75  42.533333
+5     math   26  18.461538
+6  Overall  218  38.256881
+```
+
+Note that because the version of GPT used for scoring differs from the official server, the scores tested by VLMEvalKit will be slightly different.
+
+## MMMU_DEV_VAL
+
+```
+torchrun --nproc-per-node=8 run.py --data MMMU_DEV_VAL --model Mini-InternVL-Chat-2B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-24 23:39:23,721 - Evaluation - INFO -         split   Overall  ...   Science  Tech & Engineering
+0         dev  0.353333  ...  0.240000            0.342857
+1  validation  0.376667  ...  0.286667            0.376190
+```
+
+## MathVista_MINI
+
+```
+torchrun --nproc-per-node=8 run.py --data MathVista_MINI --model Mini-InternVL-Chat-2B-V1-5 --verbose
+```
+
+The result is:
+
+```
+0                       Overall  1000       520  411      52.000000  41.100000
+1          scientific reasoning   122        91   54      74.590164  44.262295
+2   textbook question answering   158       100   72      63.291139  45.569620
+3           numeric commonsense   144        41   45      28.472222  31.250000
+4          arithmetic reasoning   353       108  129      30.594901  36.543909
+5     visual question answering   179        94   69      52.513966  38.547486
+6            geometry reasoning   239       158   85      66.108787  35.564854
+7           algebraic reasoning   281       180  104      64.056940  37.010676
+8      geometry problem solving   208       149   76      71.634615  36.538462
+9             math word problem   186        27   68      14.516129  36.559140
+10            logical reasoning    37        24    4      64.864865  10.810811
+11    figure question answering   269       150  126      55.762082  46.840149
+12        statistical reasoning   301       139  159      46.179402  52.823920
+```
+
+Note that because the version of GPT used for answer extraction differs from the official code, the scores tested by VLMEvalKit will be slightly different.
+
+## ScienceQA_TEST
+
+```
+torchrun --nproc-per-node=8 run.py --data ScienceQA_TEST --model Mini-InternVL-Chat-2B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-24 23:42:24,271 - Evaluation - INFO -   split   Overall  ...  Weather and climate  World religions
+0  test  0.852256  ...             0.810345              0.0
+```
+
+## HallusionBench
+
+```
+torchrun --nproc-per-node=8 run.py --data HallusionBench --model Mini-InternVL-Chat-2B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-24 23:41:40,703 - Evaluation - INFO -           split       aAcc       fAcc       qAcc
+0       Overall  59.411146  24.277457  28.791209
+1            VS  61.111111  17.241379  34.269663
+2            VD  58.375635  27.826087  25.270758
+3   VD_illusion  57.638889  22.580645  19.444444
+4        VS_ocr  53.703704  15.384615  11.111111
+5        VS_map  54.687500   9.090909  15.625000
+6      VS_chart  66.153846  15.000000  50.000000
+7      VS_table  62.500000  28.571429  34.883721
+8       VD_math  58.333333  11.111111  31.481481
+9      VD_video  47.058824  12.500000   8.695652
+10    VD_figure  65.000000  41.463415  28.205128
+11       VD_ocr  75.280899  53.488372  51.162791
+```
+
+## TextVQA_VAL
+
+```
+torchrun --nproc-per-node=8 run.py --data TextVQA_VAL --model Mini-InternVL-Chat-2B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-24 23:45:09,563 - Evaluation - INFO -    Overall
+0   70.452
+```
+
+## ChartQA_TEST
+
+```
+torchrun --nproc-per-node=8 run.py --data ChartQA_TEST --model Mini-InternVL-Chat-2B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-24 23:43:24,645 - Evaluation - INFO -    test_augmented  test_human  Overall
+0           91.68       54.88    73.28
+```
+
+## AI2D_TEST
+
+```
+torchrun --nproc-per-node=8 run.py --data AI2D_TEST --model Mini-InternVL-Chat-2B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-24 23:43:12,535 - Evaluation - INFO -   split   Overall  atomStructure  ...  typesOf  volcano  waterCNPCycle
+0  none  0.699482          0.625  ...  0.61516   0.6875       0.477273
+```
+
+## LLaVABench
+
+```
+torchrun --nproc-per-node=8 run.py --data LLaVABench --model Mini-InternVL-Chat-2B-V1-5 --verbose
+```
+
+The result is:
+
+```
+Processing ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0:00:00 60/60 100%
+     split  Relative Score (main)  VLM Score  GPT4 Score
+0  overall                   61.0       47.7        78.2
+1  complex                   68.4       52.5        76.8
+2     conv                   59.9       53.5        89.4
+3   detail                   47.1       32.0        68.0
+```
+
+## DocVQA_VAL
+
+```
+torchrun --nproc-per-node=8 run.py --data DocVQA_VAL --model Mini-InternVL-Chat-2B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-24 23:47:40,385 - Evaluation - INFO -          val    Overall
+0  83.883006  83.883006
+```
+
+## InfoVQA_VAL
+
+```
+torchrun --nproc-per-node=8 run.py --data InfoVQA_VAL --model Mini-InternVL-Chat-2B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-24 23:46:53,303 - Evaluation - INFO -         val   Overall
+0  55.86691  55.86691
+```
+
+## OCRBench
+
+```
+torchrun --nproc-per-node=8 run.py --data OCRBench --model Mini-InternVL-Chat-2B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-24 23:45:30,929 - Evaluation - INFO - Score:
+2024-05-24 23:45:30,929 - Evaluation - INFO - Text Recognition:222
+2024-05-24 23:45:30,929 - Evaluation - INFO - Scene Text-centric VQA:163
+2024-05-24 23:45:30,929 - Evaluation - INFO - Doc-oriented VQA:125
+2024-05-24 23:45:30,929 - Evaluation - INFO - Key Information Extraction:139
+2024-05-24 23:45:30,929 - Evaluation - INFO - Handwritten Mathematical Expression Recognition:5
+2024-05-24 23:45:30,929 - Evaluation - INFO - Final Score:654
+2024-05-24 23:45:30,929 - Evaluation - INFO - Final Score Norm:65.4
+```
+
+## RealWorldQA
+
+```
+torchrun --nproc-per-node=8 run.py --data RealWorldQA --model Mini-InternVL-Chat-2B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-25 00:13:34,645 - Evaluation - INFO -   split   Overall
+0  none  0.579085
+```
+
+## SEEDBench2_Plus
+
+```
+torchrun --nproc-per-node=8 run.py --data SEEDBench2_Plus --model Mini-InternVL-Chat-2B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-29 12:31:50,587 - Evaluation - INFO -   split   Overall     chart       map       web
+0  none  0.588933  0.562963  0.482032  0.751515
+```
--- a/document/How_to_evaluate_mini_internvl_chat_4b_v1_5_using_vlmevalkit.md
+++ b/document/How_to_evaluate_mini_internvl_chat_4b_v1_5_using_vlmevalkit.md
+# How to Evaluate Mini-InternVL-Chat-4B-V1-5 using VLMEvalKit?
+
+In this tutorial, we will provide a detailed guide on how to evaluate Mini-InternVL-Chat-4B-V1-5 using VLMEvalKit.
+
+First of all, please follow this [guide](https://github.com/open-compass/VLMEvalKit/blob/main/docs/en/Quickstart.md) to install VLMEvalKit.
+
+## MMBench_DEV_EN
+
+```
+torchrun --nproc-per-node=8 run.py --data MMBench_DEV_EN --model Mini-InternVL-Chat-4B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-29 04:01:07,750 - Evaluation - INFO -   split   Overall  ...  spatial_relationship  structuralized_imagetext_understanding
+0   dev  0.764605  ...              0.355556                                0.551282
+```
+
+## MMBench_DEV_CN
+
+```
+torchrun --nproc-per-node=8 run.py --data MMBench_DEV_CN --model Mini-InternVL-Chat-4B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-29 04:11:49,747 - Evaluation - INFO -   split   Overall  ...  spatial_relationship  structuralized_imagetext_understanding
+0   dev  0.699313  ...              0.244444                                0.512821
+```
+
+## MMStar
+
+```
+torchrun --nproc-per-node=8 run.py --data MMStar --model Mini-InternVL-Chat-4B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-29 04:02:01,943 - Evaluation - INFO -   split   Overall  ...   math  science & technology
+0  none  0.527333  ...  0.516                 0.408
+```
+
+## MME
+
+```
+torchrun --nproc-per-node=8 run.py --data MME --model Mini-InternVL-Chat-4B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-29 04:06:31,735 - Evaluation - INFO -     perception   reasoning    OCR  ...     posters   scene  text_translation
+0  1569.933774  492.857143  147.5  ...  153.061224  153.25             162.5
+```
+
+## SEEDBench_IMG
+
+```
+torchrun --nproc-per-node=8 run.py --data SEEDBench_IMG --model Mini-InternVL-Chat-4B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-29 04:17:50,620 - Evaluation - INFO -   split   Overall  ...  Text Understanding  Visual Reasoning
+0  none  0.721684  ...            0.559524          0.779456
+```
+
+## MMVet
+
+```
+torchrun --nproc-per-node=8 run.py --data MMVet --model Mini-InternVL-Chat-4B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-29 11:04:32,615 - Evaluation - INFO -   Category  tot        acc
+0      rec  187  43.636364
+1      ocr  108  47.037037
+2     know   84  26.904762
+3      gen   80  27.625000
+4     spat   75  43.066667
+5     math   26  34.230769
+6  Overall  218  41.972477
+```
+
+Note that because the version of GPT used for scoring differs from the official server, the scores tested by VLMEvalKit will be slightly different.
+
+## MMMU_DEV_VAL
+
+```
+torchrun --nproc-per-node=8 run.py --data MMMU_DEV_VAL --model Mini-InternVL-Chat-4B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-29 04:16:48,300 - Evaluation - INFO -         split   Overall  ...  Science  Tech & Engineering
+0  validation  0.457778  ...      0.4            0.404762
+1         dev  0.480000  ...      0.4            0.371429
+```
+
+## MathVista_MINI
+
+```
+torchrun --nproc-per-node=8 run.py --data MathVista_MINI --model Mini-InternVL-Chat-4B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-29 04:56:36,722 - Evaluation - INFO -                      Task&Skill   tot  prefetch  hit  prefetch_rate        acc
+0                       Overall  1000       560  537      56.000000  53.700000
+1          scientific reasoning   122        86   71      70.491803  58.196721
+2   textbook question answering   158        99   89      62.658228  56.329114
+3           numeric commonsense   144        41   41      28.472222  28.472222
+4          arithmetic reasoning   353       134  180      37.960340  50.991501
+5     visual question answering   179        94   88      52.513966  49.162011
+6            geometry reasoning   239       161  118      67.364017  49.372385
+7           algebraic reasoning   281       189  139      67.259786  49.466192
+8      geometry problem solving   208       153  105      73.557692  50.480769
+9             math word problem   186        52   96      27.956989  51.612903
+10            logical reasoning    37        18    4      48.648649  10.810811
+11    figure question answering   269       162  159      60.223048  59.107807
+12        statistical reasoning   301       177  203      58.803987  67.441860
+```
+
+Note that because the version of GPT used for answer extraction differs from the official code, the scores tested by VLMEvalKit will be slightly different.
+
+## ScienceQA_TEST
+
+```
+torchrun --nproc-per-node=8 run.py --data ScienceQA_TEST --model Mini-InternVL-Chat-4B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-29 04:14:56,970 - Evaluation - INFO -   split   Overall  ...  Weather and climate  World religions
+0  test  0.927119  ...             0.948276              1.0
+```
+
+## HallusionBench
+
+```
+torchrun --nproc-per-node=8 run.py --data HallusionBench --model Mini-InternVL-Chat-4B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-29 04:24:07,079 - Evaluation - INFO -           split       aAcc       fAcc       qAcc
+0       Overall  62.460568  33.236994  32.747253
+1            VD  58.544839  31.739130  24.187726
+2            VS  68.888889  36.206897  46.067416
+3   VD_illusion  59.027778  30.645161  23.611111
+4       VD_math  55.555556  25.000000  27.777778
+5        VS_ocr  61.111111  34.615385  25.925926
+6     VD_figure  62.500000  41.463415  23.076923
+7        VD_ocr  74.157303  51.162791  48.837209
+8        VS_map  54.687500  27.272727  15.625000
+9      VS_chart  76.153846  40.000000  64.473684
+10     VS_table  72.321429  39.285714  48.837209
+11     VD_video  50.000000  12.500000   7.246377
+```
+
+## TextVQA_VAL
+
+```
+torchrun --nproc-per-node=8 run.py --data TextVQA_VAL --model Mini-InternVL-Chat-4B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-29 04:26:03,441 - Evaluation - INFO -    Overall
+0   72.886
+```
+
+## ChartQA_TEST
+
+```
+torchrun --nproc-per-node=8 run.py --data ChartQA_TEST --model Mini-InternVL-Chat-4B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-29 04:28:40,199 - Evaluation - INFO -    test_augmented  test_human  Overall
+0            93.2        68.8     81.0
+```
+
+## AI2D_TEST
+
+```
+torchrun --nproc-per-node=8 run.py --data AI2D_TEST --model Mini-InternVL-Chat-4B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-29 04:29:54,674 - Evaluation - INFO -   split   Overall  atomStructure  ...   typesOf  volcano  waterCNPCycle
+0  none  0.769754          0.875  ...  0.720117    0.875            0.5
+```
+
+## LLaVABench
+
+```
+torchrun --nproc-per-node=8 run.py --data LLaVABench --model Mini-InternVL-Chat-4B-V1-5 --verbose
+```
+
+The result is:
+
+```
+Processing ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0:00:00 60/60 100%
+     split  Relative Score (main)  VLM Score  GPT4 Score
+0  overall                   68.3       49.7        72.7
+1  complex                   77.5       55.4        71.4
+2   detail                   67.0       40.7        60.7
+3     conv                   56.6       48.2        85.3
+```
+
+## DocVQA_VAL
+
+```
+torchrun --nproc-per-node=8 run.py --data DocVQA_VAL --model Mini-InternVL-Chat-4B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-29 04:40:18,175 - Evaluation - INFO -          val    Overall
+0  86.635414  86.635414
+```
+
+## InfoVQA_VAL
+
+```
+torchrun --nproc-per-node=8 run.py --data InfoVQA_VAL --model Mini-InternVL-Chat-4B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-29 04:49:18,835 - Evaluation - INFO -          val    Overall
+0  64.588708  64.588708
+```
+
+## OCRBench
+
+```
+torchrun --nproc-per-node=8 run.py --data OCRBench --model Mini-InternVL-Chat-4B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-29 04:31:09,980 - Evaluation - INFO - Score:
+2024-05-29 04:31:09,980 - Evaluation - INFO - Text Recognition:194
+2024-05-29 04:31:09,980 - Evaluation - INFO - Scene Text-centric VQA:160
+2024-05-29 04:31:09,980 - Evaluation - INFO - Doc-oriented VQA:145
+2024-05-29 04:31:09,980 - Evaluation - INFO - Key Information Extraction:133
+2024-05-29 04:31:09,980 - Evaluation - INFO - Handwritten Mathematical Expression Recognition:6
+2024-05-29 04:31:09,980 - Evaluation - INFO - Final Score:638
+2024-05-29 04:31:09,980 - Evaluation - INFO - Final Score Norm:63.8
+```
+
+## RealWorldQA
+
+```
+torchrun --nproc-per-node=8 run.py --data RealWorldQA --model Mini-InternVL-Chat-4B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-29 04:34:10,091 - Evaluation - INFO -   split   Overall
+0  none  0.601307
+```
+
+## SEEDBench2_Plus
+
+```
+torchrun --nproc-per-node=8 run.py --data SEEDBench2_Plus --model Mini-InternVL-Chat-4B-V1-5 --verbose
+```
+
+The result is:
+
+```
+2024-05-29 12:33:20,074 - Evaluation - INFO -   split   Overall     chart       map       web
+0  none  0.625823  0.616049  0.537794  0.745455
+```
--- a/document/How_to_finetune_internvl_chat_v1_2_on_a_custom_dataset.md
+++ b/document/How_to_finetune_internvl_chat_v1_2_on_a_custom_dataset.md
+# How to Fine-tune InternVL-Chat-V1-2 on a Custom Dataset
+
+## 1. Prepare the Pre-trained Model
+
+Before starting the second fine-tuning process, download the pre-trained model we provide. Two versions are available: [InternVL-Chat-V1-2](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-2) and [InternVL-Chat-V1-2-Plus](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-2-Plus). We recommend downloading the Plus version.
+
+Use the following commands to download the desired model:
+
+```shell
+cd pretrained/
+# pip install -U huggingface_hub
+# Download OpenGVLab/InternVL-Chat-V1-2
+huggingface-cli download --resume-download --local-dir-use-symlinks False OpenGVLab/InternVL-Chat-V1-2 --local-dir InternVL-Chat-V1-2
+# Download OpenGVLab/InternVL-Chat-V1-2-Plus
+huggingface-cli download --resume-download --local-dir-use-symlinks False OpenGVLab/InternVL-Chat-V1-2-Plus --local-dir InternVL-Chat-V1-2-Plus
+```
+
+## 2. Prepare Your Customized Training Data
+
+After downloading the pre-trained model, prepare your customized SFT (Supervised Fine-Tuning) data. Create a JSON file in `internvl_chat/shell/data/` similar to [this example](./shell/data/internvl_1_2_finetune.json).
+
+The format for the JSON file should be:
+
+```json
+{
+  "your-custom-dataset-1": {
+    "root": "path/to/the/image/",
+    "annotation": "path/to/the/jsonl/annotation",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": "number of your data"
+  },
+  ...
+}
+```
+
+Example:
+
+```json
+{
+  "sharegpt4v_instruct_gpt4-vision_cap100k": {
+    "root": "playground/data/",
+    "annotation": "playground/opensource/sharegpt4v_instruct_gpt4-vision_cap100k.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 102025
+  }
+}
+```
+
+## 3. Start Fine-tuning
+
+Fine-tune the pre-trained models using either the [script for training the full LLM](../internvl_chat/shell/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_res_finetune_continue.sh) or the [script for training the LoRA adapter](../internvl_chat/shell/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_res_finetune_continue_lora.sh), depending on your available GPU resources.
+
+Before fine-tuning, set the `--meta_path` to the path of the JSON file you created in the previous step. The default pre-trained model path in these shell scripts is `./pretrained/InternVL-Chat-V1-2`. Update it to `./pretrained/InternVL-Chat-V1-2-Plus` if you are using the Plus version.
+
+> Note: Fine-tuning the full LLM requires 16 A100 80G GPUs, whereas fine-tuning the LoRA requires 2 A100 80G GPUs.
+
+Commands for fine-tuning:
+
+```sh
+# Using 16 GPUs with SLURM system, fine-tune the full LLM
+PARTITION='your partition' GPUS=16 sh shell/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_res_finetune_continue.sh
+# Using 2 GPUs, fine-tune the LoRA
+CUDA_VISIBLE_DEVICES=0,1 sh shell/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_res_finetune_continue_lora.sh
+```
+
+If you encounter any issues, please let me know, and I will update the training guide to enhance its usability.
--- a/document/How_to_finetune_mini_internvl_chat_v1_5_on_a_custom_dataset.md
+++ b/document/How_to_finetune_mini_internvl_chat_v1_5_on_a_custom_dataset.md
+# How to Fine-tune the Mini-InternVL-Chat Series on a Custom Dataset
+
+## 1. Prepare the Pre-trained Model
+
+Before starting the second fine-tuning process, download the models we released. We provide two models: [Mini-InternVL-Chat-2B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-2B-V1-5) and [Mini-InternVL-Chat-4B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-4B-V1-5).
+
+Use the following commands to download the desired model:
+
+```bash
+huggingface-cli download --resume-download --local-dir-use-symlinks False OpenGVLab/Mini-InternVL-Chat-2B-V1-5 --local-dir path/to/Mini-InternVL-Chat-2B-V1-5
+huggingface-cli download --resume-download --local-dir-use-symlinks False OpenGVLab/Mini-InternVL-Chat-4B-V1-5 --local-dir path/to/Mini-InternVL-Chat-4B-V1-5
+```
+
+## 2. Prepare Datasets
+
+### Prepare Released Training Datasets
+
+Refer to [this link](../internvl_chat#prepare-training-datasets) for details on preparing released training datasets.
+
+### Prepare Your Customized Data
+
+Create a JSONL file with annotations for your custom data in the following format:
+
+```json
+{"id": 0, "image": "image path relative to dataset path", "conversations": [{"from": "human", "value": "<image>\nyour question"}, {"from": "gpt", "value": "response"}]}
+```
+
+If you want to train with your customized SFT data, merge your data with our [internvl_1_2_finetune](../internvl_chat/shell/data/internvl_1_2_finetune.json) data by adding your data's metadata to our [JSON file](../internvl_chat/shell/data/internvl_1_2_finetune.json). The format for organizing this JSON file is:
+
+```json
+{
+  "sharegpt4v_instruct_gpt4-vision_cap100k": {
+    "root": "playground/data/",
+    "annotation": "playground/opensource/sharegpt4v_instruct_gpt4-vision_cap100k.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 102025
+  },
+  "llava_instruct_150k_zh": {
+    "root": "playground/data/coco/",
+    "annotation": "playground/opensource/llava_instruct_150k_zh.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 157712
+  },
+  "sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k": {
+    "root": "playground/data/",
+    "annotation": "playground/opensource/sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 665058
+  },
+  "dvqa_train_200k": {
+    "root": "playground/data/dvqa/",
+    "annotation": "playground/opensource/dvqa_train_200k.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 200000
+  },
+  "chartqa_train_18k": {
+    "root": "playground/data/chartqa/",
+    "annotation": "playground/opensource/chartqa_train_18k.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 18317
+  },
+  "ai2d_train_12k": {
+    "root": "playground/data/ai2d/",
+    "annotation": "playground/opensource/ai2d_train_12k.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 12413
+  },
+  "docvqa_train_10k": {
+    "root": "playground/data/docvqa/",
+    "annotation": "playground/opensource/docvqa_train_10k.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 10211
+  },
+  "geoqa+": {
+    "root": "playground/data/geoqa+/",
+    "annotation": "playground/opensource/geoqa+.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 72318
+  },
+  "synthdog_en": {
+    "root": "playground/data/synthdog-en/",
+    "annotation": "playground/opensource/synthdog_en.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 29765
+  },
+  "your_new_dataset": {
+    "root": "path/to/images",
+    "annotation": "path/to/annotation_file",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 499712
+  }
+}
+```
+
+## 3. Start Fine-tuning
+
+Fine-tune the released models using either the [script for Mini-InternVL-Chat-2B-V1-5](./internvl_chat/shell/internlm2_1_8b_dynamic/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_finetune.sh) or the [script for Mini-InternVL-Chat-4B-V1-5](./internvl_chat/shell/phi3_3_8b_dynamic/internvl_chat_v1_5_phi3_3_8b_dynamic_res_finetune.sh). Set the `--meta_path` to the path of the JSON file you created in the last step and update `--model_name_or_path` in these shell scripts to `path/to/Mini-InternVL-Chat-2B-V1-5` or `path/to/Mini-InternVL-Chat-4B-V1-5`.
+
+```bash
+# Using 16 GPUs with SLURM system, fine-tune the full LLM
+cd internvl_chat/
+# Mini-InternVL-Chat-2B-V1-5
+PARTITION='your partition' GPUS=16 sh shell/internlm2_1_8b_dynamic/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_finetune.sh
+# Mini-InternVL-Chat-4B-V1-5
+PARTITION='your partition' GPUS=16 sh shell/phi3_3_8b_dynamic/internvl_chat_v1_5_phi3_3_8b_dynamic_res_finetune.sh
+```
+
+If you see the following log in the terminal, it means the training has started successfully:
+
+![Training Started Successfully](https://github.com/G-z-w/InternVL/assets/95175307/d66a2c40-be4c-42c8-babf-052621d2995e)
+
+For a complete example training log, refer to [this link](./training_log.txt).
+
+## 4. Evaluate
+
+Refer to [this link](./document/How_to_evaluate_internvl_chat_v1_5.md) for evaluation details.
--- a/document/How_to_use_InternVL_API.md
+++ b/document/How_to_use_InternVL_API.md
+# How to use InternVL API?
+
+## 1. Official API of InternVL2
+
+We encourage everyone to use our API for research. For better management, please submit ([English application form](https://docs.google.com/forms/d/e/1FAIpQLSfMCzhPr1OOEKau_6jwTU0EiZMSFckDo-HMlc_hUudhF_97rw/viewform?usp=sf_link))/([中文申请表](https://wj.qq.com/s2/14910502/25a4/)) to obtain free API access.
+
+## 2. Community-Host API of InternVL 1.5
+
+https://rapidapi.com/adushar1320/api/internvl-chat
+
+## 3. Examples
+
+TBD
--- a/document/training_log.txt
+++ b/document/training_log.txt
--- a/finetune_lora_multi_dcu.sh
+++ b/finetune_lora_multi_dcu.sh
+set -x
+
+GPUS=${GPUS:-8}
+BATCH_SIZE=${BATCH_SIZE:-16}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='/home/wanglch/projects/saves/InternVL/internvl2-26b/finetune_multi_dcu'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 4
+# batch size per gpu: 8
+# gradient accumulation steps: 2
+# total batch size: 16
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "/home/wanglch/projects/InternVL/InternVL2-26B" \
+  --conv_style "internlm2-chat" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "/home/wanglch/projects/InternVL/internvl_chat/shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 12 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --freeze_llm True \
+  --freeze_mlp True \
+  --freeze_backbone True \
+  --use_llm_lora 16 \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 8 \
+  --fp16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 2e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 4096 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "/home/wanglch/projects/InternVL/internvl_chat/zero_stage3_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
\ No newline at end of file