Initial commit

26e59280 · wanglch · 26e59280 · 26e59280 · 26e59280 · 26e59280
Commit 26e59280 authored Apr 24, 2025 by wanglch
20 changed files
--- a/internvl_chat/eval/refcoco/evaluate_grounding.py
+++ b/internvl_chat/eval/refcoco/evaluate_grounding.py
+import argparse
+import itertools
+import json
+import os
+import random
+import re
+import time
+from functools import partial
+
+import torch
+from internvl.model import load_model_and_tokenizer
+from internvl.train.dataset import build_transform, dynamic_preprocess
+from PIL import Image
+from torchvision.ops.boxes import box_area
+from tqdm import tqdm
+
+ds_collections = {
+    'refcoco_val': 'data/refcoco/refcoco_val.jsonl',
+    'refcoco_testA': 'data/refcoco/refcoco_testA.jsonl',
+    'refcoco_testB': 'data/refcoco/refcoco_testB.jsonl',
+    'refcoco+_val': 'data/refcoco/refcoco+_val.jsonl',
+    'refcoco+_testA': 'data/refcoco/refcoco+_testA.jsonl',
+    'refcoco+_testB': 'data/refcoco/refcoco+_testB.jsonl',
+    'refcocog_val': 'data/refcoco/refcocog_val.jsonl',
+    'refcocog_test': 'data/refcoco/refcocog_test.jsonl',
+}
+
+
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+def collate_fn(batches, tokenizer):
+    pixel_values = torch.cat([_['pixel_values'] for _ in batches], dim=0)
+    texts = [_['text'] for _ in batches]
+    bboxes = [_['bbox'] for _ in batches]
+    hws = [_['hw'] for _ in batches]
+    return pixel_values, texts, bboxes, hws
+
+
+class RefCOCODataset(torch.utils.data.Dataset):
+
+    def __init__(self, test, prompt, input_size=224, dynamic_image_size=False,
+                 use_thumbnail=False, max_num=6):
+        self.datas = open(test).readlines()
+        self.prompt = prompt
+        self.input_size = input_size
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.max_num = max_num
+        self.transform = build_transform(is_train=False, input_size=input_size)
+
+    def __len__(self):
+        return len(self.datas)
+
+    def __getitem__(self, idx):
+        data = json.loads(self.datas[idx].strip())
+        image = data['image']
+        text = data['sent']
+        bbox = data['bbox']
+
+        w, h = data['width'], data['height']
+
+        image = Image.open(image).convert('RGB')
+        if self.dynamic_image_size:
+            images = dynamic_preprocess(image, image_size=self.input_size,
+                                        use_thumbnail=self.use_thumbnail,
+                                        max_num=self.max_num)
+        else:
+            images = [image]
+        pixel_values = [self.transform(image) for image in images]
+        pixel_values = torch.stack(pixel_values)
+
+        return {
+            'text': self.prompt.format(text),
+            'pixel_values': pixel_values,
+            'bbox': bbox,
+            'hw': (h, w),
+        }
+
+
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
+
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
+
+
+def evaluate_chat_model():
+    print('prompt:', prompt)
+    random.seed(args.seed)
+    summaries = []
+
+    for ds_name in args.datasets:
+        dataset = RefCOCODataset(
+            test=ds_collections[ds_name],
+            prompt=prompt,
+            input_size=image_size,
+            dynamic_image_size=args.dynamic,
+            use_thumbnail=use_thumbnail,
+            max_num=args.max_num
+        )
+        dataloader = torch.utils.data.DataLoader(
+            dataset=dataset,
+            sampler=InferenceSampler(len(dataset)),
+            batch_size=args.batch_size,
+            num_workers=args.num_workers,
+            pin_memory=True,
+            drop_last=False,
+            collate_fn=partial(collate_fn, tokenizer=tokenizer),
+        )
+
+        outputs = []
+        for _, (pixel_values, questions, bboxes, hws) in enumerate(tqdm(dataloader)):
+            pixel_values = pixel_values.to(torch.bfloat16).cuda()
+            generation_config = dict(
+                num_beams=args.num_beams,
+                max_new_tokens=100,
+                min_new_tokens=1,
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+            )
+            pred = model.chat(
+                tokenizer=tokenizer,
+                pixel_values=pixel_values,
+                question=questions[0],
+                generation_config=generation_config,
+                verbose=True
+            )
+            answers = [pred]
+
+            for bbox, hw, answer in zip(bboxes, hws, answers):
+                outputs.append({
+                    'answer': answer,
+                    'gt_bbox': bbox,
+                    'hw': hw,
+                })
+
+        torch.distributed.barrier()
+
+        world_size = torch.distributed.get_world_size()
+        merged_outputs = [None for _ in range(world_size)]
+        torch.distributed.all_gather_object(merged_outputs, outputs)
+
+        merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
+
+        if torch.distributed.get_rank() == 0:
+            print(f'Evaluating {ds_name} ...')
+            time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
+            results_file = f'{ds_name}_{time_prefix}.json'
+            results_file = os.path.join(args.out_dir, results_file)
+            json.dump(merged_outputs, open(results_file, 'w'))
+
+            correct = total_cnt = 0
+            for i, output in enumerate(merged_outputs):
+                predict_bbox = re.findall(PATTERN, output['answer'])
+                try:
+                    predict_bbox = (float(predict_bbox[0][0]), float(predict_bbox[0][1]), float(predict_bbox[0][2]),
+                                    float(predict_bbox[0][3]))
+                except:
+                    predict_bbox = (0., 0., 0., 0.)
+                target_bbox = torch.tensor(output['gt_bbox'],
+                                           dtype=torch.float32).view(-1, 4)
+                predict_bbox = torch.tensor(predict_bbox,
+                                            dtype=torch.float32).view(-1, 4)
+                if predict_bbox.sum() >= 4:
+                    predict_bbox = predict_bbox / 1000
+                predict_bbox[:, 0::2] *= output['hw'][1]
+                predict_bbox[:, 1::2] *= output['hw'][0]
+                iou, _ = box_iou(predict_bbox, target_bbox)
+                iou = iou.item()
+                total_cnt += 1
+                if iou >= 0.5:
+                    correct += 1
+
+            print(f'Evaluating {ds_name} ...')
+            print(f'Precision @ 1: {correct / total_cnt} \n')
+            summaries.append([args.checkpoint, ds_name, f'Precision @ 1: {correct / total_cnt} \n'])
+
+        torch.distributed.barrier()
+
+    out_path = '_'.join(args.checkpoint.split('/')[-2:])
+    writer = open(os.path.join(args.out_dir, f'{out_path}.txt'), 'a')
+    print(f"write results to file {os.path.join(args.out_dir, f'{out_path}.txt')}")
+    for summary in summaries:
+        print(summary)
+        writer.write(f'{summary}\n')
+    writer.close()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--datasets', type=str, default='refcoco_val,refcoco_testA,refcoco_testB,'
+                                                        'refcoco+_val,refcoco+_testA,refcoco+_testB,'
+                                                        'refcocog_val,refcocog_test')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    parser.add_argument('--num-beams', type=int, default=1)
+    parser.add_argument('--out-dir', type=str, default='results')
+    parser.add_argument('--sample', type=bool, default=False)
+    parser.add_argument('--temperature', type=float, default=0.0)
+    parser.add_argument('--seed', type=int, default=0)
+    parser.add_argument('--dynamic', action='store_true')
+    parser.add_argument('--max-num', type=int, default=6)
+    parser.add_argument('--load-in-8bit', action='store_true')
+    parser.add_argument('--load-in-4bit', action='store_true')
+    parser.add_argument('--auto', action='store_true')
+    args = parser.parse_args()
+
+    if not os.path.exists(args.out_dir):
+        os.makedirs(args.out_dir, exist_ok=True)
+
+    args.datasets = args.datasets.split(',')
+    print('datasets:', args.datasets)
+    assert args.batch_size == 1, 'Only batch size 1 is supported'
+
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+
+    PATTERN = re.compile(r'\[*\[(.*?),(.*?),(.*?),(.*?)\]\]*')
+    model, tokenizer = load_model_and_tokenizer(args)
+    image_size = model.config.force_image_size or model.config.vision_config.image_size
+    use_thumbnail = model.config.use_thumbnail
+    prompt = 'Please provide the bounding box coordinate of the region this sentence describes: <ref>{}</ref>'
+
+    total_params = sum(p.numel() for p in model.parameters()) / 1e9
+    if total_params > 20 or args.dynamic:
+        args.num_beams = 1
+        print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
+    else:
+        print(f'[test] total_params: {total_params}B')
+    print(f'[test] image_size: {image_size}')
+    print(f'[test] template: {model.config.template}')
+    print(f'[test] dynamic_image_size: {args.dynamic}')
+    print(f'[test] use_thumbnail: {use_thumbnail}')
+    print(f'[test] max_num: {args.max_num}')
+
+    evaluate_chat_model()
--- a/internvl_chat/eval/scienceqa/README.md
+++ b/internvl_chat/eval/scienceqa/README.md
+# README for Evaluation
+
+## 🌟 Overview
+
+This script provides an evaluation pipeline for `ScienceQA`.
+
+## 🗂️ Data Preparation
+
+Before starting to download the data, please create the `InternVL/internvl_chat/data` folder.
+
+### ScienceQA
+
+Follow the instructions below to prepare the data:
+
+```shell
+# Step 1: Create the data directory
+mkdir -p data/scienceqa/images && cd data/scienceqa/images
+
+# Step 2: Download images
+wget https://scienceqa.s3.us-west-1.amazonaws.com/images/test.zip && unzip test.zip
+
+cd ..
+
+# Step 3: Download original questions
+wget https://github.com/lupantech/ScienceQA/blob/main/data/scienceqa/problems.json
+
+# Step 4: Download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/scienceqa/scienceqa_test_img.jsonl
+
+cd ../..
+```
+
+After preparation is complete, the directory structure is:
+
+```shell
+data/scienceqa
+├── images
+├── problems.json
+└── scienceqa_test_img.jsonl
+```
+
+## 🏃 Evaluation Execution
+
+> ⚠️ Note: For testing InternVL (1.5, 2.0, 2.5, and later versions), always enable `--dynamic` to perform dynamic resolution testing.
+
+To run the evaluation, execute the following command on an 8-GPU setup:
+
+```shell
+torchrun --nproc_per_node=8 eval/scienceqa/evaluate_scienceqa.py --checkpoint ${CHECKPOINT} --dynamic
+```
+
+Alternatively, you can run the following simplified command:
+
+```shell
+GPUS=8 sh evaluate.sh ${CHECKPOINT} scienceqa --dynamic
+```
+
+### Arguments
+
+The following arguments can be configured for the evaluation script:
+
+| Argument         | Type   | Default      | Description                                                                                                       |
+| ---------------- | ------ | ------------ | ----------------------------------------------------------------------------------------------------------------- |
+| `--checkpoint`   | `str`  | `''`         | Path to the model checkpoint.                                                                                     |
+| `--datasets`     | `str`  | `'sqa_test'` | Comma-separated list of datasets to evaluate.                                                                     |
+| `--dynamic`      | `flag` | `False`      | Enables dynamic high resolution preprocessing.                                                                    |
+| `--max-num`      | `int`  | `6`          | Maximum tile number for dynamic high resolution.                                                                  |
+| `--load-in-8bit` | `flag` | `False`      | Loads the model weights in 8-bit precision.                                                                       |
+| `--auto`         | `flag` | `False`      | Automatically splits a large model across 8 GPUs when needed, useful for models too large to fit on a single GPU. |
--- a/internvl_chat/eval/scienceqa/evaluate_scienceqa.py
+++ b/internvl_chat/eval/scienceqa/evaluate_scienceqa.py
+import argparse
+import itertools
+import json
+import os
+import random
+import re
+import time
+from functools import partial
+
+import torch
+from internvl.model import load_model_and_tokenizer
+from internvl.train.dataset import build_transform, dynamic_preprocess
+from PIL import Image
+from tqdm import tqdm
+
+ds_collections = {
+    'sqa_test': {
+        'root': 'data/scienceqa/scienceqa_test_img.jsonl',
+        'max_new_tokens': 100,
+        'min_new_tokens': 1,
+    },
+    'm3cot_test': {
+        'root': 'data/M3CoT/test.jsonl',
+        'max_new_tokens': 100,
+        'min_new_tokens': 1,
+    },
+}
+
+
+COT_INSTRUCTION = (
+    'Your task is to answer the question below. '
+    "Give step by step reasoning before you answer, and when you're ready to answer, "
+    "please use the format \"Final answer: ..\""
+    '\n\n'
+    'Question:'
+    '\n\n'
+    '{question}'
+)
+
+
+def extract_answer(text):
+    match = re.search(r'(Final answer:|Answer:)\s*(.*)', text, re.IGNORECASE)
+    if match:
+        return match.group(2).strip()
+    return text
+
+
+def collate_fn(batches, tokenizer):
+    pixel_values = torch.cat([_['pixel_values'] for _ in batches], dim=0)
+    questions = [_['question'] for _ in batches]
+    answers = [_['answer'] for _ in batches]
+    image_paths = [_['image_path'] for _ in batches]
+    options = [_['option'] for _ in batches]
+    return pixel_values, questions, answers, image_paths, options
+
+
+class ScienceQADataset(torch.utils.data.Dataset):
+
+    def __init__(self, root, prompt, input_size=224, dynamic_image_size=False,
+                 use_thumbnail=False, max_num=6):
+        f = open(root, 'r', encoding='utf-8')
+        self.data = [json.loads(line) for line in f.readlines()]
+        self.prompt = prompt
+        self.input_size = input_size
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.max_num = max_num
+        self.transform = build_transform(is_train=False, input_size=input_size)
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        data = self.data[idx]
+        image_path = data['image']
+        hint = data['hint'] if data['hint'] else None
+        question = data['question']
+
+        choices = data['choices']
+        answer = data['answer']
+        choice_list = []
+
+        options = {}
+        multiple_choices = ['A', 'B', 'C', 'D', 'E']
+        for i, c in enumerate(choices):
+            choice_list.append('{}. {}'.format(multiple_choices[i], c))
+            options[multiple_choices[i]] = c
+        choice_txt = '\n'.join(choice_list)
+
+        image = Image.open(image_path).convert('RGB')
+        if self.dynamic_image_size:
+            images = dynamic_preprocess(image, image_size=self.input_size,
+                                        use_thumbnail=self.use_thumbnail,
+                                        max_num=self.max_num)
+        else:
+            images = [image]
+        pixel_values = [self.transform(image) for image in images]
+        pixel_values = torch.stack(pixel_values)
+
+        if hint is not None:
+            question = hint + '\n' + question
+        question += '\n' + choice_txt
+        question += '\n' + self.prompt
+
+        return {
+            'question': question.strip(),
+            'pixel_values': pixel_values,
+            'answer': multiple_choices[answer],
+            'image_path': image_path,
+            'option': options
+        }
+
+
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
+
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
+
+
+def post_process(pred, option):
+    pred = pred.strip()
+    option_candidate = list(option.keys())
+    if len(pred) == 1:
+        return pred
+    elif len(pred) > 1 and pred[0] in option_candidate:
+        return pred[0]
+    elif len(pred) > 1 and pred[0] not in option_candidate:
+        for k, v in option.items():
+            if v in pred:
+                return k
+
+    if len(pred) > 1 and pred[1] == '.':
+        pred = pred[0]
+
+    if len(pred) > 1 and pred[0] == '(' and pred[2] == ')':
+        pred = pred[1]
+
+    return pred
+
+
+def evaluate_chat_model():
+    prompt = '' if args.cot else "Answer with the option's letter from the given choices directly."
+    random.seed(args.seed)
+
+    for ds_name in args.datasets:
+        dataset = ScienceQADataset(
+            root=ds_collections[ds_name]['root'],
+            prompt=prompt,
+            input_size=image_size,
+            dynamic_image_size=args.dynamic,
+            use_thumbnail=use_thumbnail,
+            max_num=args.max_num
+        )
+        dataloader = torch.utils.data.DataLoader(
+            dataset=dataset,
+            sampler=InferenceSampler(len(dataset)),
+            batch_size=args.batch_size,
+            num_workers=args.num_workers,
+            pin_memory=True,
+            drop_last=False,
+            collate_fn=partial(collate_fn, tokenizer=tokenizer),
+        )
+
+        outputs = []
+        for _, (pixel_values, questions, answers, image_paths, options) in tqdm(enumerate(dataloader)):
+            if args.cot:
+                questions = [COT_INSTRUCTION.format(question=q) for q in questions]
+
+            pixel_values = pixel_values.to(torch.bfloat16).cuda()
+            generation_config = dict(
+                num_beams=args.num_beams,
+                max_new_tokens=ds_collections[ds_name]['max_new_tokens'] if not args.cot else 4096,
+                min_new_tokens=ds_collections[ds_name]['min_new_tokens'],
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+            )
+            pred = model.chat(
+                tokenizer=tokenizer,
+                pixel_values=pixel_values,
+                question=questions[0],
+                generation_config=generation_config,
+                verbose=True
+            )
+            pred_orig = pred
+            if args.cot:
+                pred = extract_answer(pred).strip()
+            preds = [post_process(pred, options[0])]
+
+            for question, pred, answer, image_path in zip(questions, preds, answers, image_paths):
+                outputs.append({
+                    'question': question,
+                    'answer': pred,
+                    'answer_orig': pred_orig,
+                    'gt_answers': answer,
+                    'image_path': image_path
+                })
+
+        torch.distributed.barrier()
+
+        world_size = torch.distributed.get_world_size()
+        merged_outputs = [None for _ in range(world_size)]
+        torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
+
+        merged_outputs = [json.loads(_) for _ in merged_outputs]
+        merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
+
+        if torch.distributed.get_rank() == 0:
+            print(f'Evaluating {ds_name} ...')
+            time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
+            results_file = f'{ds_name}_{time_prefix}.jsonl'
+            output_path = os.path.join(args.out_dir, results_file)
+            with open(output_path, 'w') as f:
+                for output in merged_outputs:
+                    f.write(json.dumps(output) + '\n')
+            print('Results saved to {}'.format(output_path))
+            cnt = 0
+            for item in merged_outputs:
+                if item['answer'] == item['gt_answers']:
+                    cnt += 1
+            print(f'Acc@1: {cnt / len(merged_outputs)}')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--datasets', type=str, default='sqa_test')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    parser.add_argument('--num-beams', type=int, default=1)
+    parser.add_argument('--temperature', type=float, default=0.0)
+    parser.add_argument('--out-dir', type=str, default='results')
+    parser.add_argument('--seed', type=int, default=0)
+    parser.add_argument('--dynamic', action='store_true')
+    parser.add_argument('--max-num', type=int, default=6)
+    parser.add_argument('--load-in-8bit', action='store_true')
+    parser.add_argument('--load-in-4bit', action='store_true')
+    parser.add_argument('--auto', action='store_true')
+    parser.add_argument('--cot', action='store_true')
+    args = parser.parse_args()
+
+    model_name = '_'.join(args.checkpoint.split('/')[-2:])
+    model_name = f'{model_name}_cot' if args.cot else model_name
+    args.out_dir = os.path.join(args.out_dir, model_name)
+
+    if not os.path.exists(args.out_dir):
+        os.makedirs(args.out_dir, exist_ok=True)
+
+    args.datasets = args.datasets.split(',')
+    print('datasets:', args.datasets)
+    assert args.batch_size == 1, 'Only batch size 1 is supported'
+
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+
+    model, tokenizer = load_model_and_tokenizer(args)
+    image_size = model.config.force_image_size or model.config.vision_config.image_size
+    use_thumbnail = model.config.use_thumbnail
+
+    total_params = sum(p.numel() for p in model.parameters()) / 1e9
+    if total_params > 20 or args.dynamic:
+        args.num_beams = 1
+        print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
+    else:
+        print(f'[test] total_params: {total_params}B')
+    print(f'[test] image_size: {image_size}')
+    print(f'[test] template: {model.config.template}')
+    print(f'[test] dynamic_image_size: {args.dynamic}')
+    print(f'[test] use_thumbnail: {use_thumbnail}')
+
+    evaluate_chat_model()
--- a/internvl_chat/eval/seed/README.md
+++ b/internvl_chat/eval/seed/README.md
+# README for Evaluation
+
+## 🌟 Overview
+
+This script provides an evaluation pipeline for `SEED-Image`.
+
+## 🗂️ Data Preparation
+
+Before starting to download the data, please create the `InternVL/internvl_chat/data` folder.
+
+### SEED-Image
+
+Follow the instructions below to prepare the data:
+
+```shell
+# Step 1: Create the data directory
+mkdir -p data/SEED && cd data/SEED
+
+# Step 2: Download the dataset
+wget https://huggingface.co/OpenGVLab/InternVL/resolve/main/SEED-Bench-image.zip
+unzip SEED-Bench-image.zip
+wget https://huggingface.co/OpenGVLab/InternVL/resolve/main/seed.jsonl
+
+cd ../..
+```
+
+After preparation is complete, the directory structure is:
+
+```shell
+data/SEED
+ ├── SEED-Bench-image
+ └── seed.jsonl
+```
+
+## 🏃 Evaluation Execution
+
+> ⚠️ Note: For testing InternVL (1.5, 2.0, 2.5, and later versions), always enable `--dynamic` to perform dynamic resolution testing.
+
+To run the evaluation, execute the following command on an 8-GPU setup:
+
+```shell
+torchrun --nproc_per_node=8 eval/seed/evaluate_seed.py --checkpoint ${CHECKPOINT} --dynamic
+```
+
+Alternatively, you can run the following simplified command:
+
+```shell
+GPUS=8 sh evaluate.sh ${CHECKPOINT} seed --dynamic
+```
+
+### Arguments
+
+The following arguments can be configured for the evaluation script:
+
+| Argument         | Type   | Default    | Description                                                                                                       |
+| ---------------- | ------ | ---------- | ----------------------------------------------------------------------------------------------------------------- |
+| `--checkpoint`   | `str`  | `''`       | Path to the model checkpoint.                                                                                     |
+| `--datasets`     | `str`  | `'SEEDv1'` | Comma-separated list of datasets to evaluate.                                                                     |
+| `--dynamic`      | `flag` | `False`    | Enables dynamic high resolution preprocessing.                                                                    |
+| `--max-num`      | `int`  | `6`        | Maximum tile number for dynamic high resolution.                                                                  |
+| `--load-in-8bit` | `flag` | `False`    | Loads the model weights in 8-bit precision.                                                                       |
+| `--auto`         | `flag` | `False`    | Automatically splits a large model across 8 GPUs when needed, useful for models too large to fit on a single GPU. |
--- a/internvl_chat/eval/seed/calculation.py
+++ b/internvl_chat/eval/seed/calculation.py
+import argparse
+import json
+import os
+
+argparse = argparse.ArgumentParser()
+argparse.add_argument('--image_result_file', type=str, default='')
+argparse.add_argument('--anno_path', type=str, default='data/SEED/SEED-Bench.json')
+
+args = argparse.parse_args()
+image_result_file = args.image_result_file
+anno_path = args.anno_path
+
+assert image_result_file.endswith('.jsonl')
+
+
+def is_integer_string(s):
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False
+
+
+def filter_questions(data, task='all'):
+    if task == 'image':
+        return [q for q in data if 1 <= q['question_type_id'] <= 9]
+    elif task == 'video':
+        return [q for q in data if 10 <= q['question_type_id'] <= 12]
+    elif task == 'all':
+        return data
+    elif is_integer_string(task):
+        return [q for q in data if q['question_type_id'] == int(task)]
+    else:
+        raise ValueError(f'Invalid task: {task}')
+
+
+if __name__ == '__main__':
+
+    qa_anno = json.load(open(anno_path, 'rb'))
+    if 'questions' in qa_anno.keys():
+        question_type = qa_anno['question_type']
+        question_id_type = {v: k for k, v in question_type.items()}
+        qa_anno = qa_anno['questions']
+
+    qa_anno = filter_questions(qa_anno, 'all')
+    print(f'length: {len(qa_anno)}')
+
+    with open(image_result_file, 'r') as f:
+
+        image_result = [json.loads(line) for line in f.readlines()]
+
+    results = []
+
+    results.extend(image_result)
+
+    qa_id_anno = {}
+    for item in qa_anno:
+        question_id = str(item['question_id'])
+        qa_id_anno[question_id] = item
+
+    type_counts = {k: [] for k, v in question_id_type.items()}
+
+    for item in results:
+        pred, gt, question_id = item['prediction'], item['answer'], item['question_id']
+        question_id = str(question_id)
+        question_type = qa_id_anno[question_id]['question_type_id']
+        data_type = qa_id_anno[question_id]['data_type']
+        gt = qa_id_anno[question_id]['answer']
+        if len(pred) != 1:
+            pred = pred[0]
+        if pred == gt:
+            type_counts[question_type].append(1)
+        else:
+            type_counts[question_type].append(0)
+
+    print('Accuracy for each data type:')
+    total_count, image_count, video_count = 0, 0, 0
+    total_correct, image_correct, video_correct = 0, 0, 0
+    for data_type_id, result in type_counts.items():
+        accuracy = sum(result) / len(result) * 100
+        data_type = question_id_type[data_type_id]
+        print(f'Data type {data_type}: {accuracy:.2f}%')
+
+        total_count += len(result)
+        total_correct += sum(result)
+        if data_type_id >= 1 and data_type_id <= 9:
+            image_count += len(result)
+            image_correct += sum(result)
+        else:
+            video_count += len(result)
+            video_correct += sum(result)
+
+    total_accuracy = total_correct / total_count * 100
+    image_accuracy = image_correct / image_count * 100
+    video_accuracy = video_correct / video_count * 100
+
+    print(f'Total accuracy: {total_accuracy:.2f}%')
+    print(f'Image accuracy: {image_accuracy:.2f}%')
+    print(f'Video accuracy: {video_accuracy:.2f}%')
--- a/internvl_chat/eval/seed/evaluate_seed.py
+++ b/internvl_chat/eval/seed/evaluate_seed.py
+import argparse
+import itertools
+import json
+import os
+import random
+import time
+from functools import partial
+
+import torch
+from internvl.model import load_model_and_tokenizer
+from internvl.train.dataset import build_transform, dynamic_preprocess
+from PIL import Image
+from tqdm import tqdm
+
+ds_collections = {
+    'SEEDv1': {
+        'root': 'data/SEED/',
+        'annotation': 'data/SEED/seed.jsonl',
+        'max_new_tokens': 100,
+        'min_new_tokens': 1,
+    },
+}
+
+
+def collate_fn(batches, tokenizer):
+    pixel_values = torch.cat([_['pixel_values'] for _ in batches], dim=0)
+    questions = [_['question'] for _ in batches]
+    answers = [_['answer'] for _ in batches]
+    indexes = [_['index'] for _ in batches]
+    return pixel_values, questions, answers, indexes
+
+
+class MultipleChoiceDataset(torch.utils.data.Dataset):
+
+    def __init__(self, root, annotation, input_size=224, dynamic_image_size=False,
+                 use_thumbnail=False, max_num=6):
+        f = open(annotation, 'r', encoding='utf-8')
+        self.data = [json.loads(line) for line in f.readlines()]
+        self.root = root
+        self.input_size = input_size
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.max_num = max_num
+        self.transform = build_transform(is_train=False, input_size=input_size)
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        data = self.data[idx]
+        question = data['text']
+        image_path = os.path.join(self.root, data['image'])
+        image = Image.open(image_path).convert('RGB')
+        if self.dynamic_image_size:
+            images = dynamic_preprocess(image, image_size=self.input_size,
+                                        use_thumbnail=self.use_thumbnail,
+                                        max_num=self.max_num)
+        else:
+            images = [image]
+        pixel_values = [self.transform(image) for image in images]
+        pixel_values = torch.stack(pixel_values)
+        answer = data['answer'] if 'answer' in data else None
+        return {
+            'question': question,
+            'pixel_values': pixel_values,
+            'answer': answer,
+            'index': data['question_id'],
+        }
+
+
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
+
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
+
+
+def post_process(pred, option):
+    pred = pred.strip()
+    option_candidate = list(option.keys())
+    if len(pred) == 1:
+        return pred
+    elif len(pred) != 1 and pred[0] in option_candidate:
+        return pred[0]
+    elif len(pred) != 1 and pred[0] not in option_candidate:
+        for k, v in option.items():
+            if v in pred:
+                return k
+
+    return pred
+
+
+def evaluate_chat_model():
+    random.seed(args.seed)
+
+    for ds_name in args.datasets:
+        dataset = MultipleChoiceDataset(
+            root=ds_collections[ds_name]['root'],
+            annotation=ds_collections[ds_name]['annotation'],
+            input_size=image_size,
+            dynamic_image_size=args.dynamic,
+            use_thumbnail=use_thumbnail,
+            max_num=args.max_num
+        )
+        dataloader = torch.utils.data.DataLoader(
+            dataset=dataset,
+            sampler=InferenceSampler(len(dataset)),
+            batch_size=args.batch_size,
+            num_workers=args.num_workers,
+            pin_memory=True,
+            drop_last=False,
+            collate_fn=partial(collate_fn, tokenizer=tokenizer),
+        )
+
+        outputs = []
+        for _, (pixel_values, questions, answers, indexes) in enumerate(tqdm(dataloader)):
+            pixel_values = pixel_values.to(torch.bfloat16).cuda()
+            generation_config = dict(
+                num_beams=args.num_beams,
+                max_new_tokens=ds_collections[ds_name]['max_new_tokens'],
+                min_new_tokens=ds_collections[ds_name]['min_new_tokens'],
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+            )
+            pred = model.chat(
+                tokenizer=tokenizer,
+                pixel_values=pixel_values,
+                question=questions[0],
+                generation_config=generation_config,
+                verbose=True
+            )
+            preds = [pred]
+
+            for question, pred, answer, index in zip(questions, preds, answers, indexes):
+                outputs.append({
+                    'question_id': index,
+                    'question': question,
+                    'prediction': pred,
+                    'answer': answer,
+                })
+
+        torch.distributed.barrier()
+
+        world_size = torch.distributed.get_world_size()
+        merged_outputs = [None for _ in range(world_size)]
+        torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
+
+        merged_outputs = [json.loads(_) for _ in merged_outputs]
+        merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
+
+        if torch.distributed.get_rank() == 0:
+            print(f'Evaluating {ds_name} ...')
+            time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
+            results_file = f'{ds_name}_{time_prefix}.jsonl'
+            output_path = os.path.join(args.out_dir, results_file)
+            writer = open(output_path, 'w')
+
+            results = []
+            for item in merged_outputs:
+                writer.write(json.dumps(item) + '\n')
+                answer = item['answer']
+                prediction = item['prediction']
+                if prediction == answer:
+                    results.append(1)
+                else:
+                    results.append(0)
+            writer.close()
+            print('Results saved to {}'.format(output_path))
+            print(f'Acc@1: {sum(results) / len(results)}')
+            cmd = f'python eval/seed/calculation.py --image_result_file {output_path}'
+            os.system(cmd)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--datasets', type=str, default='SEEDv1')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    parser.add_argument('--num-beams', type=int, default=1)
+    parser.add_argument('--temperature', type=float, default=0.0)
+    parser.add_argument('--out-dir', type=str, default='results')
+    parser.add_argument('--seed', type=int, default=0)
+    parser.add_argument('--dynamic', action='store_true')
+    parser.add_argument('--max-num', type=int, default=6)
+    parser.add_argument('--load-in-8bit', action='store_true')
+    parser.add_argument('--load-in-4bit', action='store_true')
+    parser.add_argument('--auto', action='store_true')
+    args = parser.parse_args()
+
+    if not os.path.exists(args.out_dir):
+        os.makedirs(args.out_dir, exist_ok=True)
+
+    args.datasets = args.datasets.split(',')
+    print('datasets:', args.datasets)
+    assert args.batch_size == 1, 'Only batch size 1 is supported'
+
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+
+    model, tokenizer = load_model_and_tokenizer(args)
+    image_size = model.config.force_image_size or model.config.vision_config.image_size
+    use_thumbnail = model.config.use_thumbnail
+
+    total_params = sum(p.numel() for p in model.parameters()) / 1e9
+    if total_params > 20 or args.dynamic:
+        args.num_beams = 1
+        print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
+    else:
+        print(f'[test] total_params: {total_params}B')
+    print(f'[test] image_size: {image_size}')
+    print(f'[test] template: {model.config.template}')
+    print(f'[test] dynamic_image_size: {args.dynamic}')
+    print(f'[test] use_thumbnail: {use_thumbnail}')
+
+    evaluate_chat_model()
--- a/internvl_chat/eval/tiny_lvlm/README.md
+++ b/internvl_chat/eval/tiny_lvlm/README.md
+# README for Evaluation
+
+## 🌟 Overview
+
+This script provides an evaluation pipeline for `Tiny-LVLM-eHub`.
+
+## 🗂️ Data Preparation
+
+Before starting to download the data, please create the `InternVL/internvl_chat/data` folder.
+
+### Tiny-LVLM-eHub
+
+Follow the instructions below to prepare the data:
+
+```shell
+# Step 1: Create the data directory
+mkdir -p data/tiny_lvlm && cd data/tiny_lvlm
+
+# Step 2: Download the dataset
+wget https://huggingface.co/OpenGVLab/InternVL/resolve/main/updated_datasets.zip
+unzip updated_datasets.zip
+
+cd ../..
+```
+
+After preparation is complete, the directory structure is:
+
+```shell
+data/tiny_lvlm
+ └── updated_datasets
+```
+
+## 🏃 Evaluation Execution
+
+> ⚠️ Note: For testing InternVL (1.5, 2.0, 2.5, and later versions), always enable `--dynamic` to perform dynamic resolution testing.
+
+To run the evaluation, execute the following command on an 8-GPU setup:
+
+```shell
+torchrun --nproc_per_node=8 eval/tiny_lvlm/evaluate_lvlm.py --checkpoint ${CHECKPOINT} --dynamic
+```
+
+Alternatively, you can run the following simplified command:
+
+```shell
+GPUS=8 sh evaluate.sh ${CHECKPOINT} tiny_lvlm --dynamic
+```
+
+### Arguments
+
+The following arguments can be configured for the evaluation script:
+
+| Argument         | Type   | Default              | Description                                                                                                       |
+| ---------------- | ------ | -------------------- | ----------------------------------------------------------------------------------------------------------------- |
+| `--checkpoint`   | `str`  | `''`                 | Path to the model checkpoint.                                                                                     |
+| `--datasets`     | `str`  | `'updated_datasets'` | Comma-separated list of datasets to evaluate.                                                                     |
+| `--dynamic`      | `flag` | `False`              | Enables dynamic high resolution preprocessing.                                                                    |
+| `--max-num`      | `int`  | `6`                  | Maximum tile number for dynamic high resolution.                                                                  |
+| `--load-in-8bit` | `flag` | `False`              | Loads the model weights in 8-bit precision.                                                                       |
+| `--auto`         | `flag` | `False`              | Automatically splits a large model across 8 GPUs when needed, useful for models too large to fit on a single GPU. |
--- a/internvl_chat/eval/tiny_lvlm/calculate_score.py
+++ b/internvl_chat/eval/tiny_lvlm/calculate_score.py
+import argparse
+import json
+
+try:
+    from .tools import VQAEval
+except:
+    from tools import VQAEval
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Demo')
+    parser.add_argument('--file-path', type=str, default='results/updated_datasets_231221114523.json')
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    data = json.loads(open(args.file_path).read())
+    overall_score = 0
+    results = {}
+    dataset_names = ['Visual_Reasoning', 'Visual_Perception', 'Visual_Knowledge_Acquisition',
+                     'Visual_Commonsense', 'Object_Hallucination']
+    for item in data:
+        task_type = item['image_path'].split('/')[-2]
+        assert task_type in dataset_names
+        if task_type in results:
+            results[task_type].append(item)
+        else:
+            results[task_type] = [item]
+
+    for k, v in results.items():
+        eval = VQAEval()
+        correct = 0
+        num = 0
+        for i in range(len(v)):
+            gt_answers = v[i]['gt_answers']
+            answer = v[i]['answer']
+            if eval.evaluate(answer, gt_answers) == 1:
+                correct += 1
+            num += 1
+        overall_score += float(correct) / num
+        print(f'{k}: {float(correct) / num}')
+    print(f'Overall: {overall_score}')
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
--- a/internvl_chat/eval/tiny_lvlm/evaluate_lvlm.py
+++ b/internvl_chat/eval/tiny_lvlm/evaluate_lvlm.py
+import argparse
+import itertools
+import json
+import os
+import random
+import time
+from functools import partial
+
+import torch
+from internvl.model import load_model_and_tokenizer
+from internvl.train.dataset import build_transform, dynamic_preprocess
+from PIL import Image
+from tqdm import tqdm
+
+ds_collections = {
+    'updated_datasets': {
+        'root': 'data/tiny_lvlm/updated_datasets/',
+        'max_new_tokens': 30,
+        'min_new_tokens': 1,
+    }
+}
+
+
+def collate_fn(batches, tokenizer):
+    pixel_values = torch.cat([_['pixel_values'] for _ in batches], dim=0)
+    questions = [_['question'] for _ in batches]
+    annotations = [_['annotation'] for _ in batches]
+    image_paths = [_['image_path'] for _ in batches]
+
+    return pixel_values, questions, annotations, image_paths
+
+
+class VQADataset(torch.utils.data.Dataset):
+
+    def __init__(self, root, prompt, input_size=224, dynamic_image_size=False,
+                 use_thumbnail=False, max_num=6):
+        dirnames = [os.path.join(root, item) for item in os.listdir(root)]
+        dirnames = [item for item in dirnames if os.path.exists(os.path.join(item, 'dataset.json'))]
+        sorted(dirnames)
+
+        self.roots = []
+        self.items = []
+        for item in dirnames:
+            data_path = os.path.join(item, 'dataset.json')
+            data = json.loads(open(data_path).read())
+            for data_line in data:
+                self.roots.append(item)
+                self.items.append(data_line)
+        self.prompt = prompt
+        self.input_size = input_size
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.max_num = max_num
+        self.transform = build_transform(is_train=False, input_size=input_size)
+
+    def __len__(self):
+        return len(self.items)
+
+    def __getitem__(self, idx):
+        root = self.roots[idx]
+        item = self.items[idx]
+        image_path, question, annotation = item['image_path'], item['question'], item['gt_answers']
+        image_path = os.path.join(root, image_path)
+        image = Image.open(image_path).convert('RGB')
+        if self.dynamic_image_size:
+            images = dynamic_preprocess(image, image_size=self.input_size,
+                                        use_thumbnail=self.use_thumbnail,
+                                        max_num=self.max_num)
+        else:
+            images = [image]
+        pixel_values = [self.transform(image) for image in images]
+        pixel_values = torch.stack(pixel_values)
+        question = question + ' ' + self.prompt
+        return {
+            'question': question,
+            'pixel_values': pixel_values,
+            'annotation': annotation,
+            'image_path': image_path,
+        }
+
+
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
+
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
+
+
+def evaluate_chat_model():
+    prompt = 'Answer the question using a single word or phrase.'
+    random.seed(args.seed)
+
+    for ds_name in args.datasets:
+        dataset = VQADataset(
+            root=ds_collections[ds_name]['root'],
+            prompt=prompt,
+            input_size=image_size,
+            dynamic_image_size=args.dynamic,
+            use_thumbnail=use_thumbnail,
+            max_num=args.max_num
+        )
+        dataloader = torch.utils.data.DataLoader(
+            dataset=dataset,
+            sampler=InferenceSampler(len(dataset)),
+            batch_size=args.batch_size,
+            num_workers=args.num_workers,
+            pin_memory=True,
+            drop_last=False,
+            collate_fn=partial(collate_fn, tokenizer=tokenizer),
+        )
+
+        outputs = []
+        for _, (pixel_values, questions, annotations, image_paths) in tqdm(enumerate(dataloader)):
+            pixel_values = pixel_values.to(torch.bfloat16).cuda()
+            generation_config = dict(
+                num_beams=args.num_beams,
+                max_new_tokens=ds_collections[ds_name]['max_new_tokens'],
+                min_new_tokens=ds_collections[ds_name]['min_new_tokens'],
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+            )
+            pred = model.chat(
+                tokenizer=tokenizer,
+                pixel_values=pixel_values,
+                question=questions[0],
+                generation_config=generation_config,
+                verbose=True
+            )
+            answers = [pred]
+
+            for question, answer, annotation, image_path in zip(questions, answers, annotations, image_paths):
+                task_type = image_path.split('/')[-2]
+                outputs.append({
+                    'question': question,
+                    'answer': answer,
+                    'gt_answers': annotation,
+                    'image_path': image_path,
+                    'task_type': task_type
+                })
+
+        torch.distributed.barrier()
+
+        world_size = torch.distributed.get_world_size()
+        merged_outputs = [None for _ in range(world_size)]
+        torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
+
+        merged_outputs = [json.loads(_) for _ in merged_outputs]
+        merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
+
+        if torch.distributed.get_rank() == 0:
+            print(f'Evaluating {ds_name} ...')
+            time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
+            results_file = f'{ds_name}_{time_prefix}.json'
+            results_file = os.path.join(args.out_dir, results_file)
+            json.dump(merged_outputs, open(results_file, 'w'))
+            print('Results saved to {}'.format(results_file))
+            cmd = 'python eval/tiny_lvlm/calculate_score.py ' \
+                  '--file-path ' + results_file
+            print(cmd)
+            os.system(cmd)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--datasets', type=str, default='updated_datasets')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    parser.add_argument('--num-beams', type=int, default=1)
+    parser.add_argument('--temperature', type=float, default=0.0)
+    parser.add_argument('--out-dir', type=str, default='results')
+    parser.add_argument('--seed', type=int, default=0)
+    parser.add_argument('--dynamic', action='store_true')
+    parser.add_argument('--max-num', type=int, default=6)
+    parser.add_argument('--load-in-8bit', action='store_true')
+    parser.add_argument('--load-in-4bit', action='store_true')
+    parser.add_argument('--auto', action='store_true')
+    args = parser.parse_args()
+
+    if not os.path.exists(args.out_dir):
+        os.makedirs(args.out_dir, exist_ok=True)
+
+    args.datasets = args.datasets.split(',')
+    print('datasets:', args.datasets)
+    assert args.batch_size == 1, 'Only batch size 1 is supported'
+
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+
+    model, tokenizer = load_model_and_tokenizer(args)
+    image_size = model.config.force_image_size or model.config.vision_config.image_size
+    use_thumbnail = model.config.use_thumbnail
+
+    total_params = sum(p.numel() for p in model.parameters()) / 1e9
+    if total_params > 20 or args.dynamic:
+        args.num_beams = 1
+        print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
+    else:
+        print(f'[test] total_params: {total_params}B')
+    print(f'[test] image_size: {image_size}')
+    print(f'[test] template: {model.config.template}')
+    print(f'[test] dynamic_image_size: {args.dynamic}')
+    print(f'[test] use_thumbnail: {use_thumbnail}')
+
+    evaluate_chat_model()
--- a/internvl_chat/eval/tiny_lvlm/tools.py
+++ b/internvl_chat/eval/tiny_lvlm/tools.py
+# https://github.com/OpenGVLab/Multi-Modality-Arena/blob/main/tiny_lvlm_evaluation/utils/tools.py
+import re
+
+
+def remove_special_chars(s):
+    pattern = r'[^a-zA-Z0-9\s]'
+    s = re.sub(pattern, '', s)
+    return s
+
+
+def has_word(sentence, word):
+    pattern = r'\b' + re.escape(word) + r'\b'
+    match = re.search(pattern, sentence)
+    if match:
+        return True
+    else:
+        return False
+
+
+class VQAEval:
+    def __init__(self):
+        self.contractions = {
+            'aint': "ain't",
+            'arent': "aren't",
+            'cant': "can't",
+            'couldve': "could've",
+            'couldnt': "couldn't",
+            "couldn'tve": "couldn't've",
+            "couldnt've": "couldn't've",
+            'didnt': "didn't",
+            'doesnt': "doesn't",
+            'dont': "don't",
+            'hadnt': "hadn't",
+            "hadnt've": "hadn't've",
+            "hadn'tve": "hadn't've",
+            'hasnt': "hasn't",
+            'havent': "haven't",
+            'hed': "he'd",
+            "hed've": "he'd've",
+            "he'dve": "he'd've",
+            'hes': "he's",
+            'howd': "how'd",
+            'howll': "how'll",
+            'hows': "how's",
+            "Id've": "I'd've",
+            "I'dve": "I'd've",
+            'Im': "I'm",
+            'Ive': "I've",
+            'isnt': "isn't",
+            'itd': "it'd",
+            "itd've": "it'd've",
+            "it'dve": "it'd've",
+            'itll': "it'll",
+            "let's": "let's",
+            'maam': "ma'am",
+            'mightnt': "mightn't",
+            "mightnt've": "mightn't've",
+            "mightn'tve": "mightn't've",
+            'mightve': "might've",
+            'mustnt': "mustn't",
+            'mustve': "must've",
+            'neednt': "needn't",
+            'notve': "not've",
+            'oclock': "o'clock",
+            'oughtnt': "oughtn't",
+            "ow's'at": "'ow's'at",
+            "'ows'at": "'ow's'at",
+            "'ow'sat": "'ow's'at",
+            'shant': "shan't",
+            "shed've": "she'd've",
+            "she'dve": "she'd've",
+            "she's": "she's",
+            'shouldve': "should've",
+            'shouldnt': "shouldn't",
+            "shouldnt've": "shouldn't've",
+            "shouldn'tve": "shouldn't've",
+            "somebody'd": 'somebodyd',
+            "somebodyd've": "somebody'd've",
+            "somebody'dve": "somebody'd've",
+            'somebodyll': "somebody'll",
+            'somebodys': "somebody's",
+            'someoned': "someone'd",
+            "someoned've": "someone'd've",
+            "someone'dve": "someone'd've",
+            'someonell': "someone'll",
+            'someones': "someone's",
+            'somethingd': "something'd",
+            "somethingd've": "something'd've",
+            "something'dve": "something'd've",
+            'somethingll': "something'll",
+            'thats': "that's",
+            'thered': "there'd",
+            "thered've": "there'd've",
+            "there'dve": "there'd've",
+            'therere': "there're",
+            'theres': "there's",
+            'theyd': "they'd",
+            "theyd've": "they'd've",
+            "they'dve": "they'd've",
+            'theyll': "they'll",
+            'theyre': "they're",
+            'theyve': "they've",
+            'twas': "'twas",
+            'wasnt': "wasn't",
+            "wed've": "we'd've",
+            "we'dve": "we'd've",
+            'weve': "we've",
+            'werent': "weren't",
+            'whatll': "what'll",
+            'whatre': "what're",
+            'whats': "what's",
+            'whatve': "what've",
+            'whens': "when's",
+            'whered': "where'd",
+            'wheres': "where's",
+            'whereve': "where've",
+            'whod': "who'd",
+            "whod've": "who'd've",
+            "who'dve": "who'd've",
+            'wholl': "who'll",
+            'whos': "who's",
+            'whove': "who've",
+            'whyll': "why'll",
+            'whyre': "why're",
+            'whys': "why's",
+            'wont': "won't",
+            'wouldve': "would've",
+            'wouldnt': "wouldn't",
+            "wouldnt've": "wouldn't've",
+            "wouldn'tve": "wouldn't've",
+            'yall': "y'all",
+            "yall'll": "y'all'll",
+            "y'allll": "y'all'll",
+            "yall'd've": "y'all'd've",
+            "y'alld've": "y'all'd've",
+            "y'all'dve": "y'all'd've",
+            'youd': "you'd",
+            "youd've": "you'd've",
+            "you'dve": "you'd've",
+            'youll': "you'll",
+            'youre': "you're",
+            'youve': "you've",
+        }
+        self.manualMap = {
+            'none': '0',
+            'zero': '0',
+            'one': '1',
+            'two': '2',
+            'three': '3',
+            'four': '4',
+            'five': '5',
+            'six': '6',
+            'seven': '7',
+            'eight': '8',
+            'nine': '9',
+            'ten': '10',
+        }
+        self.articles = ['a', 'an', 'the']
+
+        self.periodStrip = re.compile('(?!<=\d)(\.)(?!\d)')
+        self.commaStrip = re.compile('(\d)(\,)(\d)')
+        self.punct = [
+            ';',
+            r'/',
+            '[',
+            ']',
+            '"',
+            '{',
+            '}',
+            '(',
+            ')',
+            '=',
+            '+',
+            '\\',
+            '_',
+            '-',
+            '>',
+            '<',
+            '@',
+            '`',
+            ',',
+            '?',
+            '!',
+        ]
+
+    def evaluate(self, answer, gt_answers):
+        answer = answer.replace('\n', ' ')
+        answer = answer.replace('\t', ' ')
+        answer = answer.strip()
+        answer = self.processPunctuation(answer)
+        answer = self.processDigitArticle(answer)
+        if type(gt_answers) == list:
+            for i in range(len(gt_answers)):
+                gt_answers[i] = gt_answers[i].replace('\n', ' ')
+                gt_answers[i] = gt_answers[i].replace('\t', ' ')
+                gt_answers[i] = gt_answers[i].strip()
+                gt_answers[i] = self.processPunctuation(gt_answers[i])
+                gt_answers[i] = self.processDigitArticle(gt_answers[i])
+                if has_word(answer, gt_answers[i]):
+                    return 1
+            return 0
+        else:
+            gt_answers = gt_answers.replace('\n', ' ')
+            gt_answers = gt_answers.replace('\t', ' ')
+            gt_answers = gt_answers.strip()
+            gt_answers = self.processPunctuation(gt_answers)
+            gt_answers = self.processDigitArticle(gt_answers)
+            if has_word(answer, gt_answers):
+                return 1
+            else:
+                return 0
+
+    def evaluate_MRR(self, answer, gt_answers):
+        answer = answer.replace('\n', ' ')
+        answer = answer.replace('\t', ' ')
+        answer = answer.strip()
+        answer = self.processPunctuation(answer)
+        answer = self.processDigitArticle(answer)
+        if type(gt_answers) is str:
+            gt_answers = [gt_answers]
+        for i in range(len(gt_answers)):
+            gt_answers[i] = gt_answers[i].replace('\n', ' ')
+            gt_answers[i] = gt_answers[i].replace('\t', ' ')
+            gt_answers[i] = gt_answers[i].strip()
+            gt_answers[i] = self.processPunctuation(gt_answers[i])
+            gt_answers[i] = self.processDigitArticle(gt_answers[i])
+            if has_word(answer, gt_answers[i]):
+                return 1 / (i + 1)
+        return 0.0
+
+    def processPunctuation(self, inText):
+        outText = inText
+        for p in self.punct:
+            if (p + ' ' in inText or ' ' + p in inText) or (
+                    re.search(self.commaStrip, inText) is not None
+            ):
+                outText = outText.replace(p, '')
+            else:
+                outText = outText.replace(p, ' ')
+        outText = self.periodStrip.sub('', outText, re.UNICODE)
+        return outText
+
+    def processDigitArticle(self, inText):
+        outText = []
+        tempText = inText.lower().split()
+        for word in tempText:
+            word = self.manualMap.setdefault(word, word)
+            if word not in self.articles:
+                outText.append(word)
+            else:
+                pass
+        for wordId, word in enumerate(outText):
+            if word in self.contractions:
+                outText[wordId] = self.contractions[word]
+        outText = ' '.join(outText)
+        return outText
--- a/internvl_chat/eval/vqa/README.md
+++ b/internvl_chat/eval/vqa/README.md
+# README for Evaluation
+
+## 🌟 Overview
+
+This script provides an evaluation pipeline for visual question answering across 9 datasets: `VQAv2`, `OKVQA`, `TextVQA`, `Vizwiz`, `DocVQA`, `ChartQA`, `AI2D`, `InfoVQA`, and `GQA`.
+
+## 🗂️ Data Preparation
+
+Before starting to download the data, please create the `InternVL/internvl_chat/data` folder.
+
+### VQAv2
+
+Follow the instructions below to prepare the data:
+
+```shell
+# Step 1: Create the data directory
+mkdir -p data/vqav2 && cd data/vqav2
+
+# Step 2: Make sure you have downloaded COCO images
+ln -s ../coco/train2014 ./
+ln -s ../coco/val2014 ./
+ln -s ../coco/test2015 ./
+
+# Step 3: Download questions and annotations
+wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Train_mscoco.zip && unzip v2_Annotations_Train_mscoco.zip
+wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Train_mscoco.zip && unzip v2_Questions_Train_mscoco.zip
+wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip && unzip v2_Annotations_Val_mscoco.zip
+wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Val_mscoco.zip && unzip v2_Questions_Val_mscoco.zip
+wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Test_mscoco.zip && unzip v2_Questions_Test_mscoco.zip
+
+# Step 4: Download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vqav2/vqav2_train.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vqav2/vqav2_val.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vqav2/vqav2_testdev.jsonl
+
+cd ../..
+```
+
+After preparation is complete, the directory structure is:
+
+```shell
+data/vqav2
+├── train2014 -> ../coco/train2014
+├── val2014 -> ../coco/val2014
+├── test2015 -> ../coco/test2015
+├── v2_mscoco_train2014_annotations.json
+├── v2_mscoco_train2014_complementary_pairs.json
+├── v2_mscoco_val2014_annotations.json
+├── v2_OpenEnded_mscoco_test2015_questions.json
+├── v2_OpenEnded_mscoco_test-dev2015_questions.json
+├── v2_OpenEnded_mscoco_train2014_questions.json
+├── v2_OpenEnded_mscoco_val2014_questions.json
+├── vqav2_testdev.jsonl
+├── vqav2_train.jsonl
+└── vqav2_val.jsonl
+```
+
+### OKVQA
+
+Follow the instructions below to prepare the data:
+
+```shell
+# Step 1: Create the data directory
+mkdir -p data/okvqa && cd data/okvqa
+
+# Step 2: Make sure you have downloaded COCO images
+ln -s ../coco/train2014 ./
+ln -s ../coco/val2014 ./
+
+# Step 3: Download annotations and questions
+wget https://okvqa.allenai.org/static/data/mscoco_train2014_annotations.json.zip && unzip mscoco_train2014_annotations.json.zip
+wget https://okvqa.allenai.org/static/data/OpenEnded_mscoco_train2014_questions.json.zip && unzip OpenEnded_mscoco_train2014_questions.json.zip
+wget https://okvqa.allenai.org/static/data/mscoco_val2014_annotations.json.zip && unzip mscoco_val2014_annotations.json.zip
+wget https://okvqa.allenai.org/static/data/OpenEnded_mscoco_val2014_questions.json.zip && unzip OpenEnded_mscoco_val2014_questions.json.zip
+
+# Step 4: Download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/okvqa/okvqa_train.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/okvqa/okvqa_val.jsonl
+
+cd ../..
+```
+
+After preparation is complete, the directory structure is:
+
+```shell
+data/okvqa
+├── mscoco_train2014_annotations.json
+├── mscoco_val2014_annotations.json
+├── okvqa_train.jsonl
+├── okvqa_val.jsonl
+├── OpenEnded_mscoco_train2014_questions.json
+├── OpenEnded_mscoco_val2014_questions.json
+├── test2014 -> ../coco/test2014
+└── val2014 -> ../coco/val2014
+```
+
+### TextVQA
+
+Follow the instructions below to prepare the data:
+
+```shell
+# Step 1: Create the data directory
+mkdir -p data/textvqa && cd data/textvqa
+
+# Step 2: Download images
+wget https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip && unzip train_val_images.zip
+
+# Step 3: Download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train_annotations.json
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train_questions.json
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_val_annotations.json
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_val_questions.json
+wget https://huggingface.co/OpenGVLab/InternVL/raw/main/textvqa_val.jsonl
+wget https://huggingface.co/OpenGVLab/InternVL/raw/main/textvqa_val_llava.jsonl
+
+cd ../..
+```
+
+After preparation is complete, the directory structure is:
+
+```shell
+data/textvqa
+├── TextVQA_Rosetta_OCR_v0.2_test.json
+├── TextVQA_Rosetta_OCR_v0.2_train.json
+├── TextVQA_Rosetta_OCR_v0.2_val.json
+├── textvqa_train_annotations.json
+├── textvqa_train.jsonl
+├── textvqa_train_questions.json
+├── textvqa_val_annotations.json
+├── textvqa_val.jsonl
+├── textvqa_val_llava.jsonl
+├── textvqa_val_questions.json
+└── train_images
+```
+
+### VizWiz
+
+Follow the instructions below to prepare the data:
+
+```shell
+# Step 1: Create the data directory
+mkdir -p data/vizwiz && cd data/vizwiz
+
+# Step 2: Download images
+wget https://vizwiz.cs.colorado.edu/VizWiz_final/images/train.zip && unzip train.zip
+wget https://vizwiz.cs.colorado.edu/VizWiz_final/images/val.zip && unzip val.zip
+wget https://vizwiz.cs.colorado.edu/VizWiz_final/images/test.zip && unzip test.zip
+
+# Step 3: Download annotations
+wget https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations.zip && unzip Annotations.zip
+
+# Step 4: Download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_train_annotations.json
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_train_questions.json
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_train.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_val_annotations.json
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_val_questions.json
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_val.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_test.jsonl
+
+cd ../..
+```
+
+After preparation is complete, the directory structure is:
+
+```shell
+data/vizwiz
+├── annotations
+├── test
+├── train
+├── val
+├── vizwiz_test.jsonl
+├── vizwiz_train_annotations.json
+├── vizwiz_train.jsonl
+├── vizwiz_train_questions.json
+├── vizwiz_val_annotations.json
+├── vizwiz_val.jsonl
+└── vizwiz_val_questions.json
+```
+
+### DocVQA
+
+Follow the instructions below to prepare the data:
+
+```shell
+# Step 1: Create the data directory
+mkdir -p data/docvqa && cd data/docvqa
+
+# Step 2: Download images and annotations
+wget https://datasets.cvc.uab.es/rrc/DocVQA/train.tar.gz --no-check-certificate # (optional)
+wget https://datasets.cvc.uab.es/rrc/DocVQA/val.tar.gz --no-check-certificate
+wget https://datasets.cvc.uab.es/rrc/DocVQA/test.tar.gz --no-check-certificate
+
+# Step 3: Unzip files
+tar -zxvf train.tar.gz
+tar -zxvf val.tar.gz
+tar -zxvf test.tar.gz
+
+# Step 4: Download converted jsonl files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/docvqa/train.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/docvqa/val.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/docvqa/test.jsonl
+
+cd ../..
+```
+
+After preparation is complete, the directory structure is:
+
+```shell
+data/docvqa
+├── test
+├── test.jsonl
+├── train
+├── train.jsonl
+├── val
+└── val.jsonl
+```
+
+### AI2D
+
+Follow the instructions below to prepare the data：
+
+```bash
+# Step 1: Create the data directory
+mkdir -p data/ai2diagram && cd data/ai2diagram
+
+# Step 2: Download converted files
+wget https://huggingface.co/OpenGVLab/InternVL/raw/main/ai2d_test_vlmevalkit.jsonl -O test_vlmevalkit.jsonl
+wget https://huggingface.co/OpenGVLab/InternVL/resolve/main/AI2D_TEST.zip && unzip AI2D_TEST.zip
+
+# Step 3: Download images from Google Drive (optional, provided by InternLM-XComposer)
+# https://drive.google.com/file/d/1dqqa3MnrxMXaU_K9JA6C83je32ibwdOY/view?usp=sharing
+# images should be placed in `data/ai2diagram/ai2d/abc_images` and `data/ai2diagram/ai2d/images`
+cd ../..
+```
+
+After preparation is complete, the directory structure is:
+
+```
+data/ai2diagram
+ ├── test_vlmevalkit.jsonl
+ ├── ai2d # (optional)
+ │    ├── abc_images
+ │    └── images
+ └── AI2D_TEST
+```
+
+### InfoVQA
+
+Follow the instructions below to prepare the data:
+
+```shell
+# Step 1: Create the data directory
+mkdir -p data/infographicsvqa && cd data/infographicsvqa
+
+# Step 2: Download images and annotations from https://rrc.cvc.uab.es/?ch=17&com=downloads
+# infographicsVQA_test_v1.0.json, infographicsVQA_val_v1.0_withQT.json, infographicVQA_train_v1.0.json
+
+# Step 3: Download converted files
+wget https://huggingface.co/OpenGVLab/InternVL/raw/main/infographicsvqa_val.jsonl -O val.jsonl
+wget https://huggingface.co/OpenGVLab/InternVL/raw/main/infographicsvqa_test.jsonl -O test.jsonl
+
+cd ../..
+```
+
+After preparation is complete, the directory structure is:
+
+```shell
+data/infographicsvqa
+├── infographicsvqa_images
+├── infographicsVQA_test_v1.0.json
+├── infographicsVQA_val_v1.0_withQT.json
+├── infographicVQA_train_v1.0.json
+├── test.jsonl
+└── val.jsonl
+```
+
+### ChartQA
+
+Follow the instructions below to prepare the data:
+
+```shell
+# Step 1: Create the data directory
+mkdir -p data/chartqa && cd data/chartqa
+
+# Step 2: download images from
+# https://drive.google.com/file/d/1Lm_w6zeET1Hyl_9ks6w5nEsgpoyPHalV/view
+
+# Step 3: Download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/train_human.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/train_augmented.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/test_human.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/test_augmented.jsonl
+
+cd ../..
+```
+
+After preparation is complete, the directory structure is:
+
+```shell
+data/chartqa
+ ├── ChartQA Dataset
+ │    ├── test
+ │    ├── train
+ │    └── val
+ ├── test_augmented.jsonl
+ ├── test_human.jsonl
+ ├── train_augmented.jsonl
+ └── train_human.jsonl
+```
+
+### GQA
+
+Follow the instructions below to prepare the data:
+
+```shell
+# Step 1: Create the data directory
+mkdir -p data/gqa && cd data/gqa
+
+# Step 2: Download the official evaluation script
+wget https://nlp.stanford.edu/data/gqa/eval.zip
+unzip eval.zip
+
+# Step 3: Download images
+wget https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip
+unzip images.zip
+
+# Step 4: Download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/gqa/testdev_balanced.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/gqa/train_balanced.jsonl
+wget https://github.com/OpenGVLab/InternVL/releases/download/data/llava_gqa_testdev_balanced_qwen_format.jsonl
+
+cd ../..
+```
+
+After preparation is complete, the directory structure is:
+
+```shell
+data/gqa
+├── challenge_all_questions.json
+├── challenge_balanced_questions.json
+├── eval.py
+├── images
+├── llava_gqa_testdev_balanced_qwen_format.jsonl
+├── readme.txt
+├── submission_all_questions.json
+├── test_all_questions.json
+├── test_balanced.jsonl
+├── test_balanced_questions.json
+├── testdev_all_questions.json
+├── testdev_balanced_all_questions.json
+├── testdev_balanced_predictions.json
+├── testdev_balanced_questions.json
+├── train_all_questions
+├── train_balanced.jsonl
+├── train_balanced_questions.json
+├── val_all_questions.json
+└── val_balanced_questions.json
+```
+
+## 🏃 Evaluation Execution
+
+> ⚠️ Note: For testing InternVL (1.5, 2.0, 2.5, and later versions), always enable `--dynamic` to perform dynamic resolution testing.
+
+To run the evaluation, execute the following command on an 8-GPU setup:
+
+```shell
+torchrun --nproc_per_node=8 eval/caption/evaluate_caption.py --checkpoint ${CHECKPOINT} --datasets ${DATASETS} --dynamic
+```
+
+Alternatively, you can run the following simplified command:
+
+```shell
+# Test VQAv2 val
+GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-vqav2-val --dynamic
+# Test VQAv2 testdev
+GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-vqav2-testdev --dynamic
+# Test OKVQA val
+GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-okvqa-val --dynamic
+# Test Vizwiz val
+GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-vizwiz-val --dynamic
+# Test Vizwiz test
+GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-vizwiz-test --dynamic
+# Test GQA testdev
+GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-gqa-testdev --dynamic
+# Test AI2D test
+GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-ai2d-test --dynamic
+# Test TextVQA val
+GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-textvqa-val --dynamic
+# Test ChartQA test-human & test-augmented
+GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-chartqa-test --dynamic --max-num 12
+# Test DocVQA val
+GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-docvqa-val --dynamic --max-num 18
+# Test DocVQA test
+GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-docvqa-test --dynamic --max-num 18
+# Test InfoVQA val
+GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-infovqa-val --dynamic --max-num 24
+# Test InfoVQA test
+GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-infovqa-test --dynamic --max-num 24
+```
+
+### Arguments
+
+The following arguments can be configured for the evaluation script:
+
+| Argument         | Type   | Default     | Description                                                                                                       |
+| ---------------- | ------ | ----------- | ----------------------------------------------------------------------------------------------------------------- |
+| `--checkpoint`   | `str`  | `''`        | Path to the model checkpoint.                                                                                     |
+| `--datasets`     | `str`  | `okvqa_val` | Comma-separated list of datasets to evaluate.                                                                     |
+| `--dynamic`      | `flag` | `False`     | Enables dynamic high resolution preprocessing.                                                                    |
+| `--max-num`      | `int`  | `6`         | Maximum tile number for dynamic high resolution.                                                                  |
+| `--load-in-8bit` | `flag` | `False`     | Loads the model weights in 8-bit precision.                                                                       |
+| `--auto`         | `flag` | `False`     | Automatically splits a large model across 8 GPUs when needed, useful for models too large to fit on a single GPU. |
--- a/internvl_chat/eval/vqa/convert_gqa_for_eval.py
+++ b/internvl_chat/eval/vqa/convert_gqa_for_eval.py
+import argparse
+import json
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--src', type=str)
+parser.add_argument('--dst', type=str)
+args = parser.parse_args()
+
+all_answers = []
+data = json.load(open(args.src))
+for res in data:
+    question_id = res['questionId']
+    answer = res['answer'].rstrip('.').lower()
+    all_answers.append({'questionId': question_id, 'prediction': answer})
+
+with open(args.dst, 'w') as f:
+    json.dump(all_answers, f)
--- a/internvl_chat/eval/vqa/evaluate_vqa.py
+++ b/internvl_chat/eval/vqa/evaluate_vqa.py
+import argparse
+import itertools
+import json
+import os
+import random
+import subprocess
+import time
+from functools import partial
+from typing import Optional
+
+import torch
+from internvl.model import load_model_and_tokenizer
+from internvl.train.dataset import build_transform, dynamic_preprocess
+from PIL import Image
+from textvqa_eval import TextVQAAccuracyEvaluator
+from tqdm import tqdm
+
+ds_collections = {
+    'vqav2_val': {
+        'train': 'data/vqav2/vqav2_train.jsonl',
+        'test': 'data/vqav2/vqav2_val.jsonl',
+        'question': 'data/vqav2/v2_OpenEnded_mscoco_val2014_questions.json',
+        'annotation': 'data/vqav2/v2_mscoco_val2014_annotations.json',
+        'metric': 'vqa_score',
+        'max_new_tokens': 10,
+    },
+    'vqav2_testdev': {
+        'train': 'data/vqav2/vqav2_train.jsonl',
+        'test': 'data/vqav2/vqav2_testdev.jsonl',
+        'metric': None,
+        'max_new_tokens': 10,
+    },
+    'okvqa_val': {
+        'train': 'data/okvqa/okvqa_train.jsonl',
+        'test': 'data/okvqa/okvqa_val.jsonl',
+        'question': 'data/okvqa/OpenEnded_mscoco_val2014_questions.json',
+        'annotation': 'data/okvqa/mscoco_val2014_annotations.json',
+        'metric': 'vqa_score',
+        'max_new_tokens': 10,
+    },
+    'textvqa_val': {
+        'train': 'data/textvqa/textvqa_train.jsonl',
+        'test': 'data/textvqa/textvqa_val.jsonl',
+        'question': 'data/textvqa/textvqa_val_questions.json',
+        'annotation': 'data/textvqa/textvqa_val_annotations.json',
+        'metric': 'vqa_score',
+        'max_new_tokens': 10,
+    },
+    'textvqa_val_ocr': {
+        'train': 'data/textvqa/textvqa_train.jsonl',
+        'test': 'data/textvqa/textvqa_val_llava.jsonl',
+        'question': 'data/textvqa/textvqa_val_questions.json',
+        'annotation': 'data/textvqa/textvqa_val_annotations.json',
+        'metric': 'vqa_score',
+        'max_new_tokens': 10,
+    },
+    'vizwiz_val': {
+        'train': 'data/vizwiz/vizwiz_train.jsonl',
+        'test': 'data/vizwiz/vizwiz_val.jsonl',
+        'question': 'data/vizwiz/vizwiz_val_questions.json',
+        'annotation': 'data/vizwiz/vizwiz_val_annotations.json',
+        'metric': 'vqa_score',
+        'max_new_tokens': 10,
+    },
+    'vizwiz_test': {
+        'train': 'data/vizwiz/vizwiz_train.jsonl',
+        'test': 'data/vizwiz/vizwiz_test.jsonl',
+        'metric': None,
+        'max_new_tokens': 10,
+    },
+    'docvqa_val': {
+        'train': 'data/docvqa/train.jsonl',
+        'test': 'data/docvqa/val.jsonl',
+        'annotation': 'data/docvqa/val/val_v1.0.json',
+        'metric': 'anls',
+        'max_new_tokens': 100,
+    },
+    'docvqa_test': {
+        'train': 'data/docvqa/train.jsonl',
+        'test': 'data/docvqa/test.jsonl',
+        'metric': None,
+        'max_new_tokens': 100,
+    },
+    'chartqa_test_human': {
+        'train': 'data/chartqa/train_human.jsonl',
+        'test': 'data/chartqa/test_human.jsonl',
+        'metric': 'relaxed_accuracy',
+        'max_new_tokens': 100,
+    },
+    'chartqa_test_augmented': {
+        'train': 'data/chartqa/train_augmented.jsonl',
+        'test': 'data/chartqa/test_augmented.jsonl',
+        'metric': 'relaxed_accuracy',
+        'max_new_tokens': 100,
+    },
+    'gqa_testdev': {
+        'train': 'data/gqa/train.jsonl',
+        'test': 'data/gqa/test_balanced.jsonl',
+        'metric': 'accuracy',
+        'max_new_tokens': 10,
+    },
+    'gqa_testdev_llava': {
+        'train': 'data/gqa/train.jsonl',
+        'test': 'data/gqa/llava_gqa_testdev_balanced_qwen_format.jsonl',
+        'metric': 'accuracy',
+        'max_new_tokens': 10,
+    },
+    'ocrvqa_val': {
+        'train': 'data/ocrvqa/ocrvqa_train.jsonl',
+        'test': 'data/ocrvqa/ocrvqa_val.jsonl',
+        'metric': 'accuracy',
+        'max_new_tokens': 100,
+    },
+    'ocrvqa_test': {
+        'train': 'data/ocrvqa/ocrvqa_train.jsonl',
+        'test': 'data/ocrvqa/ocrvqa_test.jsonl',
+        'metric': 'accuracy',
+        'max_new_tokens': 100,
+    },
+    'ai2diagram_test': {
+        'train': 'data/ai2diagram/train.jsonl',
+        'test': 'data/ai2diagram/test_vlmevalkit.jsonl',
+        'metric': 'accuracy',
+        'max_new_tokens': 10,
+    },
+    'infographicsvqa_val': {
+        'train': 'data/infographicsvqa/train.jsonl',
+        'test': 'data/infographicsvqa/val.jsonl',
+        'annotation': 'data/infographicsvqa/infographicsVQA_val_v1.0_withQT.json',
+        'metric': 'anls',
+        'max_new_tokens': 100,
+    },
+    'infographicsvqa_test': {
+        'train': 'data/infographicsvqa/train.jsonl',
+        'test': 'data/infographicsvqa/test.jsonl',
+        'annotation': 'data/infographicsvqa/infographicsVQA_test_v1.0.json',
+        'metric': None,
+        'max_new_tokens': 100,
+    }
+}
+
+
+# https://github.com/google-research/pix2struct/blob/main/pix2struct/metrics.py#L81
+def relaxed_correctness(target: str,
+                        prediction: str,
+                        max_relative_change: float = 0.05) -> bool:
+    """Calculates relaxed correctness.
+
+    The correctness tolerates certain error ratio defined by max_relative_change.
+    See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
+    “Following Methani et al. (2020), we use a relaxed accuracy measure for the
+    numeric answers to allow a minor inaccuracy that may result from the automatic
+    data extraction process. We consider an answer to be correct if it is within
+    5% of the gold answer. For non-numeric answers, we still need an exact match
+    to consider an answer to be correct.”
+
+    Args:
+      target: Target string.
+      prediction: Predicted string.
+      max_relative_change: Maximum relative change.
+
+    Returns:
+      Whether the prediction was correct given the specified tolerance.
+    """
+
+    def _to_float(text: str) -> Optional[float]:
+        try:
+            if text.endswith('%'):
+                # Convert percentages to floats.
+                return float(text.rstrip('%')) / 100.0
+            else:
+                return float(text)
+        except ValueError:
+            return None
+
+    prediction_float = _to_float(prediction)
+    target_float = _to_float(target)
+    if prediction_float is not None and target_float:
+        relative_change = abs(prediction_float -
+                              target_float) / abs(target_float)
+        return relative_change <= max_relative_change
+    else:
+        return prediction.lower() == target.lower()
+
+
+def evaluate_relaxed_accuracy(entries):
+    scores = []
+    for elem in entries:
+        if isinstance(elem['annotation'], str):
+            elem['annotation'] = [elem['annotation']]
+        score = max([
+            relaxed_correctness(elem['answer'].strip(), ann)
+            for ann in elem['annotation']
+        ])
+        scores.append(score)
+    return sum(scores) / len(scores)
+
+
+def evaluate_exact_match_accuracy(entries):
+    scores = []
+    for elem in entries:
+        if isinstance(elem['annotation'], str):
+            elem['annotation'] = [elem['annotation']]
+        score = max([
+            (1.0 if
+             (elem['answer'].strip().lower() == ann.strip().lower()) else 0.0)
+            for ann in elem['annotation']
+        ])
+        scores.append(score)
+    return sum(scores) / len(scores)
+
+
+def collate_fn(batches, tokenizer):
+    pixel_values = torch.cat([_['pixel_values'] for _ in batches], dim=0)
+    questions = [_['question'] for _ in batches]
+    question_ids = [_['question_id'] for _ in batches]
+    annotations = [_['annotation'] for _ in batches]
+
+    return pixel_values, questions, question_ids, annotations
+
+
+class VQADataset(torch.utils.data.Dataset):
+
+    def __init__(self, train, test, prompt, few_shot, input_size=224, dynamic_image_size=False,
+                 use_thumbnail=False, max_num=6):
+        self.test = open(test).readlines()
+        self.prompt = prompt
+        self.input_size = input_size
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.few_shot = few_shot
+        self.max_num = max_num
+        if few_shot > 0:
+            self.train = open(train).readlines()
+        self.transform = build_transform(is_train=False, input_size=input_size)
+
+    def __len__(self):
+        return len(self.test)
+
+    def __getitem__(self, idx):
+        data = json.loads(self.test[idx].strip())
+        image, question, question_id, annotation = data['image'], data[
+            'question'], data['question_id'], data.get('answer', None)
+
+        few_shot_prompt = ''
+        if self.few_shot > 0:
+            few_shot_samples = random.sample(self.train, self.few_shot)
+            for sample in few_shot_samples:
+                sample = json.loads(sample.strip())
+                few_shot_prompt += self.prompt.format(
+                    sample['image'],
+                    sample['question']) + f" {sample['answer']}"
+
+        image = Image.open(image).convert('RGB')
+        if self.dynamic_image_size:
+            images = dynamic_preprocess(image, image_size=self.input_size,
+                                        use_thumbnail=self.use_thumbnail,
+                                        max_num=self.max_num)
+        else:
+            images = [image]
+        pixel_values = [self.transform(image) for image in images]
+        pixel_values = torch.stack(pixel_values)
+        if len(self.prompt) != 0:
+            question = question + ' ' + self.prompt
+        return {
+            'question_id': question_id,
+            'question': question,
+            'pixel_values': pixel_values,
+            'annotation': annotation
+        }
+
+
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
+
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
+
+
+def post_process(response):
+    response = response.strip().split('.')[0].split(
+        ',')[0].split('!')[0].lower()
+    if 'is ' in response:
+        response = response.split('is ')[1]
+    if 'are ' in response:
+        response = response.split('are ')[1]
+    if 'a ' in response:
+        response = response.split('a ')[1]
+    if 'an ' in response:
+        response = response.split('an ')[1]
+    if 'the ' in response:
+        response = response.split('the ')[1]
+    if ' of' in response:
+        response = response.split(' of')[0]
+    response = response.strip()
+    return response
+
+
+def evaluate_chat_model():
+    base_prompt = 'Answer the question using a single word or phrase.'
+    vizwiz_prompt = "When the provided information is insufficient, respond with 'Unanswerable'. "
+    infovqa_prompt = 'Answer the question using a single word or phrase.'
+    ai2d_prompt = ''
+    random.seed(args.seed)
+    summaries = []
+
+    for ds_name in args.datasets:
+        if 'vizwiz' in ds_name:
+            input_prompt = vizwiz_prompt + base_prompt
+        elif 'ai2d' in ds_name:
+            input_prompt = ai2d_prompt
+        elif 'infographicsvqa' in ds_name:
+            input_prompt = infovqa_prompt
+        else:
+            input_prompt = base_prompt
+
+        dataset = VQADataset(
+            train=ds_collections[ds_name]['train'],
+            test=ds_collections[ds_name]['test'],
+            prompt=input_prompt,
+            few_shot=args.few_shot,
+            input_size=image_size,
+            dynamic_image_size=args.dynamic,
+            use_thumbnail=use_thumbnail,
+            max_num=args.max_num
+        )
+        dataloader = torch.utils.data.DataLoader(
+            dataset=dataset,
+            sampler=InferenceSampler(len(dataset)),
+            batch_size=args.batch_size,
+            num_workers=args.num_workers,
+            pin_memory=True,
+            drop_last=False,
+            collate_fn=partial(collate_fn, tokenizer=tokenizer),
+        )
+
+        outputs = []
+        for _, (pixel_values, questions, question_ids, annotations) in tqdm(enumerate(dataloader)):
+            pixel_values = pixel_values.to(torch.bfloat16).cuda()
+            generation_config = dict(
+                num_beams=args.num_beams,
+                max_new_tokens=ds_collections[ds_name]['max_new_tokens'],
+                min_new_tokens=1,
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+            )
+            pred = model.chat(
+                tokenizer=tokenizer,
+                pixel_values=pixel_values,
+                question=questions[0],
+                generation_config=generation_config,
+                verbose=True
+            )
+            answers = [pred]
+
+            for question, question_id, answer, annotation in zip(questions, question_ids, answers, annotations):
+                if ds_name in ['vqav2_val', 'vqav2_testdev', 'okvqa_val', 'textvqa_val',
+                               'vizwiz_val', 'textvqa_val_ocr']:
+                    outputs.append({
+                        'question': question,
+                        'question_id': question_id,
+                        'answer': answer,
+                    })
+                elif ds_name in ['docvqa_val', 'infographicsvqa_val', 'gqa_testdev', 'ocrvqa_val',
+                                 'ocrvqa_test', 'gqa_testdev_llava', 'infographicsvqa_test',]:
+                    outputs.append({
+                        'question': question,
+                        'questionId': question_id,
+                        'answer': answer,
+                        'annotation': annotation,
+                    })
+                elif ds_name in ['ai2diagram_test']:
+                    outputs.append({
+                        'question': question,
+                        'image': question_id,
+                        'answer': answer,
+                        'annotation': annotation,
+                    })
+                elif ds_name in ['chartqa_test_human', 'chartqa_test_augmented']:
+                    outputs.append({
+                        'question': question,
+                        'answer': answer,
+                        'annotation': annotation,
+                    })
+                elif ds_name in ['docvqa_test']:
+                    outputs.append({
+                        'questionId': question_id,
+                        'answer': answer,
+                    })
+                elif ds_name in ['vizwiz_test']:
+                    outputs.append({
+                        'image': question_id.replace('data/vizwiz/test/', ''),
+                        'answer': answer,
+                    })
+                else:
+                    raise NotImplementedError
+
+        torch.distributed.barrier()
+
+        world_size = torch.distributed.get_world_size()
+        merged_outputs = [None for _ in range(world_size)]
+        torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
+
+        merged_outputs = [json.loads(_) for _ in merged_outputs]
+        merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
+
+        if torch.distributed.get_rank() == 0:
+            print(f'Evaluating {ds_name} ...')
+            time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
+            results_file = f'{ds_name}_{time_prefix}.json'
+            results_file = os.path.join(args.out_dir, results_file)
+            json.dump(merged_outputs, open(results_file, 'w'))
+            print('Results saved to {}'.format(results_file))
+
+            if ds_collections[ds_name]['metric'] == 'vqa_score':
+                evaluator = TextVQAAccuracyEvaluator()
+                annotation = json.load(open(ds_collections[ds_name]['annotation'], 'r'))['annotations']
+                question_id2answers = {}
+                for item in annotation:
+                    question_id = item['question_id']
+                    answers = [answer['answer'] for answer in item['answers']]
+                    question_id2answers[question_id] = answers
+                for item in merged_outputs:
+                    item['pred_answer'] = item['answer']
+                    item['gt_answers'] = question_id2answers[item['question_id']]
+                accuracy = evaluator.eval_pred_list(merged_outputs)
+                print(ds_name, accuracy)
+                summaries.append([args.checkpoint, ds_name, accuracy])
+
+            elif ds_collections[ds_name]['metric'] == 'anls':
+                json.dump(merged_outputs,
+                          open(results_file, 'w'),
+                          ensure_ascii=False)
+                print('python eval/vqa/infographicsvqa_eval.py -g ' +
+                      ds_collections[ds_name]['annotation'] + ' -s ' +
+                      results_file)
+                os.system('python eval/vqa/infographicsvqa_eval.py -g ' +
+                          ds_collections[ds_name]['annotation'] + ' -s ' +
+                          results_file)
+            elif ds_collections[ds_name]['metric'] == 'relaxed_accuracy':
+                relaxed_accuracy = evaluate_relaxed_accuracy(merged_outputs)
+                print(ds_name, {'relaxed_accuracy': relaxed_accuracy})
+                summaries.append([ds_name, {'relaxed_accuracy': relaxed_accuracy}])
+            elif ds_collections[ds_name]['metric'] == 'accuracy':
+                if 'gqa' in ds_name:
+                    dst_file = './data/gqa/testdev_balanced_predictions.json'
+                    print('python eval/vqa/convert_gqa_for_eval.py --src ' +
+                          results_file + ' --dst ' + dst_file)
+                    python_path = 'python'
+                    os.system(python_path + ' eval/vqa/convert_gqa_for_eval.py --src ' +
+                              results_file + ' --dst ' + dst_file)
+                    command = f'cd ./data/gqa/ && {python_path} eval.py --tier testdev_balanced && cd ../../'
+                    print(command)
+                    accuracy = subprocess.check_output(command, shell=True, universal_newlines=True)
+                else:
+                    accuracy = {'accuracy': evaluate_exact_match_accuracy(merged_outputs)}
+                print(ds_name, accuracy)
+                summaries.append([args.checkpoint, ds_name, accuracy])
+
+        torch.distributed.barrier()
+
+    out_path = '_'.join(args.checkpoint.split('/')[-2:])
+    writer = open(os.path.join(args.out_dir, f'{out_path}.txt'), 'a')
+    print(f"write results to file {os.path.join(args.out_dir, f'{out_path}.txt')}")
+    for summary in summaries:
+        print(summary)
+        writer.write(f'{summary}\n')
+    writer.close()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--datasets', type=str,
+                        default='okvqa_val,textvqa_val,vizwiz_val,ai2diagram_test,gqa_testdev_llava')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    parser.add_argument('--num-beams', type=int, default=1)
+    parser.add_argument('--temperature', type=float, default=0.0)
+    parser.add_argument('--out-dir', type=str, default='results')
+    parser.add_argument('--few-shot', type=int, default=0)
+    parser.add_argument('--seed', type=int, default=0)
+    parser.add_argument('--dynamic', action='store_true')
+    parser.add_argument('--max-num', type=int, default=6)
+    parser.add_argument('--load-in-8bit', action='store_true')
+    parser.add_argument('--load-in-4bit', action='store_true')
+    parser.add_argument('--auto', action='store_true')
+    args = parser.parse_args()
+
+    if not os.path.exists(args.out_dir):
+        os.makedirs(args.out_dir, exist_ok=True)
+
+    args.datasets = args.datasets.split(',')
+    print('datasets:', args.datasets)
+    assert args.batch_size == 1, 'Only batch size 1 is supported'
+
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+
+    model, tokenizer = load_model_and_tokenizer(args)
+    image_size = model.config.force_image_size or model.config.vision_config.image_size
+    use_thumbnail = model.config.use_thumbnail
+
+    total_params = sum(p.numel() for p in model.parameters()) / 1e9
+    if total_params > 20 or args.dynamic:
+        args.num_beams = 1
+        print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
+    else:
+        print(f'[test] total_params: {total_params}B')
+    print(f'[test] image_size: {image_size}')
+    print(f'[test] template: {model.config.template}')
+    print(f'[test] dynamic_image_size: {args.dynamic}')
+    print(f'[test] use_thumbnail: {use_thumbnail}')
+
+    evaluate_chat_model()
--- a/internvl_chat/eval/vqa/infographicsvqa_eval.py
+++ b/internvl_chat/eval/vqa/infographicsvqa_eval.py
+# This file can be downloaded from: https://www.docvqa.org/datasets/infographicvqa and https://rrc.cvc.uab.es/?ch=17&com=introduction
+
+import argparse
+import json
+import os
+
+question_ids_to_exclude = []
+
+# answer_types = {'image span': 'Image-Span', 'question span': 'Question-Span', 'multiple spans': 'Multi-Span', 'non span': 'None span', 'list': 'List'}
+answer_types = {'image span': 'Image-Span', 'question span': 'Question-Span', 'multiple spans': 'Multi-Span',
+                'non span': 'None span'}
+evidence_types = {'table/list': 'Table/list', 'textual': 'Text', 'photo/pciture/visual_objects': 'Visual/Layout',
+                  'figure': 'Figure', 'map': 'Map'}
+reasoning_requirements = {'comparison': 'Sorting', 'arithmetic': 'Arithmetic', 'counting': 'Counting'}
+
+
+def save_json(file_path, data):
+    with open(file_path, 'w+') as json_file:
+        json.dump(data, json_file)
+
+
+def levenshtein_distance(s1, s2):
+    if len(s1) > len(s2):
+        s1, s2 = s2, s1
+
+    distances = range(len(s1) + 1)
+    for i2, c2 in enumerate(s2):
+        distances_ = [i2 + 1]
+        for i1, c1 in enumerate(s1):
+            if c1 == c2:
+                distances_.append(distances[i1])
+            else:
+                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
+        distances = distances_
+    return distances[-1]
+
+
+def validate_data(gtFilePath, submFilePath):
+    """
+    Method validate_data: validates that all files in the results folder are correct (have the correct name contents).
+                            Validates also that there are no missing files in the folder.
+                            If some error detected, the method raises the error
+    """
+
+    gtJson = json.load(open(gtFilePath, 'rb'))
+    submJson = json.load(open(submFilePath, 'rb'))
+
+    if 'data' not in gtJson:
+        raise Exception('The GT file is not valid (no data key)')
+
+    if 'dataset_name' not in gtJson:
+        raise Exception('The GT file is not valid (no dataset_name key)')
+
+    if isinstance(submJson, list) is False:
+        raise Exception('The Det file is not valid (root item must be an array)')
+
+    if len(submJson) != len(gtJson['data']):
+        raise Exception('The Det file is not valid (invalid number of answers. Expected:' + str(
+            len(gtJson['data'])) + ' Found:' + str(len(submJson)) + ')')
+
+    gtQuestions = sorted([r['questionId'] for r in gtJson['data']])
+    res_id_to_index = {int(r['questionId']): ix for ix, r in enumerate(submJson)}
+    detQuestions = sorted([r['questionId'] for r in submJson])
+
+    if ((gtQuestions == detQuestions) is False):
+        raise Exception('The Det file is not valid. Question IDs must much GT')
+
+    for gtObject in gtJson['data']:
+
+        try:
+            q_id = int(gtObject['questionId'])
+            res_ix = res_id_to_index[q_id]
+
+        except:
+            raise Exception('The Det file is not valid. Question ' + str(gtObject['questionId']) + ' not present')
+
+        else:
+            detObject = submJson[res_ix]
+
+            #            if detObject['questionId'] != gtObject['questionId'] :
+            #                raise Exception("Answer #" + str(i) + " not valid (invalid question ID. Expected:" + str(gtObject['questionId']) + "Found:" + detObject['questionId'] + ")")
+
+            if 'answer' not in detObject:
+                raise Exception('Question ' + str(gtObject['questionId']) + ' not valid (no answer key)')
+
+            if isinstance(detObject['answer'], list) is True:
+                raise Exception(
+                    'Question ' + str(gtObject['questionId']) + ' not valid (answer key has to be a single string)')
+
+
+def evaluate_method(gtFilePath, submFilePath, evaluationParams):
+    """
+    Method evaluate_method: evaluate method and returns the results
+        Results. Dictionary with the following values:
+        - method (required)  Global method metrics. Ex: { 'Precision':0.8,'Recall':0.9 }
+        - samples (optional) Per sample metrics. Ex: {'sample1' : { 'Precision':0.8,'Recall':0.9 } , 'sample2' : { 'Precision':0.8,'Recall':0.9 }
+    """
+
+    show_scores_per_answer_type = evaluationParams.answer_types
+
+    gtJson = json.load(open(gtFilePath, 'rb'))
+    submJson = json.load(open(submFilePath, 'rb'))
+
+    res_id_to_index = {int(r['questionId']): ix for ix, r in enumerate(submJson)}
+
+    perSampleMetrics = {}
+
+    totalScore = 0
+    row = 0
+
+    if show_scores_per_answer_type:
+        answerTypeTotalScore = {x: 0 for x in answer_types.keys()}
+        answerTypeNumQuestions = {x: 0 for x in answer_types.keys()}
+
+        evidenceTypeTotalScore = {x: 0 for x in evidence_types.keys()}
+        evidenceTypeNumQuestions = {x: 0 for x in evidence_types.keys()}
+
+        reasoningTypeTotalScore = {x: 0 for x in reasoning_requirements.keys()}
+        reasoningTypeNumQuestions = {x: 0 for x in reasoning_requirements.keys()}
+
+    for gtObject in gtJson['data']:
+
+        q_id = int(gtObject['questionId'])
+        res_ix = res_id_to_index[q_id]
+        detObject = submJson[res_ix]
+
+        if q_id in question_ids_to_exclude:
+            question_result = 0
+            info = 'Question EXCLUDED from the result'
+
+        else:
+            info = ''
+            values = []
+            for answer in gtObject['answers']:
+                # preprocess both the answers - gt and prediction
+                gt_answer = ' '.join(answer.strip().lower().split())
+                det_answer = ' '.join(detObject['answer'].strip().lower().split())
+
+                # dist = levenshtein_distance(answer.lower(), detObject['answer'].lower())
+                dist = levenshtein_distance(gt_answer, det_answer)
+                length = max(len(answer.upper()), len(detObject['answer'].upper()))
+                values.append(0.0 if length == 0 else float(dist) / float(length))
+
+            question_result = 1 - min(values)
+
+            if (question_result < evaluationParams.anls_threshold):
+                question_result = 0
+
+            totalScore += question_result
+
+            if show_scores_per_answer_type:
+                for q_type in gtObject['answer_type']:
+                    answerTypeTotalScore[q_type] += question_result
+                    answerTypeNumQuestions[q_type] += 1
+
+                for q_type in gtObject['evidence']:
+                    evidenceTypeTotalScore[q_type] += question_result
+                    evidenceTypeNumQuestions[q_type] += 1
+
+                for q_type in gtObject['operation/reasoning']:
+                    reasoningTypeTotalScore[q_type] += question_result
+                    reasoningTypeNumQuestions[q_type] += 1
+
+        perSampleMetrics[str(gtObject['questionId'])] = {
+            'score': question_result,
+            'question': gtObject['question'],
+            'gt': gtObject['answers'],
+            'det': detObject['answer'],
+            'info': info
+        }
+        row = row + 1
+
+    methodMetrics = {
+        'score': 0 if len(gtJson['data']) == 0 else totalScore / (len(gtJson['data']) - len(question_ids_to_exclude))
+    }
+
+    answer_types_scores = {}
+    evidence_types_scores = {}
+    operation_types_scores = {}
+
+    if show_scores_per_answer_type:
+        for a_type, ref in answer_types.items():
+            answer_types_scores[ref] = 0 if len(gtJson['data']) == 0 else answerTypeTotalScore[a_type] / (
+            answerTypeNumQuestions[a_type])
+
+        for e_type, ref in evidence_types.items():
+            evidence_types_scores[ref] = 0 if len(gtJson['data']) == 0 else evidenceTypeTotalScore[e_type] / (
+            evidenceTypeNumQuestions[e_type])
+
+        for r_type, ref in reasoning_requirements.items():
+            operation_types_scores[ref] = 0 if len(gtJson['data']) == 0 else reasoningTypeTotalScore[r_type] / (
+            reasoningTypeNumQuestions[r_type])
+
+    resDict = {
+        'result': methodMetrics,
+        'scores_by_types': {'answer_types': answer_types_scores, 'evidence_types': evidence_types_scores,
+                            'operation_types': operation_types_scores},
+        'per_sample_result': perSampleMetrics
+    }
+
+    return resDict
+
+
+def display_results(results, show_answer_types):
+    print('\nOverall ANLS: {:2.4f}'.format(results['result']['score']))
+
+    if show_answer_types:
+        print('\nAnswer types:')
+        for a_type in answer_types.values():
+            print('\t{:12s} {:2.4f}'.format(a_type, results['scores_by_types']['answer_types'][a_type]))
+
+        print('\nEvidence types:')
+        for e_type in evidence_types.values():
+            print('\t{:12s} {:2.4f}'.format(e_type, results['scores_by_types']['evidence_types'][e_type]))
+
+        print('\nOperation required:')
+        for r_type in reasoning_requirements.values():
+            print('\t{:12s} {:2.4f}'.format(r_type, results['scores_by_types']['operation_types'][r_type]))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='InfographVQA evaluation script.')
+
+    parser.add_argument('-g', '--ground_truth', type=str, help='Path of the Ground Truth file.', required=True)
+    parser.add_argument('-s', '--submission_file', type=str, help="Path of your method's results file.", required=True)
+
+    parser.add_argument('-t', '--anls_threshold', type=float, default=0.5,
+                        help='ANLS threshold to use (See Scene-Text VQA paper for more info.).', required=False)
+    parser.add_argument('-a', '--answer_types', type=bool, default=False,
+                        help='Score break down by answer types (special gt file required).', required=False)
+    parser.add_argument('-o', '--output', type=str,
+                        help="Path to a directory where to copy the file 'results.json' that contains per-sample results.",
+                        required=False)
+
+    args = parser.parse_args()
+
+    # Validate the format of ground truth and submission files.
+    validate_data(args.ground_truth, args.submission_file)
+
+    # Evaluate method
+    results = evaluate_method(args.ground_truth, args.submission_file, args)
+
+    display_results(results, args.answer_types)
+
+    if args.output:
+        output_dir = args.output
+
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        resultsOutputname = os.path.join(output_dir, 'results.json')
+        save_json(resultsOutputname, results)
+
+        print('All results including per-sample result has been correctly saved!')
--- a/internvl_chat/eval/vqa/textvqa_eval.py
+++ b/internvl_chat/eval/vqa/textvqa_eval.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+# copied from https://github.com/haotian-liu/LLaVA/blob/main/llava/eval/m4c_evaluator.py
+import re
+
+from tqdm import tqdm
+
+
+class EvalAIAnswerProcessor:
+    """
+    Processes an answer similar to Eval AI
+        copied from
+        https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897
+    """
+
+    CONTRACTIONS = {
+        'aint': "ain't",
+        'arent': "aren't",
+        'cant': "can't",
+        'couldve': "could've",
+        'couldnt': "couldn't",
+        "couldn'tve": "couldn't've",
+        "couldnt've": "couldn't've",
+        'didnt': "didn't",
+        'doesnt': "doesn't",
+        'dont': "don't",
+        'hadnt': "hadn't",
+        "hadnt've": "hadn't've",
+        "hadn'tve": "hadn't've",
+        'hasnt': "hasn't",
+        'havent': "haven't",
+        'hed': "he'd",
+        "hed've": "he'd've",
+        "he'dve": "he'd've",
+        'hes': "he's",
+        'howd': "how'd",
+        'howll': "how'll",
+        'hows': "how's",
+        "Id've": "I'd've",
+        "I'dve": "I'd've",
+        'Im': "I'm",
+        'Ive': "I've",
+        'isnt': "isn't",
+        'itd': "it'd",
+        "itd've": "it'd've",
+        "it'dve": "it'd've",
+        'itll': "it'll",
+        "let's": "let's",
+        'maam': "ma'am",
+        'mightnt': "mightn't",
+        "mightnt've": "mightn't've",
+        "mightn'tve": "mightn't've",
+        'mightve': "might've",
+        'mustnt': "mustn't",
+        'mustve': "must've",
+        'neednt': "needn't",
+        'notve': "not've",
+        'oclock': "o'clock",
+        'oughtnt': "oughtn't",
+        "ow's'at": "'ow's'at",
+        "'ows'at": "'ow's'at",
+        "'ow'sat": "'ow's'at",
+        'shant': "shan't",
+        "shed've": "she'd've",
+        "she'dve": "she'd've",
+        "she's": "she's",
+        'shouldve': "should've",
+        'shouldnt': "shouldn't",
+        "shouldnt've": "shouldn't've",
+        "shouldn'tve": "shouldn't've",
+        "somebody'd": 'somebodyd',
+        "somebodyd've": "somebody'd've",
+        "somebody'dve": "somebody'd've",
+        'somebodyll': "somebody'll",
+        'somebodys': "somebody's",
+        'someoned': "someone'd",
+        "someoned've": "someone'd've",
+        "someone'dve": "someone'd've",
+        'someonell': "someone'll",
+        'someones': "someone's",
+        'somethingd': "something'd",
+        "somethingd've": "something'd've",
+        "something'dve": "something'd've",
+        'somethingll': "something'll",
+        'thats': "that's",
+        'thered': "there'd",
+        "thered've": "there'd've",
+        "there'dve": "there'd've",
+        'therere': "there're",
+        'theres': "there's",
+        'theyd': "they'd",
+        "theyd've": "they'd've",
+        "they'dve": "they'd've",
+        'theyll': "they'll",
+        'theyre': "they're",
+        'theyve': "they've",
+        'twas': "'twas",
+        'wasnt': "wasn't",
+        "wed've": "we'd've",
+        "we'dve": "we'd've",
+        'weve': "we've",
+        'werent': "weren't",
+        'whatll': "what'll",
+        'whatre': "what're",
+        'whats': "what's",
+        'whatve': "what've",
+        'whens': "when's",
+        'whered': "where'd",
+        'wheres': "where's",
+        'whereve': "where've",
+        'whod': "who'd",
+        "whod've": "who'd've",
+        "who'dve": "who'd've",
+        'wholl': "who'll",
+        'whos': "who's",
+        'whove': "who've",
+        'whyll': "why'll",
+        'whyre': "why're",
+        'whys': "why's",
+        'wont': "won't",
+        'wouldve': "would've",
+        'wouldnt': "wouldn't",
+        "wouldnt've": "wouldn't've",
+        "wouldn'tve": "wouldn't've",
+        'yall': "y'all",
+        "yall'll": "y'all'll",
+        "y'allll": "y'all'll",
+        "yall'd've": "y'all'd've",
+        "y'alld've": "y'all'd've",
+        "y'all'dve": "y'all'd've",
+        'youd': "you'd",
+        "youd've": "you'd've",
+        "you'dve": "you'd've",
+        'youll': "you'll",
+        'youre': "you're",
+        'youve': "you've",
+    }
+
+    NUMBER_MAP = {
+        'none': '0',
+        'zero': '0',
+        'one': '1',
+        'two': '2',
+        'three': '3',
+        'four': '4',
+        'five': '5',
+        'six': '6',
+        'seven': '7',
+        'eight': '8',
+        'nine': '9',
+        'ten': '10',
+    }
+    ARTICLES = ['a', 'an', 'the']
+    PERIOD_STRIP = re.compile(r'(?!<=\d)(\.)(?!\d)')
+    COMMA_STRIP = re.compile(r'(?<=\d)(\,)+(?=\d)')
+    PUNCTUATIONS = [
+        ';',
+        r'/',
+        '[',
+        ']',
+        '"',
+        '{',
+        '}',
+        '(',
+        ')',
+        '=',
+        '+',
+        '\\',
+        '_',
+        '-',
+        '>',
+        '<',
+        '@',
+        '`',
+        ',',
+        '?',
+        '!',
+    ]
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def word_tokenize(self, word):
+        word = word.lower()
+        word = word.replace(',', '').replace('?', '').replace("'s", " 's")
+        return word.strip()
+
+    def process_punctuation(self, in_text):
+        out_text = in_text
+        for p in self.PUNCTUATIONS:
+            if (p + ' ' in in_text or ' ' + p in in_text) or (
+                re.search(self.COMMA_STRIP, in_text) is not None
+            ):
+                out_text = out_text.replace(p, '')
+            else:
+                out_text = out_text.replace(p, ' ')
+        out_text = self.PERIOD_STRIP.sub('', out_text, re.UNICODE)
+        return out_text
+
+    def process_digit_article(self, in_text):
+        out_text = []
+        temp_text = in_text.lower().split()
+        for word in temp_text:
+            word = self.NUMBER_MAP.setdefault(word, word)
+            if word not in self.ARTICLES:
+                out_text.append(word)
+            else:
+                pass
+        for word_id, word in enumerate(out_text):
+            if word in self.CONTRACTIONS:
+                out_text[word_id] = self.CONTRACTIONS[word]
+        out_text = ' '.join(out_text)
+        return out_text
+
+    def __call__(self, item):
+        item = self.word_tokenize(item)
+        item = item.replace('\n', ' ').replace('\t', ' ').strip()
+        item = self.process_punctuation(item)
+        item = self.process_digit_article(item)
+        return item
+
+
+class TextVQAAccuracyEvaluator:
+    def __init__(self):
+        self.answer_processor = EvalAIAnswerProcessor()
+
+    def _compute_answer_scores(self, raw_answers):
+        """
+        compute the accuracy (soft score) of human answers
+        """
+        answers = [self.answer_processor(a) for a in raw_answers]
+        assert len(answers) == 10
+        gt_answers = list(enumerate(answers))
+        unique_answers = set(answers)
+        unique_answer_scores = {}
+
+        for unique_answer in unique_answers:
+            accs = []
+            for gt_answer in gt_answers:
+                other_answers = [item for item in gt_answers if item != gt_answer]
+                matching_answers = [
+                    item for item in other_answers if item[1] == unique_answer
+                ]
+                acc = min(1, float(len(matching_answers)) / 3)
+                accs.append(acc)
+            unique_answer_scores[unique_answer] = sum(accs) / len(accs)
+
+        return unique_answer_scores
+
+    def eval_pred_list(self, pred_list, disable_tqdm=False):
+        pred_scores = []
+        for entry in tqdm(pred_list, disable=disable_tqdm):
+            pred_answer = self.answer_processor(entry['pred_answer'])
+            unique_answer_scores = self._compute_answer_scores(entry['gt_answers'])
+            score = unique_answer_scores.get(pred_answer, 0.0)
+            pred_scores.append(score)
+
+        accuracy = sum(pred_scores) / len(pred_scores)
+        return accuracy
+
+
+class STVQAAccuracyEvaluator:
+    def __init__(self):
+        self.answer_processor = EvalAIAnswerProcessor()
+
+    def eval_pred_list(self, pred_list):
+        pred_scores = []
+        for entry in pred_list:
+            pred_answer = self.answer_processor(entry['pred_answer'])
+            gts = [self.answer_processor(a) for a in entry['gt_answers']]
+            score = 1.0 if pred_answer in gts else 0.0
+            pred_scores.append(score)
+
+        accuracy = sum(pred_scores) / len(pred_scores)
+        return accuracy
+
+
+class STVQAANLSEvaluator:
+    def __init__(self):
+        import editdistance  # install with `pip install editdistance`
+
+        self.get_edit_distance = editdistance.eval
+
+    def get_anls(self, s1, s2):
+        s1 = s1.lower().strip()
+        s2 = s2.lower().strip()
+        iou = 1 - self.get_edit_distance(s1, s2) / max(len(s1), len(s2))
+        anls = iou if iou >= 0.5 else 0.0
+        return anls
+
+    def eval_pred_list(self, pred_list):
+        pred_scores = []
+        for entry in pred_list:
+            anls = max(
+                self.get_anls(entry['pred_answer'], gt) for gt in entry['gt_answers']
+            )
+            pred_scores.append(anls)
+
+        accuracy = sum(pred_scores) / len(pred_scores)
+        return accuracy
+
+
+class TextCapsBleu4Evaluator:
+    def __init__(self):
+        # The following script requires Java 1.8.0 and pycocotools installed.
+        # The pycocoevalcap can be installed with pip as
+        # pip install git+https://github.com/ronghanghu/coco-caption.git@python23
+        # Original pycocoevalcap code is at https://github.com/tylin/coco-caption
+        # but has no python3 support yet.
+        try:
+            from pycocoevalcap.bleu.bleu import Bleu
+            from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+        except ModuleNotFoundError:
+            print(
+                'Please install pycocoevalcap module using '
+                'pip install git+https://github.com/ronghanghu/coco-caption.git@python23'  # noqa
+            )
+            raise
+
+        self.tokenizer = PTBTokenizer()
+        self.scorer = Bleu(4)
+
+    def eval_pred_list(self, pred_list):
+        # Create reference and hypotheses captions.
+        gts = {}
+        res = {}
+        for idx, entry in enumerate(pred_list):
+            gts[idx] = [{'caption': a} for a in entry['gt_answers']]
+            res[idx] = [{'caption': entry['pred_answer']}]
+
+        gts = self.tokenizer.tokenize(gts)
+        res = self.tokenizer.tokenize(res)
+        score, _ = self.scorer.compute_score(gts, res)
+
+        bleu4 = score[3]  # score is (Bleu-1, Bleu-2, Bleu-3, Bleu-4)
+        return bleu4
--- a/internvl_chat/evaluate.sh
+++ b/internvl_chat/evaluate.sh
+set -x
+
+CHECKPOINT=${1}
+DATASET=${2}
+CHECKPOINT="$(pwd)/${CHECKPOINT}"
+export PYTHONPATH="$(pwd):${PYTHONPATH}"
+echo "CHECKPOINT: ${CHECKPOINT}"
+
+MASTER_PORT=${MASTER_PORT:-63669}
+PORT=${PORT:-63665}
+GPUS=${GPUS:-8}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+NODES=$((GPUS / GPUS_PER_NODE))
+export MASTER_PORT=${MASTER_PORT}
+export PORT=${PORT}
+
+# Save original arguments
+ARGS=("$@")
+
+# Parse options
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --auto)
+      GPUS=1
+      shift
+      ;;
+    *)
+      shift
+      ;;
+  esac
+done
+echo "GPUS: ${GPUS}"
+
+if  [ ${DATASET} == "mme" ]; then
+  cd eval/mme/
+  DIRNAME=`basename ${CHECKPOINT}`
+  python eval.py --checkpoint ${CHECKPOINT} "${ARGS[@]:2}"
+  python calculation.py --results_dir ${DIRNAME}
+  cd ../../
+fi
+
+if  [ ${DATASET} == "caption" ]; then
+  torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/caption/evaluate_caption.py --checkpoint ${CHECKPOINT} "${ARGS[@]:2}"
+fi
+
+if  [ ${DATASET} == "caption-coco" ]; then
+  torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/caption/evaluate_caption.py --checkpoint ${CHECKPOINT} --datasets coco "${ARGS[@]:2}"
+fi
+
+if  [ ${DATASET} == "caption-flickr30k" ]; then
+  torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/caption/evaluate_caption.py --checkpoint ${CHECKPOINT} --datasets flickr30k "${ARGS[@]:2}"
+fi
+
+if  [ ${DATASET} == "caption-nocaps" ]; then
+  torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/caption/evaluate_caption.py --checkpoint ${CHECKPOINT} --datasets nocaps "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "vqa" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "vqa-okvqa-val" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets okvqa_val "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "vqa-textvqa-val" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets textvqa_val "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "vqa-textvqa-val-ocr" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets textvqa_val_ocr "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "vqa-vizwiz-val" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets vizwiz_val "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "vqa-vizwiz-test" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets vizwiz_test "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "vqa-vqav2-testdev" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets vqav2_testdev "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "vqa-ai2d-test" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets ai2diagram_test "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "vqa-vqav2-val" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets vqav2_val "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "vqa-gqa-testdev" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets gqa_testdev_llava "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "vqa-docvqa-val" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets docvqa_val "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "vqa-docvqa-test" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets docvqa_test "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "vqa-mpdocvqa-val" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/mpdocvqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets mpdocvqa_val "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "vqa-mpdocvqa-test" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/mpdocvqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets mpdocvqa_test "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "vqa-chartqa-test" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets chartqa_test_human,chartqa_test_augmented "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "vqa-infovqa-val" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets infographicsvqa_val "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "vqa-infovqa-test" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets infographicsvqa_test "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "vqa-chartqa-test-human" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets chartqa_test_human "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "vqa-chartqa-test-augmented" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets chartqa_test_augmented "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "vqa-ocrvqa-val" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets ocrvqa_val "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "vqa-ocrvqa-test" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets ocrvqa_test "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "refcoco" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT} "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "refcoco-val" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT} --datasets refcoco_val "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "refcoco-testA" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT} --datasets refcoco_testA "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "refcoco-testB" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT} --datasets refcoco_testB "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "refcoco+-val" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT} --datasets refcoco+_val "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "refcoco+-testA" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT} --datasets refcoco+_testA "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "refcoco+-testB" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT} --datasets refcoco+_testB "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "refcocog-val" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT} --datasets refcocog_val "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "refcocog-test" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT} --datasets refcocog_test "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "llava-bench" ]; then
+    rm -rf results/llava_bench_results_review.jsonl
+    python eval/llava_bench/evaluate_llava_bench.py --checkpoint ${CHECKPOINT} "${ARGS[@]:2}"
+    python -u eval/llava_bench/eval_gpt_review_bench.py \
+      --question data/llava-bench-in-the-wild/questions.jsonl \
+      --context data/llava-bench-in-the-wild/context.jsonl \
+      --rule eval/llava_bench/rule.json \
+      --answer-list \
+          data/llava-bench-in-the-wild/answers_gpt4.jsonl \
+          results/llava_bench_results.jsonl \
+      --output \
+          results/llava_bench_results_review.jsonl
+    python -u eval/llava_bench/summarize_gpt_review.py -f results/llava_bench_results_review.jsonl
+fi
+
+if [ ${DATASET} == "pope" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/pope/evaluate_pope.py --checkpoint ${CHECKPOINT} --datasets pope "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "tiny_lvlm" ]; then
+    torchrun \
+    --nnodes=1 \
+    --node_rank=0 \
+    --master_addr=127.0.0.1 \
+    --nproc_per_node=${GPUS} \
+    --master_port=${MASTER_PORT} \
+    eval/tiny_lvlm/evaluate_lvlm.py --checkpoint ${CHECKPOINT} --datasets updated_datasets "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mmvet" ]; then
+    python eval/mmvet/evaluate_mmvet.py --checkpoint ${CHECKPOINT} --datasets mmvet "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mmvetv2" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/mmvetv2/evaluate_mmvet_v2.py --checkpoint ${CHECKPOINT} --datasets mmvet-v2 "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mmbench-dev-en" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/mmbench/evaluate_mmbench.py --checkpoint ${CHECKPOINT} --datasets mmbench_dev_20230712 "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mmbench-dev-cn" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/mmbench/evaluate_mmbench.py --checkpoint ${CHECKPOINT} --datasets mmbench_dev_cn_20231003 "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mmbench-test-en" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/mmbench/evaluate_mmbench.py --checkpoint ${CHECKPOINT} --datasets mmbench_test_en_20231003 "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mmbench-test-cn" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/mmbench/evaluate_mmbench.py --checkpoint ${CHECKPOINT} --datasets mmbench_test_cn_20231003 "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "ccbench-dev" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/mmbench/evaluate_mmbench.py --checkpoint ${CHECKPOINT} --datasets ccbench_dev_cn "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "scienceqa" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/scienceqa/evaluate_scienceqa.py --checkpoint ${CHECKPOINT} --datasets sqa_test "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mantis" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/mantis_eval/evaluate_mantis.py --checkpoint ${CHECKPOINT} --datasets Mantis-Eval "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mirb" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/mirb/evaluate_mirb.py --checkpoint ${CHECKPOINT} --datasets MIRB "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "m3cot" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/scienceqa/evaluate_scienceqa.py --checkpoint ${CHECKPOINT} --datasets m3cot_test "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mmmu-dev" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/mmmu/evaluate_mmmu.py --checkpoint ${CHECKPOINT} --datasets MMMU_dev "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mmmu-val" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/mmmu/evaluate_mmmu.py --checkpoint ${CHECKPOINT} --datasets MMMU_validation "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mmmu-test" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/mmmu/evaluate_mmmu.py --checkpoint ${CHECKPOINT} --datasets MMMU_test "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mmmu-dev-cot" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/mmmu/evaluate_mmmu_cot.py --checkpoint ${CHECKPOINT} --datasets MMMU_dev "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mmmu-val-cot" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/mmmu/evaluate_mmmu_cot.py --checkpoint ${CHECKPOINT} --datasets MMMU_validation "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mmmu-test-cot" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/mmmu/evaluate_mmmu_cot.py --checkpoint ${CHECKPOINT} --datasets MMMU_test "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mmvp" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/mmvp/evaluate_mmvp.py --checkpoint ${CHECKPOINT} --datasets MMVP "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mathvista-testmini" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/mathvista/evaluate_mathvista.py --checkpoint ${CHECKPOINT} --datasets MathVista_testmini "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mathvista-test" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/mathvista/evaluate_mathvista.py --checkpoint ${CHECKPOINT} --datasets MathVista_test "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "seed" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/seed/evaluate_seed.py --checkpoint ${CHECKPOINT} --datasets SEEDv1 "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mvbench" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/mvbench/evaluate_mvbench.py --checkpoint ${CHECKPOINT} --num_segments 16 "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mmiu" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/mmiu/evaluate_mmiu.py --checkpoint ${CHECKPOINT} "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mmhal" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/mmhal/evaluate_mmhal.py --checkpoint ${CHECKPOINT} "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mmmu-pro" ]; then
+    python -u eval/mmmu_pro/evaluate_mmmu_pro.py --model ${CHECKPOINT} --mode direct --setting "standard (10 options)" "${ARGS[@]:2}"
+    python -u eval/mmmu_pro/evaluate_mmmu_pro.py --model ${CHECKPOINT} --mode cot --setting "standard (10 options)" "${ARGS[@]:2}"
+    python -u eval/mmmu_pro/evaluate_mmmu_pro.py --model ${CHECKPOINT} --mode direct --setting vision "${ARGS[@]:2}"
+    python -u eval/mmmu_pro/evaluate_mmmu_pro.py --model ${CHECKPOINT} --mode cot --setting vision "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mmmu-pro-std10" ]; then
+    python -u eval/mmmu_pro/evaluate_mmmu_pro.py --model ${CHECKPOINT} --mode direct --setting "standard (10 options)" "${ARGS[@]:2}"
+    python -u eval/mmmu_pro/evaluate_mmmu_pro.py --model ${CHECKPOINT} --mode cot --setting "standard (10 options)" "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "mmmu-pro-vision" ]; then
+    python -u eval/mmmu_pro/evaluate_mmmu_pro.py --model ${CHECKPOINT} --mode direct --setting vision "${ARGS[@]:2}"
+    python -u eval/mmmu_pro/evaluate_mmmu_pro.py --model ${CHECKPOINT} --mode cot --setting vision "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "drivelm" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/domain_specific/drivelm/evaluate.py --checkpoint ${CHECKPOINT} --datasets DriveLM_val --dynamic --max-num 12
+fi
+
+if [ ${DATASET} == "mme—realworld" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/domain_specific/mme_rw/evaluate.py --checkpoint ${CHECKPOINT} --datasets MME_RealWorld "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "dior-rsvg" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/domain_specific/rs_det/evaluate.py --checkpoint ${CHECKPOINT} --datasets DIOR_RSVG "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "rsvqa-lr" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/domain_specific/rs_vqa/evaluate.py --checkpoint ${CHECKPOINT} --datasets RSVQA_H_TEST2 "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "rsvqa-hr-test1" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/domain_specific/rs_vqa/evaluate.py --checkpoint ${CHECKPOINT} --datasets RSVQA_H_TEST1 "${ARGS[@]:2}"
+fi
+
+if [ ${DATASET} == "rsvqa-hr-test2" ]; then
+    torchrun \
+      --nnodes=1 \
+      --node_rank=0 \
+      --master_addr=127.0.0.1 \
+      --nproc_per_node=${GPUS} \
+      --master_port=${MASTER_PORT} \
+      eval/domain_specific/rs_vqa/evaluate.py --checkpoint ${CHECKPOINT} --datasets RSVQA_L "${ARGS[@]:2}"
+fi
--- a/internvl_chat/examples/image1.jpg
+++ b/internvl_chat/examples/image1.jpg
--- a/internvl_chat/examples/image2.jpg
+++ b/internvl_chat/examples/image2.jpg
--- a/internvl_chat/examples/image3.jpg
+++ b/internvl_chat/examples/image3.jpg
--- a/internvl_chat/examples/image4.jpg
+++ b/internvl_chat/examples/image4.jpg