Commit 26e59280 authored by wanglch's avatar wanglch
Browse files

Initial commit

parents
Pipeline #2674 failed with stages
in 0 seconds
import argparse
import itertools
import json
import os
import random
import re
import time
from functools import partial
import torch
from internvl.model import load_model_and_tokenizer
from internvl.train.dataset import build_transform, dynamic_preprocess
from PIL import Image
from torchvision.ops.boxes import box_area
from tqdm import tqdm
ds_collections = {
'refcoco_val': 'data/refcoco/refcoco_val.jsonl',
'refcoco_testA': 'data/refcoco/refcoco_testA.jsonl',
'refcoco_testB': 'data/refcoco/refcoco_testB.jsonl',
'refcoco+_val': 'data/refcoco/refcoco+_val.jsonl',
'refcoco+_testA': 'data/refcoco/refcoco+_testA.jsonl',
'refcoco+_testB': 'data/refcoco/refcoco+_testB.jsonl',
'refcocog_val': 'data/refcoco/refcocog_val.jsonl',
'refcocog_test': 'data/refcoco/refcocog_test.jsonl',
}
def box_iou(boxes1, boxes2):
area1 = box_area(boxes1)
area2 = box_area(boxes2)
lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
wh = (rb - lt).clamp(min=0) # [N,M,2]
inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
union = area1[:, None] + area2 - inter
iou = inter / union
return iou, union
def collate_fn(batches, tokenizer):
pixel_values = torch.cat([_['pixel_values'] for _ in batches], dim=0)
texts = [_['text'] for _ in batches]
bboxes = [_['bbox'] for _ in batches]
hws = [_['hw'] for _ in batches]
return pixel_values, texts, bboxes, hws
class RefCOCODataset(torch.utils.data.Dataset):
def __init__(self, test, prompt, input_size=224, dynamic_image_size=False,
use_thumbnail=False, max_num=6):
self.datas = open(test).readlines()
self.prompt = prompt
self.input_size = input_size
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail = use_thumbnail
self.max_num = max_num
self.transform = build_transform(is_train=False, input_size=input_size)
def __len__(self):
return len(self.datas)
def __getitem__(self, idx):
data = json.loads(self.datas[idx].strip())
image = data['image']
text = data['sent']
bbox = data['bbox']
w, h = data['width'], data['height']
image = Image.open(image).convert('RGB')
if self.dynamic_image_size:
images = dynamic_preprocess(image, image_size=self.input_size,
use_thumbnail=self.use_thumbnail,
max_num=self.max_num)
else:
images = [image]
pixel_values = [self.transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return {
'text': self.prompt.format(text),
'pixel_values': pixel_values,
'bbox': bbox,
'hw': (h, w),
}
class InferenceSampler(torch.utils.data.sampler.Sampler):
def __init__(self, size):
self._size = int(size)
assert size > 0
self._rank = torch.distributed.get_rank()
self._world_size = torch.distributed.get_world_size()
self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
@staticmethod
def _get_local_indices(total_size, world_size, rank):
shard_size = total_size // world_size
left = total_size % world_size
shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
begin = sum(shard_sizes[:rank])
end = min(sum(shard_sizes[:rank + 1]), total_size)
return range(begin, end)
def __iter__(self):
yield from self._local_indices
def __len__(self):
return len(self._local_indices)
def evaluate_chat_model():
print('prompt:', prompt)
random.seed(args.seed)
summaries = []
for ds_name in args.datasets:
dataset = RefCOCODataset(
test=ds_collections[ds_name],
prompt=prompt,
input_size=image_size,
dynamic_image_size=args.dynamic,
use_thumbnail=use_thumbnail,
max_num=args.max_num
)
dataloader = torch.utils.data.DataLoader(
dataset=dataset,
sampler=InferenceSampler(len(dataset)),
batch_size=args.batch_size,
num_workers=args.num_workers,
pin_memory=True,
drop_last=False,
collate_fn=partial(collate_fn, tokenizer=tokenizer),
)
outputs = []
for _, (pixel_values, questions, bboxes, hws) in enumerate(tqdm(dataloader)):
pixel_values = pixel_values.to(torch.bfloat16).cuda()
generation_config = dict(
num_beams=args.num_beams,
max_new_tokens=100,
min_new_tokens=1,
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
)
pred = model.chat(
tokenizer=tokenizer,
pixel_values=pixel_values,
question=questions[0],
generation_config=generation_config,
verbose=True
)
answers = [pred]
for bbox, hw, answer in zip(bboxes, hws, answers):
outputs.append({
'answer': answer,
'gt_bbox': bbox,
'hw': hw,
})
torch.distributed.barrier()
world_size = torch.distributed.get_world_size()
merged_outputs = [None for _ in range(world_size)]
torch.distributed.all_gather_object(merged_outputs, outputs)
merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
if torch.distributed.get_rank() == 0:
print(f'Evaluating {ds_name} ...')
time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
results_file = f'{ds_name}_{time_prefix}.json'
results_file = os.path.join(args.out_dir, results_file)
json.dump(merged_outputs, open(results_file, 'w'))
correct = total_cnt = 0
for i, output in enumerate(merged_outputs):
predict_bbox = re.findall(PATTERN, output['answer'])
try:
predict_bbox = (float(predict_bbox[0][0]), float(predict_bbox[0][1]), float(predict_bbox[0][2]),
float(predict_bbox[0][3]))
except:
predict_bbox = (0., 0., 0., 0.)
target_bbox = torch.tensor(output['gt_bbox'],
dtype=torch.float32).view(-1, 4)
predict_bbox = torch.tensor(predict_bbox,
dtype=torch.float32).view(-1, 4)
if predict_bbox.sum() >= 4:
predict_bbox = predict_bbox / 1000
predict_bbox[:, 0::2] *= output['hw'][1]
predict_bbox[:, 1::2] *= output['hw'][0]
iou, _ = box_iou(predict_bbox, target_bbox)
iou = iou.item()
total_cnt += 1
if iou >= 0.5:
correct += 1
print(f'Evaluating {ds_name} ...')
print(f'Precision @ 1: {correct / total_cnt} \n')
summaries.append([args.checkpoint, ds_name, f'Precision @ 1: {correct / total_cnt} \n'])
torch.distributed.barrier()
out_path = '_'.join(args.checkpoint.split('/')[-2:])
writer = open(os.path.join(args.out_dir, f'{out_path}.txt'), 'a')
print(f"write results to file {os.path.join(args.out_dir, f'{out_path}.txt')}")
for summary in summaries:
print(summary)
writer.write(f'{summary}\n')
writer.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--checkpoint', type=str, default='')
parser.add_argument('--datasets', type=str, default='refcoco_val,refcoco_testA,refcoco_testB,'
'refcoco+_val,refcoco+_testA,refcoco+_testB,'
'refcocog_val,refcocog_test')
parser.add_argument('--batch-size', type=int, default=1)
parser.add_argument('--num-workers', type=int, default=1)
parser.add_argument('--num-beams', type=int, default=1)
parser.add_argument('--out-dir', type=str, default='results')
parser.add_argument('--sample', type=bool, default=False)
parser.add_argument('--temperature', type=float, default=0.0)
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--dynamic', action='store_true')
parser.add_argument('--max-num', type=int, default=6)
parser.add_argument('--load-in-8bit', action='store_true')
parser.add_argument('--load-in-4bit', action='store_true')
parser.add_argument('--auto', action='store_true')
args = parser.parse_args()
if not os.path.exists(args.out_dir):
os.makedirs(args.out_dir, exist_ok=True)
args.datasets = args.datasets.split(',')
print('datasets:', args.datasets)
assert args.batch_size == 1, 'Only batch size 1 is supported'
torch.distributed.init_process_group(
backend='nccl',
world_size=int(os.getenv('WORLD_SIZE', '1')),
rank=int(os.getenv('RANK', '0')),
)
torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
PATTERN = re.compile(r'\[*\[(.*?),(.*?),(.*?),(.*?)\]\]*')
model, tokenizer = load_model_and_tokenizer(args)
image_size = model.config.force_image_size or model.config.vision_config.image_size
use_thumbnail = model.config.use_thumbnail
prompt = 'Please provide the bounding box coordinate of the region this sentence describes: <ref>{}</ref>'
total_params = sum(p.numel() for p in model.parameters()) / 1e9
if total_params > 20 or args.dynamic:
args.num_beams = 1
print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
else:
print(f'[test] total_params: {total_params}B')
print(f'[test] image_size: {image_size}')
print(f'[test] template: {model.config.template}')
print(f'[test] dynamic_image_size: {args.dynamic}')
print(f'[test] use_thumbnail: {use_thumbnail}')
print(f'[test] max_num: {args.max_num}')
evaluate_chat_model()
# README for Evaluation
## 🌟 Overview
This script provides an evaluation pipeline for `ScienceQA`.
## 🗂️ Data Preparation
Before starting to download the data, please create the `InternVL/internvl_chat/data` folder.
### ScienceQA
Follow the instructions below to prepare the data:
```shell
# Step 1: Create the data directory
mkdir -p data/scienceqa/images && cd data/scienceqa/images
# Step 2: Download images
wget https://scienceqa.s3.us-west-1.amazonaws.com/images/test.zip && unzip test.zip
cd ..
# Step 3: Download original questions
wget https://github.com/lupantech/ScienceQA/blob/main/data/scienceqa/problems.json
# Step 4: Download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/scienceqa/scienceqa_test_img.jsonl
cd ../..
```
After preparation is complete, the directory structure is:
```shell
data/scienceqa
├── images
├── problems.json
└── scienceqa_test_img.jsonl
```
## 🏃 Evaluation Execution
> ⚠️ Note: For testing InternVL (1.5, 2.0, 2.5, and later versions), always enable `--dynamic` to perform dynamic resolution testing.
To run the evaluation, execute the following command on an 8-GPU setup:
```shell
torchrun --nproc_per_node=8 eval/scienceqa/evaluate_scienceqa.py --checkpoint ${CHECKPOINT} --dynamic
```
Alternatively, you can run the following simplified command:
```shell
GPUS=8 sh evaluate.sh ${CHECKPOINT} scienceqa --dynamic
```
### Arguments
The following arguments can be configured for the evaluation script:
| Argument | Type | Default | Description |
| ---------------- | ------ | ------------ | ----------------------------------------------------------------------------------------------------------------- |
| `--checkpoint` | `str` | `''` | Path to the model checkpoint. |
| `--datasets` | `str` | `'sqa_test'` | Comma-separated list of datasets to evaluate. |
| `--dynamic` | `flag` | `False` | Enables dynamic high resolution preprocessing. |
| `--max-num` | `int` | `6` | Maximum tile number for dynamic high resolution. |
| `--load-in-8bit` | `flag` | `False` | Loads the model weights in 8-bit precision. |
| `--auto` | `flag` | `False` | Automatically splits a large model across 8 GPUs when needed, useful for models too large to fit on a single GPU. |
import argparse
import itertools
import json
import os
import random
import re
import time
from functools import partial
import torch
from internvl.model import load_model_and_tokenizer
from internvl.train.dataset import build_transform, dynamic_preprocess
from PIL import Image
from tqdm import tqdm
ds_collections = {
'sqa_test': {
'root': 'data/scienceqa/scienceqa_test_img.jsonl',
'max_new_tokens': 100,
'min_new_tokens': 1,
},
'm3cot_test': {
'root': 'data/M3CoT/test.jsonl',
'max_new_tokens': 100,
'min_new_tokens': 1,
},
}
COT_INSTRUCTION = (
'Your task is to answer the question below. '
"Give step by step reasoning before you answer, and when you're ready to answer, "
"please use the format \"Final answer: ..\""
'\n\n'
'Question:'
'\n\n'
'{question}'
)
def extract_answer(text):
match = re.search(r'(Final answer:|Answer:)\s*(.*)', text, re.IGNORECASE)
if match:
return match.group(2).strip()
return text
def collate_fn(batches, tokenizer):
pixel_values = torch.cat([_['pixel_values'] for _ in batches], dim=0)
questions = [_['question'] for _ in batches]
answers = [_['answer'] for _ in batches]
image_paths = [_['image_path'] for _ in batches]
options = [_['option'] for _ in batches]
return pixel_values, questions, answers, image_paths, options
class ScienceQADataset(torch.utils.data.Dataset):
def __init__(self, root, prompt, input_size=224, dynamic_image_size=False,
use_thumbnail=False, max_num=6):
f = open(root, 'r', encoding='utf-8')
self.data = [json.loads(line) for line in f.readlines()]
self.prompt = prompt
self.input_size = input_size
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail = use_thumbnail
self.max_num = max_num
self.transform = build_transform(is_train=False, input_size=input_size)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
data = self.data[idx]
image_path = data['image']
hint = data['hint'] if data['hint'] else None
question = data['question']
choices = data['choices']
answer = data['answer']
choice_list = []
options = {}
multiple_choices = ['A', 'B', 'C', 'D', 'E']
for i, c in enumerate(choices):
choice_list.append('{}. {}'.format(multiple_choices[i], c))
options[multiple_choices[i]] = c
choice_txt = '\n'.join(choice_list)
image = Image.open(image_path).convert('RGB')
if self.dynamic_image_size:
images = dynamic_preprocess(image, image_size=self.input_size,
use_thumbnail=self.use_thumbnail,
max_num=self.max_num)
else:
images = [image]
pixel_values = [self.transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
if hint is not None:
question = hint + '\n' + question
question += '\n' + choice_txt
question += '\n' + self.prompt
return {
'question': question.strip(),
'pixel_values': pixel_values,
'answer': multiple_choices[answer],
'image_path': image_path,
'option': options
}
class InferenceSampler(torch.utils.data.sampler.Sampler):
def __init__(self, size):
self._size = int(size)
assert size > 0
self._rank = torch.distributed.get_rank()
self._world_size = torch.distributed.get_world_size()
self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
@staticmethod
def _get_local_indices(total_size, world_size, rank):
shard_size = total_size // world_size
left = total_size % world_size
shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
begin = sum(shard_sizes[:rank])
end = min(sum(shard_sizes[:rank + 1]), total_size)
return range(begin, end)
def __iter__(self):
yield from self._local_indices
def __len__(self):
return len(self._local_indices)
def post_process(pred, option):
pred = pred.strip()
option_candidate = list(option.keys())
if len(pred) == 1:
return pred
elif len(pred) > 1 and pred[0] in option_candidate:
return pred[0]
elif len(pred) > 1 and pred[0] not in option_candidate:
for k, v in option.items():
if v in pred:
return k
if len(pred) > 1 and pred[1] == '.':
pred = pred[0]
if len(pred) > 1 and pred[0] == '(' and pred[2] == ')':
pred = pred[1]
return pred
def evaluate_chat_model():
prompt = '' if args.cot else "Answer with the option's letter from the given choices directly."
random.seed(args.seed)
for ds_name in args.datasets:
dataset = ScienceQADataset(
root=ds_collections[ds_name]['root'],
prompt=prompt,
input_size=image_size,
dynamic_image_size=args.dynamic,
use_thumbnail=use_thumbnail,
max_num=args.max_num
)
dataloader = torch.utils.data.DataLoader(
dataset=dataset,
sampler=InferenceSampler(len(dataset)),
batch_size=args.batch_size,
num_workers=args.num_workers,
pin_memory=True,
drop_last=False,
collate_fn=partial(collate_fn, tokenizer=tokenizer),
)
outputs = []
for _, (pixel_values, questions, answers, image_paths, options) in tqdm(enumerate(dataloader)):
if args.cot:
questions = [COT_INSTRUCTION.format(question=q) for q in questions]
pixel_values = pixel_values.to(torch.bfloat16).cuda()
generation_config = dict(
num_beams=args.num_beams,
max_new_tokens=ds_collections[ds_name]['max_new_tokens'] if not args.cot else 4096,
min_new_tokens=ds_collections[ds_name]['min_new_tokens'],
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
)
pred = model.chat(
tokenizer=tokenizer,
pixel_values=pixel_values,
question=questions[0],
generation_config=generation_config,
verbose=True
)
pred_orig = pred
if args.cot:
pred = extract_answer(pred).strip()
preds = [post_process(pred, options[0])]
for question, pred, answer, image_path in zip(questions, preds, answers, image_paths):
outputs.append({
'question': question,
'answer': pred,
'answer_orig': pred_orig,
'gt_answers': answer,
'image_path': image_path
})
torch.distributed.barrier()
world_size = torch.distributed.get_world_size()
merged_outputs = [None for _ in range(world_size)]
torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
merged_outputs = [json.loads(_) for _ in merged_outputs]
merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
if torch.distributed.get_rank() == 0:
print(f'Evaluating {ds_name} ...')
time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
results_file = f'{ds_name}_{time_prefix}.jsonl'
output_path = os.path.join(args.out_dir, results_file)
with open(output_path, 'w') as f:
for output in merged_outputs:
f.write(json.dumps(output) + '\n')
print('Results saved to {}'.format(output_path))
cnt = 0
for item in merged_outputs:
if item['answer'] == item['gt_answers']:
cnt += 1
print(f'Acc@1: {cnt / len(merged_outputs)}')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--checkpoint', type=str, default='')
parser.add_argument('--datasets', type=str, default='sqa_test')
parser.add_argument('--batch-size', type=int, default=1)
parser.add_argument('--num-workers', type=int, default=1)
parser.add_argument('--num-beams', type=int, default=1)
parser.add_argument('--temperature', type=float, default=0.0)
parser.add_argument('--out-dir', type=str, default='results')
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--dynamic', action='store_true')
parser.add_argument('--max-num', type=int, default=6)
parser.add_argument('--load-in-8bit', action='store_true')
parser.add_argument('--load-in-4bit', action='store_true')
parser.add_argument('--auto', action='store_true')
parser.add_argument('--cot', action='store_true')
args = parser.parse_args()
model_name = '_'.join(args.checkpoint.split('/')[-2:])
model_name = f'{model_name}_cot' if args.cot else model_name
args.out_dir = os.path.join(args.out_dir, model_name)
if not os.path.exists(args.out_dir):
os.makedirs(args.out_dir, exist_ok=True)
args.datasets = args.datasets.split(',')
print('datasets:', args.datasets)
assert args.batch_size == 1, 'Only batch size 1 is supported'
torch.distributed.init_process_group(
backend='nccl',
world_size=int(os.getenv('WORLD_SIZE', '1')),
rank=int(os.getenv('RANK', '0')),
)
torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
model, tokenizer = load_model_and_tokenizer(args)
image_size = model.config.force_image_size or model.config.vision_config.image_size
use_thumbnail = model.config.use_thumbnail
total_params = sum(p.numel() for p in model.parameters()) / 1e9
if total_params > 20 or args.dynamic:
args.num_beams = 1
print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
else:
print(f'[test] total_params: {total_params}B')
print(f'[test] image_size: {image_size}')
print(f'[test] template: {model.config.template}')
print(f'[test] dynamic_image_size: {args.dynamic}')
print(f'[test] use_thumbnail: {use_thumbnail}')
evaluate_chat_model()
# README for Evaluation
## 🌟 Overview
This script provides an evaluation pipeline for `SEED-Image`.
## 🗂️ Data Preparation
Before starting to download the data, please create the `InternVL/internvl_chat/data` folder.
### SEED-Image
Follow the instructions below to prepare the data:
```shell
# Step 1: Create the data directory
mkdir -p data/SEED && cd data/SEED
# Step 2: Download the dataset
wget https://huggingface.co/OpenGVLab/InternVL/resolve/main/SEED-Bench-image.zip
unzip SEED-Bench-image.zip
wget https://huggingface.co/OpenGVLab/InternVL/resolve/main/seed.jsonl
cd ../..
```
After preparation is complete, the directory structure is:
```shell
data/SEED
├── SEED-Bench-image
└── seed.jsonl
```
## 🏃 Evaluation Execution
> ⚠️ Note: For testing InternVL (1.5, 2.0, 2.5, and later versions), always enable `--dynamic` to perform dynamic resolution testing.
To run the evaluation, execute the following command on an 8-GPU setup:
```shell
torchrun --nproc_per_node=8 eval/seed/evaluate_seed.py --checkpoint ${CHECKPOINT} --dynamic
```
Alternatively, you can run the following simplified command:
```shell
GPUS=8 sh evaluate.sh ${CHECKPOINT} seed --dynamic
```
### Arguments
The following arguments can be configured for the evaluation script:
| Argument | Type | Default | Description |
| ---------------- | ------ | ---------- | ----------------------------------------------------------------------------------------------------------------- |
| `--checkpoint` | `str` | `''` | Path to the model checkpoint. |
| `--datasets` | `str` | `'SEEDv1'` | Comma-separated list of datasets to evaluate. |
| `--dynamic` | `flag` | `False` | Enables dynamic high resolution preprocessing. |
| `--max-num` | `int` | `6` | Maximum tile number for dynamic high resolution. |
| `--load-in-8bit` | `flag` | `False` | Loads the model weights in 8-bit precision. |
| `--auto` | `flag` | `False` | Automatically splits a large model across 8 GPUs when needed, useful for models too large to fit on a single GPU. |
import argparse
import json
import os
argparse = argparse.ArgumentParser()
argparse.add_argument('--image_result_file', type=str, default='')
argparse.add_argument('--anno_path', type=str, default='data/SEED/SEED-Bench.json')
args = argparse.parse_args()
image_result_file = args.image_result_file
anno_path = args.anno_path
assert image_result_file.endswith('.jsonl')
def is_integer_string(s):
try:
int(s)
return True
except ValueError:
return False
def filter_questions(data, task='all'):
if task == 'image':
return [q for q in data if 1 <= q['question_type_id'] <= 9]
elif task == 'video':
return [q for q in data if 10 <= q['question_type_id'] <= 12]
elif task == 'all':
return data
elif is_integer_string(task):
return [q for q in data if q['question_type_id'] == int(task)]
else:
raise ValueError(f'Invalid task: {task}')
if __name__ == '__main__':
qa_anno = json.load(open(anno_path, 'rb'))
if 'questions' in qa_anno.keys():
question_type = qa_anno['question_type']
question_id_type = {v: k for k, v in question_type.items()}
qa_anno = qa_anno['questions']
qa_anno = filter_questions(qa_anno, 'all')
print(f'length: {len(qa_anno)}')
with open(image_result_file, 'r') as f:
image_result = [json.loads(line) for line in f.readlines()]
results = []
results.extend(image_result)
qa_id_anno = {}
for item in qa_anno:
question_id = str(item['question_id'])
qa_id_anno[question_id] = item
type_counts = {k: [] for k, v in question_id_type.items()}
for item in results:
pred, gt, question_id = item['prediction'], item['answer'], item['question_id']
question_id = str(question_id)
question_type = qa_id_anno[question_id]['question_type_id']
data_type = qa_id_anno[question_id]['data_type']
gt = qa_id_anno[question_id]['answer']
if len(pred) != 1:
pred = pred[0]
if pred == gt:
type_counts[question_type].append(1)
else:
type_counts[question_type].append(0)
print('Accuracy for each data type:')
total_count, image_count, video_count = 0, 0, 0
total_correct, image_correct, video_correct = 0, 0, 0
for data_type_id, result in type_counts.items():
accuracy = sum(result) / len(result) * 100
data_type = question_id_type[data_type_id]
print(f'Data type {data_type}: {accuracy:.2f}%')
total_count += len(result)
total_correct += sum(result)
if data_type_id >= 1 and data_type_id <= 9:
image_count += len(result)
image_correct += sum(result)
else:
video_count += len(result)
video_correct += sum(result)
total_accuracy = total_correct / total_count * 100
image_accuracy = image_correct / image_count * 100
video_accuracy = video_correct / video_count * 100
print(f'Total accuracy: {total_accuracy:.2f}%')
print(f'Image accuracy: {image_accuracy:.2f}%')
print(f'Video accuracy: {video_accuracy:.2f}%')
import argparse
import itertools
import json
import os
import random
import time
from functools import partial
import torch
from internvl.model import load_model_and_tokenizer
from internvl.train.dataset import build_transform, dynamic_preprocess
from PIL import Image
from tqdm import tqdm
ds_collections = {
'SEEDv1': {
'root': 'data/SEED/',
'annotation': 'data/SEED/seed.jsonl',
'max_new_tokens': 100,
'min_new_tokens': 1,
},
}
def collate_fn(batches, tokenizer):
pixel_values = torch.cat([_['pixel_values'] for _ in batches], dim=0)
questions = [_['question'] for _ in batches]
answers = [_['answer'] for _ in batches]
indexes = [_['index'] for _ in batches]
return pixel_values, questions, answers, indexes
class MultipleChoiceDataset(torch.utils.data.Dataset):
def __init__(self, root, annotation, input_size=224, dynamic_image_size=False,
use_thumbnail=False, max_num=6):
f = open(annotation, 'r', encoding='utf-8')
self.data = [json.loads(line) for line in f.readlines()]
self.root = root
self.input_size = input_size
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail = use_thumbnail
self.max_num = max_num
self.transform = build_transform(is_train=False, input_size=input_size)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
data = self.data[idx]
question = data['text']
image_path = os.path.join(self.root, data['image'])
image = Image.open(image_path).convert('RGB')
if self.dynamic_image_size:
images = dynamic_preprocess(image, image_size=self.input_size,
use_thumbnail=self.use_thumbnail,
max_num=self.max_num)
else:
images = [image]
pixel_values = [self.transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
answer = data['answer'] if 'answer' in data else None
return {
'question': question,
'pixel_values': pixel_values,
'answer': answer,
'index': data['question_id'],
}
class InferenceSampler(torch.utils.data.sampler.Sampler):
def __init__(self, size):
self._size = int(size)
assert size > 0
self._rank = torch.distributed.get_rank()
self._world_size = torch.distributed.get_world_size()
self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
@staticmethod
def _get_local_indices(total_size, world_size, rank):
shard_size = total_size // world_size
left = total_size % world_size
shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
begin = sum(shard_sizes[:rank])
end = min(sum(shard_sizes[:rank + 1]), total_size)
return range(begin, end)
def __iter__(self):
yield from self._local_indices
def __len__(self):
return len(self._local_indices)
def post_process(pred, option):
pred = pred.strip()
option_candidate = list(option.keys())
if len(pred) == 1:
return pred
elif len(pred) != 1 and pred[0] in option_candidate:
return pred[0]
elif len(pred) != 1 and pred[0] not in option_candidate:
for k, v in option.items():
if v in pred:
return k
return pred
def evaluate_chat_model():
random.seed(args.seed)
for ds_name in args.datasets:
dataset = MultipleChoiceDataset(
root=ds_collections[ds_name]['root'],
annotation=ds_collections[ds_name]['annotation'],
input_size=image_size,
dynamic_image_size=args.dynamic,
use_thumbnail=use_thumbnail,
max_num=args.max_num
)
dataloader = torch.utils.data.DataLoader(
dataset=dataset,
sampler=InferenceSampler(len(dataset)),
batch_size=args.batch_size,
num_workers=args.num_workers,
pin_memory=True,
drop_last=False,
collate_fn=partial(collate_fn, tokenizer=tokenizer),
)
outputs = []
for _, (pixel_values, questions, answers, indexes) in enumerate(tqdm(dataloader)):
pixel_values = pixel_values.to(torch.bfloat16).cuda()
generation_config = dict(
num_beams=args.num_beams,
max_new_tokens=ds_collections[ds_name]['max_new_tokens'],
min_new_tokens=ds_collections[ds_name]['min_new_tokens'],
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
)
pred = model.chat(
tokenizer=tokenizer,
pixel_values=pixel_values,
question=questions[0],
generation_config=generation_config,
verbose=True
)
preds = [pred]
for question, pred, answer, index in zip(questions, preds, answers, indexes):
outputs.append({
'question_id': index,
'question': question,
'prediction': pred,
'answer': answer,
})
torch.distributed.barrier()
world_size = torch.distributed.get_world_size()
merged_outputs = [None for _ in range(world_size)]
torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
merged_outputs = [json.loads(_) for _ in merged_outputs]
merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
if torch.distributed.get_rank() == 0:
print(f'Evaluating {ds_name} ...')
time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
results_file = f'{ds_name}_{time_prefix}.jsonl'
output_path = os.path.join(args.out_dir, results_file)
writer = open(output_path, 'w')
results = []
for item in merged_outputs:
writer.write(json.dumps(item) + '\n')
answer = item['answer']
prediction = item['prediction']
if prediction == answer:
results.append(1)
else:
results.append(0)
writer.close()
print('Results saved to {}'.format(output_path))
print(f'Acc@1: {sum(results) / len(results)}')
cmd = f'python eval/seed/calculation.py --image_result_file {output_path}'
os.system(cmd)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--checkpoint', type=str, default='')
parser.add_argument('--datasets', type=str, default='SEEDv1')
parser.add_argument('--batch-size', type=int, default=1)
parser.add_argument('--num-workers', type=int, default=1)
parser.add_argument('--num-beams', type=int, default=1)
parser.add_argument('--temperature', type=float, default=0.0)
parser.add_argument('--out-dir', type=str, default='results')
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--dynamic', action='store_true')
parser.add_argument('--max-num', type=int, default=6)
parser.add_argument('--load-in-8bit', action='store_true')
parser.add_argument('--load-in-4bit', action='store_true')
parser.add_argument('--auto', action='store_true')
args = parser.parse_args()
if not os.path.exists(args.out_dir):
os.makedirs(args.out_dir, exist_ok=True)
args.datasets = args.datasets.split(',')
print('datasets:', args.datasets)
assert args.batch_size == 1, 'Only batch size 1 is supported'
torch.distributed.init_process_group(
backend='nccl',
world_size=int(os.getenv('WORLD_SIZE', '1')),
rank=int(os.getenv('RANK', '0')),
)
torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
model, tokenizer = load_model_and_tokenizer(args)
image_size = model.config.force_image_size or model.config.vision_config.image_size
use_thumbnail = model.config.use_thumbnail
total_params = sum(p.numel() for p in model.parameters()) / 1e9
if total_params > 20 or args.dynamic:
args.num_beams = 1
print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
else:
print(f'[test] total_params: {total_params}B')
print(f'[test] image_size: {image_size}')
print(f'[test] template: {model.config.template}')
print(f'[test] dynamic_image_size: {args.dynamic}')
print(f'[test] use_thumbnail: {use_thumbnail}')
evaluate_chat_model()
# README for Evaluation
## 🌟 Overview
This script provides an evaluation pipeline for `Tiny-LVLM-eHub`.
## 🗂️ Data Preparation
Before starting to download the data, please create the `InternVL/internvl_chat/data` folder.
### Tiny-LVLM-eHub
Follow the instructions below to prepare the data:
```shell
# Step 1: Create the data directory
mkdir -p data/tiny_lvlm && cd data/tiny_lvlm
# Step 2: Download the dataset
wget https://huggingface.co/OpenGVLab/InternVL/resolve/main/updated_datasets.zip
unzip updated_datasets.zip
cd ../..
```
After preparation is complete, the directory structure is:
```shell
data/tiny_lvlm
└── updated_datasets
```
## 🏃 Evaluation Execution
> ⚠️ Note: For testing InternVL (1.5, 2.0, 2.5, and later versions), always enable `--dynamic` to perform dynamic resolution testing.
To run the evaluation, execute the following command on an 8-GPU setup:
```shell
torchrun --nproc_per_node=8 eval/tiny_lvlm/evaluate_lvlm.py --checkpoint ${CHECKPOINT} --dynamic
```
Alternatively, you can run the following simplified command:
```shell
GPUS=8 sh evaluate.sh ${CHECKPOINT} tiny_lvlm --dynamic
```
### Arguments
The following arguments can be configured for the evaluation script:
| Argument | Type | Default | Description |
| ---------------- | ------ | -------------------- | ----------------------------------------------------------------------------------------------------------------- |
| `--checkpoint` | `str` | `''` | Path to the model checkpoint. |
| `--datasets` | `str` | `'updated_datasets'` | Comma-separated list of datasets to evaluate. |
| `--dynamic` | `flag` | `False` | Enables dynamic high resolution preprocessing. |
| `--max-num` | `int` | `6` | Maximum tile number for dynamic high resolution. |
| `--load-in-8bit` | `flag` | `False` | Loads the model weights in 8-bit precision. |
| `--auto` | `flag` | `False` | Automatically splits a large model across 8 GPUs when needed, useful for models too large to fit on a single GPU. |
import argparse
import json
try:
from .tools import VQAEval
except:
from tools import VQAEval
def parse_args():
parser = argparse.ArgumentParser(description='Demo')
parser.add_argument('--file-path', type=str, default='results/updated_datasets_231221114523.json')
args = parser.parse_args()
return args
def main(args):
data = json.loads(open(args.file_path).read())
overall_score = 0
results = {}
dataset_names = ['Visual_Reasoning', 'Visual_Perception', 'Visual_Knowledge_Acquisition',
'Visual_Commonsense', 'Object_Hallucination']
for item in data:
task_type = item['image_path'].split('/')[-2]
assert task_type in dataset_names
if task_type in results:
results[task_type].append(item)
else:
results[task_type] = [item]
for k, v in results.items():
eval = VQAEval()
correct = 0
num = 0
for i in range(len(v)):
gt_answers = v[i]['gt_answers']
answer = v[i]['answer']
if eval.evaluate(answer, gt_answers) == 1:
correct += 1
num += 1
overall_score += float(correct) / num
print(f'{k}: {float(correct) / num}')
print(f'Overall: {overall_score}')
if __name__ == '__main__':
args = parse_args()
main(args)
import argparse
import itertools
import json
import os
import random
import time
from functools import partial
import torch
from internvl.model import load_model_and_tokenizer
from internvl.train.dataset import build_transform, dynamic_preprocess
from PIL import Image
from tqdm import tqdm
ds_collections = {
'updated_datasets': {
'root': 'data/tiny_lvlm/updated_datasets/',
'max_new_tokens': 30,
'min_new_tokens': 1,
}
}
def collate_fn(batches, tokenizer):
pixel_values = torch.cat([_['pixel_values'] for _ in batches], dim=0)
questions = [_['question'] for _ in batches]
annotations = [_['annotation'] for _ in batches]
image_paths = [_['image_path'] for _ in batches]
return pixel_values, questions, annotations, image_paths
class VQADataset(torch.utils.data.Dataset):
def __init__(self, root, prompt, input_size=224, dynamic_image_size=False,
use_thumbnail=False, max_num=6):
dirnames = [os.path.join(root, item) for item in os.listdir(root)]
dirnames = [item for item in dirnames if os.path.exists(os.path.join(item, 'dataset.json'))]
sorted(dirnames)
self.roots = []
self.items = []
for item in dirnames:
data_path = os.path.join(item, 'dataset.json')
data = json.loads(open(data_path).read())
for data_line in data:
self.roots.append(item)
self.items.append(data_line)
self.prompt = prompt
self.input_size = input_size
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail = use_thumbnail
self.max_num = max_num
self.transform = build_transform(is_train=False, input_size=input_size)
def __len__(self):
return len(self.items)
def __getitem__(self, idx):
root = self.roots[idx]
item = self.items[idx]
image_path, question, annotation = item['image_path'], item['question'], item['gt_answers']
image_path = os.path.join(root, image_path)
image = Image.open(image_path).convert('RGB')
if self.dynamic_image_size:
images = dynamic_preprocess(image, image_size=self.input_size,
use_thumbnail=self.use_thumbnail,
max_num=self.max_num)
else:
images = [image]
pixel_values = [self.transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
question = question + ' ' + self.prompt
return {
'question': question,
'pixel_values': pixel_values,
'annotation': annotation,
'image_path': image_path,
}
class InferenceSampler(torch.utils.data.sampler.Sampler):
def __init__(self, size):
self._size = int(size)
assert size > 0
self._rank = torch.distributed.get_rank()
self._world_size = torch.distributed.get_world_size()
self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
@staticmethod
def _get_local_indices(total_size, world_size, rank):
shard_size = total_size // world_size
left = total_size % world_size
shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
begin = sum(shard_sizes[:rank])
end = min(sum(shard_sizes[:rank + 1]), total_size)
return range(begin, end)
def __iter__(self):
yield from self._local_indices
def __len__(self):
return len(self._local_indices)
def evaluate_chat_model():
prompt = 'Answer the question using a single word or phrase.'
random.seed(args.seed)
for ds_name in args.datasets:
dataset = VQADataset(
root=ds_collections[ds_name]['root'],
prompt=prompt,
input_size=image_size,
dynamic_image_size=args.dynamic,
use_thumbnail=use_thumbnail,
max_num=args.max_num
)
dataloader = torch.utils.data.DataLoader(
dataset=dataset,
sampler=InferenceSampler(len(dataset)),
batch_size=args.batch_size,
num_workers=args.num_workers,
pin_memory=True,
drop_last=False,
collate_fn=partial(collate_fn, tokenizer=tokenizer),
)
outputs = []
for _, (pixel_values, questions, annotations, image_paths) in tqdm(enumerate(dataloader)):
pixel_values = pixel_values.to(torch.bfloat16).cuda()
generation_config = dict(
num_beams=args.num_beams,
max_new_tokens=ds_collections[ds_name]['max_new_tokens'],
min_new_tokens=ds_collections[ds_name]['min_new_tokens'],
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
)
pred = model.chat(
tokenizer=tokenizer,
pixel_values=pixel_values,
question=questions[0],
generation_config=generation_config,
verbose=True
)
answers = [pred]
for question, answer, annotation, image_path in zip(questions, answers, annotations, image_paths):
task_type = image_path.split('/')[-2]
outputs.append({
'question': question,
'answer': answer,
'gt_answers': annotation,
'image_path': image_path,
'task_type': task_type
})
torch.distributed.barrier()
world_size = torch.distributed.get_world_size()
merged_outputs = [None for _ in range(world_size)]
torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
merged_outputs = [json.loads(_) for _ in merged_outputs]
merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
if torch.distributed.get_rank() == 0:
print(f'Evaluating {ds_name} ...')
time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
results_file = f'{ds_name}_{time_prefix}.json'
results_file = os.path.join(args.out_dir, results_file)
json.dump(merged_outputs, open(results_file, 'w'))
print('Results saved to {}'.format(results_file))
cmd = 'python eval/tiny_lvlm/calculate_score.py ' \
'--file-path ' + results_file
print(cmd)
os.system(cmd)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--checkpoint', type=str, default='')
parser.add_argument('--datasets', type=str, default='updated_datasets')
parser.add_argument('--batch-size', type=int, default=1)
parser.add_argument('--num-workers', type=int, default=1)
parser.add_argument('--num-beams', type=int, default=1)
parser.add_argument('--temperature', type=float, default=0.0)
parser.add_argument('--out-dir', type=str, default='results')
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--dynamic', action='store_true')
parser.add_argument('--max-num', type=int, default=6)
parser.add_argument('--load-in-8bit', action='store_true')
parser.add_argument('--load-in-4bit', action='store_true')
parser.add_argument('--auto', action='store_true')
args = parser.parse_args()
if not os.path.exists(args.out_dir):
os.makedirs(args.out_dir, exist_ok=True)
args.datasets = args.datasets.split(',')
print('datasets:', args.datasets)
assert args.batch_size == 1, 'Only batch size 1 is supported'
torch.distributed.init_process_group(
backend='nccl',
world_size=int(os.getenv('WORLD_SIZE', '1')),
rank=int(os.getenv('RANK', '0')),
)
torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
model, tokenizer = load_model_and_tokenizer(args)
image_size = model.config.force_image_size or model.config.vision_config.image_size
use_thumbnail = model.config.use_thumbnail
total_params = sum(p.numel() for p in model.parameters()) / 1e9
if total_params > 20 or args.dynamic:
args.num_beams = 1
print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
else:
print(f'[test] total_params: {total_params}B')
print(f'[test] image_size: {image_size}')
print(f'[test] template: {model.config.template}')
print(f'[test] dynamic_image_size: {args.dynamic}')
print(f'[test] use_thumbnail: {use_thumbnail}')
evaluate_chat_model()
# https://github.com/OpenGVLab/Multi-Modality-Arena/blob/main/tiny_lvlm_evaluation/utils/tools.py
import re
def remove_special_chars(s):
pattern = r'[^a-zA-Z0-9\s]'
s = re.sub(pattern, '', s)
return s
def has_word(sentence, word):
pattern = r'\b' + re.escape(word) + r'\b'
match = re.search(pattern, sentence)
if match:
return True
else:
return False
class VQAEval:
def __init__(self):
self.contractions = {
'aint': "ain't",
'arent': "aren't",
'cant': "can't",
'couldve': "could've",
'couldnt': "couldn't",
"couldn'tve": "couldn't've",
"couldnt've": "couldn't've",
'didnt': "didn't",
'doesnt': "doesn't",
'dont': "don't",
'hadnt': "hadn't",
"hadnt've": "hadn't've",
"hadn'tve": "hadn't've",
'hasnt': "hasn't",
'havent': "haven't",
'hed': "he'd",
"hed've": "he'd've",
"he'dve": "he'd've",
'hes': "he's",
'howd': "how'd",
'howll': "how'll",
'hows': "how's",
"Id've": "I'd've",
"I'dve": "I'd've",
'Im': "I'm",
'Ive': "I've",
'isnt': "isn't",
'itd': "it'd",
"itd've": "it'd've",
"it'dve": "it'd've",
'itll': "it'll",
"let's": "let's",
'maam': "ma'am",
'mightnt': "mightn't",
"mightnt've": "mightn't've",
"mightn'tve": "mightn't've",
'mightve': "might've",
'mustnt': "mustn't",
'mustve': "must've",
'neednt': "needn't",
'notve': "not've",
'oclock': "o'clock",
'oughtnt': "oughtn't",
"ow's'at": "'ow's'at",
"'ows'at": "'ow's'at",
"'ow'sat": "'ow's'at",
'shant': "shan't",
"shed've": "she'd've",
"she'dve": "she'd've",
"she's": "she's",
'shouldve': "should've",
'shouldnt': "shouldn't",
"shouldnt've": "shouldn't've",
"shouldn'tve": "shouldn't've",
"somebody'd": 'somebodyd',
"somebodyd've": "somebody'd've",
"somebody'dve": "somebody'd've",
'somebodyll': "somebody'll",
'somebodys': "somebody's",
'someoned': "someone'd",
"someoned've": "someone'd've",
"someone'dve": "someone'd've",
'someonell': "someone'll",
'someones': "someone's",
'somethingd': "something'd",
"somethingd've": "something'd've",
"something'dve": "something'd've",
'somethingll': "something'll",
'thats': "that's",
'thered': "there'd",
"thered've": "there'd've",
"there'dve": "there'd've",
'therere': "there're",
'theres': "there's",
'theyd': "they'd",
"theyd've": "they'd've",
"they'dve": "they'd've",
'theyll': "they'll",
'theyre': "they're",
'theyve': "they've",
'twas': "'twas",
'wasnt': "wasn't",
"wed've": "we'd've",
"we'dve": "we'd've",
'weve': "we've",
'werent': "weren't",
'whatll': "what'll",
'whatre': "what're",
'whats': "what's",
'whatve': "what've",
'whens': "when's",
'whered': "where'd",
'wheres': "where's",
'whereve': "where've",
'whod': "who'd",
"whod've": "who'd've",
"who'dve": "who'd've",
'wholl': "who'll",
'whos': "who's",
'whove': "who've",
'whyll': "why'll",
'whyre': "why're",
'whys': "why's",
'wont': "won't",
'wouldve': "would've",
'wouldnt': "wouldn't",
"wouldnt've": "wouldn't've",
"wouldn'tve": "wouldn't've",
'yall': "y'all",
"yall'll": "y'all'll",
"y'allll": "y'all'll",
"yall'd've": "y'all'd've",
"y'alld've": "y'all'd've",
"y'all'dve": "y'all'd've",
'youd': "you'd",
"youd've": "you'd've",
"you'dve": "you'd've",
'youll': "you'll",
'youre': "you're",
'youve': "you've",
}
self.manualMap = {
'none': '0',
'zero': '0',
'one': '1',
'two': '2',
'three': '3',
'four': '4',
'five': '5',
'six': '6',
'seven': '7',
'eight': '8',
'nine': '9',
'ten': '10',
}
self.articles = ['a', 'an', 'the']
self.periodStrip = re.compile('(?!<=\d)(\.)(?!\d)')
self.commaStrip = re.compile('(\d)(\,)(\d)')
self.punct = [
';',
r'/',
'[',
']',
'"',
'{',
'}',
'(',
')',
'=',
'+',
'\\',
'_',
'-',
'>',
'<',
'@',
'`',
',',
'?',
'!',
]
def evaluate(self, answer, gt_answers):
answer = answer.replace('\n', ' ')
answer = answer.replace('\t', ' ')
answer = answer.strip()
answer = self.processPunctuation(answer)
answer = self.processDigitArticle(answer)
if type(gt_answers) == list:
for i in range(len(gt_answers)):
gt_answers[i] = gt_answers[i].replace('\n', ' ')
gt_answers[i] = gt_answers[i].replace('\t', ' ')
gt_answers[i] = gt_answers[i].strip()
gt_answers[i] = self.processPunctuation(gt_answers[i])
gt_answers[i] = self.processDigitArticle(gt_answers[i])
if has_word(answer, gt_answers[i]):
return 1
return 0
else:
gt_answers = gt_answers.replace('\n', ' ')
gt_answers = gt_answers.replace('\t', ' ')
gt_answers = gt_answers.strip()
gt_answers = self.processPunctuation(gt_answers)
gt_answers = self.processDigitArticle(gt_answers)
if has_word(answer, gt_answers):
return 1
else:
return 0
def evaluate_MRR(self, answer, gt_answers):
answer = answer.replace('\n', ' ')
answer = answer.replace('\t', ' ')
answer = answer.strip()
answer = self.processPunctuation(answer)
answer = self.processDigitArticle(answer)
if type(gt_answers) is str:
gt_answers = [gt_answers]
for i in range(len(gt_answers)):
gt_answers[i] = gt_answers[i].replace('\n', ' ')
gt_answers[i] = gt_answers[i].replace('\t', ' ')
gt_answers[i] = gt_answers[i].strip()
gt_answers[i] = self.processPunctuation(gt_answers[i])
gt_answers[i] = self.processDigitArticle(gt_answers[i])
if has_word(answer, gt_answers[i]):
return 1 / (i + 1)
return 0.0
def processPunctuation(self, inText):
outText = inText
for p in self.punct:
if (p + ' ' in inText or ' ' + p in inText) or (
re.search(self.commaStrip, inText) is not None
):
outText = outText.replace(p, '')
else:
outText = outText.replace(p, ' ')
outText = self.periodStrip.sub('', outText, re.UNICODE)
return outText
def processDigitArticle(self, inText):
outText = []
tempText = inText.lower().split()
for word in tempText:
word = self.manualMap.setdefault(word, word)
if word not in self.articles:
outText.append(word)
else:
pass
for wordId, word in enumerate(outText):
if word in self.contractions:
outText[wordId] = self.contractions[word]
outText = ' '.join(outText)
return outText
# README for Evaluation
## 🌟 Overview
This script provides an evaluation pipeline for visual question answering across 9 datasets: `VQAv2`, `OKVQA`, `TextVQA`, `Vizwiz`, `DocVQA`, `ChartQA`, `AI2D`, `InfoVQA`, and `GQA`.
## 🗂️ Data Preparation
Before starting to download the data, please create the `InternVL/internvl_chat/data` folder.
### VQAv2
Follow the instructions below to prepare the data:
```shell
# Step 1: Create the data directory
mkdir -p data/vqav2 && cd data/vqav2
# Step 2: Make sure you have downloaded COCO images
ln -s ../coco/train2014 ./
ln -s ../coco/val2014 ./
ln -s ../coco/test2015 ./
# Step 3: Download questions and annotations
wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Train_mscoco.zip && unzip v2_Annotations_Train_mscoco.zip
wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Train_mscoco.zip && unzip v2_Questions_Train_mscoco.zip
wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip && unzip v2_Annotations_Val_mscoco.zip
wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Val_mscoco.zip && unzip v2_Questions_Val_mscoco.zip
wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Test_mscoco.zip && unzip v2_Questions_Test_mscoco.zip
# Step 4: Download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vqav2/vqav2_train.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vqav2/vqav2_val.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vqav2/vqav2_testdev.jsonl
cd ../..
```
After preparation is complete, the directory structure is:
```shell
data/vqav2
├── train2014 -> ../coco/train2014
├── val2014 -> ../coco/val2014
├── test2015 -> ../coco/test2015
├── v2_mscoco_train2014_annotations.json
├── v2_mscoco_train2014_complementary_pairs.json
├── v2_mscoco_val2014_annotations.json
├── v2_OpenEnded_mscoco_test2015_questions.json
├── v2_OpenEnded_mscoco_test-dev2015_questions.json
├── v2_OpenEnded_mscoco_train2014_questions.json
├── v2_OpenEnded_mscoco_val2014_questions.json
├── vqav2_testdev.jsonl
├── vqav2_train.jsonl
└── vqav2_val.jsonl
```
### OKVQA
Follow the instructions below to prepare the data:
```shell
# Step 1: Create the data directory
mkdir -p data/okvqa && cd data/okvqa
# Step 2: Make sure you have downloaded COCO images
ln -s ../coco/train2014 ./
ln -s ../coco/val2014 ./
# Step 3: Download annotations and questions
wget https://okvqa.allenai.org/static/data/mscoco_train2014_annotations.json.zip && unzip mscoco_train2014_annotations.json.zip
wget https://okvqa.allenai.org/static/data/OpenEnded_mscoco_train2014_questions.json.zip && unzip OpenEnded_mscoco_train2014_questions.json.zip
wget https://okvqa.allenai.org/static/data/mscoco_val2014_annotations.json.zip && unzip mscoco_val2014_annotations.json.zip
wget https://okvqa.allenai.org/static/data/OpenEnded_mscoco_val2014_questions.json.zip && unzip OpenEnded_mscoco_val2014_questions.json.zip
# Step 4: Download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/okvqa/okvqa_train.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/okvqa/okvqa_val.jsonl
cd ../..
```
After preparation is complete, the directory structure is:
```shell
data/okvqa
├── mscoco_train2014_annotations.json
├── mscoco_val2014_annotations.json
├── okvqa_train.jsonl
├── okvqa_val.jsonl
├── OpenEnded_mscoco_train2014_questions.json
├── OpenEnded_mscoco_val2014_questions.json
├── test2014 -> ../coco/test2014
└── val2014 -> ../coco/val2014
```
### TextVQA
Follow the instructions below to prepare the data:
```shell
# Step 1: Create the data directory
mkdir -p data/textvqa && cd data/textvqa
# Step 2: Download images
wget https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip && unzip train_val_images.zip
# Step 3: Download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train_annotations.json
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train_questions.json
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_val_annotations.json
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_val_questions.json
wget https://huggingface.co/OpenGVLab/InternVL/raw/main/textvqa_val.jsonl
wget https://huggingface.co/OpenGVLab/InternVL/raw/main/textvqa_val_llava.jsonl
cd ../..
```
After preparation is complete, the directory structure is:
```shell
data/textvqa
├── TextVQA_Rosetta_OCR_v0.2_test.json
├── TextVQA_Rosetta_OCR_v0.2_train.json
├── TextVQA_Rosetta_OCR_v0.2_val.json
├── textvqa_train_annotations.json
├── textvqa_train.jsonl
├── textvqa_train_questions.json
├── textvqa_val_annotations.json
├── textvqa_val.jsonl
├── textvqa_val_llava.jsonl
├── textvqa_val_questions.json
└── train_images
```
### VizWiz
Follow the instructions below to prepare the data:
```shell
# Step 1: Create the data directory
mkdir -p data/vizwiz && cd data/vizwiz
# Step 2: Download images
wget https://vizwiz.cs.colorado.edu/VizWiz_final/images/train.zip && unzip train.zip
wget https://vizwiz.cs.colorado.edu/VizWiz_final/images/val.zip && unzip val.zip
wget https://vizwiz.cs.colorado.edu/VizWiz_final/images/test.zip && unzip test.zip
# Step 3: Download annotations
wget https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations.zip && unzip Annotations.zip
# Step 4: Download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_train_annotations.json
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_train_questions.json
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_train.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_val_annotations.json
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_val_questions.json
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_val.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_test.jsonl
cd ../..
```
After preparation is complete, the directory structure is:
```shell
data/vizwiz
├── annotations
├── test
├── train
├── val
├── vizwiz_test.jsonl
├── vizwiz_train_annotations.json
├── vizwiz_train.jsonl
├── vizwiz_train_questions.json
├── vizwiz_val_annotations.json
├── vizwiz_val.jsonl
└── vizwiz_val_questions.json
```
### DocVQA
Follow the instructions below to prepare the data:
```shell
# Step 1: Create the data directory
mkdir -p data/docvqa && cd data/docvqa
# Step 2: Download images and annotations
wget https://datasets.cvc.uab.es/rrc/DocVQA/train.tar.gz --no-check-certificate # (optional)
wget https://datasets.cvc.uab.es/rrc/DocVQA/val.tar.gz --no-check-certificate
wget https://datasets.cvc.uab.es/rrc/DocVQA/test.tar.gz --no-check-certificate
# Step 3: Unzip files
tar -zxvf train.tar.gz
tar -zxvf val.tar.gz
tar -zxvf test.tar.gz
# Step 4: Download converted jsonl files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/docvqa/train.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/docvqa/val.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/docvqa/test.jsonl
cd ../..
```
After preparation is complete, the directory structure is:
```shell
data/docvqa
├── test
├── test.jsonl
├── train
├── train.jsonl
├── val
└── val.jsonl
```
### AI2D
Follow the instructions below to prepare the data:
```bash
# Step 1: Create the data directory
mkdir -p data/ai2diagram && cd data/ai2diagram
# Step 2: Download converted files
wget https://huggingface.co/OpenGVLab/InternVL/raw/main/ai2d_test_vlmevalkit.jsonl -O test_vlmevalkit.jsonl
wget https://huggingface.co/OpenGVLab/InternVL/resolve/main/AI2D_TEST.zip && unzip AI2D_TEST.zip
# Step 3: Download images from Google Drive (optional, provided by InternLM-XComposer)
# https://drive.google.com/file/d/1dqqa3MnrxMXaU_K9JA6C83je32ibwdOY/view?usp=sharing
# images should be placed in `data/ai2diagram/ai2d/abc_images` and `data/ai2diagram/ai2d/images`
cd ../..
```
After preparation is complete, the directory structure is:
```
data/ai2diagram
├── test_vlmevalkit.jsonl
├── ai2d # (optional)
│ ├── abc_images
│ └── images
└── AI2D_TEST
```
### InfoVQA
Follow the instructions below to prepare the data:
```shell
# Step 1: Create the data directory
mkdir -p data/infographicsvqa && cd data/infographicsvqa
# Step 2: Download images and annotations from https://rrc.cvc.uab.es/?ch=17&com=downloads
# infographicsVQA_test_v1.0.json, infographicsVQA_val_v1.0_withQT.json, infographicVQA_train_v1.0.json
# Step 3: Download converted files
wget https://huggingface.co/OpenGVLab/InternVL/raw/main/infographicsvqa_val.jsonl -O val.jsonl
wget https://huggingface.co/OpenGVLab/InternVL/raw/main/infographicsvqa_test.jsonl -O test.jsonl
cd ../..
```
After preparation is complete, the directory structure is:
```shell
data/infographicsvqa
├── infographicsvqa_images
├── infographicsVQA_test_v1.0.json
├── infographicsVQA_val_v1.0_withQT.json
├── infographicVQA_train_v1.0.json
├── test.jsonl
└── val.jsonl
```
### ChartQA
Follow the instructions below to prepare the data:
```shell
# Step 1: Create the data directory
mkdir -p data/chartqa && cd data/chartqa
# Step 2: download images from
# https://drive.google.com/file/d/1Lm_w6zeET1Hyl_9ks6w5nEsgpoyPHalV/view
# Step 3: Download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/train_human.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/train_augmented.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/test_human.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/test_augmented.jsonl
cd ../..
```
After preparation is complete, the directory structure is:
```shell
data/chartqa
├── ChartQA Dataset
│ ├── test
│ ├── train
│ └── val
├── test_augmented.jsonl
├── test_human.jsonl
├── train_augmented.jsonl
└── train_human.jsonl
```
### GQA
Follow the instructions below to prepare the data:
```shell
# Step 1: Create the data directory
mkdir -p data/gqa && cd data/gqa
# Step 2: Download the official evaluation script
wget https://nlp.stanford.edu/data/gqa/eval.zip
unzip eval.zip
# Step 3: Download images
wget https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip
unzip images.zip
# Step 4: Download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/gqa/testdev_balanced.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/gqa/train_balanced.jsonl
wget https://github.com/OpenGVLab/InternVL/releases/download/data/llava_gqa_testdev_balanced_qwen_format.jsonl
cd ../..
```
After preparation is complete, the directory structure is:
```shell
data/gqa
├── challenge_all_questions.json
├── challenge_balanced_questions.json
├── eval.py
├── images
├── llava_gqa_testdev_balanced_qwen_format.jsonl
├── readme.txt
├── submission_all_questions.json
├── test_all_questions.json
├── test_balanced.jsonl
├── test_balanced_questions.json
├── testdev_all_questions.json
├── testdev_balanced_all_questions.json
├── testdev_balanced_predictions.json
├── testdev_balanced_questions.json
├── train_all_questions
├── train_balanced.jsonl
├── train_balanced_questions.json
├── val_all_questions.json
└── val_balanced_questions.json
```
## 🏃 Evaluation Execution
> ⚠️ Note: For testing InternVL (1.5, 2.0, 2.5, and later versions), always enable `--dynamic` to perform dynamic resolution testing.
To run the evaluation, execute the following command on an 8-GPU setup:
```shell
torchrun --nproc_per_node=8 eval/caption/evaluate_caption.py --checkpoint ${CHECKPOINT} --datasets ${DATASETS} --dynamic
```
Alternatively, you can run the following simplified command:
```shell
# Test VQAv2 val
GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-vqav2-val --dynamic
# Test VQAv2 testdev
GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-vqav2-testdev --dynamic
# Test OKVQA val
GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-okvqa-val --dynamic
# Test Vizwiz val
GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-vizwiz-val --dynamic
# Test Vizwiz test
GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-vizwiz-test --dynamic
# Test GQA testdev
GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-gqa-testdev --dynamic
# Test AI2D test
GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-ai2d-test --dynamic
# Test TextVQA val
GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-textvqa-val --dynamic
# Test ChartQA test-human & test-augmented
GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-chartqa-test --dynamic --max-num 12
# Test DocVQA val
GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-docvqa-val --dynamic --max-num 18
# Test DocVQA test
GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-docvqa-test --dynamic --max-num 18
# Test InfoVQA val
GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-infovqa-val --dynamic --max-num 24
# Test InfoVQA test
GPUS=8 sh evaluate.sh ${CHECKPOINT} vqa-infovqa-test --dynamic --max-num 24
```
### Arguments
The following arguments can be configured for the evaluation script:
| Argument | Type | Default | Description |
| ---------------- | ------ | ----------- | ----------------------------------------------------------------------------------------------------------------- |
| `--checkpoint` | `str` | `''` | Path to the model checkpoint. |
| `--datasets` | `str` | `okvqa_val` | Comma-separated list of datasets to evaluate. |
| `--dynamic` | `flag` | `False` | Enables dynamic high resolution preprocessing. |
| `--max-num` | `int` | `6` | Maximum tile number for dynamic high resolution. |
| `--load-in-8bit` | `flag` | `False` | Loads the model weights in 8-bit precision. |
| `--auto` | `flag` | `False` | Automatically splits a large model across 8 GPUs when needed, useful for models too large to fit on a single GPU. |
import argparse
import json
parser = argparse.ArgumentParser()
parser.add_argument('--src', type=str)
parser.add_argument('--dst', type=str)
args = parser.parse_args()
all_answers = []
data = json.load(open(args.src))
for res in data:
question_id = res['questionId']
answer = res['answer'].rstrip('.').lower()
all_answers.append({'questionId': question_id, 'prediction': answer})
with open(args.dst, 'w') as f:
json.dump(all_answers, f)
import argparse
import itertools
import json
import os
import random
import subprocess
import time
from functools import partial
from typing import Optional
import torch
from internvl.model import load_model_and_tokenizer
from internvl.train.dataset import build_transform, dynamic_preprocess
from PIL import Image
from textvqa_eval import TextVQAAccuracyEvaluator
from tqdm import tqdm
ds_collections = {
'vqav2_val': {
'train': 'data/vqav2/vqav2_train.jsonl',
'test': 'data/vqav2/vqav2_val.jsonl',
'question': 'data/vqav2/v2_OpenEnded_mscoco_val2014_questions.json',
'annotation': 'data/vqav2/v2_mscoco_val2014_annotations.json',
'metric': 'vqa_score',
'max_new_tokens': 10,
},
'vqav2_testdev': {
'train': 'data/vqav2/vqav2_train.jsonl',
'test': 'data/vqav2/vqav2_testdev.jsonl',
'metric': None,
'max_new_tokens': 10,
},
'okvqa_val': {
'train': 'data/okvqa/okvqa_train.jsonl',
'test': 'data/okvqa/okvqa_val.jsonl',
'question': 'data/okvqa/OpenEnded_mscoco_val2014_questions.json',
'annotation': 'data/okvqa/mscoco_val2014_annotations.json',
'metric': 'vqa_score',
'max_new_tokens': 10,
},
'textvqa_val': {
'train': 'data/textvqa/textvqa_train.jsonl',
'test': 'data/textvqa/textvqa_val.jsonl',
'question': 'data/textvqa/textvqa_val_questions.json',
'annotation': 'data/textvqa/textvqa_val_annotations.json',
'metric': 'vqa_score',
'max_new_tokens': 10,
},
'textvqa_val_ocr': {
'train': 'data/textvqa/textvqa_train.jsonl',
'test': 'data/textvqa/textvqa_val_llava.jsonl',
'question': 'data/textvqa/textvqa_val_questions.json',
'annotation': 'data/textvqa/textvqa_val_annotations.json',
'metric': 'vqa_score',
'max_new_tokens': 10,
},
'vizwiz_val': {
'train': 'data/vizwiz/vizwiz_train.jsonl',
'test': 'data/vizwiz/vizwiz_val.jsonl',
'question': 'data/vizwiz/vizwiz_val_questions.json',
'annotation': 'data/vizwiz/vizwiz_val_annotations.json',
'metric': 'vqa_score',
'max_new_tokens': 10,
},
'vizwiz_test': {
'train': 'data/vizwiz/vizwiz_train.jsonl',
'test': 'data/vizwiz/vizwiz_test.jsonl',
'metric': None,
'max_new_tokens': 10,
},
'docvqa_val': {
'train': 'data/docvqa/train.jsonl',
'test': 'data/docvqa/val.jsonl',
'annotation': 'data/docvqa/val/val_v1.0.json',
'metric': 'anls',
'max_new_tokens': 100,
},
'docvqa_test': {
'train': 'data/docvqa/train.jsonl',
'test': 'data/docvqa/test.jsonl',
'metric': None,
'max_new_tokens': 100,
},
'chartqa_test_human': {
'train': 'data/chartqa/train_human.jsonl',
'test': 'data/chartqa/test_human.jsonl',
'metric': 'relaxed_accuracy',
'max_new_tokens': 100,
},
'chartqa_test_augmented': {
'train': 'data/chartqa/train_augmented.jsonl',
'test': 'data/chartqa/test_augmented.jsonl',
'metric': 'relaxed_accuracy',
'max_new_tokens': 100,
},
'gqa_testdev': {
'train': 'data/gqa/train.jsonl',
'test': 'data/gqa/test_balanced.jsonl',
'metric': 'accuracy',
'max_new_tokens': 10,
},
'gqa_testdev_llava': {
'train': 'data/gqa/train.jsonl',
'test': 'data/gqa/llava_gqa_testdev_balanced_qwen_format.jsonl',
'metric': 'accuracy',
'max_new_tokens': 10,
},
'ocrvqa_val': {
'train': 'data/ocrvqa/ocrvqa_train.jsonl',
'test': 'data/ocrvqa/ocrvqa_val.jsonl',
'metric': 'accuracy',
'max_new_tokens': 100,
},
'ocrvqa_test': {
'train': 'data/ocrvqa/ocrvqa_train.jsonl',
'test': 'data/ocrvqa/ocrvqa_test.jsonl',
'metric': 'accuracy',
'max_new_tokens': 100,
},
'ai2diagram_test': {
'train': 'data/ai2diagram/train.jsonl',
'test': 'data/ai2diagram/test_vlmevalkit.jsonl',
'metric': 'accuracy',
'max_new_tokens': 10,
},
'infographicsvqa_val': {
'train': 'data/infographicsvqa/train.jsonl',
'test': 'data/infographicsvqa/val.jsonl',
'annotation': 'data/infographicsvqa/infographicsVQA_val_v1.0_withQT.json',
'metric': 'anls',
'max_new_tokens': 100,
},
'infographicsvqa_test': {
'train': 'data/infographicsvqa/train.jsonl',
'test': 'data/infographicsvqa/test.jsonl',
'annotation': 'data/infographicsvqa/infographicsVQA_test_v1.0.json',
'metric': None,
'max_new_tokens': 100,
}
}
# https://github.com/google-research/pix2struct/blob/main/pix2struct/metrics.py#L81
def relaxed_correctness(target: str,
prediction: str,
max_relative_change: float = 0.05) -> bool:
"""Calculates relaxed correctness.
The correctness tolerates certain error ratio defined by max_relative_change.
See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
“Following Methani et al. (2020), we use a relaxed accuracy measure for the
numeric answers to allow a minor inaccuracy that may result from the automatic
data extraction process. We consider an answer to be correct if it is within
5% of the gold answer. For non-numeric answers, we still need an exact match
to consider an answer to be correct.”
Args:
target: Target string.
prediction: Predicted string.
max_relative_change: Maximum relative change.
Returns:
Whether the prediction was correct given the specified tolerance.
"""
def _to_float(text: str) -> Optional[float]:
try:
if text.endswith('%'):
# Convert percentages to floats.
return float(text.rstrip('%')) / 100.0
else:
return float(text)
except ValueError:
return None
prediction_float = _to_float(prediction)
target_float = _to_float(target)
if prediction_float is not None and target_float:
relative_change = abs(prediction_float -
target_float) / abs(target_float)
return relative_change <= max_relative_change
else:
return prediction.lower() == target.lower()
def evaluate_relaxed_accuracy(entries):
scores = []
for elem in entries:
if isinstance(elem['annotation'], str):
elem['annotation'] = [elem['annotation']]
score = max([
relaxed_correctness(elem['answer'].strip(), ann)
for ann in elem['annotation']
])
scores.append(score)
return sum(scores) / len(scores)
def evaluate_exact_match_accuracy(entries):
scores = []
for elem in entries:
if isinstance(elem['annotation'], str):
elem['annotation'] = [elem['annotation']]
score = max([
(1.0 if
(elem['answer'].strip().lower() == ann.strip().lower()) else 0.0)
for ann in elem['annotation']
])
scores.append(score)
return sum(scores) / len(scores)
def collate_fn(batches, tokenizer):
pixel_values = torch.cat([_['pixel_values'] for _ in batches], dim=0)
questions = [_['question'] for _ in batches]
question_ids = [_['question_id'] for _ in batches]
annotations = [_['annotation'] for _ in batches]
return pixel_values, questions, question_ids, annotations
class VQADataset(torch.utils.data.Dataset):
def __init__(self, train, test, prompt, few_shot, input_size=224, dynamic_image_size=False,
use_thumbnail=False, max_num=6):
self.test = open(test).readlines()
self.prompt = prompt
self.input_size = input_size
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail = use_thumbnail
self.few_shot = few_shot
self.max_num = max_num
if few_shot > 0:
self.train = open(train).readlines()
self.transform = build_transform(is_train=False, input_size=input_size)
def __len__(self):
return len(self.test)
def __getitem__(self, idx):
data = json.loads(self.test[idx].strip())
image, question, question_id, annotation = data['image'], data[
'question'], data['question_id'], data.get('answer', None)
few_shot_prompt = ''
if self.few_shot > 0:
few_shot_samples = random.sample(self.train, self.few_shot)
for sample in few_shot_samples:
sample = json.loads(sample.strip())
few_shot_prompt += self.prompt.format(
sample['image'],
sample['question']) + f" {sample['answer']}"
image = Image.open(image).convert('RGB')
if self.dynamic_image_size:
images = dynamic_preprocess(image, image_size=self.input_size,
use_thumbnail=self.use_thumbnail,
max_num=self.max_num)
else:
images = [image]
pixel_values = [self.transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
if len(self.prompt) != 0:
question = question + ' ' + self.prompt
return {
'question_id': question_id,
'question': question,
'pixel_values': pixel_values,
'annotation': annotation
}
class InferenceSampler(torch.utils.data.sampler.Sampler):
def __init__(self, size):
self._size = int(size)
assert size > 0
self._rank = torch.distributed.get_rank()
self._world_size = torch.distributed.get_world_size()
self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
@staticmethod
def _get_local_indices(total_size, world_size, rank):
shard_size = total_size // world_size
left = total_size % world_size
shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
begin = sum(shard_sizes[:rank])
end = min(sum(shard_sizes[:rank + 1]), total_size)
return range(begin, end)
def __iter__(self):
yield from self._local_indices
def __len__(self):
return len(self._local_indices)
def post_process(response):
response = response.strip().split('.')[0].split(
',')[0].split('!')[0].lower()
if 'is ' in response:
response = response.split('is ')[1]
if 'are ' in response:
response = response.split('are ')[1]
if 'a ' in response:
response = response.split('a ')[1]
if 'an ' in response:
response = response.split('an ')[1]
if 'the ' in response:
response = response.split('the ')[1]
if ' of' in response:
response = response.split(' of')[0]
response = response.strip()
return response
def evaluate_chat_model():
base_prompt = 'Answer the question using a single word or phrase.'
vizwiz_prompt = "When the provided information is insufficient, respond with 'Unanswerable'. "
infovqa_prompt = 'Answer the question using a single word or phrase.'
ai2d_prompt = ''
random.seed(args.seed)
summaries = []
for ds_name in args.datasets:
if 'vizwiz' in ds_name:
input_prompt = vizwiz_prompt + base_prompt
elif 'ai2d' in ds_name:
input_prompt = ai2d_prompt
elif 'infographicsvqa' in ds_name:
input_prompt = infovqa_prompt
else:
input_prompt = base_prompt
dataset = VQADataset(
train=ds_collections[ds_name]['train'],
test=ds_collections[ds_name]['test'],
prompt=input_prompt,
few_shot=args.few_shot,
input_size=image_size,
dynamic_image_size=args.dynamic,
use_thumbnail=use_thumbnail,
max_num=args.max_num
)
dataloader = torch.utils.data.DataLoader(
dataset=dataset,
sampler=InferenceSampler(len(dataset)),
batch_size=args.batch_size,
num_workers=args.num_workers,
pin_memory=True,
drop_last=False,
collate_fn=partial(collate_fn, tokenizer=tokenizer),
)
outputs = []
for _, (pixel_values, questions, question_ids, annotations) in tqdm(enumerate(dataloader)):
pixel_values = pixel_values.to(torch.bfloat16).cuda()
generation_config = dict(
num_beams=args.num_beams,
max_new_tokens=ds_collections[ds_name]['max_new_tokens'],
min_new_tokens=1,
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
)
pred = model.chat(
tokenizer=tokenizer,
pixel_values=pixel_values,
question=questions[0],
generation_config=generation_config,
verbose=True
)
answers = [pred]
for question, question_id, answer, annotation in zip(questions, question_ids, answers, annotations):
if ds_name in ['vqav2_val', 'vqav2_testdev', 'okvqa_val', 'textvqa_val',
'vizwiz_val', 'textvqa_val_ocr']:
outputs.append({
'question': question,
'question_id': question_id,
'answer': answer,
})
elif ds_name in ['docvqa_val', 'infographicsvqa_val', 'gqa_testdev', 'ocrvqa_val',
'ocrvqa_test', 'gqa_testdev_llava', 'infographicsvqa_test',]:
outputs.append({
'question': question,
'questionId': question_id,
'answer': answer,
'annotation': annotation,
})
elif ds_name in ['ai2diagram_test']:
outputs.append({
'question': question,
'image': question_id,
'answer': answer,
'annotation': annotation,
})
elif ds_name in ['chartqa_test_human', 'chartqa_test_augmented']:
outputs.append({
'question': question,
'answer': answer,
'annotation': annotation,
})
elif ds_name in ['docvqa_test']:
outputs.append({
'questionId': question_id,
'answer': answer,
})
elif ds_name in ['vizwiz_test']:
outputs.append({
'image': question_id.replace('data/vizwiz/test/', ''),
'answer': answer,
})
else:
raise NotImplementedError
torch.distributed.barrier()
world_size = torch.distributed.get_world_size()
merged_outputs = [None for _ in range(world_size)]
torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
merged_outputs = [json.loads(_) for _ in merged_outputs]
merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
if torch.distributed.get_rank() == 0:
print(f'Evaluating {ds_name} ...')
time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
results_file = f'{ds_name}_{time_prefix}.json'
results_file = os.path.join(args.out_dir, results_file)
json.dump(merged_outputs, open(results_file, 'w'))
print('Results saved to {}'.format(results_file))
if ds_collections[ds_name]['metric'] == 'vqa_score':
evaluator = TextVQAAccuracyEvaluator()
annotation = json.load(open(ds_collections[ds_name]['annotation'], 'r'))['annotations']
question_id2answers = {}
for item in annotation:
question_id = item['question_id']
answers = [answer['answer'] for answer in item['answers']]
question_id2answers[question_id] = answers
for item in merged_outputs:
item['pred_answer'] = item['answer']
item['gt_answers'] = question_id2answers[item['question_id']]
accuracy = evaluator.eval_pred_list(merged_outputs)
print(ds_name, accuracy)
summaries.append([args.checkpoint, ds_name, accuracy])
elif ds_collections[ds_name]['metric'] == 'anls':
json.dump(merged_outputs,
open(results_file, 'w'),
ensure_ascii=False)
print('python eval/vqa/infographicsvqa_eval.py -g ' +
ds_collections[ds_name]['annotation'] + ' -s ' +
results_file)
os.system('python eval/vqa/infographicsvqa_eval.py -g ' +
ds_collections[ds_name]['annotation'] + ' -s ' +
results_file)
elif ds_collections[ds_name]['metric'] == 'relaxed_accuracy':
relaxed_accuracy = evaluate_relaxed_accuracy(merged_outputs)
print(ds_name, {'relaxed_accuracy': relaxed_accuracy})
summaries.append([ds_name, {'relaxed_accuracy': relaxed_accuracy}])
elif ds_collections[ds_name]['metric'] == 'accuracy':
if 'gqa' in ds_name:
dst_file = './data/gqa/testdev_balanced_predictions.json'
print('python eval/vqa/convert_gqa_for_eval.py --src ' +
results_file + ' --dst ' + dst_file)
python_path = 'python'
os.system(python_path + ' eval/vqa/convert_gqa_for_eval.py --src ' +
results_file + ' --dst ' + dst_file)
command = f'cd ./data/gqa/ && {python_path} eval.py --tier testdev_balanced && cd ../../'
print(command)
accuracy = subprocess.check_output(command, shell=True, universal_newlines=True)
else:
accuracy = {'accuracy': evaluate_exact_match_accuracy(merged_outputs)}
print(ds_name, accuracy)
summaries.append([args.checkpoint, ds_name, accuracy])
torch.distributed.barrier()
out_path = '_'.join(args.checkpoint.split('/')[-2:])
writer = open(os.path.join(args.out_dir, f'{out_path}.txt'), 'a')
print(f"write results to file {os.path.join(args.out_dir, f'{out_path}.txt')}")
for summary in summaries:
print(summary)
writer.write(f'{summary}\n')
writer.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--checkpoint', type=str, default='')
parser.add_argument('--datasets', type=str,
default='okvqa_val,textvqa_val,vizwiz_val,ai2diagram_test,gqa_testdev_llava')
parser.add_argument('--batch-size', type=int, default=1)
parser.add_argument('--num-workers', type=int, default=1)
parser.add_argument('--num-beams', type=int, default=1)
parser.add_argument('--temperature', type=float, default=0.0)
parser.add_argument('--out-dir', type=str, default='results')
parser.add_argument('--few-shot', type=int, default=0)
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--dynamic', action='store_true')
parser.add_argument('--max-num', type=int, default=6)
parser.add_argument('--load-in-8bit', action='store_true')
parser.add_argument('--load-in-4bit', action='store_true')
parser.add_argument('--auto', action='store_true')
args = parser.parse_args()
if not os.path.exists(args.out_dir):
os.makedirs(args.out_dir, exist_ok=True)
args.datasets = args.datasets.split(',')
print('datasets:', args.datasets)
assert args.batch_size == 1, 'Only batch size 1 is supported'
torch.distributed.init_process_group(
backend='nccl',
world_size=int(os.getenv('WORLD_SIZE', '1')),
rank=int(os.getenv('RANK', '0')),
)
torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
model, tokenizer = load_model_and_tokenizer(args)
image_size = model.config.force_image_size or model.config.vision_config.image_size
use_thumbnail = model.config.use_thumbnail
total_params = sum(p.numel() for p in model.parameters()) / 1e9
if total_params > 20 or args.dynamic:
args.num_beams = 1
print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
else:
print(f'[test] total_params: {total_params}B')
print(f'[test] image_size: {image_size}')
print(f'[test] template: {model.config.template}')
print(f'[test] dynamic_image_size: {args.dynamic}')
print(f'[test] use_thumbnail: {use_thumbnail}')
evaluate_chat_model()
# This file can be downloaded from: https://www.docvqa.org/datasets/infographicvqa and https://rrc.cvc.uab.es/?ch=17&com=introduction
import argparse
import json
import os
question_ids_to_exclude = []
# answer_types = {'image span': 'Image-Span', 'question span': 'Question-Span', 'multiple spans': 'Multi-Span', 'non span': 'None span', 'list': 'List'}
answer_types = {'image span': 'Image-Span', 'question span': 'Question-Span', 'multiple spans': 'Multi-Span',
'non span': 'None span'}
evidence_types = {'table/list': 'Table/list', 'textual': 'Text', 'photo/pciture/visual_objects': 'Visual/Layout',
'figure': 'Figure', 'map': 'Map'}
reasoning_requirements = {'comparison': 'Sorting', 'arithmetic': 'Arithmetic', 'counting': 'Counting'}
def save_json(file_path, data):
with open(file_path, 'w+') as json_file:
json.dump(data, json_file)
def levenshtein_distance(s1, s2):
if len(s1) > len(s2):
s1, s2 = s2, s1
distances = range(len(s1) + 1)
for i2, c2 in enumerate(s2):
distances_ = [i2 + 1]
for i1, c1 in enumerate(s1):
if c1 == c2:
distances_.append(distances[i1])
else:
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
distances = distances_
return distances[-1]
def validate_data(gtFilePath, submFilePath):
"""
Method validate_data: validates that all files in the results folder are correct (have the correct name contents).
Validates also that there are no missing files in the folder.
If some error detected, the method raises the error
"""
gtJson = json.load(open(gtFilePath, 'rb'))
submJson = json.load(open(submFilePath, 'rb'))
if 'data' not in gtJson:
raise Exception('The GT file is not valid (no data key)')
if 'dataset_name' not in gtJson:
raise Exception('The GT file is not valid (no dataset_name key)')
if isinstance(submJson, list) is False:
raise Exception('The Det file is not valid (root item must be an array)')
if len(submJson) != len(gtJson['data']):
raise Exception('The Det file is not valid (invalid number of answers. Expected:' + str(
len(gtJson['data'])) + ' Found:' + str(len(submJson)) + ')')
gtQuestions = sorted([r['questionId'] for r in gtJson['data']])
res_id_to_index = {int(r['questionId']): ix for ix, r in enumerate(submJson)}
detQuestions = sorted([r['questionId'] for r in submJson])
if ((gtQuestions == detQuestions) is False):
raise Exception('The Det file is not valid. Question IDs must much GT')
for gtObject in gtJson['data']:
try:
q_id = int(gtObject['questionId'])
res_ix = res_id_to_index[q_id]
except:
raise Exception('The Det file is not valid. Question ' + str(gtObject['questionId']) + ' not present')
else:
detObject = submJson[res_ix]
# if detObject['questionId'] != gtObject['questionId'] :
# raise Exception("Answer #" + str(i) + " not valid (invalid question ID. Expected:" + str(gtObject['questionId']) + "Found:" + detObject['questionId'] + ")")
if 'answer' not in detObject:
raise Exception('Question ' + str(gtObject['questionId']) + ' not valid (no answer key)')
if isinstance(detObject['answer'], list) is True:
raise Exception(
'Question ' + str(gtObject['questionId']) + ' not valid (answer key has to be a single string)')
def evaluate_method(gtFilePath, submFilePath, evaluationParams):
"""
Method evaluate_method: evaluate method and returns the results
Results. Dictionary with the following values:
- method (required) Global method metrics. Ex: { 'Precision':0.8,'Recall':0.9 }
- samples (optional) Per sample metrics. Ex: {'sample1' : { 'Precision':0.8,'Recall':0.9 } , 'sample2' : { 'Precision':0.8,'Recall':0.9 }
"""
show_scores_per_answer_type = evaluationParams.answer_types
gtJson = json.load(open(gtFilePath, 'rb'))
submJson = json.load(open(submFilePath, 'rb'))
res_id_to_index = {int(r['questionId']): ix for ix, r in enumerate(submJson)}
perSampleMetrics = {}
totalScore = 0
row = 0
if show_scores_per_answer_type:
answerTypeTotalScore = {x: 0 for x in answer_types.keys()}
answerTypeNumQuestions = {x: 0 for x in answer_types.keys()}
evidenceTypeTotalScore = {x: 0 for x in evidence_types.keys()}
evidenceTypeNumQuestions = {x: 0 for x in evidence_types.keys()}
reasoningTypeTotalScore = {x: 0 for x in reasoning_requirements.keys()}
reasoningTypeNumQuestions = {x: 0 for x in reasoning_requirements.keys()}
for gtObject in gtJson['data']:
q_id = int(gtObject['questionId'])
res_ix = res_id_to_index[q_id]
detObject = submJson[res_ix]
if q_id in question_ids_to_exclude:
question_result = 0
info = 'Question EXCLUDED from the result'
else:
info = ''
values = []
for answer in gtObject['answers']:
# preprocess both the answers - gt and prediction
gt_answer = ' '.join(answer.strip().lower().split())
det_answer = ' '.join(detObject['answer'].strip().lower().split())
# dist = levenshtein_distance(answer.lower(), detObject['answer'].lower())
dist = levenshtein_distance(gt_answer, det_answer)
length = max(len(answer.upper()), len(detObject['answer'].upper()))
values.append(0.0 if length == 0 else float(dist) / float(length))
question_result = 1 - min(values)
if (question_result < evaluationParams.anls_threshold):
question_result = 0
totalScore += question_result
if show_scores_per_answer_type:
for q_type in gtObject['answer_type']:
answerTypeTotalScore[q_type] += question_result
answerTypeNumQuestions[q_type] += 1
for q_type in gtObject['evidence']:
evidenceTypeTotalScore[q_type] += question_result
evidenceTypeNumQuestions[q_type] += 1
for q_type in gtObject['operation/reasoning']:
reasoningTypeTotalScore[q_type] += question_result
reasoningTypeNumQuestions[q_type] += 1
perSampleMetrics[str(gtObject['questionId'])] = {
'score': question_result,
'question': gtObject['question'],
'gt': gtObject['answers'],
'det': detObject['answer'],
'info': info
}
row = row + 1
methodMetrics = {
'score': 0 if len(gtJson['data']) == 0 else totalScore / (len(gtJson['data']) - len(question_ids_to_exclude))
}
answer_types_scores = {}
evidence_types_scores = {}
operation_types_scores = {}
if show_scores_per_answer_type:
for a_type, ref in answer_types.items():
answer_types_scores[ref] = 0 if len(gtJson['data']) == 0 else answerTypeTotalScore[a_type] / (
answerTypeNumQuestions[a_type])
for e_type, ref in evidence_types.items():
evidence_types_scores[ref] = 0 if len(gtJson['data']) == 0 else evidenceTypeTotalScore[e_type] / (
evidenceTypeNumQuestions[e_type])
for r_type, ref in reasoning_requirements.items():
operation_types_scores[ref] = 0 if len(gtJson['data']) == 0 else reasoningTypeTotalScore[r_type] / (
reasoningTypeNumQuestions[r_type])
resDict = {
'result': methodMetrics,
'scores_by_types': {'answer_types': answer_types_scores, 'evidence_types': evidence_types_scores,
'operation_types': operation_types_scores},
'per_sample_result': perSampleMetrics
}
return resDict
def display_results(results, show_answer_types):
print('\nOverall ANLS: {:2.4f}'.format(results['result']['score']))
if show_answer_types:
print('\nAnswer types:')
for a_type in answer_types.values():
print('\t{:12s} {:2.4f}'.format(a_type, results['scores_by_types']['answer_types'][a_type]))
print('\nEvidence types:')
for e_type in evidence_types.values():
print('\t{:12s} {:2.4f}'.format(e_type, results['scores_by_types']['evidence_types'][e_type]))
print('\nOperation required:')
for r_type in reasoning_requirements.values():
print('\t{:12s} {:2.4f}'.format(r_type, results['scores_by_types']['operation_types'][r_type]))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='InfographVQA evaluation script.')
parser.add_argument('-g', '--ground_truth', type=str, help='Path of the Ground Truth file.', required=True)
parser.add_argument('-s', '--submission_file', type=str, help="Path of your method's results file.", required=True)
parser.add_argument('-t', '--anls_threshold', type=float, default=0.5,
help='ANLS threshold to use (See Scene-Text VQA paper for more info.).', required=False)
parser.add_argument('-a', '--answer_types', type=bool, default=False,
help='Score break down by answer types (special gt file required).', required=False)
parser.add_argument('-o', '--output', type=str,
help="Path to a directory where to copy the file 'results.json' that contains per-sample results.",
required=False)
args = parser.parse_args()
# Validate the format of ground truth and submission files.
validate_data(args.ground_truth, args.submission_file)
# Evaluate method
results = evaluate_method(args.ground_truth, args.submission_file, args)
display_results(results, args.answer_types)
if args.output:
output_dir = args.output
if not os.path.exists(output_dir):
os.makedirs(output_dir)
resultsOutputname = os.path.join(output_dir, 'results.json')
save_json(resultsOutputname, results)
print('All results including per-sample result has been correctly saved!')
# Copyright (c) Facebook, Inc. and its affiliates.
# copied from https://github.com/haotian-liu/LLaVA/blob/main/llava/eval/m4c_evaluator.py
import re
from tqdm import tqdm
class EvalAIAnswerProcessor:
"""
Processes an answer similar to Eval AI
copied from
https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897
"""
CONTRACTIONS = {
'aint': "ain't",
'arent': "aren't",
'cant': "can't",
'couldve': "could've",
'couldnt': "couldn't",
"couldn'tve": "couldn't've",
"couldnt've": "couldn't've",
'didnt': "didn't",
'doesnt': "doesn't",
'dont': "don't",
'hadnt': "hadn't",
"hadnt've": "hadn't've",
"hadn'tve": "hadn't've",
'hasnt': "hasn't",
'havent': "haven't",
'hed': "he'd",
"hed've": "he'd've",
"he'dve": "he'd've",
'hes': "he's",
'howd': "how'd",
'howll': "how'll",
'hows': "how's",
"Id've": "I'd've",
"I'dve": "I'd've",
'Im': "I'm",
'Ive': "I've",
'isnt': "isn't",
'itd': "it'd",
"itd've": "it'd've",
"it'dve": "it'd've",
'itll': "it'll",
"let's": "let's",
'maam': "ma'am",
'mightnt': "mightn't",
"mightnt've": "mightn't've",
"mightn'tve": "mightn't've",
'mightve': "might've",
'mustnt': "mustn't",
'mustve': "must've",
'neednt': "needn't",
'notve': "not've",
'oclock': "o'clock",
'oughtnt': "oughtn't",
"ow's'at": "'ow's'at",
"'ows'at": "'ow's'at",
"'ow'sat": "'ow's'at",
'shant': "shan't",
"shed've": "she'd've",
"she'dve": "she'd've",
"she's": "she's",
'shouldve': "should've",
'shouldnt': "shouldn't",
"shouldnt've": "shouldn't've",
"shouldn'tve": "shouldn't've",
"somebody'd": 'somebodyd',
"somebodyd've": "somebody'd've",
"somebody'dve": "somebody'd've",
'somebodyll': "somebody'll",
'somebodys': "somebody's",
'someoned': "someone'd",
"someoned've": "someone'd've",
"someone'dve": "someone'd've",
'someonell': "someone'll",
'someones': "someone's",
'somethingd': "something'd",
"somethingd've": "something'd've",
"something'dve": "something'd've",
'somethingll': "something'll",
'thats': "that's",
'thered': "there'd",
"thered've": "there'd've",
"there'dve": "there'd've",
'therere': "there're",
'theres': "there's",
'theyd': "they'd",
"theyd've": "they'd've",
"they'dve": "they'd've",
'theyll': "they'll",
'theyre': "they're",
'theyve': "they've",
'twas': "'twas",
'wasnt': "wasn't",
"wed've": "we'd've",
"we'dve": "we'd've",
'weve': "we've",
'werent': "weren't",
'whatll': "what'll",
'whatre': "what're",
'whats': "what's",
'whatve': "what've",
'whens': "when's",
'whered': "where'd",
'wheres': "where's",
'whereve': "where've",
'whod': "who'd",
"whod've": "who'd've",
"who'dve": "who'd've",
'wholl': "who'll",
'whos': "who's",
'whove': "who've",
'whyll': "why'll",
'whyre': "why're",
'whys': "why's",
'wont': "won't",
'wouldve': "would've",
'wouldnt': "wouldn't",
"wouldnt've": "wouldn't've",
"wouldn'tve": "wouldn't've",
'yall': "y'all",
"yall'll": "y'all'll",
"y'allll": "y'all'll",
"yall'd've": "y'all'd've",
"y'alld've": "y'all'd've",
"y'all'dve": "y'all'd've",
'youd': "you'd",
"youd've": "you'd've",
"you'dve": "you'd've",
'youll': "you'll",
'youre': "you're",
'youve': "you've",
}
NUMBER_MAP = {
'none': '0',
'zero': '0',
'one': '1',
'two': '2',
'three': '3',
'four': '4',
'five': '5',
'six': '6',
'seven': '7',
'eight': '8',
'nine': '9',
'ten': '10',
}
ARTICLES = ['a', 'an', 'the']
PERIOD_STRIP = re.compile(r'(?!<=\d)(\.)(?!\d)')
COMMA_STRIP = re.compile(r'(?<=\d)(\,)+(?=\d)')
PUNCTUATIONS = [
';',
r'/',
'[',
']',
'"',
'{',
'}',
'(',
')',
'=',
'+',
'\\',
'_',
'-',
'>',
'<',
'@',
'`',
',',
'?',
'!',
]
def __init__(self, *args, **kwargs):
pass
def word_tokenize(self, word):
word = word.lower()
word = word.replace(',', '').replace('?', '').replace("'s", " 's")
return word.strip()
def process_punctuation(self, in_text):
out_text = in_text
for p in self.PUNCTUATIONS:
if (p + ' ' in in_text or ' ' + p in in_text) or (
re.search(self.COMMA_STRIP, in_text) is not None
):
out_text = out_text.replace(p, '')
else:
out_text = out_text.replace(p, ' ')
out_text = self.PERIOD_STRIP.sub('', out_text, re.UNICODE)
return out_text
def process_digit_article(self, in_text):
out_text = []
temp_text = in_text.lower().split()
for word in temp_text:
word = self.NUMBER_MAP.setdefault(word, word)
if word not in self.ARTICLES:
out_text.append(word)
else:
pass
for word_id, word in enumerate(out_text):
if word in self.CONTRACTIONS:
out_text[word_id] = self.CONTRACTIONS[word]
out_text = ' '.join(out_text)
return out_text
def __call__(self, item):
item = self.word_tokenize(item)
item = item.replace('\n', ' ').replace('\t', ' ').strip()
item = self.process_punctuation(item)
item = self.process_digit_article(item)
return item
class TextVQAAccuracyEvaluator:
def __init__(self):
self.answer_processor = EvalAIAnswerProcessor()
def _compute_answer_scores(self, raw_answers):
"""
compute the accuracy (soft score) of human answers
"""
answers = [self.answer_processor(a) for a in raw_answers]
assert len(answers) == 10
gt_answers = list(enumerate(answers))
unique_answers = set(answers)
unique_answer_scores = {}
for unique_answer in unique_answers:
accs = []
for gt_answer in gt_answers:
other_answers = [item for item in gt_answers if item != gt_answer]
matching_answers = [
item for item in other_answers if item[1] == unique_answer
]
acc = min(1, float(len(matching_answers)) / 3)
accs.append(acc)
unique_answer_scores[unique_answer] = sum(accs) / len(accs)
return unique_answer_scores
def eval_pred_list(self, pred_list, disable_tqdm=False):
pred_scores = []
for entry in tqdm(pred_list, disable=disable_tqdm):
pred_answer = self.answer_processor(entry['pred_answer'])
unique_answer_scores = self._compute_answer_scores(entry['gt_answers'])
score = unique_answer_scores.get(pred_answer, 0.0)
pred_scores.append(score)
accuracy = sum(pred_scores) / len(pred_scores)
return accuracy
class STVQAAccuracyEvaluator:
def __init__(self):
self.answer_processor = EvalAIAnswerProcessor()
def eval_pred_list(self, pred_list):
pred_scores = []
for entry in pred_list:
pred_answer = self.answer_processor(entry['pred_answer'])
gts = [self.answer_processor(a) for a in entry['gt_answers']]
score = 1.0 if pred_answer in gts else 0.0
pred_scores.append(score)
accuracy = sum(pred_scores) / len(pred_scores)
return accuracy
class STVQAANLSEvaluator:
def __init__(self):
import editdistance # install with `pip install editdistance`
self.get_edit_distance = editdistance.eval
def get_anls(self, s1, s2):
s1 = s1.lower().strip()
s2 = s2.lower().strip()
iou = 1 - self.get_edit_distance(s1, s2) / max(len(s1), len(s2))
anls = iou if iou >= 0.5 else 0.0
return anls
def eval_pred_list(self, pred_list):
pred_scores = []
for entry in pred_list:
anls = max(
self.get_anls(entry['pred_answer'], gt) for gt in entry['gt_answers']
)
pred_scores.append(anls)
accuracy = sum(pred_scores) / len(pred_scores)
return accuracy
class TextCapsBleu4Evaluator:
def __init__(self):
# The following script requires Java 1.8.0 and pycocotools installed.
# The pycocoevalcap can be installed with pip as
# pip install git+https://github.com/ronghanghu/coco-caption.git@python23
# Original pycocoevalcap code is at https://github.com/tylin/coco-caption
# but has no python3 support yet.
try:
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
except ModuleNotFoundError:
print(
'Please install pycocoevalcap module using '
'pip install git+https://github.com/ronghanghu/coco-caption.git@python23' # noqa
)
raise
self.tokenizer = PTBTokenizer()
self.scorer = Bleu(4)
def eval_pred_list(self, pred_list):
# Create reference and hypotheses captions.
gts = {}
res = {}
for idx, entry in enumerate(pred_list):
gts[idx] = [{'caption': a} for a in entry['gt_answers']]
res[idx] = [{'caption': entry['pred_answer']}]
gts = self.tokenizer.tokenize(gts)
res = self.tokenizer.tokenize(res)
score, _ = self.scorer.compute_score(gts, res)
bleu4 = score[3] # score is (Bleu-1, Bleu-2, Bleu-3, Bleu-4)
return bleu4
set -x
CHECKPOINT=${1}
DATASET=${2}
CHECKPOINT="$(pwd)/${CHECKPOINT}"
export PYTHONPATH="$(pwd):${PYTHONPATH}"
echo "CHECKPOINT: ${CHECKPOINT}"
MASTER_PORT=${MASTER_PORT:-63669}
PORT=${PORT:-63665}
GPUS=${GPUS:-8}
GPUS_PER_NODE=${GPUS_PER_NODE:-8}
NODES=$((GPUS / GPUS_PER_NODE))
export MASTER_PORT=${MASTER_PORT}
export PORT=${PORT}
# Save original arguments
ARGS=("$@")
# Parse options
while [[ $# -gt 0 ]]; do
case "$1" in
--auto)
GPUS=1
shift
;;
*)
shift
;;
esac
done
echo "GPUS: ${GPUS}"
if [ ${DATASET} == "mme" ]; then
cd eval/mme/
DIRNAME=`basename ${CHECKPOINT}`
python eval.py --checkpoint ${CHECKPOINT} "${ARGS[@]:2}"
python calculation.py --results_dir ${DIRNAME}
cd ../../
fi
if [ ${DATASET} == "caption" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/caption/evaluate_caption.py --checkpoint ${CHECKPOINT} "${ARGS[@]:2}"
fi
if [ ${DATASET} == "caption-coco" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/caption/evaluate_caption.py --checkpoint ${CHECKPOINT} --datasets coco "${ARGS[@]:2}"
fi
if [ ${DATASET} == "caption-flickr30k" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/caption/evaluate_caption.py --checkpoint ${CHECKPOINT} --datasets flickr30k "${ARGS[@]:2}"
fi
if [ ${DATASET} == "caption-nocaps" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/caption/evaluate_caption.py --checkpoint ${CHECKPOINT} --datasets nocaps "${ARGS[@]:2}"
fi
if [ ${DATASET} == "vqa" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} "${ARGS[@]:2}"
fi
if [ ${DATASET} == "vqa-okvqa-val" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets okvqa_val "${ARGS[@]:2}"
fi
if [ ${DATASET} == "vqa-textvqa-val" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets textvqa_val "${ARGS[@]:2}"
fi
if [ ${DATASET} == "vqa-textvqa-val-ocr" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets textvqa_val_ocr "${ARGS[@]:2}"
fi
if [ ${DATASET} == "vqa-vizwiz-val" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets vizwiz_val "${ARGS[@]:2}"
fi
if [ ${DATASET} == "vqa-vizwiz-test" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets vizwiz_test "${ARGS[@]:2}"
fi
if [ ${DATASET} == "vqa-vqav2-testdev" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets vqav2_testdev "${ARGS[@]:2}"
fi
if [ ${DATASET} == "vqa-ai2d-test" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets ai2diagram_test "${ARGS[@]:2}"
fi
if [ ${DATASET} == "vqa-vqav2-val" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets vqav2_val "${ARGS[@]:2}"
fi
if [ ${DATASET} == "vqa-gqa-testdev" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets gqa_testdev_llava "${ARGS[@]:2}"
fi
if [ ${DATASET} == "vqa-docvqa-val" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets docvqa_val "${ARGS[@]:2}"
fi
if [ ${DATASET} == "vqa-docvqa-test" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets docvqa_test "${ARGS[@]:2}"
fi
if [ ${DATASET} == "vqa-mpdocvqa-val" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mpdocvqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets mpdocvqa_val "${ARGS[@]:2}"
fi
if [ ${DATASET} == "vqa-mpdocvqa-test" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mpdocvqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets mpdocvqa_test "${ARGS[@]:2}"
fi
if [ ${DATASET} == "vqa-chartqa-test" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets chartqa_test_human,chartqa_test_augmented "${ARGS[@]:2}"
fi
if [ ${DATASET} == "vqa-infovqa-val" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets infographicsvqa_val "${ARGS[@]:2}"
fi
if [ ${DATASET} == "vqa-infovqa-test" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets infographicsvqa_test "${ARGS[@]:2}"
fi
if [ ${DATASET} == "vqa-chartqa-test-human" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets chartqa_test_human "${ARGS[@]:2}"
fi
if [ ${DATASET} == "vqa-chartqa-test-augmented" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets chartqa_test_augmented "${ARGS[@]:2}"
fi
if [ ${DATASET} == "vqa-ocrvqa-val" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets ocrvqa_val "${ARGS[@]:2}"
fi
if [ ${DATASET} == "vqa-ocrvqa-test" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/vqa/evaluate_vqa.py --checkpoint ${CHECKPOINT} --datasets ocrvqa_test "${ARGS[@]:2}"
fi
if [ ${DATASET} == "refcoco" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT} "${ARGS[@]:2}"
fi
if [ ${DATASET} == "refcoco-val" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT} --datasets refcoco_val "${ARGS[@]:2}"
fi
if [ ${DATASET} == "refcoco-testA" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT} --datasets refcoco_testA "${ARGS[@]:2}"
fi
if [ ${DATASET} == "refcoco-testB" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT} --datasets refcoco_testB "${ARGS[@]:2}"
fi
if [ ${DATASET} == "refcoco+-val" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT} --datasets refcoco+_val "${ARGS[@]:2}"
fi
if [ ${DATASET} == "refcoco+-testA" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT} --datasets refcoco+_testA "${ARGS[@]:2}"
fi
if [ ${DATASET} == "refcoco+-testB" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT} --datasets refcoco+_testB "${ARGS[@]:2}"
fi
if [ ${DATASET} == "refcocog-val" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT} --datasets refcocog_val "${ARGS[@]:2}"
fi
if [ ${DATASET} == "refcocog-test" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/refcoco/evaluate_grounding.py --checkpoint ${CHECKPOINT} --datasets refcocog_test "${ARGS[@]:2}"
fi
if [ ${DATASET} == "llava-bench" ]; then
rm -rf results/llava_bench_results_review.jsonl
python eval/llava_bench/evaluate_llava_bench.py --checkpoint ${CHECKPOINT} "${ARGS[@]:2}"
python -u eval/llava_bench/eval_gpt_review_bench.py \
--question data/llava-bench-in-the-wild/questions.jsonl \
--context data/llava-bench-in-the-wild/context.jsonl \
--rule eval/llava_bench/rule.json \
--answer-list \
data/llava-bench-in-the-wild/answers_gpt4.jsonl \
results/llava_bench_results.jsonl \
--output \
results/llava_bench_results_review.jsonl
python -u eval/llava_bench/summarize_gpt_review.py -f results/llava_bench_results_review.jsonl
fi
if [ ${DATASET} == "pope" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/pope/evaluate_pope.py --checkpoint ${CHECKPOINT} --datasets pope "${ARGS[@]:2}"
fi
if [ ${DATASET} == "tiny_lvlm" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/tiny_lvlm/evaluate_lvlm.py --checkpoint ${CHECKPOINT} --datasets updated_datasets "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mmvet" ]; then
python eval/mmvet/evaluate_mmvet.py --checkpoint ${CHECKPOINT} --datasets mmvet "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mmvetv2" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mmvetv2/evaluate_mmvet_v2.py --checkpoint ${CHECKPOINT} --datasets mmvet-v2 "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mmbench-dev-en" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mmbench/evaluate_mmbench.py --checkpoint ${CHECKPOINT} --datasets mmbench_dev_20230712 "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mmbench-dev-cn" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mmbench/evaluate_mmbench.py --checkpoint ${CHECKPOINT} --datasets mmbench_dev_cn_20231003 "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mmbench-test-en" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mmbench/evaluate_mmbench.py --checkpoint ${CHECKPOINT} --datasets mmbench_test_en_20231003 "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mmbench-test-cn" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mmbench/evaluate_mmbench.py --checkpoint ${CHECKPOINT} --datasets mmbench_test_cn_20231003 "${ARGS[@]:2}"
fi
if [ ${DATASET} == "ccbench-dev" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mmbench/evaluate_mmbench.py --checkpoint ${CHECKPOINT} --datasets ccbench_dev_cn "${ARGS[@]:2}"
fi
if [ ${DATASET} == "scienceqa" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/scienceqa/evaluate_scienceqa.py --checkpoint ${CHECKPOINT} --datasets sqa_test "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mantis" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mantis_eval/evaluate_mantis.py --checkpoint ${CHECKPOINT} --datasets Mantis-Eval "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mirb" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mirb/evaluate_mirb.py --checkpoint ${CHECKPOINT} --datasets MIRB "${ARGS[@]:2}"
fi
if [ ${DATASET} == "m3cot" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/scienceqa/evaluate_scienceqa.py --checkpoint ${CHECKPOINT} --datasets m3cot_test "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mmmu-dev" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mmmu/evaluate_mmmu.py --checkpoint ${CHECKPOINT} --datasets MMMU_dev "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mmmu-val" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mmmu/evaluate_mmmu.py --checkpoint ${CHECKPOINT} --datasets MMMU_validation "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mmmu-test" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mmmu/evaluate_mmmu.py --checkpoint ${CHECKPOINT} --datasets MMMU_test "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mmmu-dev-cot" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mmmu/evaluate_mmmu_cot.py --checkpoint ${CHECKPOINT} --datasets MMMU_dev "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mmmu-val-cot" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mmmu/evaluate_mmmu_cot.py --checkpoint ${CHECKPOINT} --datasets MMMU_validation "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mmmu-test-cot" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mmmu/evaluate_mmmu_cot.py --checkpoint ${CHECKPOINT} --datasets MMMU_test "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mmvp" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mmvp/evaluate_mmvp.py --checkpoint ${CHECKPOINT} --datasets MMVP "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mathvista-testmini" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mathvista/evaluate_mathvista.py --checkpoint ${CHECKPOINT} --datasets MathVista_testmini "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mathvista-test" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mathvista/evaluate_mathvista.py --checkpoint ${CHECKPOINT} --datasets MathVista_test "${ARGS[@]:2}"
fi
if [ ${DATASET} == "seed" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/seed/evaluate_seed.py --checkpoint ${CHECKPOINT} --datasets SEEDv1 "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mvbench" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mvbench/evaluate_mvbench.py --checkpoint ${CHECKPOINT} --num_segments 16 "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mmiu" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mmiu/evaluate_mmiu.py --checkpoint ${CHECKPOINT} "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mmhal" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/mmhal/evaluate_mmhal.py --checkpoint ${CHECKPOINT} "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mmmu-pro" ]; then
python -u eval/mmmu_pro/evaluate_mmmu_pro.py --model ${CHECKPOINT} --mode direct --setting "standard (10 options)" "${ARGS[@]:2}"
python -u eval/mmmu_pro/evaluate_mmmu_pro.py --model ${CHECKPOINT} --mode cot --setting "standard (10 options)" "${ARGS[@]:2}"
python -u eval/mmmu_pro/evaluate_mmmu_pro.py --model ${CHECKPOINT} --mode direct --setting vision "${ARGS[@]:2}"
python -u eval/mmmu_pro/evaluate_mmmu_pro.py --model ${CHECKPOINT} --mode cot --setting vision "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mmmu-pro-std10" ]; then
python -u eval/mmmu_pro/evaluate_mmmu_pro.py --model ${CHECKPOINT} --mode direct --setting "standard (10 options)" "${ARGS[@]:2}"
python -u eval/mmmu_pro/evaluate_mmmu_pro.py --model ${CHECKPOINT} --mode cot --setting "standard (10 options)" "${ARGS[@]:2}"
fi
if [ ${DATASET} == "mmmu-pro-vision" ]; then
python -u eval/mmmu_pro/evaluate_mmmu_pro.py --model ${CHECKPOINT} --mode direct --setting vision "${ARGS[@]:2}"
python -u eval/mmmu_pro/evaluate_mmmu_pro.py --model ${CHECKPOINT} --mode cot --setting vision "${ARGS[@]:2}"
fi
if [ ${DATASET} == "drivelm" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/domain_specific/drivelm/evaluate.py --checkpoint ${CHECKPOINT} --datasets DriveLM_val --dynamic --max-num 12
fi
if [ ${DATASET} == "mme—realworld" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/domain_specific/mme_rw/evaluate.py --checkpoint ${CHECKPOINT} --datasets MME_RealWorld "${ARGS[@]:2}"
fi
if [ ${DATASET} == "dior-rsvg" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/domain_specific/rs_det/evaluate.py --checkpoint ${CHECKPOINT} --datasets DIOR_RSVG "${ARGS[@]:2}"
fi
if [ ${DATASET} == "rsvqa-lr" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/domain_specific/rs_vqa/evaluate.py --checkpoint ${CHECKPOINT} --datasets RSVQA_H_TEST2 "${ARGS[@]:2}"
fi
if [ ${DATASET} == "rsvqa-hr-test1" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/domain_specific/rs_vqa/evaluate.py --checkpoint ${CHECKPOINT} --datasets RSVQA_H_TEST1 "${ARGS[@]:2}"
fi
if [ ${DATASET} == "rsvqa-hr-test2" ]; then
torchrun \
--nnodes=1 \
--node_rank=0 \
--master_addr=127.0.0.1 \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
eval/domain_specific/rs_vqa/evaluate.py --checkpoint ${CHECKPOINT} --datasets RSVQA_L "${ARGS[@]:2}"
fi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment