Commit 5e887c2c authored by wanglch's avatar wanglch
Browse files

Initial commit

parents
Pipeline #1060 canceled with stages
import pandas as pd
import io
import base64
import json
from PIL import Image
'''
This scripts convert mmbench_dev tsv file to jsonl
'''
datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t')
global_choices = ['A', 'B', 'C', 'D']
def decode_base64_to_image(base64_string):
image_data = base64.b64decode(base64_string)
image = Image.open(io.BytesIO(image_data))
return image
with open('./data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.jsonl', 'w') as f:
for idx in range(len(datas)):
data = datas.iloc[idx]
index = int(data['index'])
question = data['question']
hint = data['hint'] if not pd.isna(data['hint']) else 'N/A'
choices = []
for opt in global_choices:
if pd.isna(data[opt]):
continue
choices.append(data[opt])
answer = global_choices.index(data['answer'])
image = decode_base64_to_image(data['image'])
image.save("data/mmbench/mmbench_dev_20230712/images/%d.jpg" % index)
f.write(json.dumps({
"index": index,
"image": "data/mmbench/mmbench_dev_20230712/images/%d.jpg" % index,
"hint": hint,
"question": question,
"choices": choices,
"answer": answer,
}) + "\n")
import pandas as pd
import io
import base64
import json
from PIL import Image
'''
This script convert mmbench_test tsv file to jsonl
This script is very similar to mmbench_converter_dev except there's no answer for accuracy calculation
'''
datas = pd.read_csv("data/mmbench/mmbench_test_20230712/mmbench_test_20230712.tsv", sep='\t')
global_choices = ['A', 'B', 'C', 'D']
def decode_base64_to_image(base64_string):
image_data = base64.b64decode(base64_string)
image = Image.open(io.BytesIO(image_data))
return image
with open('./data/mmbench/mmbench_test_20230712/mmbench_test_20230712.jsonl', 'w') as f:
for idx in range(len(datas)):
data = datas.iloc[idx]
index = int(data['index'])
question = data['question']
hint = data['hint'] if not pd.isna(data['hint']) else 'N/A'
choices = []
for opt in global_choices:
if pd.isna(data[opt]):
continue
choices.append(data[opt])
# answer = global_choices.index(data['answer'])
image = decode_base64_to_image(data['image'])
image.save("data/mmbench/mmbench_test_20230712/images/%d.jpg" % index)
f.write(json.dumps({
"index": index,
"image": "data/mmbench/mmbench_test_20230712/images/%d.jpg" % index,
"hint": hint,
"question": question,
"choices": choices,
# "answer": answer,
}) + "\n")
import pandas as pd
import json
'''
This script provides `global top-1 accuracy` metric calculation for mmbench_dev.
'''
predictions = json.load(open('mmbench_dev_20230712.json'))
index2predictions = {}
for pred in predictions:
index2predictions[pred['index']] = pred['prediction']
datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t')
glb_opts = ['A', 'B', 'C', 'D']
index2answer = {}
for idx in range(len(datas)):
data = datas.iloc[idx]
index2answer[data['index']] = glb_opts.index(data['answer'])
identity_indexes = list(set([int(_ % 1e6) for _ in index2predictions.keys()]))
correct = 0
total = 0
for index in identity_indexes:
for _ in range(4):
cycle_index = int(_ * 1e6 + index)
if index2predictions.get(cycle_index, None) is not None:
if index2predictions[cycle_index] == index2answer[cycle_index]:
continue
else:
print(cycle_index)
break
else:
correct += 1
total += 1
print(correct, total)
import pandas as pd
import json
import random
'''
This script provides metric calculation for mmbench_dev with the same accuarcy algo as OpenCompass server
'''
predictions = json.load(open('mmbench_dev_20230712.json'))
index2predictions = {}
for pred in predictions:
index2predictions[pred['index']] = pred['prediction']
from collections import Counter
def most_common_elements(lst):
counter = Counter(lst)
max_count = max(counter.values())
most_common = [element for element, count in counter.items() if count == max_count]
return random.choice(most_common) # random sample from random choice
datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t')
glb_opts = ['A', 'B', 'C', 'D']
index2answer = {}
index2choices = {}
index2rawanswer = {}
for idx in range(len(datas)):
data = datas.iloc[idx]
choices = []
for opt in glb_opts:
if not pd.isna(data[opt]):
choices.append(data[opt])
index2choices[data['index']] = choices
index2answer[data['index']] = glb_opts.index(data['answer'])
index2rawanswer[data['index']] = choices[glb_opts.index(data['answer'])]
identity_indexes = list(set([int(_ % 1e6) for _ in index2predictions.keys()]))
correct = 0
total = 0
for index in identity_indexes:
raw_preds = []
raw_answer = []
for _ in range(4):
cycle_index = int(_ * 1e6 + index)
if index2predictions.get(cycle_index, None) is not None:
raw_answer = index2rawanswer[cycle_index]
raw_pred = index2choices[cycle_index][index2predictions[cycle_index]]
raw_preds.append(raw_pred)
if len(set(raw_preds)) == 1:
if raw_preds[0] == raw_answer:
correct += 1
else:
result = most_common_elements(raw_preds)
if result == raw_answer:
correct += 1
total += 1
print(correct, total, correct / total * 100.)
import pandas as pd
import json
import random
'''
This script convert the output file of our inference processor to target formation of OpenCompass evaluator server
'''
predictions = json.load(open('mmbench_test_20230712.json'))
index2predictions = {}
for pred in predictions:
index2predictions[pred['index']] = pred['prediction']
from collections import Counter
def most_common_elements(lst):
counter = Counter(lst)
max_count = max(counter.values())
most_common = [element for element, count in counter.items() if count == max_count]
print(most_common)
return random.choice(most_common)
# return most_common
datas = pd.read_csv("data/mmbench/mmbench_test_20230712/mmbench_test_20230712.tsv", sep='\t')
datas = datas.drop('image', axis=1)
glb_opts = ['A', 'B', 'C', 'D']
index2choices = {}
for idx in range(len(datas)):
data = datas.iloc[idx]
choices = []
for opt in glb_opts:
if not pd.isna(data[opt]):
choices.append(data[opt])
index2choices[data['index']] = choices
identity_indexes = list(set([int(_ % 1e6) for _ in index2predictions.keys()]))
processed_index2predictions = {}
for index in identity_indexes:
raw_preds = []
for _ in range(4):
cycle_index = int(_ * 1e6 + index)
if index2predictions.get(cycle_index, None) is not None:
raw_pred = index2choices[cycle_index][index2predictions[cycle_index]]
raw_preds.append(raw_pred)
if len(set(raw_preds)) == 1:
pred_answer = raw_preds[0]
else:
pred_answer = most_common_elements(raw_preds)
print(index, pred_answer)
for _ in range(4):
cycle_index = int(_ * 1e6 + index)
if index2predictions.get(cycle_index, None) is not None:
processed_index2predictions[cycle_index] = index2choices[cycle_index].index(pred_answer)
predictions = []
for idx in range(len(datas)):
data = datas.iloc[idx]
index = data['index']
prediction = glb_opts[processed_index2predictions[index]]
predictions.append(prediction)
datas['prediction'] = predictions
datas.to_excel("mmbench_test_20230712_230831_constrained.xlsx", index=False)
# constrained means we force the model predict same answer when tested on a question for multiple times
# MME Benchmark
[MME](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation) is a comprehensive evaluation benchmark for multimodal large language models. It measures both perception and cognition abilities on a total of 14 subtasks, including existence, count, position, color, poster, celebrity, scene, landmark, artwork, OCR, commonsense reasoning, numerical calculation, text translation, and code reasoning.
Qwen-VL-Chat achieves SOTAs on both perception and cognition evaluation.
Perception Evaluation
| Rank | Model | Version | Score |
|:----:|:---------------:|:------------------------:|:-------:|
| 1 | **[Qwen-VL-Chat](https://github.com/QwenLM/Qwen-VL/)**| **[Qwen-7B](https://github.com/QwenLM/Qwen-7B)** | **1487.57** |
| 2 | Skywork-MM | Skywork-MM-13B | 1419.08 |
| 3 | MMICL | FlanT5xxl | 1376.00 |
| 4 | Lynx | vicuna-7b | 1373.23 |
| 5 | BLIVA | FlanT5xxl | 1337.73 |
Cognition Evaluation
| Rank | Model | Version | Score |
|:----:|:----------------:|:--------------:|:----------:|
| 1 | **[Qwen-VL-Chat](https://github.com/QwenLM/Qwen-VL/)** | **[Qwen-7B](https://github.com/QwenLM/Qwen-7B)** | **360.71** |
| 2 | MMICL | FlanT5xxl | 360.36 |
| 3 | Skywork-MM | Skywork-MM-13B | 356.43 |
| 4 | BLIVA | FlanT5xxl | 331.43 |
| 5 | LRV-Instruction | LRV-7B | 328.21 |
Full Metrics
```
=========== Perception ===========
total score: 1487.576330532213
existence score: 158.33333333333331
count score: 150.0
position score: 128.33333333333334
color score: 170.0
posters score: 178.57142857142856
celebrity score: 120.58823529411764
scene score: 152.25
landmark score: 164.0
artwork score: 125.5
OCR score: 140.0
=========== Cognition ===========
total score: 360.71428571428567
commonsense_reasoning score: 130.7142857142857
numerical_calculation score: 40.0
text_translation score: 147.5
code_reasoning score: 42.5
```
## How To Reproduce Results of MME Benchmark
1. Download MME images and eval_tool from the [MME repo](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/blob/Evaluation/README.md)
2. Rearrange images by executing `python get_images.py`
3. Evaluate Qwen-VL-Chat results by executing `python eval.py`
4. Calculate MME results by executing `python calculation.py --results_dir Qwen-VL-Chat`, which the calculation script comes from the MME eval_tool.
import os
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
checkpoint = 'Qwen/Qwen-VL-Chat'
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
checkpoint, device_map='cuda', trust_remote_code=True).eval()
model.generation_config = GenerationConfig.from_pretrained(checkpoint, trust_remote_code=True)
model.generation_config.top_p = 0.01
root = 'Your_Results'
output = 'Qwen-VL-Chat'
os.makedirs(output, exist_ok=True)
for filename in os.listdir(root):
with open(os.path.join(root, filename), 'r') as fin, open(os.path.join(output, filename), 'w') as fout:
lines = fin.read().splitlines()
filename = filename.replace('.txt', '')
for line in tqdm(lines):
img, question, gt = line.strip().split('\t')
img_path = os.path.join('images', filename, img)
assert os.path.exists(img_path), img_path
query = f'<img>{img_path}</img>\n{question}'
response, _ = model.chat(tokenizer, query=query, history=None)
print(img, question, gt, response, sep='\t', file=fout)
import os
from tqdm import tqdm
os.system('rm -rf images')
os.system('mkdir images')
os.system('cp -r ../MME_Benchmark_release/OCR images/')
os.system('mkdir images/artwork')
os.system('cp ../MME_Benchmark_release/artwork/questions_answers_YN/* images/artwork/')
with open('LaVIN/artwork.txt') as fin:
paths = [ line.strip().split('\t', 1)[0] for line in fin ]
paths = list(set(paths))
for path in tqdm(paths):
os.system(f'cp ../MME_Benchmark_release/artwork/images/toy_dataset/{path} images/artwork/{path}')
os.system('mkdir images/celebrity')
os.system('cp ../MME_Benchmark_release/celebrity/images/* images/celebrity/')
os.system('cp ../MME_Benchmark_release/celebrity/questions_answers_YN/* images/celebrity/')
os.system('cp -r ../MME_Benchmark_release/code_reasoning images/')
os.system('cp -r ../MME_Benchmark_release/color images/')
os.system('cp -r ../MME_Benchmark_release/commonsense_reasoning images/')
os.system('cp -r ../MME_Benchmark_release/count images/')
os.system('cp -r ../MME_Benchmark_release/existence images/')
os.system('mkdir images/landmark')
os.system('cp ../MME_Benchmark_release/landmark/images/* images/landmark/')
os.system('cp ../MME_Benchmark_release/landmark/questions_answers_YN/* images/landmark/')
os.system('cp -r ../MME_Benchmark_release/numerical_calculation images/')
os.system('cp -r ../MME_Benchmark_release/position images/')
os.system('mkdir images/posters')
os.system('cp ../MME_Benchmark_release/posters/images/* images/posters/')
os.system('cp ../MME_Benchmark_release/posters/questions_answers_YN/* images/posters/')
os.system('mkdir images/scene')
os.system('cp ../MME_Benchmark_release/scene/images/* images/scene/')
os.system('cp ../MME_Benchmark_release/scene/questions_answers_YN/* images/scene/')
os.system('cp -r ../MME_Benchmark_release/text_translation images/')
# Seed-Bench Evaluation
[SEED-Bench](https://huggingface.co/spaces/AILab-CVC/SEED-Bench_Leaderboard) is a multimodal benchmark of 19K multiple-choice questions with accurate human annotations for evaluating Multimodal LLMs, covering 12 evaluation dimensions including both **image** and **video** understanding.
Qwen-VL and Qwen-VL-Chat achieve SOTAs on this benchmark.
<p align="center">
<img src="leaderboard.jpg"/>
<p>
## How To Process Video by Qwen-VL
Qwen-VL and Qwen-VL-Chat didn't train any video data or tasks during training, but they can understand some videos in a zero-shot way. For the video question-answering task, we utilize four uniformly sampled frames per video sample. These frames are treated as separate images and are stitched into the context. For example:
```
{
"question_id": "v0",
"prompt": "<img>video_imgs_4/v0_0.jpg</img>\n<img>video_imgs_4/v0_1.jpg</img>\n<img>video_imgs_4/v0_2.jpg</img>\n<img>video_imgs_4/v0_3.jpg</img>\nQuestion: Can you identify the action taking place in the video?\nOptions: A. pretending to take something out of something\nB. pretending to take something from somewhere\nC. feigning to insert something into something\nD. simulating putting something onto something\nAnswer:"
}
```
The above JSON line can be used as the input by `eval_mm/seed_bench/eval.py` and output the following results:
```
{"question_id": "v0", "prediction": "B"}
```
Please see [eval_mm/seed_bench/eval.py](eval.py) for more inference details.
## How To Reproduce Results of Seed-Bench
1. Download all images and videos by following the [instruction](https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md). Then modify the root path in `eval_mm/seed_bench/trans.py` with your customized path.
```
# path of SEED-Bench.json, download from https://huggingface.co/datasets/AILab-CVC/SEED-Bench/blob/main/SEED-Bench.json
seed_bench_input_path = 'SEED-Bench.json'
# root directory of evaluation dimension 1-9, following https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md
cc3m_dir = "/YOUR_PATH_TO/seed_bench_image"
# root directory of evaluation dimension 10
dimension10_dir = "/YOUR_PATH_TO/SSV2/videos"
# root directory of evaluation dimension 11
dimension11_dir = "/YOUR_PATH_TO/EPIC-KITCHENS/3h91syskeag572hl6tvuovwv4d/videos/test"
# root directory of evaluation dimension 12
dimension12_dir = "/YOUR_PATH_TO/BreakfastII_15fps_qvga_sync"
```
2. Generate input files of Qwen-VL with the JSON formatting.
```
cd eval_mm/seed_bench/
python trans.py
```
This script will output two JSONL files and one directory. `image_input.jsonl` is the input file of image evaluation and `video_input_4.jsonl` is the input file of video evaluation by 4 frames. The directory `video_imgs_4` contains all 4-framed images extracted from videos. We provide our [image_input.jsonl](http://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/seed_bench/image_input.jsonl) and [video_input_4.jsonl](http://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/seed_bench/video_input_4.jsonl) here for reference.
3. Produce the results of Seed-Bench.
```
# The number of available GPUs
export NPROC_PER_NODE=8
# Produce the Qwen-VL-Chat results of image understanding
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} \
--nnodes ${WORLD_SIZE:-1} \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-12345} \
eval.py \
--checkpoint Qwen/Qwen-VL-Chat \
--dataset image_input.jsonl \
--batch-size 4 \
--num-workers 2
# Collect the result files
cat result_?.jsonl >results_chat_img.jsonl
rm result_?.jsonl
# Produce the results of video understanding
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} \
--nnodes ${WORLD_SIZE:-1} \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-12345} \
eval.py \
--checkpoint Qwen/Qwen-VL-Chat \
--dataset video_input_4.jsonl \
--batch-size 2 \
--num-workers 1
# Collect the result files
cat result_?.jsonl >results_chat_vid.jsonl
rm result_?.jsonl
# The file `results_chat.jsonl` can be submitted to the leaderboard
cat results_chat_img.jsonl results_chat_vid.jsonl >results_chat.jsonl
```
You can reproduce the Seed-Bench results of Qwen-VL by replacing `Qwen/Qwen-VL-Chat` with `Qwen/Qwen-VL` on the above script.
import argparse
import itertools
import json
import os
from functools import partial
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
def collate_fn(batches, pad_token_id):
input_tokens = [_['input_tokens'] for _ in batches]
target_lengths = [_['target_lengths'] for _ in batches]
answers = [_['answer'] for _ in batches]
question_id = [_['question_id'] for _ in batches]
chunk_sizes = [len(_) for _ in input_tokens]
input_tokens = [_ for _ in itertools.chain.from_iterable(input_tokens)]
max_lengths = max([len(_) for _ in input_tokens])
input_tokens = [[pad_token_id] * (max_lengths - len(_)) + _
for _ in input_tokens]
input_tokens = torch.LongTensor(input_tokens)
attention_mask = 1 - input_tokens.eq(pad_token_id).float()
return input_tokens, attention_mask, target_lengths, answers, chunk_sizes, question_id
class MultipleChoiceDataste(torch.utils.data.Dataset):
def __init__(self, test, tokenizer):
self.datas = []
with open(test) as fin:
for line in tqdm(fin):
self.datas.append(json.loads(line.strip()))
self.tokenizer = tokenizer
def __len__(self):
return len(self.datas)
def __getitem__(self, idx):
data = self.datas[idx]
prompt = data['prompt']
prompt_tokens = self.tokenizer(prompt).input_ids
target_tokens = [
self.tokenizer(' ' + _).input_ids
for _ in ['A', 'B', 'C', 'D']
]
return {
'input_tokens': [prompt_tokens + _ for _ in target_tokens],
'target_lengths': [len(_) for _ in target_tokens],
'answer': data['answer'],
'question_id': data['question_id'],
}
class InferenceSampler(torch.utils.data.sampler.Sampler):
def __init__(self, size):
self._size = int(size)
assert size > 0
self._rank = torch.distributed.get_rank()
self._world_size = torch.distributed.get_world_size()
self._local_indices = self._get_local_indices(size, self._world_size,
self._rank)
@staticmethod
def _get_local_indices(total_size, world_size, rank):
shard_size = total_size // world_size
left = total_size % world_size
shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
begin = sum(shard_sizes[:rank])
end = min(sum(shard_sizes[:rank + 1]), total_size)
return range(begin, end)
def __iter__(self):
yield from self._local_indices
def __len__(self):
return len(self._local_indices)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--checkpoint', type=str, default='')
parser.add_argument('--dataset', type=str, default='')
parser.add_argument('--batch-size', type=int, default=1)
parser.add_argument('--num-workers', type=int, default=1)
args = parser.parse_args()
torch.distributed.init_process_group(
backend='nccl',
world_size=int(os.getenv('WORLD_SIZE', '1')),
rank=int(os.getenv('RANK', '0')),
)
torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
model = AutoModelForCausalLM.from_pretrained(
args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
trust_remote_code=True)
model.generation_config = GenerationConfig.from_pretrained(args.checkpoint, trust_remote_code=True)
model.generation_config.top_p = 0.01
dataset = MultipleChoiceDataste(test=args.dataset, tokenizer=tokenizer)
dataloader = torch.utils.data.DataLoader(
dataset=dataset,
# sampler=InferenceSampler(1000),
sampler=InferenceSampler(len(dataset)),
batch_size=args.batch_size,
num_workers=args.num_workers,
pin_memory=True,
drop_last=False,
collate_fn=partial(collate_fn, pad_token_id=tokenizer.eod_id),
)
results = []
fout = open('result_{}.jsonl'.format(torch.distributed.get_rank()), 'w')
with torch.no_grad():
for _, (input_tokens, attention_mask, target_lengths, answers,
chunk_sizes, question_ids) in tqdm(enumerate(dataloader)):
outputs = model(
input_ids=input_tokens[:, :-1].cuda(),
attention_mask=attention_mask[:, :-1].cuda(),
return_dict=True,
)
losses = torch.nn.functional.cross_entropy(outputs.logits.permute(
0, 2, 1),
input_tokens[:,
1:].cuda(),
reduction='none')
losses = losses.split(chunk_sizes, dim=0)
for loss, target_length, answer, question_id in zip(losses, target_lengths,
answers, question_ids):
target_loss = loss.mean(-1)
for _ in range(len(target_length)):
target_loss[_] = loss[_, -target_length[_]:].mean()
pred = target_loss.argmin().item()
pred = chr(pred + 65)
if pred == answer:
results.append(1)
else:
results.append(0)
answer_record = {
'question_id': question_id,
'prediction': pred
}
print(json.dumps(answer_record), file=fout)
fout.close()
torch.distributed.barrier()
world_size = torch.distributed.get_world_size()
merged_results = [None for _ in range(world_size)]
torch.distributed.all_gather_object(merged_results, results)
merged_results = [_ for _ in itertools.chain.from_iterable(merged_results)]
if torch.distributed.get_rank() == 0:
print(f"Evaluating {args.dataset} ...")
print(f'Acc@1: {sum(merged_results) / len(merged_results)}')
torch.distributed.barrier()
import os
import av
import json
import torch
import numpy as np
from PIL import Image
from tqdm import tqdm
from decord import VideoReader, cpu
# path of SEED-Bench.json, download from https://huggingface.co/datasets/AILab-CVC/SEED-Bench/blob/main/SEED-Bench.json
seed_bench_input_path = 'SEED-Bench.json'
# root directory of evaluation dimension 1-9, following https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md
cc3m_dir = "/YOUR_PATH_TO/seed_bench_image"
# root directory of evaluation dimension 10
dimension10_dir = "/YOUR_PATH_TO/SSV2/videos"
# root directory of evaluation dimension 11
dimension11_dir = "/YOUR_PATH_TO/EPIC-KITCHENS/3h91syskeag572hl6tvuovwv4d/videos/test"
# root directory of evaluation dimension 12
dimension12_dir = "/YOUR_PATH_TO/BreakfastII_15fps_qvga_sync"
def is_integer_string(s):
try:
int(s)
return True
except ValueError:
return False
def filter_questions(data, task='all'):
if task == "image":
return [q for q in data if 1 <= q["question_type_id"] <= 9]
elif task == "video":
return [q for q in data if 10 <= q["question_type_id"] <= 12]
elif task == "all":
return data
elif is_integer_string(task):
return [q for q in data if q["question_type_id"] == int(task)]
else:
raise ValueError(f"Invalid task: {task}")
def get_index(num_frames, num_segments):
if num_segments > num_frames:
offsets = np.array([
idx for idx in range(num_frames)
])
else:
# uniform sampling
seg_size = float(num_frames - 1) / num_segments
start = int(seg_size / 2)
offsets = np.array([
start + int(np.round(seg_size * idx)) for idx in range(num_segments)
])
return offsets
with open(seed_bench_input_path) as fin:
qa_anno = json.load(fin)['questions']
fout = open('image_input.jsonl', 'w')
i_anno = filter_questions(qa_anno, 'image')
for qa_item in tqdm(i_anno):
data_path = cc3m_dir + qa_item['data_id']
choices = [qa_item['choice_a'], qa_item['choice_b'], qa_item['choice_c'], qa_item['choice_d']]
choice_list = []
for i, c in enumerate(choices):
choice_list.append('{}. {}'.format(chr(i + 65), c))
choice_txt = '\n'.join(choice_list)
prompt = '<img>{}</img>\nQuestion: {}\nOptions: {}\nAnswer:'.format(
data_path, qa_item['question'], choice_txt)
print(json.dumps({
'question_id': qa_item['question_id'],
'prompt': prompt,
'answer': qa_item['answer'],
}), file=fout)
fout.close()
n_frames = 8
os.system('rm -rf video_input_' + str(n_frames))
os.makedirs('video_imgs_' + str(n_frames), exist_ok=True)
fout = open('video_input_{}.jsonl'.format(n_frames), 'w')
v_anno = filter_questions(qa_anno, 'video')
for qa_item in tqdm(v_anno):
if qa_item['question_type_id'] == 12:
data_path = dimension12_dir + qa_item['data_id']
elif qa_item['question_type_id'] == 11:
data_path = dimension11_dir + qa_item['data_id'].split('/')[-1]
elif qa_item['question_type_id'] == 10:
data_path = dimension10_dir + qa_item['data_id']
else:
assert False, str(qa_item)
print(data_path)
use_pyav = False
if 'segment' in qa_item.keys():
segment = qa_item['segment']
if isinstance(segment[0], int):
# using pyav for decoding videos in evaluation dimension 12
use_pyav = True
start, end = segment[0], segment[1]
else:
start = 0.0
end = 0.0
if use_pyav:
# using pyav for decoding videos in evaluation dimension 12
reader = av.open(data_path)
frames = [torch.from_numpy(f.to_rgb().to_ndarray()) for f in reader.decode(video=0)]
video_len = len(frames)
start_frame, end_frame = start, end
end_frame = min(end_frame, video_len)
offset = get_index(end_frame - start_frame, n_frames)
frame_indices = offset + start_frame
images = torch.stack([frames[idx] for idx in frame_indices]).numpy()
else:
# using decord for decoding videos in evaluation dimension 10-11
try:
vr = VideoReader(data_path, num_threads=1, ctx=cpu(0))
video_len = len(vr)
fps = vr.get_avg_fps()
if 'segment' in qa_item.keys():
# obtain start and end frame for the video segment in evaluation dimension 11
start_frame = int(min(max(start * fps, 0), video_len - 1))
end_frame = int(min(max(end * fps, 0), video_len - 1))
tot_frames = int(end_frame - start_frame)
offset = get_index(tot_frames, n_frames)
frame_indices = offset + start_frame
else:
# sample frames of the video in evaluation dimension 10
frame_indices = get_index(video_len - 1, n_frames)
vr.seek(0)
images = vr.get_batch(frame_indices).asnumpy()
except Exception as e:
print(json.dumps({
'question_id': qa_item['question_id'],
'prompt': "Error" + str(e),
'answer': qa_item['answer'],
}), file=fout)
continue
prompt = ''
for i in range(images.shape[0]):
data = Image.fromarray(images[i])
img_path = 'video_imgs_{}/{}_{}.jpg'.format(n_frames, qa_item['question_id'], i)
data.save(img_path)
prompt += '<img>' + img_path + '</img>\n'
choices = [qa_item['choice_a'], qa_item['choice_b'], qa_item['choice_c'], qa_item['choice_d']]
choice_list = []
for i, c in enumerate(choices):
choice_list.append('{}. {}'.format(chr(i + 65), c))
choice_txt = '\n'.join(choice_list)
prompt += 'Question: {}\nOptions: {}\nAnswer:'.format(qa_item['question'], choice_txt)
print(json.dumps({
'question_id': qa_item['question_id'],
'prompt': prompt,
'answer': qa_item['answer'],
}), file=fout)
fout.close()
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment