mmbench_converter_dev.py 1.32 KB
Newer Older
wanglch's avatar
wanglch committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import pandas as pd
import io
import base64
import json
from PIL import Image

'''
This scripts convert mmbench_dev tsv file to jsonl
'''

datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t')

global_choices = ['A', 'B', 'C', 'D']

def decode_base64_to_image(base64_string):
    image_data = base64.b64decode(base64_string)
    image = Image.open(io.BytesIO(image_data))
    return image


with open('./data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.jsonl', 'w') as f:
    for idx in range(len(datas)):
        data = datas.iloc[idx]
        
        index = int(data['index'])
        question = data['question']
        hint = data['hint'] if not pd.isna(data['hint']) else 'N/A'

        choices = []
        for opt in global_choices:
            if pd.isna(data[opt]):
                continue
            choices.append(data[opt])

        answer = global_choices.index(data['answer'])

        image = decode_base64_to_image(data['image'])
        image.save("data/mmbench/mmbench_dev_20230712/images/%d.jpg" % index)

        f.write(json.dumps({
            "index": index,
            "image": "data/mmbench/mmbench_dev_20230712/images/%d.jpg" % index,
            "hint": hint,
            "question": question,
            "choices": choices, 
            "answer": answer,
        }) + "\n")