scienceqa_data_preprocess.py 3.46 KB
Newer Older
dongchy920's avatar
dongchy920 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import json
from tqdm import tqdm
    
with open("scienceqa_problems_path.json", 'r') as file:
    data = json.load(file)
    
with open("scienceqa_pid_splits.json") as file:
    pid_splits = json.load(file)

train_ids = pid_splits['train']
val_ids = pid_splits['val']
test_ids = pid_splits['test']

# make train annotation

train_annotation = []
for id in tqdm(train_ids):
    train_data = data[str(id)]
    if train_data['image'] is None:
        continue
    image_url = f"scienceqa/images/train/{id}/image.png"
    if train_data['answer'] == 0:
        answer = "(a) " + train_data['choices'][train_data['answer']]
    elif train_data['answer'] == 1:
       answer = "(b) " + train_data['choices'][train_data['answer']]
    elif train_data['answer'] == 2:
       answer = "(c) " + train_data['choices'][train_data['answer']]
    elif train_data['answer'] == 3:
        answer = "(d) " + train_data['choices'][train_data['answer']]
    else:
        answer = "(e) " + train_data['choices'][train_data['answer']]
    ann = {
        "image": image_url,
        "question": train_data['question'],
        "answer" : answer,
        "choices": train_data['choices'],
        "context" : train_data['hint'] + " " + train_data['lecture'],
        "question_id" : id
    }
    train_annotation.append(ann)

# make val annotation

val_annotation = []
for id in tqdm(val_ids):
    val_data = data[str(id)]
    if val_data['image'] is None:
        continue
    image_url = f"scienceqa/images/val/{id}/image.png"
    if val_data['answer'] == 0:
        answer = "(a) " + val_data['choices'][val_data['answer']]
    elif val_data['answer'] == 1:
       answer = "(b) " + val_data['choices'][val_data['answer']]
    elif val_data['answer'] == 2:
       answer = "(c) " + val_data['choices'][val_data['answer']]
    elif val_data['answer'] == 3:
       answer = "(d) " + val_data['choices'][val_data['answer']]
    else:
        answer = "(e) " + val_data['choices'][val_data['answer']]
    ann = {
        "image": image_url,
        "question": val_data['question'],
        "answer" : answer,
        "choices": val_data['choices'],
        "context" : val_data['hint']+ " " + val_data['lecture'],
        "question_id" : id
    }
    val_annotation.append(ann)
    
# make test annotation

test_annotation = []
for id in tqdm(test_ids):
    test_data = data[str(id)]
    if test_data['image'] is None:
        continue
    image_url = f"scienceqa/images/test/{id}/image.png"
    if test_data['answer'] == 0:
        answer = "(a) " + test_data['choices'][test_data['answer']]
    elif test_data['answer'] == 1:
       answer = "(b) " + test_data['choices'][test_data['answer']]
    elif test_data['answer'] == 2:
       answer = "(c) " + test_data['choices'][test_data['answer']]
    elif test_data['answer'] == 3:
       answer = "(d) " + test_data['choices'][test_data['answer']]
    else:
        answer = "(e) " + test_data['choices'][test_data['answer']]
    ann = {
        "image": image_url,
        "question": test_data['question'],
        "answer" : answer,
        "choices": test_data['choices'],
        "context" :test_data['hint']+ " " + test_data['lecture'],
        "question_id" : id
    }
    test_annotation.append(ann)

with open("/input/scienceqa/scienceqa_train.json", 'w') as file:
    json.dump(train_annotation, file)

with open("/input/scienceqa/scienceqa_test.json", 'w') as file:
    json.dump(test_annotation, file)

with open("/input/scienceqa/scienceqa_val.json", 'w') as file:
    json.dump(val_annotation, file)