write_nlvr2.py 2.75 KB
Newer Older
chenych's avatar
chenych committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import json
import pandas as pd
import pyarrow as pa
import os

from tqdm import tqdm
from collections import defaultdict


def process(root, iden, row):
    texts = [r["sentence"] for r in row]
    labels = [r["label"] for r in row]

    split = iden.split("-")[0]

    if iden.startswith("train"):
        directory = row[0]["directory"]
        path = f"{root}/images/train/{directory}/{iden}"
    else:
        path = f"{root}/{split}/{iden}"

    with open(f"{path}-img0.png", "rb") as fp:
        img0 = fp.read()
    with open(f"{path}-img1.png", "rb") as fp:
        img1 = fp.read()

    return [img0, img1, texts, labels, iden]


def make_arrow(root, dataset_root):
    train_data = list(
        map(json.loads, open(f"{root}/nlvr2/data/train.json").readlines())
    )
    test1_data = list(
        map(json.loads, open(f"{root}/nlvr2/data/test1.json").readlines())
    )
    dev_data = list(map(json.loads, open(f"{root}/nlvr2/data/dev.json").readlines()))

    balanced_test1_data = list(
        map(
            json.loads,
            open(f"{root}/nlvr2/data/balanced/balanced_test1.json").readlines(),
        )
    )
    balanced_dev_data = list(
        map(
            json.loads,
            open(f"{root}/nlvr2/data/balanced/balanced_dev.json").readlines(),
        )
    )

    unbalanced_test1_data = list(
        map(
            json.loads,
            open(f"{root}/nlvr2/data/unbalanced/unbalanced_test1.json").readlines(),
        )
    )
    unbalanced_dev_data = list(
        map(
            json.loads,
            open(f"{root}/nlvr2/data/unbalanced/unbalanced_dev.json").readlines(),
        )
    )

    splits = [
        "train",
        "dev",
        "test1",
        "balanced_dev",
        "balanced_test1",
        "unbalanced_dev",
        "unbalanced_test1",
    ]

    datas = [
        train_data,
        dev_data,
        test1_data,
        balanced_dev_data,
        balanced_test1_data,
        unbalanced_dev_data,
        unbalanced_test1_data,
    ]

    annotations = dict()

    for split, data in zip(splits, datas):
        _annot = defaultdict(list)
        for row in tqdm(data):
            _annot["-".join(row["identifier"].split("-")[:-1])].append(row)
        annotations[split] = _annot

    for split in splits:
        bs = [
            process(root, iden, row) for iden, row in tqdm(annotations[split].items())
        ]

        dataframe = pd.DataFrame(
            bs, columns=["image_0", "image_1", "questions", "answers", "identifier"],
        )

        table = pa.Table.from_pandas(dataframe)

        os.makedirs(dataset_root, exist_ok=True)
        with pa.OSFile(f"{dataset_root}/nlvr2_{split}.arrow", "wb") as sink:
            with pa.RecordBatchFileWriter(sink, table.schema) as writer:
                writer.write_table(table)