preprocess_glianorex.py 702 Bytes
Newer Older
1
2
3
4
5
6
7
8
9
import datasets


def doc_to_text(doc) -> str:
    option_choices = doc["options"]
    answers = "".join((f"{k}. {v}\n") for k, v in option_choices.items())
    return f"Question: {doc['question']}\n{answers}Answer:"


10
11
def doc_to_target(doc) -> str:
    # answer_idx is `A`, `B`, `C`, `D` etc.
12
13
14
15
16
17
18
19
20
21
22
23
24
    return doc["answer_idx"]


def filter_dataset(dataset: datasets.Dataset, lang: str) -> datasets.Dataset:
    return dataset.filter(lambda example: example["language"].startswith(lang))


def filter_french(dataset: datasets.Dataset) -> datasets.Dataset:
    return filter_dataset(dataset, "fr")


def filter_english(dataset: datasets.Dataset) -> datasets.Dataset:
    return filter_dataset(dataset, "en")