utils.py 901 Bytes
Newer Older
Uanu's avatar
Uanu committed
1
2
import re

3
4
import datasets

Uanu's avatar
Uanu committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

def preprocess(text):
    if text is None:
        return " "
    text = text.strip()
    text = text.replace(" [title]", ". ")
    text = re.sub("\\[.*?\\]", "", text)
    text = text.replace("  ", " ")
    return text


def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
    def _process_doc(doc):
        # breakpoint()
        out_doc = {
            "id": doc["id"],
            "query": "Question: " + preprocess(doc["instruction"]) + "\nAnswer:",
22
23
24
25
26
27
28
            "choices": [
                preprocess(doc["option_a"]),
                preprocess(doc["option_b"]),
                preprocess(doc["option_c"]),
                preprocess(doc["option_d"]),
                preprocess(doc["option_e"]),
            ],
Uanu's avatar
Uanu committed
29
30
31
32
33
            "gold": ["A", "B", "C", "D", "E"].index(doc["answer"]),
        }
        return out_doc

    return dataset.map(_process_doc)