utils.py 932 Bytes
Newer Older
Uanu's avatar
Uanu committed
1
2
import re

3
4
import datasets

Uanu's avatar
Uanu committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

def preprocess(text):
    text = text.strip()
    text = text.replace(" [title]", ". ")
    text = re.sub("\\[.*?\\]", "", text)
    text = text.replace("  ", " ")
    return text


def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
    def _process_doc(doc):
        # breakpoint()
        out_doc = {
            "id": doc["id"],
            "query": "Question: " + preprocess(doc["instruction"]) + "\nAnswer:",
20
            "choices": [
Zehan Li's avatar
Zehan Li committed
21
22
23
24
25
26
27
28
29
                preprocess(option)
                for option in [
                    doc["option_a"],
                    doc["option_b"],
                    doc["option_c"],
                    doc["option_d"],
                    doc["option_e"],
                ]
                if option
30
            ],
Uanu's avatar
Uanu committed
31
32
33
34
35
            "gold": ["A", "B", "C", "D", "E"].index(doc["answer"]),
        }
        return out_doc

    return dataset.map(_process_doc)