preprocess_hellaswag.py 427 Bytes
Newer Older
nikuya3's avatar
nikuya3 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
import re

def preprocess(text):
    text = text.strip()
    # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
    text = text.replace(" [title]", ". ")
    text = re.sub("\\[.*?\\]", "", text)
    text = text.replace("  ", " ")
    return text

def doc_to_text(doc):
    ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
    query = preprocess(doc["activity_label"] + ": " + ctx)
    return query