preprocess_wsc.py 620 Bytes
Newer Older
lintangsutawika's avatar
lintangsutawika committed
1
from lm_eval.utils import general_detokenize
lintangsutawika's avatar
lintangsutawika committed
2
3


lintangsutawika's avatar
lintangsutawika committed
4
5
def default_doc_to_text(x):
    raw_passage = x["text"]
lintangsutawika's avatar
lintangsutawika committed
6
    # NOTE: HuggingFace span indices are word-based not character-based.
lintangsutawika's avatar
lintangsutawika committed
7
8
9
10
11
    pre = " ".join(raw_passage.split()[: x["span2_index"]])
    post = raw_passage[len(pre) + len(x["span2_text"]) + 1 :]
    passage = general_detokenize(pre + " *{}*".format(x["span2_text"]) + post)
    noun = x["span1_text"]
    pronoun = x["span2_text"]
lintangsutawika's avatar
lintangsutawika committed
12
13
14
15
16
17
    text = (
        f"Passage: {passage}\n"
        + f'Question: In the passage above, does the pronoun "*{pronoun}*" refer to "*{noun}*"?\n'
        + "Answer:"
    )
    return text