utils.py 1005 Bytes
Newer Older
lintangsutawika's avatar
lintangsutawika committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
upper_pronouns = [
    "A",
    "An",
    "The",
    "She",
    "He",
    "It",
    "They",
    "My",
    "His",
    "Her",
    "Their",
]


lintangsutawika's avatar
format  
lintangsutawika committed
16
def process_doc(dataset):
lintangsutawika's avatar
lintangsutawika committed
17
18
19
20
21
22
    def process_fn(doc):
        # The HF implementation of `wsc273` is not `partial evaluation` friendly.
        doc["text"] = doc["text"].replace("  ", " ")
        doc["options"][0] = __normalize_option(doc, doc["options"][0])
        doc["options"][1] = __normalize_option(doc, doc["options"][1])
        return doc
lintangsutawika's avatar
format  
lintangsutawika committed
23

lintangsutawika's avatar
lintangsutawika committed
24
25
    return dataset.map(process_fn)

lintangsutawika's avatar
format  
lintangsutawika committed
26

lintangsutawika's avatar
lintangsutawika committed
27
28
29
30
31
32
33
34
35
def __normalize_option(doc, option):
    # Append `'s` to possessive determiner based options.
    if doc["pronoun"].lower() in ["my", "his", "her", "our", "their"]:
        option += "'s"
    # Appropriately lowercase the pronoun in the option.
    pronoun = option.split()[0]
    start_of_sentence = doc["text"][doc["pronoun_loc"] - 2] == "."
    if not start_of_sentence and pronoun in upper_pronouns:
        return option.replace(pronoun, pronoun.lower())
lintangsutawika's avatar
format  
lintangsutawika committed
36
    return option