Unverified Commit 19b0f529 authored by Leo Gao's avatar Leo Gao Committed by GitHub
Browse files

Merge pull request #111 from jon-tow/wsc273-evaluation

Implement `WSC273` evaluation and data processing
parents e12d0078 bc5495d2
...@@ -57,7 +57,7 @@ TASK_REGISTRY = { ...@@ -57,7 +57,7 @@ TASK_REGISTRY = {
"race": race.RACE, "race": race.RACE,
# "naturalqs": naturalqs.NaturalQs, # not implemented yet # "naturalqs": naturalqs.NaturalQs, # not implemented yet
"webqs": webqs.WebQs, "webqs": webqs.WebQs,
# "wsc273": wsc273.WinogradSchemaChallenge273, # not implemented yet "wsc273": wsc273.WinogradSchemaChallenge273,
# "winogrande": winogrande.Winogrande, # not implemented yet # "winogrande": winogrande.Winogrande, # not implemented yet
"anli_r1": anli.ANLIRound1, "anli_r1": anli.ANLIRound1,
"anli_r2": anli.ANLIRound2, "anli_r2": anli.ANLIRound2,
......
import json import numpy as np
import random import random
import os from lm_eval.base import rf, mean
from lm_eval.base import Task from . common import HFTask
from ..utils import sh
"""
NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
See: https://arxiv.org/abs/1806.02847
"""
class WinogradSchemaChallenge273(HFTask):
DATASET_PATH = "winograd_wsc"
DATASET_NAME = "wsc273"
upper_pronouns = ["A", "An", "The", "She", "He",
"It", "They", "My", "His", "Her", "Their"]
class WinogradSchemaChallenge273(Task):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.data = self.__clean_data()
def download(self):
if not os.path.exists('data/wsc273'): def __clean_data(self):
sh(""" # The HF implementation of `wsc273` is not `partial evaluation` friendly.
mkdir -p data/wsc273 data = []
wget https://git.cse.msu.edu/bakerb15/nlp-final-project/raw/master/Winogard/reproduce/commonsense_test/wsc273.json -O data/wsc273/wsc273.json for doc in self.data["test"]:
""") doc["text"] = doc["text"].replace(" ", " ")
doc["options"][0] = self.__normalize_option(doc["options"][0], doc)
doc["options"][1] = self.__normalize_option(doc["options"][1], doc)
data.append(doc)
return {"test": data}
def __normalize_option(self, option, doc):
# Append `'s` to possessive determiner based options.
if doc["pronoun"].lower() in ["my", "his", "her", "our", "their"]:
option += "'s"
# Appropriately lowercase the pronoun in the option.
pronoun = option.split()[0]
start_of_sentence = doc["text"][doc['pronoun_loc'] - 2] == '.'
if not start_of_sentence and pronoun in self.upper_pronouns:
return option.replace(pronoun, pronoun.lower())
return option
def has_training_docs(self): def has_training_docs(self):
return False return False
...@@ -25,60 +51,35 @@ class WinogradSchemaChallenge273(Task): ...@@ -25,60 +51,35 @@ class WinogradSchemaChallenge273(Task):
def has_test_docs(self): def has_test_docs(self):
return True return True
def training_docs(self): def fewshot_examples(self, k):
return [] # NOTE: `super().fewshot_examples` samples from training docs which are
# not available for this test-set-only dataset.
def validation_docs(self): return random.sample(list(self.test_docs()), k)
return []
def test_docs(self):
myjson = json.load(open('data/wsc273/wsc273.json'))
return self.load_doc(myjson)
def fewshot_description(self): def fewshot_description(self):
# TODO: redo description # TODO: redo description
return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False." return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
def load_doc(self, myjson): @classmethod
docs = [] def partial_context(cls, doc):
for i in range(0, 273 * 2, 2): # Substitute the pronoun in the original text with each candidate
item1 = myjson[i] # choice and ignore everything after.
item2 = myjson[i+1] context1 = doc["text"][:doc["pronoun_loc"]] + doc["options"][0]
context2 = doc["text"][:doc["pronoun_loc"]] + doc["options"][1]
if item1['question_id'] != item2['question_id']: return context1, context2
raise ValueError("WSC273 has missing completion pair.")
question_id = item1['question_id']
if item1['correctness'] == True: @classmethod
doc = { def partial_target(cls, doc):
'id': question_id, # The target is everything after the document specified pronoun.
'completions': { start_index = doc["pronoun_loc"] + len(doc["pronoun"])
'T': item1['substitution'], return doc["text"][start_index:].strip()
'F': item2['substitution'],
},
}
if item2['correctness'] == True:
doc = {
'id': question_id,
'completions': {
'F': item1['substitution'],
'T': item2['substitution'],
},
}
docs.append(doc)
return docs
def doc_to_text(self, doc): def doc_to_text(self, doc):
# TODO: implement context1, context2 = self.partial_context(doc)
pass return context1 + '\n' + context2 + '\n'
def doc_to_target(self, doc): def doc_to_target(self, doc):
# TODO: implement return self.partial_target(doc)
pass
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of """ Uses RequestFactory to construct Requests and returns an iterable of
...@@ -91,8 +92,11 @@ class WinogradSchemaChallenge273(Task): ...@@ -91,8 +92,11 @@ class WinogradSchemaChallenge273(Task):
language description, as well as the few shot examples, and the question language description, as well as the few shot examples, and the question
part of the document for `doc`. part of the document for `doc`.
""" """
# TODO: implement evaluation. target = self.partial_target(doc)
raise NotImplementedError('Evaluation not implemented') context1, context2 = self.partial_context(doc)
ll_context1, _ = rf.loglikelihood(context1, " " + target)
ll_context2, _ = rf.loglikelihood(context2, " " + target)
return ll_context1, ll_context2
def process_results(self, doc, results): def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a """Take a single document and the LM results and evaluates, returning a
...@@ -104,8 +108,9 @@ class WinogradSchemaChallenge273(Task): ...@@ -104,8 +108,9 @@ class WinogradSchemaChallenge273(Task):
:param results: :param results:
The results of the requests created in construct_requests. The results of the requests created in construct_requests.
""" """
# TODO: implement evaluation. return {
raise NotImplementedError('Evaluation not implemented') "acc": np.argmax(results) == doc["label"]
}
def aggregation(self): def aggregation(self):
""" """
...@@ -113,8 +118,9 @@ class WinogradSchemaChallenge273(Task): ...@@ -113,8 +118,9 @@ class WinogradSchemaChallenge273(Task):
A dictionary where keys are the names of submetrics and values are A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics functions that aggregate a list of metrics
""" """
# TODO: implement evaluation. return {
raise NotImplementedError('Evaluation not implemented') "acc": mean
}
def higher_is_better(self): def higher_is_better(self):
""" """
...@@ -122,5 +128,6 @@ class WinogradSchemaChallenge273(Task): ...@@ -122,5 +128,6 @@ class WinogradSchemaChallenge273(Task):
A dictionary where keys are the names of submetrics and values are A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better whether a higher value of the submetric is better
""" """
# TODO: implement evaluation. return {
raise NotImplementedError('Evaluation not implemented') "acc": True
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment