# if we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger memory,
# we can always modify this plumbing to support that, but i didn't want to include it just yet because overengineering is bad
# (or we could make it write the requests to disk and then read them back out again - probably using an sqlite db because of all the moving parts we have
# TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable
docs={}
# get lists of each type of requeste
fortask_name,taskintask_dict_items:
#default to validation doc, fall back to test doc if validation unavailable
# TODO: the val-fallback-to-test system isn't final, we should revisit it at some point
This evaluation of Winogrande uses partial evaluation as described by
Trinh & Le in Simple Method for Commonsense Reasoning (2018).
Reference: https://arxiv.org/abs/1806.02847
"""
classWinogrande(HFTask):
classWinogrande(HFTask):
DATASET_PATH="winogrande"
DATASET_PATH="winogrande"
...
@@ -17,35 +22,31 @@ class Winogrande(HFTask):
...
@@ -17,35 +22,31 @@ class Winogrande(HFTask):
defhas_test_docs(self):
defhas_test_docs(self):
returnTrue
returnTrue
deftraining_docs(self):
ifself.has_training_docs():
returnself.data["train"]
defvalidation_docs(self):
ifself.has_validation_docs():
returnself.data["validation"]
deftest_docs(self):
ifself.has_test_docs():
returnself.data["test"]
deffewshot_description(self):
deffewshot_description(self):
# TODO: redo description
# TODO: redo description
return"Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in."
return"Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in."
@classmethod
defpartial_context(cls,doc):
# Substitute the pronoun in the sentence with each candidate choice