return"Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in."
return"Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in."
defdoc_to_text(self,doc):
defdoc_to_text(self,doc):
...
@@ -48,8 +49,47 @@ class Winogrande(HFTask):
...
@@ -48,8 +49,47 @@ class Winogrande(HFTask):
raiseValueError("Winogrande from HF datasets contained an invalid answer key")
raiseValueError("Winogrande from HF datasets contained an invalid answer key")
returntext.replace("_",answer)
returntext.replace("_",answer)
# TODO: Implement evaluation code
defconstruct_requests(self,doc,ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
:param doc:
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
The document as returned from training_docs, validation_docs, or test_docs.
# Remove this comment when the evaluation code is implemented.
:param ctx: str
\ No newline at end of file
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raiseNotImplementedError('Evaluation not implemented')
defprocess_results(self,doc,results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raiseNotImplementedError('Evaluation not implemented')
defaggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raiseNotImplementedError('Evaluation not implemented')
defhigher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raiseNotImplementedError('Evaluation not implemented')