Commit e31b4b31 authored by Leo Gao's avatar Leo Gao
Browse files

Clean up code, remove some footguns

parent e723d3d5
...@@ -32,6 +32,7 @@ class Winogrande(HFTask): ...@@ -32,6 +32,7 @@ class Winogrande(HFTask):
return self.data["test"] return self.data["test"]
def fewshot_description(self): def fewshot_description(self):
# TODO: redo description
return "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in." return "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in."
def doc_to_text(self, doc): def doc_to_text(self, doc):
...@@ -48,8 +49,47 @@ class Winogrande(HFTask): ...@@ -48,8 +49,47 @@ class Winogrande(HFTask):
raise ValueError("Winogrande from HF datasets contained an invalid answer key") raise ValueError("Winogrande from HF datasets contained an invalid answer key")
return text.replace("_", answer) return text.replace("_", answer)
# TODO: Implement evaluation code def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
# ***IMPORTANT***: this evaluation function needs to be written for the new framework. :param doc:
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. The document as returned from training_docs, validation_docs, or test_docs.
# Remove this comment when the evaluation code is implemented. :param ctx: str
\ No newline at end of file The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
...@@ -38,8 +38,7 @@ class WinogradSchemaChallenge273(Dataset): ...@@ -38,8 +38,7 @@ class WinogradSchemaChallenge273(Dataset):
return self.load_doc(myjson) return self.load_doc(myjson)
def fewshot_description(self): def fewshot_description(self):
# This format is ONLY for the purposes of deduplication. For the task evaluation, we'll need to find a new strategy, # TODO: redo description
# to meet the needs of this particular task.
return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False." return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
def load_doc(self, myjson): def load_doc(self, myjson):
...@@ -80,8 +79,48 @@ class WinogradSchemaChallenge273(Dataset): ...@@ -80,8 +79,48 @@ class WinogradSchemaChallenge273(Dataset):
text = doc['completions']['T'] + ' True. ' + doc['completions']['F'] + ' False.' text = doc['completions']['T'] + ' True. ' + doc['completions']['F'] + ' False.'
return text return text
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework. def construct_requests(self, doc, ctx):
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. """ Uses RequestFactory to construct Requests and returns an iterable of
# Remove this comment when the evaluation code is implemented. Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment