Commit e31b4b31 authored by Leo Gao's avatar Leo Gao
Browse files

Clean up code, remove some footguns

parent e723d3d5
...@@ -38,7 +38,6 @@ class GPT2LM(LM): ...@@ -38,7 +38,6 @@ class GPT2LM(LM):
logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [batch, seq] logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [batch, seq]
# TODO: implement isgreedy
res.append((float(logits.sum()), bool(max_equal))) res.append((float(logits.sum()), bool(max_equal)))
return res return res
......
...@@ -45,11 +45,50 @@ class ANLIBase(HFTask): ...@@ -45,11 +45,50 @@ class ANLIBase(HFTask):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " + ["True", "Neither", "False"][doc['label']] return " " + ["True", "Neither", "False"][doc['label']]
# TODO: Implement evaluation code def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
# ***IMPORTANT***: this evaluation function needs to be written for the new framework. Requests which will be sent to the LM.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented. :param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
class ANLIRound1(ANLIBase): class ANLIRound1(ANLIBase):
SPLIT = 1 SPLIT = 1
......
...@@ -25,9 +25,50 @@ class ARCEasy(HFTask): ...@@ -25,9 +25,50 @@ class ARCEasy(HFTask):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " + doc['choices']['text'][doc['choices']['label'].index(doc['answerKey'])] return " " + doc['choices']['text'][doc['choices']['label'].index(doc['answerKey'])]
def evaluate(self, docs, lm, provide_description, num_fewshot): def construct_requests(self, doc, ctx):
# TODO: implement """ Uses RequestFactory to construct Requests and returns an iterable of
raise NotImplementedError() Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
class ARCChallenge(ARCEasy): class ARCChallenge(ARCEasy):
DATASET_PATH = "ai2_arc" DATASET_PATH = "ai2_arc"
......
...@@ -33,22 +33,61 @@ class CoQA(Dataset): ...@@ -33,22 +33,61 @@ class CoQA(Dataset):
return json.load(open('data/coqa/coqa-dev-v1.0.json'))['data'] return json.load(open('data/coqa/coqa-dev-v1.0.json'))['data']
def test_docs(self): def test_docs(self):
pass pass
def fewshot_description(self): def fewshot_description(self):
pass # TODO: figure out description
return ""
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc):
text = [doc['story']] # TODO: implement.
for pair in zip(doc['questions'], doc['answers']): raise NotImplementedError('doc_to_text not implemented')
text.append('\n\n')
text.append(''.join(['Q: ',pair[0]['input_text'], '\n\n'])) def doc_to_target(self, doc):
if include_target: # TODO: implement.
text.append(''.join(['A: ',pair[1]['input_text']])) raise NotImplementedError('doc_to_target not implemented')
else:
text.append('A: ') def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
return ''.join(text) Requests which will be sent to the LM.
def evaluate(self, docs, lm): :param doc:
pass The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
...@@ -60,10 +60,50 @@ class DROP(Dataset): ...@@ -60,10 +60,50 @@ class DROP(Dataset):
return ''.join([doctext, '\n'.join(qa_texts)]) return ''.join([doctext, '\n'.join(qa_texts)])
def fewshot_description(self): def fewshot_description(self):
return "Read the passage and answer the questions " # TODO: figure out description
return ""
# TODO: Implement evaluation code def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
# ***IMPORTANT***: this evaluation function needs to be written for the new framework. :param doc:
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. The document as returned from training_docs, validation_docs, or test_docs.
# Remove this comment when the evaluation code is implemented. :param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
...@@ -453,37 +453,47 @@ class STSB(HFTask): ...@@ -453,37 +453,47 @@ class STSB(HFTask):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " {}".format(doc["label"]) return " {}".format(doc["label"])
def evaluate(self, docs, lm, provide_description, num_fewshot): def construct_requests(self, doc, ctx):
# TODO: Implement evaluation code using new framework """ Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. :param doc:
# Remove this comment when the evaluation code is implemented. The document as returned from training_docs, validation_docs, or test_docs.
golds = [doc["label"] for doc in docs] :param ctx: str
preds = [] The context string, generated by fewshot_context. This includes the natural
for doc in tqdm_lib.tqdm(docs): language description, as well as the few shot examples, and the question
ctx = self.fewshot_context( part of the document for `doc`.
doc=doc, """
provide_description=provide_description, # TODO: implement evaluation.
num_fewshot=num_fewshot, raise NotImplementedError('Evaluation not implemented')
)
output = lm.generate(context=ctx, max_gen_length=5).strip() def process_results(self, doc, results):
first_element = output.split()[0] """Take a single document and the LM results and evaluates, returning a
if first_element.isnumeric(): dict where keys are the names of submetrics and values are the values of
pred = max(min(float(first_element), 5.0), 0.0) the metric for that one document
else:
pred = 2.5 :param doc:
import pdb; pdb.set_trace() The document as returned from training_docs, validation_docs, or test_docs.
preds.append(pred) :param results:
pearson_corr = float(pearsonr(preds, golds)[0]) The results of the requests created in construct_requests.
spearman_corr = float(spearmanr(preds, golds)[0]) """
minor = { # TODO: implement evaluation.
"pearson": pearson_corr, raise NotImplementedError('Evaluation not implemented')
"spearmanr": spearman_corr,
"corr": (pearson_corr + spearman_corr) / 2, def aggregation(self):
} """
return { :returns: {str: [float] -> float}
"major": minor["corr"], A dictionary where keys are the names of submetrics and values are
"minor": minor, functions that aggregate a list of metrics
"higher_is_better": True, """
} # TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
...@@ -51,8 +51,47 @@ class HellaSwag(HFTask): ...@@ -51,8 +51,47 @@ class HellaSwag(HFTask):
raise ValueError("HellaSwag from HF datasets contained an invalid answer key") raise ValueError("HellaSwag from HF datasets contained an invalid answer key")
return doc['endings'][index] return doc['endings'][index]
# TODO: Implement evaluation code def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
# ***IMPORTANT***: this evaluation function needs to be written for the new framework. :param doc:
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. The document as returned from training_docs, validation_docs, or test_docs.
# Remove this comment when the evaluation code is implemented. :param ctx: str
\ No newline at end of file The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
...@@ -42,12 +42,53 @@ class Lambada(Dataset): ...@@ -42,12 +42,53 @@ class Lambada(Dataset):
return self.load_doc(myjson) return self.load_doc(myjson)
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc, include_target=True):
#TODO: check if this is how OA does it # TODO: implement.
#label = doc[]
return doc def fewshot_description(self):
# TODO: figure out description
return ""
# TODO: Implement evaluation code def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
# ***IMPORTANT***: this evaluation function needs to be written for the new framework. :param doc:
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. The document as returned from training_docs, validation_docs, or test_docs.
# Remove this comment when the evaluation code is implemented. :param ctx: str
\ No newline at end of file The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
...@@ -46,8 +46,47 @@ class NaturalQs(HFTask): ...@@ -46,8 +46,47 @@ class NaturalQs(HFTask):
long_answer = " ".join(long_answer_chars) long_answer = " ".join(long_answer_chars)
return long_answer # Replace with short_answer[0] for short answer return long_answer # Replace with short_answer[0] for short answer
# TODO: Implement evaluation code def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
# ***IMPORTANT***: this evaluation function needs to be written for the new framework. :param doc:
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. The document as returned from training_docs, validation_docs, or test_docs.
# Remove this comment when the evaluation code is implemented. :param ctx: str
\ No newline at end of file The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
...@@ -53,8 +53,47 @@ class OpenBookQA(HFTask): ...@@ -53,8 +53,47 @@ class OpenBookQA(HFTask):
raise ValueError("OpenBookQA from HF datasets contained an invalid answer key") raise ValueError("OpenBookQA from HF datasets contained an invalid answer key")
return doc['choices']['text'][index] + '.' return doc['choices']['text'][index] + '.'
# TODO: Implement evaluation code def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
# ***IMPORTANT***: this evaluation function needs to be written for the new framework. :param doc:
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. The document as returned from training_docs, validation_docs, or test_docs.
# Remove this comment when the evaluation code is implemented. :param ctx: str
\ No newline at end of file The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
...@@ -45,7 +45,8 @@ class PiQA(Dataset): ...@@ -45,7 +45,8 @@ class PiQA(Dataset):
return self.load_docs('data/piqa/piqa-test.jsonl', None) return self.load_docs('data/piqa/piqa-test.jsonl', None)
def fewshot_description(self): def fewshot_description(self):
pass # TODO: figure out fewshot description
return ""
def doc_to_text(self, doc): def doc_to_text(self, doc):
#TODO: check if oa uses newline #TODO: check if oa uses newline
...@@ -55,8 +56,47 @@ class PiQA(Dataset): ...@@ -55,8 +56,47 @@ class PiQA(Dataset):
rightanswer = int(doc[1][0]) + 1 rightanswer = int(doc[1][0]) + 1
return ''.join([doc[0]['goal'],' ',doc[0]['sol'+str(rightanswer)]]) return ''.join([doc[0]['goal'],' ',doc[0]['sol'+str(rightanswer)]])
# TODO: Implement evaluation code def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
# ***IMPORTANT***: this evaluation function needs to be written for the new framework. :param doc:
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. The document as returned from training_docs, validation_docs, or test_docs.
# Remove this comment when the evaluation code is implemented. :param ctx: str
\ No newline at end of file The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
...@@ -40,6 +40,7 @@ class QuAC(Dataset): ...@@ -40,6 +40,7 @@ class QuAC(Dataset):
raise NotImplementedError("QuAC has no test docs.") raise NotImplementedError("QuAC has no test docs.")
def fewshot_description(self): def fewshot_description(self):
# TODO: figure out fewshot description
desc = "TITLE: Title of the context passage - subtitle of the passage\nPARAGRAPH: Passage describing the relevant information for answering questions.\n\nQ: Text of a question.\n\nA: Answer to the question, based on the passage. If it cannot be answered based on the passage, write CANNOTANSWER" desc = "TITLE: Title of the context passage - subtitle of the passage\nPARAGRAPH: Passage describing the relevant information for answering questions.\n\nQ: Text of a question.\n\nA: Answer to the question, based on the passage. If it cannot be answered based on the passage, write CANNOTANSWER"
return desc return desc
...@@ -61,8 +62,47 @@ class QuAC(Dataset): ...@@ -61,8 +62,47 @@ class QuAC(Dataset):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return doc['answer'] return doc['answer']
# TODO: Implement evaluation code def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
# ***IMPORTANT***: this evaluation function needs to be written for the new framework. :param doc:
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. The document as returned from training_docs, validation_docs, or test_docs.
# Remove this comment when the evaluation code is implemented. :param ctx: str
\ No newline at end of file The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
...@@ -67,8 +67,47 @@ class RACE(HFTask): ...@@ -67,8 +67,47 @@ class RACE(HFTask):
return r return r
# TODO: Implement evaluation code def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
# ***IMPORTANT***: this evaluation function needs to be written for the new framework. Requests which will be sent to the LM.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented. :param doc:
\ No newline at end of file The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
import json import json
import random import random
import os import os
......
...@@ -42,8 +42,47 @@ class SQuAD(HFTask): ...@@ -42,8 +42,47 @@ class SQuAD(HFTask):
answer = 'unanswerable' answer = 'unanswerable'
return answer return answer
# TODO: Implement evaluation code def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
# ***IMPORTANT***: this evaluation function needs to be written for the new framework. :param doc:
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. The document as returned from training_docs, validation_docs, or test_docs.
# Remove this comment when the evaluation code is implemented. :param ctx: str
\ No newline at end of file The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
...@@ -39,7 +39,8 @@ class StoryCloze(Dataset): ...@@ -39,7 +39,8 @@ class StoryCloze(Dataset):
def fewshot_description(self): def fewshot_description(self):
pass # TODO: figure out fewshot description
return ""
def doc_to_text(self, doc): def doc_to_text(self, doc):
return ' '.join([*doc[1:5]]) return ' '.join([*doc[1:5]])
...@@ -47,9 +48,47 @@ class StoryCloze(Dataset): ...@@ -47,9 +48,47 @@ class StoryCloze(Dataset):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " + doc[int(doc[-1]) - 4] return " " + doc[int(doc[-1]) - 4]
# TODO: Implement evaluation code def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
# ***IMPORTANT***: this evaluation function needs to be written for the new framework. def aggregation(self):
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. """
# Remove this comment when the evaluation code is implemented. :returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
...@@ -356,9 +356,47 @@ class RTE(HFTask): ...@@ -356,9 +356,47 @@ class RTE(HFTask):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return 'True' if doc['label'] == 0 else 'False' return 'True' if doc['label'] == 0 else 'False'
# TODO: Implement evaluation code def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
# ***IMPORTANT***: this evaluation function needs to be written for the new framework. def aggregation(self):
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. """
# Remove this comment when the evaluation code is implemented. :returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
...@@ -37,7 +37,8 @@ class TriviaQA(Dataset): ...@@ -37,7 +37,8 @@ class TriviaQA(Dataset):
return json.load(open('data/triviaqa/triviaqa-unfiltered/unfiltered-web-test.json'))['Data'] return json.load(open('data/triviaqa/triviaqa-unfiltered/unfiltered-web-test.json'))['Data']
def fewshot_description(self): def fewshot_description(self):
pass # TODO: figure out fewshot description
return ""
def doc_to_text(self, doc): def doc_to_text(self, doc):
return ''.join(['Q: ', doc['Question'], '\n\n','A: ']) return ''.join(['Q: ', doc['Question'], '\n\n','A: '])
...@@ -45,8 +46,47 @@ class TriviaQA(Dataset): ...@@ -45,8 +46,47 @@ class TriviaQA(Dataset):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return doc['Answer']['Aliases'][0] return doc['Answer']['Aliases'][0]
# TODO: Implement evaluation code def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
# ***IMPORTANT***: this evaluation function needs to be written for the new framework. :param doc:
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. The document as returned from training_docs, validation_docs, or test_docs.
# Remove this comment when the evaluation code is implemented. :param ctx: str
\ No newline at end of file The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
...@@ -29,8 +29,47 @@ class WebQs(HFTask): ...@@ -29,8 +29,47 @@ class WebQs(HFTask):
# TODO: make sure we're actually handling multi-answer correctly # TODO: make sure we're actually handling multi-answer correctly
return " " + doc['answers'][0] return " " + doc['answers'][0]
# TODO: Implement evaluation code def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
# ***IMPORTANT***: this evaluation function needs to be written for the new framework. :param doc:
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. The document as returned from training_docs, validation_docs, or test_docs.
# Remove this comment when the evaluation code is implemented. :param ctx: str
\ No newline at end of file The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
...@@ -11,16 +11,59 @@ class WikiText103(NLP_TASK): ...@@ -11,16 +11,59 @@ class WikiText103(NLP_TASK):
NLP_NAME = "wikitext-103-raw-v1" NLP_NAME = "wikitext-103-raw-v1"
def fewshot_description(self): def fewshot_description(self):
# TODO: figure out fewshot description
return "" return ""
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc):
return doc['text'] # TODO: implement
# TODO: Implement evaluation code def doc_to_target(self, doc):
# TODO: implement
# ***IMPORTANT***: this evaluation function needs to be written for the new framework. def construct_requests(self, doc, ctx):
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. """ Uses RequestFactory to construct Requests and returns an iterable of
# Remove this comment when the evaluation code is implemented. Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
class WikiText2(NLP_TASK): class WikiText2(NLP_TASK):
...@@ -28,13 +71,56 @@ class WikiText2(NLP_TASK): ...@@ -28,13 +71,56 @@ class WikiText2(NLP_TASK):
NLP_NAME = "wikitext-2-raw-v1" NLP_NAME = "wikitext-2-raw-v1"
def fewshot_description(self): def fewshot_description(self):
# TODO: figure out fewshot description
return "" return ""
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc):
return doc['text'] # TODO: implement
def doc_to_target(self, doc):
# TODO: implement
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
# TODO: Implement evaluation code def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
# ***IMPORTANT***: this evaluation function needs to be written for the new framework. def higher_is_better(self):
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. """
# Remove this comment when the evaluation code is implemented. :returns: {str: bool}
\ No newline at end of file A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment