Commit 2d2dbf96 authored by Leo Gao's avatar Leo Gao
Browse files

Add comments to remind that evaluation needs to be written for the new framework

parent 6803e647
...@@ -45,9 +45,11 @@ class ANLIBase(HFTask): ...@@ -45,9 +45,11 @@ class ANLIBase(HFTask):
a = "True, False, or Neither?" + ((" " + ["True", "Neither", "False"][doc['label']]) if include_target else '') a = "True, False, or Neither?" + ((" " + ["True", "Neither", "False"][doc['label']]) if include_target else '')
return q + a return q + a
def evaluate(self, docs, lm, provide_description, num_fewshot): # TODO: Implement evaluation code
# TODO: implement
raise NotImplementedError() # ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
class ANLIRound1(ANLIBase): class ANLIRound1(ANLIBase):
SPLIT = 1 SPLIT = 1
......
...@@ -58,23 +58,12 @@ class DROP(Dataset): ...@@ -58,23 +58,12 @@ class DROP(Dataset):
text = ''.join([text, get_answer(pair['answer'])]) text = ''.join([text, get_answer(pair['answer'])])
qa_texts.append(text) qa_texts.append(text)
return ''.join([doctext, '\n'.join(qa_texts)]) return ''.join([doctext, '\n'.join(qa_texts)])
def evaluate(self, docs, lm, provide_description, num_fewshot):
"""Take iterable of docs and evaluates, returning a dict with the following format:
{
"major": float,
"minor": dict,
"higher_is_better": bool,
}
* `major` should be a single, representative number, for programmatic comparison
* `minor` should be a dictionary containing all relevant sub-metrics
* `higher_is_better` determines whether a higher metric is better
"""
pass
def fewshot_description(self): def fewshot_description(self):
return "Read the passage and answer the questions " return "Read the passage and answer the questions "
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
...@@ -46,6 +46,12 @@ class CoLA(HFTask): ...@@ -46,6 +46,12 @@ class CoLA(HFTask):
return text return text
def evaluate(self, docs, lm, provide_description, num_fewshot): def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs] golds = [doc["label"] for doc in docs]
preds = [] preds = []
for doc in tqdm_lib.tqdm(docs): for doc in tqdm_lib.tqdm(docs):
...@@ -99,6 +105,11 @@ class MNLI(HFTask): ...@@ -99,6 +105,11 @@ class MNLI(HFTask):
return text return text
def evaluate(self, docs, lm, provide_description, num_fewshot): def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs] golds = [doc["label"] for doc in docs]
preds = [] preds = []
for doc in tqdm_lib.tqdm(docs): for doc in tqdm_lib.tqdm(docs):
...@@ -153,6 +164,11 @@ class MRPC(HFTask): ...@@ -153,6 +164,11 @@ class MRPC(HFTask):
return text return text
def evaluate(self, docs, lm, provide_description, num_fewshot): def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs] golds = [doc["label"] for doc in docs]
preds = [] preds = []
for doc in tqdm_lib.tqdm(docs): for doc in tqdm_lib.tqdm(docs):
...@@ -190,6 +206,11 @@ class RTE(HFTask): ...@@ -190,6 +206,11 @@ class RTE(HFTask):
return text return text
def evaluate(self, docs, lm, provide_description, num_fewshot): def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs] golds = [doc["label"] for doc in docs]
preds = [] preds = []
for doc in tqdm_lib.tqdm(docs): for doc in tqdm_lib.tqdm(docs):
...@@ -227,6 +248,11 @@ class QNLI(HFTask): ...@@ -227,6 +248,11 @@ class QNLI(HFTask):
return text return text
def evaluate(self, docs, lm, provide_description, num_fewshot): def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs] golds = [doc["label"] for doc in docs]
preds = [] preds = []
for doc in tqdm_lib.tqdm(docs): for doc in tqdm_lib.tqdm(docs):
...@@ -265,6 +291,11 @@ class QQP(HFTask): ...@@ -265,6 +291,11 @@ class QQP(HFTask):
return text return text
def evaluate(self, docs, lm, provide_description, num_fewshot): def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs] golds = [doc["label"] for doc in docs]
preds = [] preds = []
for doc in tqdm_lib.tqdm(docs): for doc in tqdm_lib.tqdm(docs):
...@@ -304,6 +335,11 @@ class STSB(HFTask): ...@@ -304,6 +335,11 @@ class STSB(HFTask):
return text return text
def evaluate(self, docs, lm, provide_description, num_fewshot): def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs] golds = [doc["label"] for doc in docs]
preds = [] preds = []
for doc in tqdm_lib.tqdm(docs): for doc in tqdm_lib.tqdm(docs):
...@@ -359,6 +395,11 @@ class SST(HFTask): ...@@ -359,6 +395,11 @@ class SST(HFTask):
return text return text
def evaluate(self, docs, lm, provide_description, num_fewshot): def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs] golds = [doc["label"] for doc in docs]
preds = [] preds = []
for doc in tqdm_lib.tqdm(docs): for doc in tqdm_lib.tqdm(docs):
...@@ -397,6 +438,11 @@ class WNLI(HFTask): ...@@ -397,6 +438,11 @@ class WNLI(HFTask):
return text return text
def evaluate(self, docs, lm, provide_description, num_fewshot): def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs] golds = [doc["label"] for doc in docs]
preds = [] preds = []
for doc in tqdm_lib.tqdm(docs): for doc in tqdm_lib.tqdm(docs):
......
...@@ -51,6 +51,8 @@ class HellaSwag(HFTask): ...@@ -51,6 +51,8 @@ class HellaSwag(HFTask):
text += doc['endings'][index] text += doc['endings'][index]
return text return text
def evaluate(self, docs, lm, provide_description, num_fewshot): # TODO: Implement evaluation code
# TODO: Write evaluation function
raise NotImplementedError() # ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
...@@ -46,5 +46,8 @@ class Lambada(Dataset): ...@@ -46,5 +46,8 @@ class Lambada(Dataset):
#label = doc[] #label = doc[]
return doc return doc
def evaluate(self, docs, lm, provide_description, num_fewshot): # TODO: Implement evaluation code
pass
\ No newline at end of file # ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
...@@ -50,6 +50,8 @@ class NaturalQs(HFTask): ...@@ -50,6 +50,8 @@ class NaturalQs(HFTask):
return text return text
def evaluate(self, docs, lm, provide_description, num_fewshot): # TODO: Implement evaluation code
# TODO: implement
raise NotImplementedError() # ***IMPORTANT***: this evaluation function needs to be written for the new framework.
\ No newline at end of file # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
...@@ -53,6 +53,8 @@ class OpenBookQA(HFTask): ...@@ -53,6 +53,8 @@ class OpenBookQA(HFTask):
text += doc['choices']['text'][index] + '.' text += doc['choices']['text'][index] + '.'
return text return text
def evaluate(self, docs, lm, provide_description, num_fewshot): # TODO: Implement evaluation code
# TODO: Write evaluation function
raise NotImplementedError() # ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
...@@ -54,6 +54,8 @@ class PiQA(Dataset): ...@@ -54,6 +54,8 @@ class PiQA(Dataset):
#TODO: check if oa uses newline #TODO: check if oa uses newline
return doc['goal'] + ' ' return doc['goal'] + ' '
def evaluate(self, docs, lm): # TODO: Implement evaluation code
pass
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
...@@ -61,5 +61,8 @@ class QuAC(Dataset): ...@@ -61,5 +61,8 @@ class QuAC(Dataset):
text += doc['answer'] text += doc['answer']
return text return text
def evaluate(self, docs, lm): # TODO: Implement evaluation code
pass
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
...@@ -67,6 +67,8 @@ class RACE(HFTask): ...@@ -67,6 +67,8 @@ class RACE(HFTask):
return r return r
def evaluate(self, docs, lm, provide_description, num_fewshot): # TODO: Implement evaluation code
# TODO: implement
raise NotImplementedError() # ***IMPORTANT***: this evaluation function needs to be written for the new framework.
\ No newline at end of file # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
...@@ -93,6 +93,8 @@ class SATAnalogies(Dataset): ...@@ -93,6 +93,8 @@ class SATAnalogies(Dataset):
return text return text
def evaluate(self, docs, lm): # TODO: Implement evaluation code
# TODO: Write evaluation function
raise NotImplementedError() # ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
...@@ -42,6 +42,8 @@ class SQuAD(HFTask): ...@@ -42,6 +42,8 @@ class SQuAD(HFTask):
text += answer text += answer
return text return text
def evaluate(self, docs, lm, provide_description, num_fewshot): # TODO: Implement evaluation code
# TODO: Write evaluation function
raise NotImplementedError() # ***IMPORTANT***: this evaluation function needs to be written for the new framework.
\ No newline at end of file # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
...@@ -47,6 +47,9 @@ class StoryCloze(Dataset): ...@@ -47,6 +47,9 @@ class StoryCloze(Dataset):
else: else:
return ' '.join([*doc[1:5]]) return ' '.join([*doc[1:5]])
def evaluate(self, docs, lm): # TODO: Implement evaluation code
pass
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
...@@ -76,6 +76,11 @@ class CommitmentBank(HFTask): ...@@ -76,6 +76,11 @@ class CommitmentBank(HFTask):
return text return text
def evaluate(self, docs, lm, provide_description, num_fewshot): def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs] golds = [doc["label"] for doc in docs]
preds = [] preds = []
for doc in tqdm_lib.tqdm(docs): for doc in tqdm_lib.tqdm(docs):
...@@ -120,6 +125,11 @@ class Copa(HFTask): ...@@ -120,6 +125,11 @@ class Copa(HFTask):
return text return text
def evaluate(self, docs, lm, provide_description, num_fewshot): def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs] golds = [doc["label"] for doc in docs]
preds = [] preds = []
for doc in tqdm_lib.tqdm(docs): for doc in tqdm_lib.tqdm(docs):
...@@ -165,6 +175,11 @@ class MultiRC(HFTask): ...@@ -165,6 +175,11 @@ class MultiRC(HFTask):
return f"[{label_str}] {answer}" return f"[{label_str}] {answer}"
def evaluate(self, docs, lm, provide_description, num_fewshot): def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
preds = [] preds = []
for doc in docs: for doc in docs:
ctx = self.fewshot_context( ctx = self.fewshot_context(
...@@ -220,6 +235,11 @@ class WordsInContext(HFTask): ...@@ -220,6 +235,11 @@ class WordsInContext(HFTask):
return text return text
def evaluate(self, docs, lm, provide_description, num_fewshot): def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs] golds = [doc["label"] for doc in docs]
preds = [] preds = []
for doc in tqdm_lib.tqdm(docs): for doc in tqdm_lib.tqdm(docs):
...@@ -280,6 +300,11 @@ class SGWinogradSchemaChallenge(HFTask): ...@@ -280,6 +300,11 @@ class SGWinogradSchemaChallenge(HFTask):
return text return text
def evaluate(self, docs, lm, provide_description, num_fewshot): def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs] golds = [doc["label"] for doc in docs]
preds = [] preds = []
for doc in tqdm_lib.tqdm(docs): for doc in tqdm_lib.tqdm(docs):
...@@ -314,7 +339,10 @@ class RTE(HFTask): ...@@ -314,7 +339,10 @@ class RTE(HFTask):
return ''.join([doc['premise'], '\nquestion: ',doc['hypothesis'], ' True or False?\nanswer: ', answer]) return ''.join([doc['premise'], '\nquestion: ',doc['hypothesis'], ' True or False?\nanswer: ', answer])
else: else:
return ''.join([doc['premise'], '\nquestion: ',doc['hypothesis'], ' True or False?\nanswer: ']) return ''.join([doc['premise'], '\nquestion: ',doc['hypothesis'], ' True or False?\nanswer: '])
def evaluate(self, docs, lm, provide_description, num_fewshot):
#TODO: # TODO: Implement evaluation code
pass
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
...@@ -44,6 +44,9 @@ class TriviaQA(Dataset): ...@@ -44,6 +44,9 @@ class TriviaQA(Dataset):
return ''.join(['Q: ', doc['Question'], '\n\n','A: ', doc['Answer']['Aliases'][0]]) return ''.join(['Q: ', doc['Question'], '\n\n','A: ', doc['Answer']['Aliases'][0]])
else: else:
return ''.join(['Q: ', doc['Question'], '\n\n','A: ']) return ''.join(['Q: ', doc['Question'], '\n\n','A: '])
def evaluate(self, docs, lm):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
...@@ -29,6 +29,8 @@ class WebQs(HFTask): ...@@ -29,6 +29,8 @@ class WebQs(HFTask):
a = "A:" + ((" " + doc['answers'][0]) if include_target else '') a = "A:" + ((" " + doc['answers'][0]) if include_target else '')
return q + a return q + a
def evaluate(self, docs, lm, provide_description, num_fewshot): # TODO: Implement evaluation code
# TODO: implement
raise NotImplementedError() # ***IMPORTANT***: this evaluation function needs to be written for the new framework.
\ No newline at end of file # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
...@@ -15,8 +15,12 @@ class WikiText103(NLP_TASK): ...@@ -15,8 +15,12 @@ class WikiText103(NLP_TASK):
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc, include_target=True):
return doc['text'] return doc['text']
def evaluate(self, docs, lm, provide_description, num_fewshot):
pass # TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
class WikiText2(NLP_TASK): class WikiText2(NLP_TASK):
...@@ -28,5 +32,9 @@ class WikiText2(NLP_TASK): ...@@ -28,5 +32,9 @@ class WikiText2(NLP_TASK):
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc, include_target=True):
return doc['text'] return doc['text']
def evaluate(self, docs, lm, provide_description, num_fewshot):
pass # TODO: Implement evaluation code
\ No newline at end of file
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
...@@ -47,6 +47,8 @@ class Winogrande(HFTask): ...@@ -47,6 +47,8 @@ class Winogrande(HFTask):
text = text.replace("_", answer) text = text.replace("_", answer)
return text return text
def evaluate(self, docs, lm, provide_description, num_fewshot): # TODO: Implement evaluation code
# TODO: Write evaluation function
raise NotImplementedError() # ***IMPORTANT***: this evaluation function needs to be written for the new framework.
\ No newline at end of file # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
...@@ -80,6 +80,8 @@ class WinogradSchemaChallenge273(Dataset): ...@@ -80,6 +80,8 @@ class WinogradSchemaChallenge273(Dataset):
text = doc['completions']['T'] + ' True. ' + doc['completions']['F'] + ' False.' text = doc['completions']['T'] + ' True. ' + doc['completions']['F'] + ' False.'
return text return text
def evaluate(self, docs, lm): # TODO: Implement evaluation code
# TODO: Write evaluation function
raise NotImplementedError() # ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment