Commit 2d2dbf96 authored by Leo Gao's avatar Leo Gao
Browse files

Add comments to remind that evaluation needs to be written for the new framework

parent 6803e647
......@@ -45,9 +45,11 @@ class ANLIBase(HFTask):
a = "True, False, or Neither?" + ((" " + ["True", "Neither", "False"][doc['label']]) if include_target else '')
return q + a
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: implement
raise NotImplementedError()
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
class ANLIRound1(ANLIBase):
SPLIT = 1
......
......@@ -58,23 +58,12 @@ class DROP(Dataset):
text = ''.join([text, get_answer(pair['answer'])])
qa_texts.append(text)
return ''.join([doctext, '\n'.join(qa_texts)])
def evaluate(self, docs, lm, provide_description, num_fewshot):
"""Take iterable of docs and evaluates, returning a dict with the following format:
{
"major": float,
"minor": dict,
"higher_is_better": bool,
}
* `major` should be a single, representative number, for programmatic comparison
* `minor` should be a dictionary containing all relevant sub-metrics
* `higher_is_better` determines whether a higher metric is better
"""
pass
def fewshot_description(self):
return "Read the passage and answer the questions "
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
......@@ -46,6 +46,12 @@ class CoLA(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -99,6 +105,11 @@ class MNLI(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -153,6 +164,11 @@ class MRPC(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -190,6 +206,11 @@ class RTE(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -227,6 +248,11 @@ class QNLI(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -265,6 +291,11 @@ class QQP(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -304,6 +335,11 @@ class STSB(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -359,6 +395,11 @@ class SST(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -397,6 +438,11 @@ class WNLI(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......
......@@ -51,6 +51,8 @@ class HellaSwag(HFTask):
text += doc['endings'][index]
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Write evaluation function
raise NotImplementedError()
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -46,5 +46,8 @@ class Lambada(Dataset):
#label = doc[]
return doc
def evaluate(self, docs, lm, provide_description, num_fewshot):
pass
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -50,6 +50,8 @@ class NaturalQs(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: implement
raise NotImplementedError()
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -53,6 +53,8 @@ class OpenBookQA(HFTask):
text += doc['choices']['text'][index] + '.'
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Write evaluation function
raise NotImplementedError()
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -54,6 +54,8 @@ class PiQA(Dataset):
#TODO: check if oa uses newline
return doc['goal'] + ' '
def evaluate(self, docs, lm):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -61,5 +61,8 @@ class QuAC(Dataset):
text += doc['answer']
return text
def evaluate(self, docs, lm):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -67,6 +67,8 @@ class RACE(HFTask):
return r
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: implement
raise NotImplementedError()
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -93,6 +93,8 @@ class SATAnalogies(Dataset):
return text
def evaluate(self, docs, lm):
# TODO: Write evaluation function
raise NotImplementedError()
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -42,6 +42,8 @@ class SQuAD(HFTask):
text += answer
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Write evaluation function
raise NotImplementedError()
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -47,6 +47,9 @@ class StoryCloze(Dataset):
else:
return ' '.join([*doc[1:5]])
def evaluate(self, docs, lm):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
......@@ -76,6 +76,11 @@ class CommitmentBank(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -120,6 +125,11 @@ class Copa(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -165,6 +175,11 @@ class MultiRC(HFTask):
return f"[{label_str}] {answer}"
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
preds = []
for doc in docs:
ctx = self.fewshot_context(
......@@ -220,6 +235,11 @@ class WordsInContext(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -280,6 +300,11 @@ class SGWinogradSchemaChallenge(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -314,7 +339,10 @@ class RTE(HFTask):
return ''.join([doc['premise'], '\nquestion: ',doc['hypothesis'], ' True or False?\nanswer: ', answer])
else:
return ''.join([doc['premise'], '\nquestion: ',doc['hypothesis'], ' True or False?\nanswer: '])
def evaluate(self, docs, lm, provide_description, num_fewshot):
#TODO:
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
......@@ -44,6 +44,9 @@ class TriviaQA(Dataset):
return ''.join(['Q: ', doc['Question'], '\n\n','A: ', doc['Answer']['Aliases'][0]])
else:
return ''.join(['Q: ', doc['Question'], '\n\n','A: '])
def evaluate(self, docs, lm):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -29,6 +29,8 @@ class WebQs(HFTask):
a = "A:" + ((" " + doc['answers'][0]) if include_target else '')
return q + a
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: implement
raise NotImplementedError()
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -15,8 +15,12 @@ class WikiText103(NLP_TASK):
def doc_to_text(self, doc, include_target=True):
return doc['text']
def evaluate(self, docs, lm, provide_description, num_fewshot):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
class WikiText2(NLP_TASK):
......@@ -28,5 +32,9 @@ class WikiText2(NLP_TASK):
def doc_to_text(self, doc, include_target=True):
return doc['text']
def evaluate(self, docs, lm, provide_description, num_fewshot):
pass
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -47,6 +47,8 @@ class Winogrande(HFTask):
text = text.replace("_", answer)
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Write evaluation function
raise NotImplementedError()
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -80,6 +80,8 @@ class WinogradSchemaChallenge273(Dataset):
text = doc['completions']['T'] + ' True. ' + doc['completions']['F'] + ' False.'
return text
def evaluate(self, docs, lm):
# TODO: Write evaluation function
raise NotImplementedError()
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment