Commit 0b3b7251 authored by Leo Gao's avatar Leo Gao
Browse files

Merge branch 'bmk_refactor2' of github.com:EleutherAI/lm_evaluation_harness into bmk_refactor2

parents a18104a4 5ce42fc0
......@@ -45,9 +45,11 @@ class ANLIBase(HFTask):
a = "True, False, or Neither?" + ((" " + ["True", "Neither", "False"][doc['label']]) if include_target else '')
return q + a
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: implement
raise NotImplementedError()
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
class ANLIRound1(ANLIBase):
SPLIT = 1
......
......@@ -58,23 +58,12 @@ class DROP(Dataset):
text = ''.join([text, get_answer(pair['answer'])])
qa_texts.append(text)
return ''.join([doctext, '\n'.join(qa_texts)])
def evaluate(self, docs, lm, provide_description, num_fewshot):
"""Take iterable of docs and evaluates, returning a dict with the following format:
{
"major": float,
"minor": dict,
"higher_is_better": bool,
}
* `major` should be a single, representative number, for programmatic comparison
* `minor` should be a dictionary containing all relevant sub-metrics
* `higher_is_better` determines whether a higher metric is better
"""
pass
def fewshot_description(self):
return "Read the passage and answer the questions "
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
......@@ -46,6 +46,12 @@ class CoLA(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -99,6 +105,11 @@ class MNLI(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -153,6 +164,11 @@ class MRPC(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -190,6 +206,11 @@ class RTE(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -227,6 +248,11 @@ class QNLI(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -265,6 +291,11 @@ class QQP(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -304,6 +335,11 @@ class STSB(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -359,6 +395,11 @@ class SST(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -397,6 +438,11 @@ class WNLI(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......
......@@ -51,6 +51,8 @@ class HellaSwag(HFTask):
text += doc['endings'][index]
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Write evaluation function
raise NotImplementedError()
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -46,5 +46,8 @@ class Lambada(Dataset):
#label = doc[]
return doc
def evaluate(self, docs, lm, provide_description, num_fewshot):
pass
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -50,6 +50,8 @@ class NaturalQs(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: implement
raise NotImplementedError()
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -53,6 +53,8 @@ class OpenBookQA(HFTask):
text += doc['choices']['text'][index] + '.'
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Write evaluation function
raise NotImplementedError()
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -54,6 +54,8 @@ class PiQA(Dataset):
#TODO: check if oa uses newline
return doc['goal'] + ' '
def evaluate(self, docs, lm):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -61,5 +61,8 @@ class QuAC(Dataset):
text += doc['answer']
return text
def evaluate(self, docs, lm):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -67,6 +67,8 @@ class RACE(HFTask):
return r
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: implement
raise NotImplementedError()
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -18,6 +18,7 @@ class SATAnalogies(Dataset):
# We should be using a checksum here.
# The canonical sha256 hash is below:
# 9dece377d8d57253ef8c78370ff15de0bb1d9e90a82c815a67ba1e621e921bfc
if not os.path.exists('data/sat/SAT-package-V3.txt'):
raise NotImplementedError('SAT Analogies dataset is not provided. Follow instructions on https://aclweb.org/aclwiki/SAT_Analogy_Questions_(State_of_the_art) to locate.')
......@@ -32,7 +33,6 @@ class SATAnalogies(Dataset):
def training_docs(self):
return []
def test_docs(self):
return []
......
......@@ -42,6 +42,8 @@ class SQuAD(HFTask):
text += answer
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Write evaluation function
raise NotImplementedError()
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -47,6 +47,9 @@ class StoryCloze(Dataset):
else:
return ' '.join([*doc[1:5]])
def evaluate(self, docs, lm):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
......@@ -82,6 +82,11 @@ class CommitmentBank(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -126,6 +131,11 @@ class Copa(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -171,6 +181,11 @@ class MultiRC(HFTask):
return f"[{label_str}] {answer}"
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
preds = []
for doc in docs:
ctx = self.fewshot_context(
......@@ -226,6 +241,11 @@ class WordsInContext(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -286,6 +306,11 @@ class SGWinogradSchemaChallenge(HFTask):
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
......@@ -320,7 +345,10 @@ class RTE(HFTask):
return ''.join([doc['premise'], '\nquestion: ',doc['hypothesis'], ' True or False?\nanswer: ', answer])
else:
return ''.join([doc['premise'], '\nquestion: ',doc['hypothesis'], ' True or False?\nanswer: '])
def evaluate(self, docs, lm, provide_description, num_fewshot):
#TODO:
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
......@@ -44,6 +44,9 @@ class TriviaQA(Dataset):
return ''.join(['Q: ', doc['Question'], '\n\n','A: ', doc['Answer']['Aliases'][0]])
else:
return ''.join(['Q: ', doc['Question'], '\n\n','A: '])
def evaluate(self, docs, lm):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -29,6 +29,8 @@ class WebQs(HFTask):
a = "A:" + ((" " + doc['answers'][0]) if include_target else '')
return q + a
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: implement
raise NotImplementedError()
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -15,8 +15,12 @@ class WikiText103(NLP_TASK):
def doc_to_text(self, doc, include_target=True):
return doc['text']
def evaluate(self, docs, lm, provide_description, num_fewshot):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
class WikiText2(NLP_TASK):
......@@ -28,5 +32,9 @@ class WikiText2(NLP_TASK):
def doc_to_text(self, doc, include_target=True):
return doc['text']
def evaluate(self, docs, lm, provide_description, num_fewshot):
pass
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -47,6 +47,8 @@ class Winogrande(HFTask):
text = text.replace("_", answer)
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Write evaluation function
raise NotImplementedError()
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
......@@ -80,6 +80,8 @@ class WinogradSchemaChallenge273(Dataset):
text = doc['completions']['T'] + ' True. ' + doc['completions']['F'] + ' False.'
return text
def evaluate(self, docs, lm):
# TODO: Write evaluation function
raise NotImplementedError()
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment