This list keeps track of which tasks' implementations have been ported to YAML / v2.0 of the Eval Harness.
Boxes should be checked iff tasks are implemented in v2.0 and tested for regression. Tasks should be struck through if checked *against original introducing paper* implementation or popularizing implementation.
Boxes should be checked iff tasks are implemented in the refactor and tested for regression. Tasks should be struck through if checked *against original introducing paper* implementation or popularizing implementation.
- [ ] Glue
- [] SuperGlue
- [ ] Glue (WIP)
- [x] SuperGlue
- [ ] CoQA
- [ ] DROP
- [x] ~~Lambada~~
...
...
@@ -31,7 +31,7 @@ Boxes should be checked iff tasks are implemented in v2.0 and tested for regress
author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
title={The LAMBADA dataset},
DOI={10.5281/zenodo.2630551},
publisher={Zenodo},
year={2016},
month={Aug}
}
"""
classLambadaBase(Task):
VERSION=None
OUTPUT_TYPE="loglikelihood"
deftraining_docs(self):
ifself.has_training_docs():
returnself.dataset["train"]
defvalidation_docs(self):
ifself.has_validation_docs():
returnself.dataset["validation"]
deftest_docs(self):
ifself.has_test_docs():
returnself.dataset["test"]
defdoc_to_text(self,doc):
returndoc["text"].rsplit(" ",1)[0]
defshould_decontaminate(self):
returnTrue
defdoc_to_decontamination_query(self,doc):
returndoc["text"]
defdoc_to_target(self,doc):
return" "+doc["text"].rsplit(" ",1)[1]
defconstruct_requests(self,doc,ctx,**kwargs):
returnInstance(
request_type=self.OUTPUT_TYPE,
doc=doc,
arguments=(ctx,self.doc_to_target(doc)),
**kwargs
)
defprocess_results(self,doc,results):
# TODO: this ^ is a hack. filters should make it so that we only have one response per request that we score
results=results[
0
]# TODO: recheck this. currently a list of [(ll, is_greedy)] is passed in
ll,is_greedy=results
return{"ppl":ll,"acc":int(is_greedy)}
defaggregation(self):
return{"ppl":perplexity,"acc":mean}
defhigher_is_better(self):
return{"ppl":False,"acc":True}
@register_task("lambada_standard")
classLambadaStandard(LambadaBase):
"""The LAMBADA task using the standard original LAMBADA dataset."""
VERSION="2.0"
DATASET_PATH="lambada"
defhas_training_docs(self):
returnFalse
defhas_validation_docs(self):
returnTrue
defhas_test_docs(self):
returnTrue
@register_task("lambada_openai")
classLambadaOpenAI(LambadaBase):
"""The LAMBADA task using the LAMBADA OpenAI dataset, a modified version of the
original LAMBADA dataset created by OpenAI for evaluating their GPT-2 model.
title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},
author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},