"Evaluating Large Language Models Trained on Code"
https://arxiv.org/abs/2107.03374
TODO: add abstract/description
Homepage: https://github.com/openai/human-eval
"""
importos
importjson
fromlm_eval.baseimportTask,rf
fromlm_eval.metricsimportmean
_CITATION="""
@article{chen2021codex,
title={Evaluating Large Language Models Trained on Code},
author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
year={2021},
eprint={2107.03374},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
"""
classHumanEval(Task):
VERSION=0
DATASET_PATH="openai_humaneval"
DATASET_NAME=None
defhas_training_docs(self):
returnFalse
defhas_validation_docs(self):
returnFalse
defhas_test_docs(self):
returnTrue
deftraining_docs(self):
raiseNotImplementedError
defvalidation_docs(self):
raiseNotImplementedError
deftest_docs(self):
returnself.dataset["test"]
defdoc_to_text(self,doc):
returndoc["prompt"]
defdoc_to_target(self,doc):
returndoc["canonical_solution"]
defconstruct_requests(self,doc,ctx):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
completion=[rf.greedy_until(ctx,["\n\n"])]*100
returncompletion
def_is_correct(self,completion,doc):
returnTrue
defprocess_results(self,doc,results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# log outputs to a jsonl file, for use with the official evaluation + execution script.