"""Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation https://openreview.net/forum?id=1qvx610Cu7 The HumanEval+ dataset is created by the EvalPlus framework which extends the original HumanEval dataset by adding more automatically generated test cases to each problem. Homepage: https://github.com/evalplus/evalplus """ from warnings import warn from bigcode_eval.tasks.humaneval import GeneralHumanEval _CITATION = """ @inproceedings{evalplus, title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation}, author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming}, booktitle = {Thirty-seventh Conference on Neural Information Processing Systems}, year = {2023}, url = {https://openreview.net/forum?id=1qvx610Cu7}, } """ class GeneralHumanEvalPlus(GeneralHumanEval): """A task represents an entire benchmark including its dataset, problems, answers, generation settings and evaluation methods. """ DATASET_PATH = "evalplus/humanevalplus" def __init__(self, strip_prompt, k=[1, 10, 100], num_workers=16, timeout=10.0): if timeout < 10.0: warn( "It is suggested to have a longer timeout as HumanEval+ has lots of tests. " f"The current timeout is {timeout}s while the suggested timeout is 10s." ) super().__init__(strip_prompt, k, num_workers, timeout) def create_task(strip_prompt): class HumanEvalPlus(GeneralHumanEvalPlus): def __init__(self, **kwargs): super().__init__(strip_prompt, **kwargs) return HumanEvalPlus def create_all_tasks(): """Creates a dictionary of tasks from a list of levels :return: {task_name: task} e.g. {multiple-py: Task, multiple-java: Task} """ return { "humanevalplus": create_task(True), "humanevalplus-unstripped": create_task(False), }