humanevalplus.py

"""Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation
https://openreview.net/forum?id=1qvx610Cu7

The HumanEval+ dataset is created by the EvalPlus framework which extends the original HumanEval dataset
by adding more automatically generated test cases to each problem.

Homepage: https://github.com/evalplus/evalplus
"""

from warnings import warn

from bigcode_eval.tasks.humaneval import GeneralHumanEval

_CITATION = """
@inproceedings{evalplus,
  title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation},
  author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming},
  booktitle = {Thirty-seventh Conference on Neural Information Processing Systems},
  year = {2023},
  url = {https://openreview.net/forum?id=1qvx610Cu7},
}
"""


class GeneralHumanEvalPlus(GeneralHumanEval):
    """A task represents an entire benchmark including its dataset, problems,
    answers, generation settings and evaluation methods.
    """

    DATASET_PATH = "evalplus/humanevalplus"

    def __init__(self, strip_prompt, k=[1, 10, 100], num_workers=16, timeout=10.0):
        if timeout < 10.0:
            warn(
                "It is suggested to have a longer timeout as HumanEval+ has lots of tests. "
                f"The current timeout is {timeout}s while the suggested timeout is 10s."
            )
        super().__init__(strip_prompt, k, num_workers, timeout)


def create_task(strip_prompt):
    class HumanEvalPlus(GeneralHumanEvalPlus):
        def __init__(self, **kwargs):
            super().__init__(strip_prompt, **kwargs)

    return HumanEvalPlus


def create_all_tasks():
    """Creates a dictionary of tasks from a list of levels
    :return: {task_name: task}
        e.g. {multiple-py: Task, multiple-java: Task}
    """
    return {
        "humanevalplus": create_task(True),
        "humanevalplus-unstripped": create_task(False),
    }