instruct_humaneval.py

"""Evaluating Large Language Models Trained on Code
https://arxiv.org/abs/2107.03374

The HumanEval dataset released by OpenAI includes 164 programming problems with a function signature,
docstring, body, and several unit tests. 
They were handwritten to ensure not to be included in the training set of code generation models.

Homepage: https://github.com/openai/human-eval
"""

from bigcode_eval.base import Task
from bigcode_eval.utils import remove_after_return
from bigcode_eval.tasks.custom_metrics.code_eval import compute_code_eval

_CITATION = ""


def create_all_tasks():
    """Creates a dictionary of tasks corresponding for the 2 settings currently available
    - instruction with code completion: we provide function signature/imports.. to the model after the instruction
    - instruction to code generation: we only give the instruction without the function signature/imports..
    """
    return {
        "instruct-humaneval": InstructHumanEvalWithContext,
        "instruct-humaneval-nocontext": InstructHumanEvalWithoutContext,
    }


class InstructHumanEval(Task):
    """A task represents an entire benchmark including its dataset, problems,
    answers, generation settings and evaluation methods.
    """

    DATASET_PATH = "codeparrot/instructhumaneval"

    DATASET_NAME = None

    def __init__(self):
        super().__init__(
            stop_words=["if __name__", "\nprint", "\nclass"],
            requires_execution=True,
        )

    def get_dataset(self):
        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
        return self.dataset["test"]

    def get_prompt(self, doc):
        """Builds the prompt for the LM to generate from."""
        pass

    def get_reference(self, doc):
        """Builds the reference solution for the doc (sample from the test dataset)."""
        test_func = doc["test"]
        entry_point = f"check({doc['entry_point']})"
        return "\n" + test_func + "\n" + entry_point


    def process_results(self, generations, references):
        """Takes the list of LM generations and evaluates them against ground truth references,
        returning the metric for the generations.
        :param generations: list(list(str))
            list of lists containing generations
        :param references: list(str)
            list of str containing references
        """
        results, _ = compute_code_eval(
            references=references,
            predictions=generations,
        )
        return results


class InstructHumanEvalWithContext(InstructHumanEval):
    def __init__(self):
        super().__init__()

    def get_prompt(self, doc):
        """Builds the prompt for the LM to generate from."""
        return {"instruction": doc["instruction"], "context": doc["context"]}

    def postprocess_generation(self, generation, idx):
        """Defines the postprocessing for a LM generation.
        :param generation: str
            code generation from LM
        :param idx: int
            index of doc in the dataset to which the generation belongs
            (not used for Humaneval-Task)
        """
        generation = self._stop_at_stop_token(generation, self.stop_words)

        function_name = self.get_dataset()["entry_point"][idx]
        func_index = generation.find(f"def {function_name}")
        return generation[0:func_index] + remove_after_return(generation[func_index:])


class InstructHumanEvalWithoutContext(InstructHumanEval):
    def __init__(self):
        super().__init__()

    def get_prompt(self, doc):
        """Builds the prompt for the LM to generate from."""
        return {"instruction": doc["instruction"], "context": ""}

    def postprocess_generation(self, generation, idx):
        """Defines the postprocessing for a LM generation.
        :param generation: str
            code generation from LM
        :param idx: int
            index of doc in the dataset to which the generation belongs
            (not used for Humaneval-Task)
        """
        example = self.get_dataset()[idx]
        prompt, function_name = example["context"], example["entry_point"]
        prefix = prompt[0 : prompt.find(f"def {function_name}")]

        sep_index = generation.find("```")
        if sep_index == -1:
            pass
        else:
            if (
                generation[sep_index + len("```") : sep_index + len("```python")]
                == "python"
            ):
                generation = generation[sep_index + len("```python") :]
            else:
                generation = generation[sep_index + len("```") :]

        generation = self._stop_at_stop_token(generation, self.stop_words)

        func_index = generation.find(f"def {function_name}")
        if func_index == -1:
            func_index = 0
        return_index = generation[func_index:].rfind("  return ")
        if return_index == -1:
            return_index = 0

        j = func_index + return_index
        n = len(generation)

        while j < n and generation[j] != "\n":
            j += 1

        sep_index_2 = generation.find("```")
        if sep_index_2 == -1:
            return prefix.strip() + "\n" + generation[func_index:j]
        else:
            return prefix.strip() + "\n" + generation[func_index : min(j, sep_index_2)]