instruct_humaneval.py 5.19 KB
Newer Older
hepj's avatar
hepj committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""Evaluating Large Language Models Trained on Code
https://arxiv.org/abs/2107.03374

The HumanEval dataset released by OpenAI includes 164 programming problems with a function signature,
docstring, body, and several unit tests. 
They were handwritten to ensure not to be included in the training set of code generation models.

Homepage: https://github.com/openai/human-eval
"""

from bigcode_eval.base import Task
from bigcode_eval.utils import remove_after_return
from bigcode_eval.tasks.custom_metrics.code_eval import compute_code_eval

_CITATION = ""


def create_all_tasks():
    """Creates a dictionary of tasks corresponding for the 2 settings currently available
    - instruction with code completion: we provide function signature/imports.. to the model after the instruction
    - instruction to code generation: we only give the instruction without the function signature/imports..
    """
    return {
        "instruct-humaneval": InstructHumanEvalWithContext,
        "instruct-humaneval-nocontext": InstructHumanEvalWithoutContext,
    }


class InstructHumanEval(Task):
    """A task represents an entire benchmark including its dataset, problems,
    answers, generation settings and evaluation methods.
    """

    DATASET_PATH = "codeparrot/instructhumaneval"

    DATASET_NAME = None

    def __init__(self):
        super().__init__(
            stop_words=["if __name__", "\nprint", "\nclass"],
            requires_execution=True,
        )

    def get_dataset(self):
        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
        return self.dataset["test"]

    def get_prompt(self, doc):
        """Builds the prompt for the LM to generate from."""
        pass

    def get_reference(self, doc):
        """Builds the reference solution for the doc (sample from the test dataset)."""
        test_func = doc["test"]
        entry_point = f"check({doc['entry_point']})"
        return "\n" + test_func + "\n" + entry_point


    def process_results(self, generations, references):
        """Takes the list of LM generations and evaluates them against ground truth references,
        returning the metric for the generations.
        :param generations: list(list(str))
            list of lists containing generations
        :param references: list(str)
            list of str containing references
        """
        results, _ = compute_code_eval(
            references=references,
            predictions=generations,
        )
        return results


class InstructHumanEvalWithContext(InstructHumanEval):
    def __init__(self):
        super().__init__()

    def get_prompt(self, doc):
        """Builds the prompt for the LM to generate from."""
        return {"instruction": doc["instruction"], "context": doc["context"]}

    def postprocess_generation(self, generation, idx):
        """Defines the postprocessing for a LM generation.
        :param generation: str
            code generation from LM
        :param idx: int
            index of doc in the dataset to which the generation belongs
            (not used for Humaneval-Task)
        """
        generation = self._stop_at_stop_token(generation, self.stop_words)

        function_name = self.get_dataset()["entry_point"][idx]
        func_index = generation.find(f"def {function_name}")
        return generation[0:func_index] + remove_after_return(generation[func_index:])


class InstructHumanEvalWithoutContext(InstructHumanEval):
    def __init__(self):
        super().__init__()

    def get_prompt(self, doc):
        """Builds the prompt for the LM to generate from."""
        return {"instruction": doc["instruction"], "context": ""}

    def postprocess_generation(self, generation, idx):
        """Defines the postprocessing for a LM generation.
        :param generation: str
            code generation from LM
        :param idx: int
            index of doc in the dataset to which the generation belongs
            (not used for Humaneval-Task)
        """
        example = self.get_dataset()[idx]
        prompt, function_name = example["context"], example["entry_point"]
        prefix = prompt[0 : prompt.find(f"def {function_name}")]

        sep_index = generation.find("```")
        if sep_index == -1:
            pass
        else:
            if (
                generation[sep_index + len("```") : sep_index + len("```python")]
                == "python"
            ):
                generation = generation[sep_index + len("```python") :]
            else:
                generation = generation[sep_index + len("```") :]

        generation = self._stop_at_stop_token(generation, self.stop_words)

        func_index = generation.find(f"def {function_name}")
        if func_index == -1:
            func_index = 0
        return_index = generation[func_index:].rfind("  return ")
        if return_index == -1:
            return_index = 0

        j = func_index + return_index
        n = len(generation)

        while j < n and generation[j] != "\n":
            j += 1

        sep_index_2 = generation.find("```")
        if sep_index_2 == -1:
            return prefix.strip() + "\n" + generation[func_index:j]
        else:
            return prefix.strip() + "\n" + generation[func_index : min(j, sep_index_2)]