add prompted and unprompted infilling

cc10eee6 · haileyschoelkopf · 887b0edf · cc10eee6 · cc10eee6 · cc10eee6
Commit cc10eee6 authored Dec 07, 2022 by haileyschoelkopf
5 changed files
--- a/lm_eval/datasets/humaneval/humaneval.py
+++ b/lm_eval/datasets/humaneval/humaneval.py
+import json
+
+import datasets
+
+
+_DESCRIPTION = """\
+The HumanEval dataset released by OpenAI contains 164 handcrafted programming challenges together with unittests to very the viability of a proposed solution.
+"""
+_URL = "https://raw.githubusercontent.com/openai/human-eval-infilling/master/data/HumanEval-RandomSpanInfilling.jsonl.gz"
+
+_CITATION = """\
+@misc{chen2021evaluating,
+      title={Evaluating Large Language Models Trained on Code},
+      author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
+      year={2021},
+      eprint={2107.03374},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}"""
+
+_HOMEPAGE = "https://github.com/openai/human-eval-infilling"
+
+_LICENSE = "MIT"
+
+
+class OpenaiHumanevalInfilling(datasets.GeneratorBasedBuilder):
+    """HumanEval: A benchmark for code generation."""
+
+    VERSION = datasets.Version("1.0.0")
+
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(
+            name="humaneval_infilling",
+            version=datasets.Version("1.0.0"),
+            description=_DESCRIPTION,
+        )
+    ]
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "task_id": datasets.Value("string"),
+                "prompt": datasets.Value("string"),
+                "suffix": datasets.Value("string"),
+                "canonical_solution": datasets.Value("string"),
+                "test": datasets.Value("string"),
+                "entry_point": datasets.Value("string"),
+            }
+        )
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            supervised_keys=None,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        data_dir = dl_manager.download_and_extract(_URL)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": data_dir,
+                },
+            )
+        ]
+
+    def _generate_examples(self, filepath):
+        """Yields examples."""
+        with open(filepath, encoding="utf-8") as file:
+            data = [json.loads(line) for line in file]
+            id_ = 0
+            for sample in data:
+                yield id_, sample
+                id_ += 1
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -51,6 +51,7 @@ from . import asdiv
 from . import gsm8k
 from . import storycloze
 from . import humaneval
+from . import humaneval_infilling

 ########################################
 # Translation tasks
@@ -282,6 +283,8 @@ TASK_REGISTRY = {
    "blimp_wh_vs_that_with_gap_long_distance": blimp.BlimpWhVsThatWithGapLongDistance,
    # HumanEval
    "human_eval": humaneval.HumanEval,
+    "human_eval_infilling": humaneval_infilling.HumanEvalInfilling,
+    "prompted_infilling": humaneval_infilling.PromptedInfilling,
    # Requires manual download of data.
    # "storycloze_2016": storycloze.StoryCloze2016,
    # "storycloze_2018": storycloze.StoryCloze2018,

--- a/lm_eval/tasks/humaneval.py
+++ b/lm_eval/tasks/humaneval.py
@@ -64,7 +64,7 @@ class HumanEval(Task):
            language description, as well as the few shot examples, and the question
            part of the document for `doc`.
        """
-        completion = [rf.greedy_until(ctx, ["\n\n"])] * 100
+        completion = [rf.greedy_until(ctx, ["\n\n"]) for i in range(1)]
        return completion

    def _is_correct(self, completion, doc):
@@ -82,12 +82,13 @@ class HumanEval(Task):
        """
        
        # log outputs to a jsonl file, for use with the official evaluation + execution script.
-        if os.environ['CODE_DUMP_PATH'] is not None:
+        if os.environ.get('CODE_DUMP_PATH', None) is not None:
            with open(f"{os.environ['CODE_DUMP_PATH']}", "a") as f:
                for completion in results:
                    f.write(json.dumps({"task_id": doc["task_id"], "completion": completion}) + "\n")
-        
-        return {"pass@1": self._is_correct(completion, doc), "pass@10": self._is_correct(completion, doc)}
+       
+        # execution code would go here if we allowed it, but we don't
+        return {"pass@1": self._is_correct(results, doc), "pass@10": self._is_correct(results, doc)}

    def aggregation(self):
        """

--- a/lm_eval/tasks/humaneval_infilling.py
+++ b/lm_eval/tasks/humaneval_infilling.py
+"""
+"Evaluating Large Language Models Trained on Code"
+https://arxiv.org/abs/2107.03374
+
+TODO: add abstract/description
+
+Homepage: https://github.com/openai/human-eval
+"""
+import os
+import json
+
+import lm_eval.datasets.humaneval.humaneval
+import inspect
+
+from lm_eval.base import Task, rf
+from lm_eval.metrics import mean
+
+
+_CITATION = """
+@article{chen2021codex,
+  title={Evaluating Large Language Models Trained on Code},
+  author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
+  year={2021},
+  eprint={2107.03374},
+  archivePrefix={arXiv},
+  primaryClass={cs.LG}
+}
+"""
+
+
+class HumanEvalInfilling(Task):
+    VERSION = 0
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.humaneval.humaneval)
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return False
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        raise NotImplementedError
+
+    def validation_docs(self):
+        raise NotImplementedError
+
+    def test_docs(self):
+        return self.dataset["test"]
+
+    def doc_to_text(self, doc):
+        return "<|SUF|>" + doc["suffix"] + "<|PRE|>" + doc["prompt"] + "<|MID|>"
+
+    def doc_to_target(self, doc):
+        return doc["canonical_solution"]
+
+    def construct_requests(self, doc, ctx):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        completion = [rf.greedy_until(ctx, []) for i in range(1)]
+        return completion
+
+    def _is_correct(self, completion, doc):
+        return True
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        
+        # log outputs to a jsonl file, for use with the official evaluation + execution script.
+        if os.environ.get('CODE_DUMP_INFILL_PATH', None) is not None:
+            with open(f"{os.environ['CODE_DUMP_INFILL_PATH']}", "a") as f:
+                for completion in results:
+                    f.write(json.dumps({"task_id": doc["task_id"], "completion": completion, "input": "<|SUF|>" + doc["suffix"] + "<|PRE|>" + doc["prompt"] + "<|MID|>", "canonical_solution": doc["canonical_solution"]}) + "\n")
+       
+        # execution code would go here if we allowed it, but we don't
+        return {"pass@1": self._is_correct(results, doc), "pass@10": self._is_correct(results, doc)}
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {"pass@1": mean, "pass@10": mean}
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {f"pass@{k}": True for k in [1, 10, 100]}
+
+
+class PromptedInfilling(HumanEvalInfilling):
+    def doc_to_text(self, doc):
+        return "# Please complete the blank given by '____' in the following function:\n" + doc["prompt"] + "____" + doc["suffix"] + "\n# Write the missing code below:\n"
+
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -128,7 +128,6 @@ class Reorderer:
        arr.sort(key=lambda x: fn(x[1]))

        self.arr = arr
-        [print(x) for x in arr[:3]]

    def get_reordered(self):
        return [x[1] for x in self.arr]