init

3041681f · silencealiang · 291fc518 · 3041681f · 3041681f · 3041681f
Commit 3041681f authored Mar 19, 2025 by silencealiang
20 changed files
--- a/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/ds1000.py
+++ b/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/ds1000.py
+"""
+DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation
+
+https://arxiv.org/pdf/2211.11501.pdf
+
+DS-1000 is a code generation benchmark with a thousand data science questions spanning seven Python libraries that (1) reflects diverse, realistic, and practical use cases, (2) has a reliable metric, (3) defends against memorization by perturbing questions.
+
+Homepage: https://ds1000-code-gen.github.io/
+"""
+
+import fcntl
+import functools
+import io
+import itertools
+import pathlib
+import warnings
+import zipfile
+
+import requests
+import tqdm
+
+from bigcode_eval.base import Task
+
+_CITATION = """
+@article{Lai2022DS1000,
+  title={DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation},
+  author={Yuhang Lai and Chengxi Li and Yiming Wang and Tianyi Zhang and Ruiqi Zhong and Luke Zettlemoyer and Scott Wen-tau Yih and Daniel Fried and Sida Wang and Tao Yu},
+  journal={ArXiv},
+  year={2022},
+  volume={abs/2211.11501}
+}
+"""
+
+
+def create_all_tasks():
+    def create_task(key, mode):
+        class DS1000(GeneralDS1000):
+            def __init__(self, **kwargs):
+                super().__init__(key, mode, **kwargs)
+
+        return DS1000
+
+    return {
+        f"ds1000-{key.lower()}-{mode.lower()}": create_task(key, mode)
+        for key in [
+            "All",
+            "Numpy",
+            "Pandas",
+            "Scipy",
+            "Matplotlib",
+            "Sklearn",
+            "Tensorflow",
+            "Pytorch",
+        ]
+        for mode in ["Completion", "Insertion"]
+    }
+
+
+class GeneralDS1000(Task):
+    DATASET_PATH = None
+    DATASET_NAME = None
+
+    def __init__(self, key, mode):
+        super().__init__(
+            stop_words=["</code>", "# SOLUTION END"], requires_execution=True
+        )
+        self._key = key
+        self._mode = mode
+        if self._key == "Matplotlib" and self._mode == "Insertion":
+            warnings.warn("Insertion not supported for Matplotlib. Running Completion.")
+            self._mode = "Completion"
+        self._dir = pathlib.Path(__file__).parent / "ds"
+        self._dir.mkdir(parents=True, exist_ok=True)
+        self._src = self._dir / "ds1000.py"
+        self._data = self._dir / "ds1000_data"
+        self._download_source()
+        self._download_dataset()
+
+    def _download_source(self):
+        url = "https://github.com/HKUNLP/DS-1000/blob/49c1c543ada8b58138181333cdc62e613204efcf/ds1000.py?raw=true"
+        lock = self._src.with_suffix(".lock")
+        with open(lock, "w") as f_lock:
+            fcntl.flock(f_lock, fcntl.LOCK_EX)
+            if not self._src.exists():
+                warnings.warn(f"DS-1000 source is being saved to {self._src}.")
+                print("Downloading source code...")
+                r = requests.get(url, stream=True)
+                with open(self._src, "wb") as f_src:
+                    f_src.write(r.content)
+                open(self._src.parent / "__init__.py", "w").close()
+                print("Done.")
+            fcntl.flock(f_lock, fcntl.LOCK_UN)
+
+    def _download_dataset(self):
+        url = "https://github.com/HKUNLP/DS-1000/blob/49c1c543ada8b58138181333cdc62e613204efcf/ds1000_data.zip?raw=true"
+        lock = self._data.with_suffix(".lock")
+        with open(lock, "w") as f_lock:
+            fcntl.flock(f_lock, fcntl.LOCK_EX)
+            if not self._data.exists():
+                warnings.warn(f"DS-1000 data is being saved to {self._data}.")
+                print("Downloading dataset...")
+                r = requests.get(url, stream=True)
+                z = zipfile.ZipFile(io.BytesIO(r.content))
+                z.extractall(self._dir)
+                print("Done.")
+            fcntl.flock(f_lock, fcntl.LOCK_UN)
+
+    @functools.lru_cache()
+    def get_dataset(self):
+        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
+        from .ds.ds1000 import DS1000Dataset
+
+        data = DS1000Dataset(self._data, mode=self._mode).data
+        if self._key == "All":
+            if self._mode == "Insertion":
+                warnings.warn(
+                    "Insertion not supported for Matplotlib. Only running others."
+                )
+                data = {k: v for k, v in data.items() if k != "Matplotlib"}
+            dataset = list(itertools.chain(*data.values()))
+        else:
+            dataset = data[self._key]
+        return dataset
+
+    def get_prompt(self, doc):
+        """
+        Builds the prompt for the LM to generate from.
+        :param doc: dict[str: str]
+            sample from the test dataset
+        :return: str | dict[str: str]
+        """
+        if self._mode == "Completion":
+            return doc["prompt"]
+        elif self._mode == "Insertion":
+            prefix, suffix = doc["prompt"].split("[insert]")
+            prefix = f"{prefix.strip()}\n"
+            suffix = f"\n{suffix.strip()}\n"
+            return {"prefix": prefix, "suffix": suffix}
+        else:
+            raise ValueError(f"Invalid mode: {self._mode}")
+
+    def get_reference(self, doc):
+        """
+        Builds the reference solution for the doc (sample from the test dataset).
+        :param doc: dict[str: str]
+            sample from the test dataset
+        :return: str
+        """
+        return doc["reference_code"]
+
+    def postprocess_generation(self, generation, idx):
+        """
+        Defines the postprocessing for a LM generation.
+        :param generation: str
+            code generation from LM
+        :param idx: int (if needed)
+            index of doc in the dataset to which the generation belongs
+        :return: str
+        """
+        if self._mode == "Completion":
+            for start in ["BEGIN SOLUTION\n<code>", "# SOLUTION START"]:
+                try:
+                    generation = generation.split(start, 1)[-1]
+                except IndexError:
+                    pass
+        for stop in self.stop_words:
+            generation = generation.split(stop)[0]
+        return generation.strip()
+
+    def process_results(self, generations, references):
+        """
+        Takes the list of LM generations and evaluates them against ground truth references,
+        returning the metric for the generations as in {"metric_name": result}.
+        We encourage to directly load the metric from `evaluate` library to keep the code concise.
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list(str)
+            list of str containing refrences
+        :return: dict[str: float]
+        """
+        dataset = self.get_dataset()
+        num_correct = 0
+        print("Scoring generations...")
+        for i, ref in tqdm.tqdm(enumerate(references), total=len(references)):
+            test = [doc for doc in dataset if doc["reference_code"] == ref][0]
+            for gen in generations[i]:
+                is_correct = test.test(gen)
+                if is_correct:
+                    num_correct += 1
+        accuracy = num_correct / len(references) / len(generations[0])
+        return {f"mean pass@1 accuracy ({len(generations[0])} samples)": accuracy}
--- a/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/few_shot_examples/codexglue_text_to_text_few_shot_prompts.json
+++ b/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/few_shot_examples/codexglue_text_to_text_few_shot_prompts.json
+{"danish":{"source1":"2 . Udfyld felterne i hvert trin i vejledningen . ","source2":"* Vise rapporter med finansposter og saldi . ","target1":"2 . Fill in the fields in each step of the guide . ","target2":"* View reports that show general ledger entries and balances . "},"chinese":{"source1":"返回 与 筛选器 初始化 由 平台 的 MCDRemoteSystemPlatformFilter 对象 。 ","source2":"用于 将 本地 的 （ 调用 ） 应用 程序 可 见性 首选 项 设置 发现 远程 系统 时 的 类 。 ","target1":"Returns an MCDRemoteSystemPlatformFilter object initialized with a filter by platform . ","target2":"A class used to set the local ( calling ) application visibility preference when discovering remote systems ."},"norwegian":{"source1":"Kosttypesaldo = Kostsentersaldo + Kostobjektsaldo ","source2":"* Vise en liste over bokføringsgrupper som du posterer til kontoen . ","target1":"Cost Type Balance = Cost Center Balance + Cost Object Balance ","target2":"* See a list of posting groups that post to that account . "},"latvian":{"source1":"# # &lt; a name = &quot; 6-change-the-status-of-the-conversion-record-to-ready &quot; &gt; &lt; / a &gt; 6 . Mainiet pārveidošanas ieraksta statusu uz Gatavs ","source2":"title : Preču saņemšanas reģistrēšana pirkšanas pasūtījumā ","target1":"# # 6 . Change the status of the conversion record to Ready ","target2":"title : Record the receipt of goods on the purchase order "}}
\ No newline at end of file
--- a/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/few_shot_examples/conala_few_shot_prompts.json
+++ b/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/few_shot_examples/conala_few_shot_prompts.json
+{"instruction1": "convert a list of integers into a single integer", "instruction2": "how to convert a datetime string back to datetime object?", "solution1": "r = int(''.join(map(str, x)))", "solution2": "datetime.datetime.strptime(str, '%m/%d/%Y')"}
\ No newline at end of file
--- a/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/few_shot_examples/concode_few_shot_prompts.json
+++ b/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/few_shot_examples/concode_few_shot_prompts.json
+{"instruction1": "get the distance of map coordinates to the center ", "instruction2": "check if details are parsed", "solution1": "float function ( int arg0 , int arg1 ) { int loc0 = arg0 - cx ; int loc1 = arg1 - cy ; return getSquaredDistance ( loc0 , loc1 ) ; }", "solution2": "boolean function ( ) { return isParsed ; }"}
\ No newline at end of file
--- a/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/few_shot_examples/gsm8k_few_shot_prompts.json
+++ b/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/few_shot_examples/gsm8k_few_shot_prompts.json
+{
+    "questions": ["Olivia has $23. She bought five bagels for $3 each. How much money does she have left?",
+                  "Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?",
+                  "There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?",
+                  "Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?",
+                  "Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?",
+                  "Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?",
+                  "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?",
+                  "There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?"],
+    "solutions": ["    money_initial = 23\n    bagels = 5\n    bagel_cost = 3\n    money_spent = bagels * bagel_cost\n    money_left = money_initial - money_spent\n    result = money_left\n    return result",
+                "    golf_balls_initial = 58\n    golf_balls_lost_tuesday = 23\n    golf_balls_lost_wednesday = 2\n    golf_balls_left = golf_balls_initial - golf_balls_lost_tuesday - golf_balls_lost_wednesday\n    result = golf_balls_left\n    return result",
+                "    computers_initial = 9\n    computers_per_day = 5\n    num_days = 4  # 4 days between monday and thursday\n    computers_added = computers_per_day * num_days\n    computers_total = computers_initial + computers_added\n    result = computers_total\n    return result",
+                "    toys_initial = 5\n    mom_toys = 2\n    dad_toys = 2\n    total_received = mom_toys + dad_toys\n    total_toys = toys_initial + total_received\n    result = total_toys\n    return result",
+                "    jason_lollipops_initial = 20\n    jason_lollipops_after = 12\n    denny_lollipops = jason_lollipops_initial - jason_lollipops_after\n    result = denny_lollipops\n    return result",
+                "    leah_chocolates = 32\n    sister_chocolates = 42\n    total_chocolates = leah_chocolates + sister_chocolates\n    chocolates_eaten = 35\n    chocolates_left = total_chocolates - chocolates_eaten\n    result = chocolates_left\n    return result",
+                "    cars_initial = 3\n    cars_arrived = 2\n    total_cars = cars_initial + cars_arrived\n    result = total_cars\n    return result",
+                "    trees_initial = 15\n    trees_after = 21\n    trees_added = trees_after - trees_initial\n    result = trees_added\n    return result"]
+}
\ No newline at end of file
--- a/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/gsm.py
+++ b/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/gsm.py
+"""PAL: Program-aided Language Models
+https://arxiv.org/abs/2211.10435
+
+GSM-8k: Training Verifiers to Solve Math Word Problems
+https://arxiv.org/abs/2110.14168
+
+In PaL, Large Language Model solves reasoning problems that involve complex arithmetic and procedural tasks by generating 
+reasoning chains of text and code.This offloads the execution of the code to a program runtime, in our case, a Python interpreter.
+
+This task implements PAL methodology to evaluate GSM-8k and GSM-Hard benchmarks.
+"""
+
+import json
+import os
+import re
+from enum import Enum
+from typing import Union
+
+from evaluate import load
+
+from bigcode_eval.base import Task
+from bigcode_eval.tasks.custom_metrics.pal_metric.pal_code_exec import compute
+
+_CITATION = """
+@article{gao2022pal,
+  title={PAL: Program-aided Language Models},
+  author={Gao, Luyu and Madaan, Aman and Zhou, Shuyan and Alon, Uri and Liu, Pengfei and Yang, Yiming and Callan, Jamie and Neubig, Graham},
+  journal={arXiv preprint arXiv:2211.10435},
+  year={2022}
+}
+
+@article{cobbe2021gsm8k,
+  title={Training Verifiers to Solve Math Word Problems},
+  author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
+  journal={arXiv preprint arXiv:2110.14168},
+  year={2021}
+}
+"""
+# Number of few shot examples to consider
+NUM_SHOTS = 8
+
+
+class EvaluationType(str, Enum):
+    """Possible values for evaluation type argument"""
+
+    GREEDY = "greedy"
+    MAJORITY_VOTING = "majority_voting"
+
+
+def create_all_tasks():
+    """Creates a dictionary of tasks for all evalution type
+    :return: {task_name: task}
+        e.g. {pal-gsm8k-greedy: Task, pal-gsm8k-majority_voting: Task}
+    """
+
+    tasks = [Gsm8k, GsmHard]
+    eval_types = [et.value for et in EvaluationType]
+
+    return {
+        f"pal-{task.__name__.lower()}-{eval_type}": create_task(task, eval_type)
+        for eval_type in eval_types
+        for task in tasks
+    }
+
+
+def create_task(cls, evaluation_type):
+    class Gsm(cls):
+        def __init__(self, **kwargs):
+            super().__init__(evaluation_type, **kwargs)
+
+    return Gsm
+
+
+class Gsm8k(Task):
+
+    DATASET_PATH = "gsm8k"
+    DATASET_NAME = "main"
+    POST_SCRIPT = "print(solution())"
+    SPLIT = "test"
+
+    def __init__(
+        self, evaluation_type: Union[str, EvaluationType] = EvaluationType.GREEDY
+    ):
+        """
+        :param evaluation_type: Union[str,EvaluationType]
+            Type of evaluation to perform. Authors of PAL had originally evaluated the generations on greedy and majority voting methods.
+            Values can be `greedy` or `majority_voting`
+            greedy: One Generation is sampled using greedy decoding and evaluated against references
+            majority_voting: Predicted answer is selected from multiple generations based on majority voting and evaluated.
+        """
+        stop_words = ["\n\n\n"]
+        requires_execution = True
+        if evaluation_type == EvaluationType.MAJORITY_VOTING:
+            self.majority_voting = True
+        else:
+            self.majority_voting = False
+        super().__init__(stop_words, requires_execution)
+
+    def get_dataset(self):
+        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
+        if self.SPLIT:
+            return self.dataset[self.SPLIT]
+        return self.dataset
+
+    def fewshot_examples(self):
+        """Loads and returns the few-shot examples for the task if they exist."""
+        with open(
+            "bigcode_eval/tasks/few_shot_examples/gsm8k_few_shot_prompts.json",
+            "r",
+        ) as file:
+            examples = json.load(file)
+        return examples
+
+    @staticmethod
+    def few_shot_prompt(entry, text, examples):
+        """Two shot prompt format as source & target language documentation"""
+        prompt = ""
+        for question, solution in zip(
+            examples["questions"][:NUM_SHOTS], examples["solutions"][:NUM_SHOTS]
+        ):
+            prompt += f'''Q: {question}\n\n# solution in Python:\n\n\ndef solution():\n    """{question}"""\n{solution}\n\n\n\n\n\n'''
+        prompt += f"""Q: {text}\n\n# solution in Python:\n\n\n"""
+        return entry + prompt
+
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from."""
+        text = doc["question"]
+        entry = f""
+        examples = self.fewshot_examples()
+        prompt = self.few_shot_prompt(entry, text, examples)
+        return prompt
+
+    @staticmethod
+    def parse_target(txt):
+        def _is_num(txt):
+            try:
+                txt = txt.replace(",", "")
+                float(txt)
+            except ValueError:
+                return False
+            return True
+
+        txt = txt.strip()
+        if _is_num(txt):
+            txt = txt.replace(",", "")
+            try:
+                num = int(txt)
+            except ValueError:
+                num = float(txt)
+            return num
+        return txt
+
+    def get_reference(self, doc):
+        """Builds the reference solution for the doc (sample from the test dataset)."""
+        _answer_delim = "#### "
+        target = doc["answer"].split(_answer_delim)[-1]
+        return self.parse_target(target)
+
+    def postprocess_generation(self, generation, idx):
+        """Defines the postprocessing for a LM generation.
+        :param generation: str
+            code generation from LM
+        :param idx: int
+            index of doc in the dataset to which the generation belongs
+            (not used for this task)
+        """
+        output = generation.split("# solution in Python:", NUM_SHOTS + 1)[-1].strip()
+        if "Q:" in output:
+            output = output.split("Q:")[0]
+        output += "\n" + self.POST_SCRIPT
+        return output
+
+    def process_results(self, generations, references):
+        """Takes the list of LM generations and evaluates them against ground truth references,
+        returning the metric for the generations.
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list(float)
+            list of references
+        """
+        results = compute(
+            references=references,
+            predictions=generations,
+            majority_voting=self.majority_voting,
+        )
+        return results
+
+
+class GsmHard(Gsm8k):
+    DATASET_PATH = "reasoning-machines/gsm-hard"
+    DATASET_NAME = None
+    # the default split of GSMHARD - actually taken from test split of GSM dataset
+    SPLIT = "train"
+
+    def __init__(self, evaluation_type: str = EvaluationType.GREEDY):
+        """
+        :param evaluation_type: str
+            Type of evaluation to perform. Authors of PAL had originally evaluated the generations on greedy and majority voting methods.
+            Values can be `greedy` or `majority_voting`
+            greedy: One Generation is sampled using greedy decoding and evaluated against references
+            majority_voting: Predicted answer is selected from multiple generations based on majority voting and evaluated.
+        """
+        super().__init__(evaluation_type)
+
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from."""
+        text = doc["input"]
+        entry = ""
+        examples = self.fewshot_examples()
+        prompt = self.few_shot_prompt(entry, text, examples)
+        return prompt
+
+    def get_reference(self, doc):
+        return doc["target"]
--- a/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/humaneval.py
+++ b/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/humaneval.py
+"""Evaluating Large Language Models Trained on Code
+https://arxiv.org/abs/2107.03374
+
+The HumanEval dataset released by OpenAI includes 164 programming problems with a function signature,
+docstring, body, and several unit tests. 
+They were handwritten to ensure not to be included in the training set of code generation models.
+
+Homepage: https://github.com/openai/human-eval
+"""
+
+
+from bigcode_eval.base import Task
+from bigcode_eval.tasks.custom_metrics.code_eval import compute_code_eval
+
+_CITATION = """
+@misc{chen2021evaluating,
+      title={Evaluating Large Language Models Trained on Code},
+      author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
+      year={2021},
+      eprint={2107.03374},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+"""
+
+
+def create_all_tasks():
+    """Creates a dictionary of tasks from a list of levels
+    :return: {task_name: task}
+        e.g. {multiple-py: Task, multiple-java: Task}
+    """
+    return {"humaneval": create_task(True), "humaneval-unstripped": create_task(False)}
+
+
+def create_task(strip_prompt):
+    class HumanEval(GeneralHumanEval):
+        def __init__(self, **kwargs):
+            super().__init__(strip_prompt, **kwargs)
+
+    return HumanEval
+
+
+class GeneralHumanEval(Task):
+    """A task represents an entire benchmark including its dataset, problems,
+    answers, generation settings and evaluation methods.
+    """
+
+    DATASET_PATH = "openai_humaneval"
+
+    def __init__(self, strip_prompt, k=[1, 10, 100], num_workers=16, timeout=3.0):
+        super().__init__(
+            stop_words=["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif", "\n```", "<file_sep>"],
+            requires_execution=True,
+        )
+        self.strip_prompt = strip_prompt
+        self.k = k
+        self.num_workers = num_workers
+        self.timeout = timeout
+
+    def get_dataset(self):
+        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
+        return self.dataset["test"]
+
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from."""
+        if self.strip_prompt:
+            return doc["prompt"].strip()
+        else:
+            return doc["prompt"]
+
+    def get_reference(self, doc):
+        """Builds the reference solution for the doc (sample from the test dataset)."""
+        test_func = doc["test"]
+        entry_point = f"check({doc['entry_point']})"
+        return "\n" + test_func + "\n" + entry_point
+
+
+    def postprocess_generation(self, generation, idx):
+        """Defines the postprocessing for a LM generation.
+        :param generation: str
+            code generation from LM
+        :param idx: int
+            index of doc in the dataset to which the generation belongs
+            (not used for Humaneval-Task)
+        """
+        prompt = self.get_prompt(self.dataset["test"][idx])
+        generation = generation[len(prompt) :]
+        return prompt + self._stop_at_stop_token(generation, self.stop_words)
+
+    def process_results(self, generations, references):
+        """Takes the list of LM generations and evaluates them against ground truth references,
+        returning the metric for the generations.
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list(str)
+            list of str containing refrences
+        """
+        results, _ = compute_code_eval(
+            references=references,
+            predictions=generations,
+            k=self.k,
+            num_workers=self.num_workers,
+            timeout=self.timeout,
+        )
+        return results
--- a/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/humanevalpack.py
+++ b/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/humanevalpack.py
+import json
+import re
+
+from evaluate import load
+from bigcode_eval.base import Task
+
+_CITATION = """
+@article{muennighoff2023octopack,
+      title={OctoPack: Instruction Tuning Code Large Language Models}, 
+      author={Niklas Muennighoff and Qian Liu and Armel Zebaze and Qinkai Zheng and Binyuan Hui and Terry Yue Zhuo and Swayam Singh and Xiangru Tang and Leandro von Werra and Shayne Longpre},
+      journal={arXiv preprint arXiv:2308.07124},
+      year={2023}
+}
+"""
+
+LANGUAGES = ["python", "cpp", "js", "java", "go", "rust"]
+
+LANGUAGE_TO_NAME = {
+    "python": "Python",
+    "cpp": "C++",
+    "js": "JavaScript",
+    "java": "Java",
+    "go": "Go",
+    "rust": "Rust",
+}
+
+LANGUAGE_TO_EXTENSION = {
+    "python": "py",
+    "cpp": "cpp",
+    "js": "js",
+    "java": "java",
+    "go": "go",
+    "rust": "rs",
+}
+
+# Taken from https://huggingface.co/datasets/nuprl/MultiPL-E/ & https://github.com/THUDM/CodeGeeX
+LANGUAGE_TO_STOP_WORDS = {
+    # https://github.com/THUDM/CodeGeeX/blob/23ee51505a2bcd34d59d2e271b22e5bd91475462/codegeex/benchmark/utils.py#L164
+    "python": ["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif", "\nassert"],
+    # https://github.com/THUDM/CodeGeeX/blob/23ee51505a2bcd34d59d2e271b22e5bd91475462/codegeex/benchmark/utils.py#L185
+    "cpp": [],
+    # https://github.com/THUDM/CodeGeeX/blob/23ee51505a2bcd34d59d2e271b22e5bd91475462/codegeex/benchmark/utils.py#L188
+    "js": [],
+    # https://github.com/THUDM/CodeGeeX/blob/23ee51505a2bcd34d59d2e271b22e5bd91475462/codegeex/benchmark/utils.py#L177
+    "go": ["\n//", "\nfunc main(", "struct", "\nfunc"],
+    # https://github.com/THUDM/CodeGeeX/blob/23ee51505a2bcd34d59d2e271b22e5bd91475462/codegeex/benchmark/utils.py#L169
+    "java": [],
+    "rust": [],
+}
+
+LANGUAGE_TO_TIMEOUT = {
+    "python": 10,
+    "cpp": 60,
+    "js": 10,
+    "java": 10,
+    "go": 20,
+    "rust": 300, # Necessary for first-time compilation of cargo
+}
+
+# Java sometimes fails with more workers; For JS it's twice as fast with 4 workers
+LANGUAGE_TO_NUM_WORKERS = {
+    "python": 4,
+    "cpp": 4,
+    "js": 4,
+    "java": 1,
+    "go": 4,
+    "rust": 1,
+}
+
+# https://github.com/THUDM/CodeGeeX/blob/23ee51505a2bcd34d59d2e271b22e5bd91475462/codegeex/benchmark/utils.py#L6
+IMPORT_HELPER = {
+    "python": [
+        "import math",
+        "import re",
+        "import sys",
+        "import copy",
+        "import datetime",
+        "import itertools",
+        "import collections",
+        "import heapq",
+        "import statistics",
+        "import functools",
+        "import hashlib",
+        "import numpy",
+        "import numpy as np",
+        "import string",
+        "from typing import *",
+        "from collections import *",
+    ],
+    "go": [
+        "math",
+        "strings",
+        "fmt",
+        "strconv",
+        "time",
+        "bytes",
+        "regexp",
+        "sort",
+        "math/rand",
+        "crypto/md5",
+    ],
+    "cpp": [
+        "using namespace std;",      
+        "#include<stdlib.h>",
+        "#include<algorithm>",
+        "#include<cmath>",
+        "#include<math.h>",
+        "#include<numeric>",
+        "#include<stdio.h>",
+        "#include<vector>",
+        "#include<set>",
+        "#include<map>",
+        "#include<queue>",
+        "#include<stack>",
+        "#include<list>",
+        "#include<deque>",
+        "#include<boost/any.hpp>",
+        "#include<string>",
+        "#include<climits>",
+        "#include<cstring>",
+        "#include<iostream>",
+        "#include<sstream>",
+        "#include<fstream>",
+    ],
+}
+
+
+def create_all_tasks():
+    fix = {f"humanevalfix{mode}-{language}": create_task(language, "fix" + mode) for language in LANGUAGES for mode in ["tests", "docs"]}
+    explain = {f"humanevalexplain{mode}-{language}": create_task(language, "explain" + mode) for language in LANGUAGES for mode in ["describe", "synthesize"]}
+    synthesize = {f"humanevalsynthesize-{language}": create_task(language, "synthesize") for language in LANGUAGES}
+    return {**fix, **explain, **synthesize}
+
+def create_task(language, name):
+    class HumanEvalFixTests(HumanEvalFixBase):
+        def __init__(self, language=language, prompt="instruct"):
+            super().__init__(language=language, prompt=prompt, with_docs=False)
+    class HumanEvalFixDocs(HumanEvalFixBase):
+        def __init__(self, language=language, prompt="instruct"):            
+            super().__init__(language=language, prompt=prompt, with_docs=True)
+    class HumanEvalExplainDescribe(HumanEvalExplainDescribeBase):
+        def __init__(self, language=language, prompt="instruct"):
+            super().__init__(language=language, prompt=prompt, with_docs=False)   
+    class HumanEvalExplainSynthesize(HumanEvalExplainSynthesizeBase):
+        def __init__(self, language=language, prompt="instruct", load_data_path=None):
+            super().__init__(language=language, prompt=prompt, with_docs=False, load_data_path=load_data_path)
+    class HumanEvalSynthesize(HumanEvalSynthesizeBase):
+        def __init__(self, language=language, prompt="instruct"):
+            super().__init__(language=language, prompt=prompt, with_docs=True)
+    
+    if name == "fixtests": return HumanEvalFixTests
+    elif name == "fixdocs": return HumanEvalFixDocs
+    elif name == "explaindescribe": return HumanEvalExplainDescribe
+    elif name == "explainsynthesize": return HumanEvalExplainSynthesize
+    elif name == "synthesize": return HumanEvalSynthesize
+
+
+class HumanEvalPack(Task):
+    """Parent class for all HumanEvalPack tasks"""
+    DATASET_PATH = "bigcode/humanevalpack"
+    DATASET_NAME = None
+
+    def __init__(self, prompt="instruct", language="python", with_docs=True):
+        
+        self.DATASET_NAME = language
+        self.prompt = prompt        
+        stop_words = LANGUAGE_TO_STOP_WORDS[language]
+        if self.prompt.startswith("edit"):
+            stop_words.extend([
+                "<commit_before>",
+                "<commit_msg>",
+                "<commit_after>",
+            ])
+        elif self.prompt == "starchat":
+            stop_words.append("<|end|>")
+        elif self.prompt == "diff":
+            stop_words = ["<commit_before>", "<commit_msg>", "<commit_after>"]
+        elif self.prompt == "diff-carper":
+            stop_words = ["<BEF>", "<MSG>", "<DFF>", "\ No newline at end of file"]          
+        elif self.prompt == "issue":  
+            stop_words.append("```")
+        stop_words.append("<|endoftext|>")
+        self.with_docs = with_docs
+        super().__init__(stop_words=stop_words, requires_execution=True)
+
+    def get_dataset(self):
+        return self.dataset["test"]
+
+    def get_prompt_base(self, doc):
+        if self.with_docs: return doc["prompt"] # Already includes fn main for rust
+        else:
+            if self.DATASET_NAME == "rust":
+                # See 
+                # https://github.com/roG0d/CodeGeeX/blob/f66205b5f615a4eead9c26d7ec297e14738ea18d/codegeex/benchmark/evaluate_humaneval_x.py#L78
+                # https://github.com/THUDM/CodeGeeX/pull/76#issuecomment-1500653190
+                return "fn main(){}\n" + doc["declaration"]
+            else: return doc["declaration"]
+
+    def get_prompt(self, prompt_base, instruction, context=None):
+        if context is None:
+            inp = instruction
+        # `Context first then instruction` methods
+        elif self.prompt in ["continue", "instruct"]:
+            inp = context + "\n" + instruction
+        else:
+            inp = instruction + "\n" + context
+        
+        if self.prompt == "continue":
+            assert context is None, "The `continue` prompt should only be used for HumanEvalSynthesize. Use `instruct` for HumanEvalFix and HumanEvalExplain."
+            prompt = prompt_base
+        elif self.prompt == "instruct":
+            prompt = inp + "\n\n" + prompt_base
+        elif self.prompt == "octocoder":
+            prompt = f'Question: {inp}\n\nAnswer:\n{prompt_base}'
+        elif self.prompt == "octogeex":
+            prompt = f'Question: {inp.strip()}\n\nAnswer:\n{prompt_base}'            
+        elif self.prompt == "starchat":
+            # https://hf.co/HuggingFaceH4/starchat-beta
+            prompt = f'<|system|>\n<|end|>\n<|user|>\n{inp}<|end|>\n<|assistant|>\n{prompt_base}'
+        elif self.prompt == "starcodercommit":
+            prompt = f'<commit_before><commit_msg>{inp}<commit_after>{prompt_base}'
+        elif self.prompt == "instructcodet5p":
+            # https://github.com/salesforce/CodeT5/blob/main/CodeT5%2B/humaneval/generate_codet5p.py#L89
+            prompt = f'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{inp}\n\n### Response:{prompt_base}'       
+        elif self.prompt == "wizardcoder":
+            # https://github.com/nlpxucan/WizardLM/blob/main/WizardCoder/src/humaneval_gen.py#L37
+            prompt = f'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{inp}\n\n### Response:\n{prompt_base}'
+        elif self.prompt == "codellama":
+            # https://hf.co/codellama             
+            prompt = f"[INST] {inp.strip()} [/INST] {prompt_base}"
+        elif self.prompt in ["tulu", "gritlm"]:
+            # https://hf.co/GritLM/GritLM-7B
+            prompt = f"<|user|>\n{inp}\n<|assistant|>\n{prompt_base}"
+        elif self.prompt == "zephyr":
+            # https://hf.co/HuggingFaceH4/zephyr-7b-beta
+            prompt = f"<|user|>\n{inp}</s>\n<|assistant|>\n{prompt_base}"
+        elif self.prompt == "yi":
+            # https://hf.co/01-ai/Yi-34B-Chat     
+            prompt = f"<|im_start|>user\n{inp}<|im_end|>\n<|im_start|>assistant\n{prompt_base}"
+        elif self.prompt == "codellama-70b":
+            prompt = f"Source: user\n\n {inp.strip()} Source: assistant\nDestination: user \n\n{prompt_base}"
+        elif self.prompt == "aurora-m":
+            prompt = f'### Instruction:\n{inp}\n### Response:\n{prompt_base}'
+        else:
+            raise ValueError(f"The --prompt argument {self.prompt} wasn't provided or isn't supported")
+        # Strip off the final \n to make the tokens more natural
+        # Essentially, we want to make sure that if there was no distinction between
+        # input & output, the tokens would be the same
+        # E.g. for SantaCoder:
+        # tokenize("""def hi()\n   return""")
+        # ['def', 'Ġhi', '()', 'ĊĠĠ', 'Ġreturn']
+        # So we need to split before the \n so that the input is
+        # ['def', 'Ġhi', '()'] and the model can generate ['ĊĠĠ', 'Ġreturn']
+        # If instead we provide def hi()\n the tokens will be
+        # ['def', 'Ġhi', '()', 'Ċ'] and the model would need to generate ['ĠĠ', 'Ġreturn']
+        # Which would be harder, as it's not the usual way these tokens are tokenized
+        # i.e. the model has never seen the token sequence of ['()', 'Ċ', 'ĠĠ'], but only ['()', 'ĊĠĠ']
+        # The same holds for Java, JS, Go, Rust, C++ tho the start sequences are slightly different
+        return prompt.strip()
+            
+    def get_reference(self, doc, get_solution=False):
+        if get_solution:
+            return doc["prompt"] + doc["canonical_solution"]
+        else:
+            return "\n" + doc["test"] # check(func_name) is already included
+
+
+class HumanEvalPackGenerative(HumanEvalPack):
+    """Parent class for all HumanEvalPack tasks except describing code"""
+    def check_fn(self, code):
+        """
+        Checks whether the generated code is finished.
+        Problem: Models rarely split their code into multiple functions, but this stops the model after the 1st function.
+        Inspiration: https://github.com/THUDM/CodeGeeX/blob/23ee51505a2bcd34d59d2e271b22e5bd91475462/codegeex/benchmark/utils.py#L115
+        """
+        if any([w in code for w in self.stop_words]): return True
+
+        # The heuristics below do not hold for diff generation
+        if (self.prompt.startswith("diff")): return False
+
+        if self.DATASET_NAME == "python":
+            for line in code.split("\n"):
+                if len(line.strip()) > 0 and line[0] != ' ' and line[0] != '\t':
+                    return True
+        else:
+            open_brackets = 2 if self.DATASET_NAME == "java" else 1
+            if code.count("{") + open_brackets == code.count("}"):
+                return True
+        return False 
+
+    def remove_last_block(self, code):
+        """
+        Adapted from https://github.com/THUDM/CodeGeeX/blob/23ee51505a2bcd34d59d2e271b22e5bd91475462/codegeex/benchmark/utils.py#L151
+        """
+        for w in self.stop_words:
+            if w in code:
+                code = code[:code.find(w)]
+
+        ### Find the first occassion where a chain of { } is closed
+        if self.DATASET_NAME == "python":
+            for i, line in enumerate(code.split("\n")):
+                if len(line.strip()) > 0 and line[0] != ' ' and line[0] != '\t':
+                    return "\n".join(code.split("\n")[:i])
+        elif self.DATASET_NAME in ["java", "js", "go", "cpp", "rust"]:
+            open_brackets = 2 if self.DATASET_NAME == "java" else 1
+            cut = False
+            for i, c in enumerate(code):
+                if c == '{':
+                    open_brackets += 1
+                elif c == '}':
+                    open_brackets -= 1
+                if open_brackets == 0:
+                    code = code[:i+1]
+                    cut = True
+                    break
+            if not cut:
+                if self.DATASET_NAME == "java":
+                    main_pos = code.find("public static void main")
+                    if main_pos != -1:
+                        code = code[:main_pos] + '}'
+                    if '}' in code:
+                        code = code[:code.rfind('}')] + '}'
+                    if code.count('{') - 1 == code.count('}'):
+                        code += "\n}"
+                elif '}' in code:
+                    code = code[:code.rfind('}')] + '}'
+        return code
+
+    def postprocess_generation(self, generation, idx):
+        """Defines the postprocessing for a LM generation.
+        :param generation: str
+            code generation from LM
+        :param idx: int
+            index of doc in the dataset to which the generation belongs
+            (not used for Humaneval-Task)
+        """
+        doc = self.get_dataset()[idx]
+        prompt = self.get_prompt(doc)
+        gen = self.remove_last_block(generation[len(prompt):].rstrip())
+        # Strip to maintain same behavior as with get_prompt
+        return doc["prompt"].rstrip() + gen
+        
+    def process_results(self, generations, references):
+        """Takes the list of LM generations and evaluates them against ground truth references.
+
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list(str)
+            list of str containing refrences
+        """
+        code_metric = load("Muennighoff/code_eval_octopack")
+        timeout = LANGUAGE_TO_TIMEOUT[self.DATASET_NAME]
+        num_workers = LANGUAGE_TO_NUM_WORKERS[self.DATASET_NAME]
+        language = self.DATASET_NAME if self.DATASET_NAME != "js" else "javascript"
+
+        ### CUSTOM MUTATE METHOD CHANGES ###
+        if self.prompt == "diff":
+            # Requires:
+            # !wget https://raw.githubusercontent.com/google/diff-match-patch/master/python3/diff_match_patch.py
+            from diff_match_patch import diff_match_patch
+            dmp = diff_match_patch()
+            ds = self.get_dataset().select(range(len(generations)))
+            for gen, doc in zip(generations, ds):
+                prompt_base = self.get_prompt_base(doc)
+                old_code = prompt_base + doc["buggy_solution"]
+                for i, diff in enumerate(gen): 
+                    try:
+                        # Strip away anything to the left such as \n
+                        patches = dmp.patch_fromText(diff.lstrip())
+                        fixed_code, _ = dmp.patch_apply(patches, old_code)
+                    except Exception as e:
+                        print(f"Failed with {e} when applying patch to buggy code: {diff}")
+                        fixed_code = ""
+                    gen[i] = fixed_code
+        elif self.prompt == "diff-carper":
+            from bigcode_eval.tasks.custom_metrics.diff_eval import apply_diff
+            ds = self.get_dataset().select(range(len(generations)))
+            for gen, doc in zip(generations, ds):
+                prompt_base = self.get_prompt_base(doc)
+                old_code = prompt_base + doc["buggy_solution"]
+                for i, diff_hunk in enumerate(gen):
+                    if not(diff_hunk):
+                        gen[i] = ""
+                        continue
+                    res: str = apply_diff(old_code, diff_hunk)        
+                    gen[i] = res
+
+        ### CUSTOM PROG LANGUAGE CHANGES ###
+        # Inspiration: https://github.com/THUDM/CodeGeeX/blob/ebeb850f227a90c79de39f7e26b1302f374f3240/codegeex/benchmark/evaluate_humaneval_x.py
+        if language == "python":
+            python_imports = "\n".join(IMPORT_HELPER["python"])
+            generations = [
+                [(python_imports + "\n" + g).strip() for g in gen] for gen in generations
+            ]
+        elif language == "cpp":
+            cpp_imports = "\n".join(IMPORT_HELPER["cpp"])
+            # Remove main in case present
+            generations = [
+                [(cpp_imports + "\n" + g.split("int main")[0]).strip() for g in gen] for gen in generations
+            ]
+        elif language == "java":
+            generations = [
+                [g.replace("public class Main {\n    }", "").strip() for g in gen] for gen in generations
+            ]
+        elif language == "go":
+            ds = self.get_dataset().select(range(len(generations)))
+            for gen, ref, doc in zip(generations, references, ds):
+                for line in doc["import"].split("\n"):
+                    line = line.replace("import", "").replace("(", "").replace(")", "").replace('"', "").strip()
+                    if line: assert line in IMPORT_HELPER["go"], doc["import"] # Will be added later
+                test_setup_str = doc["test_setup"] + "\n"
+                for i, g in enumerate(gen):
+                    for line in test_setup_str.split("\n"):
+                        line = line.replace("import", "").replace("(", "").replace(")", "").strip()
+                        if line.startswith('"') and line in g:
+                            test_setup_str = test_setup_str.replace(line, "")
+                    g = test_setup_str + g + "\n" + ref
+                    other_pkgs = set()
+                    for pkg in IMPORT_HELPER["go"]:
+                        if ('"' + pkg + '"' not in g):
+                            p = pkg.split("/")[-1]
+                            # Check if the package is used
+                            if (p + "." in g):
+                                # The problem is that it could appear in a comment
+                                # E.g. in problem 158, the docstring is:
+                                # // ... a list of strings.
+                                # but the "strings" pkg is never used
+                                # Golang throws an error if the pkg is not used
+                                # Thus search for the package & make sure it's not in a commented line
+                                lines = g.split("\n")
+                                for line in lines:
+                                    if (p + "." in line) and not(line.strip().startswith("//")):
+                                        other_pkgs.add('"' + p + '"')
+                                        break
+                    other_pkgs_str = ""
+                    if other_pkgs:
+                        other_pkgs_str = "import (\n" + "\n".join(["    " + p for p in other_pkgs]) + "\n)\n"
+                    if ("package main" in gen[i]) and ("package main" in test_setup_str):
+                        gen[i] = gen[i].replace("package main", "")
+                    gen[i] = test_setup_str + other_pkgs_str + gen[i]
+        elif language == "rust":
+            ds = self.get_dataset().select(range(len(generations)))
+            main = "fn main(){}\n"
+            for gen, doc in zip(generations, ds):
+                declaration = doc["declaration"]
+                for i, g in enumerate(gen):
+                    new_gen = ""
+                    if "fn main()" not in g:
+                        new_gen += main
+                    for line in declaration.split("\n"):
+                        if line.strip() not in g:
+                            # Skip if the function is already present
+                            if line.strip().startswith("fn") and (line.strip().split("(")[0]) in g:
+                                continue
+                            new_gen += line.strip() + "\n"
+                    # If fn main() is present twice, cut off before the second one
+                    g = "fn main()".join(g.split("fn main()")[0:2])
+                    new_gen += g
+                    gen[i] = new_gen
+
+        ### EVALUATION ###
+        results, logs = code_metric.compute(
+            references=references,
+            predictions=generations,
+            language=language,
+            timeout=timeout,
+            num_workers=num_workers,
+        )
+        # Write logs to json
+        with open("logs.json", "w") as f:
+            json.dump(logs, f, indent=4, ensure_ascii=False)
+
+        """Debugging help
+        for i, (gen, ref) in enumerate(zip(generations, references)):
+            import time
+            starttime = time.time()            
+            results, log = code_metric.compute(
+                references=[ref],
+                predictions=[gen],
+                language=language,
+                timeout=timeout,
+            )
+            print("Took: ", time.time() - starttime)
+            with open("errors.txt", "a") as f:
+                f.write(log[0][0][1]["result"] + "\n")
+            if ("compilation error" in log[0][0][1]["result"]):
+                print("Result")
+                print(results)
+                print("Log")
+                print(log)
+                print("Gen")
+                print(gen[0])
+                print("Ref")
+                print(ref)
+        """
+        return results
+
+
+class HumanEvalFixBase(HumanEvalPackGenerative):
+    def get_filename_with_extension(self, input_file):
+        """Returns the synthetic filename for different datasets"""
+        file_name = input_file if input_file is not None else "solution"
+        return file_name + "." + LANGUAGE_TO_EXTENSION[self.DATASET_NAME]
+        
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from."""
+        prompt_base = self.get_prompt_base(doc)
+        instruction = f'Fix bugs in {doc["entry_point"]}.'
+        context = prompt_base + doc["buggy_solution"]
+        if self.with_docs is False: # Add tests as source of ground truth
+            context += "\n" + doc["test"]
+
+        if self.prompt == "file":
+            file_name = self.get_filename_with_extension(input_file=doc["entry_point"])
+            prompt = f"<file_name>\n{file_name}\n<commit_before>\n{context}\n<commit_msg>\n{instruction}<commit_after>\n{prompt_base}"
+        elif self.prompt == "starcodercommit":
+            prompt = f"<commit_before>{context}<commit_msg>{instruction}<commit_after>{prompt_base}"
+        elif self.prompt == "diff":
+            prompt = f"<commit_before>{context}<commit_msg>{instruction}<commit_after>"
+        elif self.prompt == "diff-carper":
+            prompt = f"<NME> {self.get_filename_with_extension(input_file=doc['entry_point'])}\n"
+            prompt += f"<BEF> {context}\n<MSG> {instruction}\n<DFF>"
+        elif self.prompt == "issue":
+            prompt = f"<issue_start>username_0: {instruction}\n\n```{context}```\nUpvotes: 100<issue_comment>username_1: Sure, here is the fixed code.\n\n```{prompt_base}"
+        else:
+            prompt = super().get_prompt(prompt_base, instruction, context)
+        return prompt.strip()
+
+    def postprocess_generation(self, generation, idx):
+        """Defines the postprocessing for a LM generation.
+        :param generation: str
+            code generation from LM
+        :param idx: int
+            index of doc in the dataset to which the generation belongs
+            (not used for Humaneval-Task)
+        """
+        doc = self.get_dataset()[idx]
+        prompt = self.get_prompt(doc)        
+        if self.prompt == "diff-carper":
+            # Only remove final stopwords like <MSG>
+            generation = self.remove_last_block(generation[len(prompt):].rstrip())
+            generation = prompt + generation
+            from bigcode_eval.tasks.custom_metrics.diff_eval import split_diff
+            # From https://github.com/CarperAI/OpenELM/blob/e6402a0696096011572152334ccbe049f89c332e/src/openelm/benchmarks/benchmark_bugs.py#L93
+            end_of_diff = re.compile("\n[^ +-@]+")
+            parsed: dict = split_diff(generation)
+            if parsed and all(
+                (s in parsed for s in ["name", "file", "message", "diff"])
+            ):
+                # truncate diff hunk at the first line not starting with " ", "+", "-", or "@"
+                diff_hunk: str = end_of_diff.split(parsed["diff"])[0]
+                # We apply diff patch loosely:
+                #   1. it ignores the line numbers;
+                #   2. it ignores invalid lines (not starting with " ",
+                #   "+" or "-" and not being "@@ ... @@").
+                # https://github.com/CarperAI/OpenELM/blob/e6402a0696096011572152334ccbe049f89c332e/src/openelm/benchmarks/benchmark_bugs.py#L162
+                nme_idx: int = diff_hunk.find("<NME>")
+                if nme_idx != -1:
+                    diff_hunk = diff_hunk[:nme_idx]
+                return diff_hunk
+        else:
+            gen = self.remove_last_block(generation[len(prompt):].rstrip())
+            if self.prompt.startswith("diff"):
+                return gen
+            else:
+                # Strip on the right to maintain same behavior as with get_prompt
+                prompt_base = self.get_prompt_base(doc)
+                return prompt_base.rstrip() + gen
+
+
+class HumanEvalExplainDescribeBase(HumanEvalPack):
+    def get_prompt_encoder(self, doc):
+        """Encoder input for models with Enc-Dec architecture like CodeT5"""
+        assert self.prompt == "instructcodet5p", "Enc-Dec is only tested for InstructCodeT5+"
+        prompt_base = self.get_prompt_base(doc)
+        instruction = f"Provide a concise natural language description of the code using at most {len(doc['docstring'])} characters."
+        context = prompt_base + doc["canonical_solution"]
+
+        return super().get_prompt("", instruction, context) # No prompt base as not generating
+    
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from."""
+        prompt_base = self.get_prompt_base(doc)
+        instruction = f"Provide a concise natural language description of the code using at most {len(doc['docstring'])} characters."
+        context = prompt_base + doc["canonical_solution"]
+        
+        return super().get_prompt("", instruction, context)
+
+    def remove_last_block(self, text):
+        for w in self.stop_words:
+            if w in text:
+                text = text[:text.find(w)]
+        return text
+
+    def remove_code(self, text, canonical_solution):
+        for line in canonical_solution.split("\n"):
+            line = line.strip()
+            if len(line) > 20 and line in text:
+                text = text.replace(line, "")
+        return text
+    
+    def postprocess_generation(self, generation, idx):
+        """Defines the postprocessing for a LM generation.
+        :param generation: str
+            code generation from LM
+        :param idx: int
+            index of doc in the dataset to which the generation belongs
+            (not used for Humaneval-Task)
+        """
+        doc = self.get_dataset()[idx]
+        prompt = self.get_prompt(doc)
+        docstring_len = len(doc["docstring"])
+        gen = self.remove_last_block(generation[len(prompt):].strip()[:docstring_len]).rstrip()
+        gen = self.remove_code(gen, doc["canonical_solution"])
+        return gen
+
+    def get_reference(self, doc, get_solution=False):
+        return None
+
+    def process_results(self, generations, references):
+        raise ValueError("""ExplainDescribe should be run with `--generation_only`.
+        Once generations are done run ExplainSynthesize with `--load_data_path path/to/generations.json`
+        It will load the explanations, generate from them and evaluate.""")
+
+
+class HumanEvalExplainSynthesizeBase(HumanEvalPackGenerative):
+    def __init__(self, load_data_path=None, **kwargs):
+        assert load_data_path is not None, "load_data_path must be specified to load the descriptions."
+        with open(load_data_path) as fp:
+            self.descriptions = json.load(fp)
+            print(f"{len(self.descriptions)} descriptions with {len(self.descriptions[0])} description candidates loaded.")    
+
+        super().__init__(**kwargs)
+
+    def get_dataset(self):
+        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
+        dataset = []
+        for description, sample in zip(self.descriptions, self.dataset["test"]):
+            for description_candidate in description:
+                dataset.append({"description": description_candidate} | sample)
+        return dataset
+
+    def get_prompt_encoder(self, doc):
+        """Encoder input for models with Enc-Dec architecture like CodeT5"""
+        assert self.prompt == "instructcodet5p", "Enc-Dec is only tested for InstructCodeT5+"
+        prompt_base = "" # No prompt base as not generating
+        instruction = f"Write functional code in {LANGUAGE_TO_NAME[self.DATASET_NAME]} according to the description."
+        context = doc["description"]
+
+        return super().get_prompt(prompt_base, instruction, context)
+    
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from."""
+        prompt_base = self.get_prompt_base(doc)
+        instruction = f"Write functional code in {LANGUAGE_TO_NAME[self.DATASET_NAME]} according to the description."
+        context = doc["description"]
+
+        return super().get_prompt(prompt_base, instruction, context)
+
+
+class HumanEvalSynthesizeBase(HumanEvalPackGenerative):
+    def get_prompt_encoder(self, doc):
+        """Encoder input for models with Enc-Dec architecture like CodeT5"""
+        assert self.prompt == "instructcodet5p", "Enc-Dec is only tested for InstructCodeT5+"
+        prompt_base = "" # No prompt base as not generating
+        instruction = doc["instruction"].strip()
+
+        return super().get_prompt(prompt_base, instruction)
+        
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from."""
+        prompt_base = self.get_prompt_base(doc)
+        instruction = doc["instruction"].strip()
+
+        return super().get_prompt(prompt_base, instruction)
--- a/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/humanevalpack_openai.py
+++ b/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/humanevalpack_openai.py
+"""Testing
+from datasets import load_dataset
+
+ds = load_dataset("bigcode/humaneval-x-bugs", "python")["test"]
+idx = 0
+
+def get_prompt_base(doc, language="python"):
+    # See 
+    # https://github.com/roG0d/CodeGeeX/blob/f66205b5f615a4eead9c26d7ec297e14738ea18d/codegeex/benchmark/evaluate_humaneval_x.py#L78
+    # https://github.com/THUDM/CodeGeeX/pull/76#issuecomment-1500653190
+    if language == "rust":
+        main = "fn main(){}\n"
+        prompt_base = main + doc["declaration"] + doc["prompt"]
+    else:
+        prompt_base = doc["prompt"]
+    return prompt_base
+
+prompt_base = get_prompt_base(ds[idx], language="python")
+    
+messages = [
+    {
+        "role": "user",
+        "content": ds[idx]["instruction"],
+    },
+    {
+        "role": "assistant",
+        "content": prompt_base,
+    },
+]
+
+gpt-4-0613
+response = openai.ChatCompletion.create(
+model="gpt-4-0613",
+messages=messages
+)
+"""
+
+import os
+import openai
+import jsonlines
+import termcolor
+
+from cdifflib import CSequenceMatcher
+from camel_converter import to_snake
+from datasets import load_dataset
+from typing import List
+from tqdm import tqdm
+
+_CITATION = """
+@article{muennighoff2023octopack,
+      title={OctoPack: Instruction Tuning Code Large Language Models}, 
+      author={Niklas Muennighoff and Qian Liu and Armel Zebaze and Qinkai Zheng and Binyuan Hui and Terry Yue Zhuo and Swayam Singh and Xiangru Tang and Leandro von Werra and Shayne Longpre},
+      journal={arXiv preprint arXiv:2308.07124},
+      year={2023}
+}
+"""
+
+LANGUAGE_TO_NAME = {
+    "python": "Python",
+    "cpp": "C++",
+    "js": "JavaScript",
+    "java": "Java",
+    "go": "Go",
+    "rust": "Rust",
+}
+
+def get_prompt_base(doc, language):
+    # See 
+    # https://github.com/roG0d/CodeGeeX/blob/f66205b5f615a4eead9c26d7ec297e14738ea18d/codegeex/benchmark/evaluate_humaneval_x.py#L78
+    # https://github.com/THUDM/CodeGeeX/pull/76#issuecomment-1500653190
+    if language == "rust":
+        main = "fn main(){}\n"
+        prompt_base = main + doc["declaration"]
+    else:
+        prompt_base = doc["declaration"]
+    return prompt_base
+
+
+def get_prompt_synthesize(doc, language="python"):
+    # addon = f"Start your code with:\n{get_prompt_base(sample, language)}"
+    # return doc["instruction"] + "\n" + addon # Results in worse performance for GPT4
+    
+    return doc["instruction"] # Problem: Difficult for problems that have helper functions
+
+
+def get_base_prompt_fix(doc, language="python", mode="tests"):
+    if language == "rust":
+        if mode == "tests":
+            return "fn main(){}\n" + doc["declaration"]
+        elif mode == "docs":
+            return "fn main(){}\n" + doc["declaration"] + doc["prompt"]
+        else:
+            raise ValueError
+    else:
+        if mode == "tests":
+            return doc["declaration"]
+        elif mode == "docs":
+            return doc["prompt"]
+        else:
+            raise ValueError
+
+def get_prompt_fix(doc, language="python", mode="tests"):
+    prompt_base = get_base_prompt_fix(doc, language, mode)
+    func = prompt_base + doc["buggy_solution"]
+    instruction = f'Fix bugs in {doc["entry_point"]}.'
+    return func + "\n" + instruction
+
+def get_prompt_explain_desc(doc, language="python"):
+    if language == "rust":
+        main = "fn main(){}\n"
+        prompt_base = main + doc["declaration"]
+    else:
+        prompt_base = doc["declaration"]
+    docstring_len = len(doc["docstring"])
+
+    instruction = f"Provide a concise natural language description of the code using at most {docstring_len} characters."
+    func = prompt_base + doc["canonical_solution"]
+
+    return instruction + "\n" + func, docstring_len
+
+def get_prompt_explain_syn(sample, desc, language="python"):
+    instruction = f"Write functional code in {LANGUAGE_TO_NAME[language]} according to the description."
+    addon = f"Start your code with:\n{get_prompt_base(sample, language)}"
+    return desc + "\n" + instruction + "\n" + addon
+
+class ParseError(Exception):
+    pass
+
+class ContentParser:
+
+    @staticmethod
+    def _entry_point_variations(entry_point: str) -> List[str]:
+        # NOTE: workaround dataset's bug with entry point naming
+        return [
+            entry_point,
+            to_snake(entry_point),
+            entry_point[0].lower() + entry_point[1:],
+        ]
+
+    def __call__(self, prompt: str, content: str, entry_point: str):
+        # NOTE: Model doesn't follow instructions directly:
+        # adds description of change and sometimes fixes
+        # typos, or other "bugs" in description.
+        if "```" in content:
+            content = content.split("```")[1]
+        # first parse with assumption that content has description
+        matcher = CSequenceMatcher(None, prompt, content)
+        tag, _, _, j1, j2 = matcher.get_opcodes()[-1]
+        if tag == "insert":
+            return content[j1:j2]
+        # second parse content with assumption that model wrote code without description
+        for entry_point in self._entry_point_variations(entry_point):
+            if entry_point in content:
+                content = content.split(entry_point)[-1]
+                return "".join(content.splitlines(keepends=True)[1:])
+        raise ParseError(f"Prompt is not in content:\n{content}")
+
+
+class ChatWrapper:
+
+    def __init__(self, model: str):
+        self._model = model
+
+    def __call__(self, prompt: str, n: int) -> str:
+        messages = [
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ]
+        while True:
+            try:
+                response = openai.ChatCompletion.create(
+                    model=self._model,
+                    messages=messages,
+                    temperature=0.2,
+                    top_p=0.95,
+                    n=n
+                )
+                content_list = list()
+                for i in range(n):
+                    message = response["choices"][i]["message"]
+                    assert message["role"] == "assistant"
+                    content_list.append(message["content"])
+                return content_list
+            except Exception as e:
+                print("API EXCEPTION:", e)
+
+
+if __name__ == '__main__':
+    TIMES = 1
+    VERBOSE = True
+    LANGUAGE = "python"
+    MODEL = "gpt-4-0613"
+    TASK = "humanevalsynthesize"
+    
+    # Load descriptions
+    if TASK == "humanevalexplainsynthesize":
+        with jsonlines.open(f"completions_{LANGUAGE}_humanevalexplaindescribe.jsonl", "r") as f:
+            descriptions = [line["raw_generation"][0] for line in f]
+
+    openai.organization = os.getenv("OPENAI_ORGANIZATION")
+    openai.api_key = os.getenv("OPENAI_API_KEY")
+
+    samples = [s for s in load_dataset("bigcode/humanevalpack", LANGUAGE)["test"]]
+
+    chat_wrapper = ChatWrapper(MODEL)
+    parse_errors = 0
+    parser = ContentParser()
+    for idx, sample in enumerate(tqdm(samples)):
+        if TASK == "humanevalfix":
+            prompt = get_prompt_fix(sample, language=LANGUAGE, mode="tests")
+        elif TASK == "humanevalsynthesize":
+            prompt = get_prompt_synthesize(sample, language=LANGUAGE)
+        elif TASK == "humanevalexplaindescribe":
+            prompt, docstring_len = get_prompt_explain_desc(sample, language=LANGUAGE)
+            gen = chat_wrapper(prompt, TIMES)
+            sample["raw_generation"] = gen
+            sample["generation"] = [gen_item[:docstring_len] for gen_item in gen]
+            continue
+        elif TASK == "humanevalexplainsynthesize":
+            desc = descriptions[idx]
+            prompt = get_prompt_explain_syn(sample, desc, language=LANGUAGE)
+        if VERBOSE:
+            print(f"Processing {sample['task_id']} ({idx + 1}/{len(samples)}))...")
+        sample["raw_generation"] = chat_wrapper(prompt, TIMES)
+        try:
+            sample["generation"] = [parser(prompt, generation_item, sample["entry_point"]) for generation_item in sample["raw_generation"]]
+        except ParseError as e:
+            parse_errors += 1
+            print("PARSE EXCEPTION:", e)
+            sample["generation"] = [""]
+        if VERBOSE:
+            for i in range(TIMES):
+                print(termcolor.colored(sample["entry_point"], "yellow", attrs=["bold"]))
+                print(termcolor.colored(prompt, "yellow"))
+                print(termcolor.colored(sample["canonical_solution"], "red"))
+                print(termcolor.colored(sample["generation"][i], "green")+"\n\n")
+    if VERBOSE:
+        print("parse error rate:", parse_errors / len(samples))
+
+    results_filename = f"completions_{LANGUAGE}_{TASK}.jsonl"
+    with jsonlines.open(results_filename, "w") as writer:
+        writer.write_all(samples)
--- a/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/humanevalplus.py
+++ b/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/humanevalplus.py
+"""Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation
+https://openreview.net/forum?id=1qvx610Cu7
+
+The HumanEval+ dataset is created by the EvalPlus framework which extends the original HumanEval dataset
+by adding more automatically generated test cases to each problem.
+
+Homepage: https://github.com/evalplus/evalplus
+"""
+
+from warnings import warn
+
+from bigcode_eval.tasks.humaneval import GeneralHumanEval
+
+_CITATION = """
+@inproceedings{evalplus,
+  title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation},
+  author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming},
+  booktitle = {Thirty-seventh Conference on Neural Information Processing Systems},
+  year = {2023},
+  url = {https://openreview.net/forum?id=1qvx610Cu7},
+}
+"""
+
+
+class GeneralHumanEvalPlus(GeneralHumanEval):
+    """A task represents an entire benchmark including its dataset, problems,
+    answers, generation settings and evaluation methods.
+    """
+
+    DATASET_PATH = "evalplus/humanevalplus"
+
+    def __init__(self, strip_prompt, k=[1, 10, 100], num_workers=16, timeout=10.0):
+        if timeout < 10.0:
+            warn(
+                "It is suggested to have a longer timeout as HumanEval+ has lots of tests. "
+                f"The current timeout is {timeout}s while the suggested timeout is 10s."
+            )
+        super().__init__(strip_prompt, k, num_workers, timeout)
+
+
+def create_task(strip_prompt):
+    class HumanEvalPlus(GeneralHumanEvalPlus):
+        def __init__(self, **kwargs):
+            super().__init__(strip_prompt, **kwargs)
+
+    return HumanEvalPlus
+
+
+def create_all_tasks():
+    """Creates a dictionary of tasks from a list of levels
+    :return: {task_name: task}
+        e.g. {multiple-py: Task, multiple-java: Task}
+    """
+    return {
+        "humanevalplus": create_task(True),
+        "humanevalplus-unstripped": create_task(False),
+    }
--- a/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/instruct_humaneval.py
+++ b/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/instruct_humaneval.py
+"""Evaluating Large Language Models Trained on Code
+https://arxiv.org/abs/2107.03374
+
+The HumanEval dataset released by OpenAI includes 164 programming problems with a function signature,
+docstring, body, and several unit tests. 
+They were handwritten to ensure not to be included in the training set of code generation models.
+
+Homepage: https://github.com/openai/human-eval
+"""
+
+from bigcode_eval.base import Task
+from bigcode_eval.utils import remove_after_return
+from bigcode_eval.tasks.custom_metrics.code_eval import compute_code_eval
+
+_CITATION = ""
+
+
+def create_all_tasks():
+    """Creates a dictionary of tasks corresponding for the 2 settings currently available
+    - instruction with code completion: we provide function signature/imports.. to the model after the instruction
+    - instruction to code generation: we only give the instruction without the function signature/imports..
+    """
+    return {
+        "instruct-humaneval": InstructHumanEvalWithContext,
+        "instruct-humaneval-nocontext": InstructHumanEvalWithoutContext,
+    }
+
+
+class InstructHumanEval(Task):
+    """A task represents an entire benchmark including its dataset, problems,
+    answers, generation settings and evaluation methods.
+    """
+
+    DATASET_PATH = "codeparrot/instructhumaneval"
+
+    DATASET_NAME = None
+
+    def __init__(self):
+        super().__init__(
+            stop_words=["if __name__", "\nprint", "\nclass"],
+            requires_execution=True,
+        )
+
+    def get_dataset(self):
+        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
+        return self.dataset["test"]
+
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from."""
+        pass
+
+    def get_reference(self, doc):
+        """Builds the reference solution for the doc (sample from the test dataset)."""
+        test_func = doc["test"]
+        entry_point = f"check({doc['entry_point']})"
+        return "\n" + test_func + "\n" + entry_point
+
+
+    def process_results(self, generations, references):
+        """Takes the list of LM generations and evaluates them against ground truth references,
+        returning the metric for the generations.
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list(str)
+            list of str containing references
+        """
+        results, _ = compute_code_eval(
+            references=references,
+            predictions=generations,
+        )
+        return results
+
+
+class InstructHumanEvalWithContext(InstructHumanEval):
+    def __init__(self):
+        super().__init__()
+
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from."""
+        return {"instruction": doc["instruction"], "context": doc["context"]}
+
+    def postprocess_generation(self, generation, idx):
+        """Defines the postprocessing for a LM generation.
+        :param generation: str
+            code generation from LM
+        :param idx: int
+            index of doc in the dataset to which the generation belongs
+            (not used for Humaneval-Task)
+        """
+        generation = self._stop_at_stop_token(generation, self.stop_words)
+
+        function_name = self.get_dataset()["entry_point"][idx]
+        func_index = generation.find(f"def {function_name}")
+        return generation[0:func_index] + remove_after_return(generation[func_index:])
+
+
+class InstructHumanEvalWithoutContext(InstructHumanEval):
+    def __init__(self):
+        super().__init__()
+
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from."""
+        return {"instruction": doc["instruction"], "context": ""}
+
+    def postprocess_generation(self, generation, idx):
+        """Defines the postprocessing for a LM generation.
+        :param generation: str
+            code generation from LM
+        :param idx: int
+            index of doc in the dataset to which the generation belongs
+            (not used for Humaneval-Task)
+        """
+        example = self.get_dataset()[idx]
+        prompt, function_name = example["context"], example["entry_point"]
+        prefix = prompt[0 : prompt.find(f"def {function_name}")]
+
+        sep_index = generation.find("```")
+        if sep_index == -1:
+            pass
+        else:
+            if (
+                generation[sep_index + len("```") : sep_index + len("```python")]
+                == "python"
+            ):
+                generation = generation[sep_index + len("```python") :]
+            else:
+                generation = generation[sep_index + len("```") :]
+
+        generation = self._stop_at_stop_token(generation, self.stop_words)
+
+        func_index = generation.find(f"def {function_name}")
+        if func_index == -1:
+            func_index = 0
+        return_index = generation[func_index:].rfind("  return ")
+        if return_index == -1:
+            return_index = 0
+
+        j = func_index + return_index
+        n = len(generation)
+
+        while j < n and generation[j] != "\n":
+            j += 1
+
+        sep_index_2 = generation.find("```")
+        if sep_index_2 == -1:
+            return prefix.strip() + "\n" + generation[func_index:j]
+        else:
+            return prefix.strip() + "\n" + generation[func_index : min(j, sep_index_2)]
--- a/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/instruct_wizard_humaneval.py
+++ b/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/instruct_wizard_humaneval.py
+"""Instruction version of HumanEval used for WizardCoder Models evaluation
+Evaluating Large Language Models Trained on Code
+https://arxiv.org/abs/2107.03374
+
+The HumanEval dataset released by OpenAI includes 164 programming problems with a function signature,
+docstring, body, and several unit tests. 
+They were handwritten to ensure not to be included in the training set of code generation models.
+
+Homepage: https://github.com/openai/human-eval
+"""
+
+
+from bigcode_eval.base import Task
+from bigcode_eval.tasks.custom_metrics.code_eval import compute_code_eval
+
+_CITATION = """
+@misc{chen2021evaluating,
+      title={Evaluating Large Language Models Trained on Code},
+      author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
+      year={2021},
+      eprint={2107.03374},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+"""
+
+
+def generate_prompt(input):
+    INSTRUCTION = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
+
+
+### Instruction:
+Create a Python script for this problem:
+{input}
+
+### Response:"""
+    return INSTRUCTION
+
+
+class HumanEvalWizardCoder(Task):
+    """A task represents an entire benchmark including its dataset, problems,
+    answers, generation settings and evaluation methods.
+    """
+
+    DATASET_PATH = "openai_humaneval"
+
+    def __init__(self):
+
+        super().__init__(
+            stop_words=[],
+            requires_execution=True,
+        )
+
+    def get_dataset(self):
+        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
+        return self.dataset["test"]
+
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from."""
+        prompt = doc["prompt"].replace("    ", "\t")
+        prompt = generate_prompt(prompt)
+        return prompt
+
+    def get_reference(self, doc):
+        """Builds the reference solution for the doc (sample from the test dataset)."""
+        test_func = doc["test"]
+        entry_point = f"check({doc['entry_point']})"
+        return "\n" + test_func + "\n" + entry_point
+
+    @staticmethod
+    def clean_comp(completion):
+        # adapted from https://github.com/nlpxucan/WizardLM/blob/main/WizardCoder/src/process_humaneval.py
+        if "```python" in completion:
+            def_line = completion.index("```python")
+            completion = completion[def_line:].strip()
+            completion = completion.replace("```python", "")
+            try:
+                next_line = completion.index("```")
+                completion = completion[:next_line].strip()
+            except:
+                a += 1
+        if '__name__ == "__main__"' in completion:
+            next_line = completion.index('if __name__ == "__main__":')
+            completion = completion[:next_line].strip()
+
+        if "# Example usage" in completion:
+            next_line = completion.index("# Example usage")
+            completion = completion[:next_line].strip()
+        if completion.startswith("Here's"):
+            completion = completion.split("\n")[1:]
+            completion = "\n".join(completion)
+        result = completion
+        return result
+
+    def postprocess_generation(self, generation, idx):
+        """Defines the postprocessing for a LM generation.
+        :param generation: str
+            code generation from LM
+        :param idx: int
+            index of doc in the dataset to which the generation belongs
+            (not used for Humaneval-Task)
+        """
+        generation = generation.split("### Response:")[-1]
+        generation = generation.replace("\t", "    ")
+        generation = generation.split("</s>")[0]
+        generation = self.clean_comp(generation)
+        return generation
+
+    def process_results(self, generations, references):
+        """Takes the list of LM generations and evaluates them against ground truth references,
+        returning the metric for the generations.
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list(str)
+            list of str containing refrences
+        """
+        results, _ = compute_code_eval(
+            references=references,
+            predictions=generations,
+        )
+        return results
--- a/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/mbpp.py
+++ b/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/mbpp.py
+"""Program Synthesis with Large Language Models
+https://arxiv.org/abs/2108.07732
+
+The benchmark consists of around 1,000 crowd-sourced Python programming problems, 
+designed to be solvable by entry level programmers, covering programming fundamentals, 
+standard library functionality, and so on. Each problem consists of a task description, 
+code solution and 3 automated test cases. As described in the paper, a subset of the data
+has been hand-verified by the authors.
+
+Homepage:: https://github.com/google-research/google-research/tree/master/mbpp
+"""
+
+from bigcode_eval.base import Task
+from bigcode_eval.tasks.custom_metrics.code_eval import compute_code_eval
+
+_CITATION = """
+@article{austin2021program,
+  title={Program Synthesis with Large Language Models},
+  author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},
+  journal={arXiv preprint arXiv:2108.07732},
+  year={2021}
+}
+"""
+
+
+class MBPP(Task):
+    """A task represents an entire benchmark including its dataset, problems,
+    answers, generation settings and evaluation methods.
+    """
+
+    DATASET_PATH = "mbpp"
+
+    def __init__(self):
+        super().__init__(
+            stop_words=["\nclass", "\nassert", '\n"""', "\nprint", "\nif", "\n<|/", "\n```"],
+            requires_execution=True,
+        )
+
+    def get_dataset(self):
+        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
+        dataset = self.dataset["test"]
+        # the wrong split of mbpp can be loaded with old datasets cache
+        assert (
+            len(dataset) == 500
+        ), "please ensure you have the latest version of MBPP dataset, try deleting its old cache"
+        return dataset
+
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from.
+        MBPP prompt is built following to InCoder (Fried et al.) approach
+        prompt = docstring that includes one test
+        """
+        description = doc["text"]
+        test_example = doc["test_list"][0]
+        prompt = f'"""\n{description}\n{test_example}\n"""\n'
+        return prompt
+
+    def get_reference(self, doc):
+        """Builds the reference solution for the doc (sample from the test dataset)."""
+        return "\n".join(doc["test_list"])
+
+
+    def postprocess_generation(self, generation, idx):
+        """Defines the postprocessing for a LM generation.
+        :param generation: str
+            code generation from LM
+        :param idx: int
+            index of doc in the dataset to which the generation belongs
+        """
+        prompt = self.get_prompt(self.dataset["test"][idx])
+        generation = generation[len(prompt) :]
+        return prompt + self._stop_at_stop_token(generation, self.stop_words)
+
+    def process_results(self, generations, references):
+        """Takes the list of LM generations and evaluates them against ground truth references,
+        returning the metric for the generations.
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list(str)
+            list of str containing refrences
+        """
+        results, _ = compute_code_eval(
+            references=references,
+            predictions=generations,
+        )
+        return results
--- a/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/mbppplus.py
+++ b/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/mbppplus.py
+"""Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation
+https://openreview.net/forum?id=1qvx610Cu7
+
+The MBPP+ dataset is created by the EvalPlus framework which extends the original MBPP dataset
+by adding more automatically generated test cases to each problem. Note MBPP+ only includes 399
+tasks which are a subset of the original MBPP dataset. The subset is selected from the sanitized
+MBPP (a subset of manually examined tasks by the original MBPP authors) and EvalPlus further 
+removes low-quality and ill-formed tasks for benchmark quality control.
+
+Homepage: https://github.com/evalplus/evalplus
+"""
+
+import os
+
+from bigcode_eval.tasks.mbpp import MBPP
+from bigcode_eval.tasks.custom_metrics.code_eval import compute_code_eval
+
+_CITATION = """
+@inproceedings{evalplus,
+  title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation},
+  author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming},
+  booktitle = {Thirty-seventh Conference on Neural Information Processing Systems},
+  year = {2023},
+  url = {https://openreview.net/forum?id=1qvx610Cu7},
+}
+"""
+
+
+class MBPPPlus(MBPP):
+    """A task represents an entire benchmark including its dataset, problems,
+    answers, generation settings and evaluation methods.
+    """
+
+    DATASET_PATH = "evalplus/mbppplus"
+
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from.
+        MBPP prompt is built following to InCoder (Fried et al.) approach
+        prompt = docstring that includes one test
+        """
+        description = doc["prompt"]  # sanitized testset use "prompt" instead of "text"
+        test_example = doc["test_list"][0]
+        prompt = f'"""\n{description}\n{test_example}\n"""\n'
+        return prompt
+
+    # NOTE(@ganler): MBPP+ extends the original MBPP jsonl data with a "test" field which
+    #                includes the testing code ready for execution. Note the "test" field
+    #                is different from HumanEval(+) which further requires a `check` func
+    def get_reference(self, doc):
+        """Builds the reference solution for the doc (sample from the test dataset)."""
+        use_mbpp_tests = os.getenv("MBBPPLUS_USE_MBPP_TESTS", "0")
+        if use_mbpp_tests == "1":
+            return "\n".join(doc["test_list"])
+        return "\n" + doc["test"]
+
+    def get_dataset(self):
+        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
+        dataset = self.dataset["test"]
+        assert (
+            len(dataset) == 399
+        ), "MBPP+ only has 399 problems. Please retry by deleting its old cache"
+        return dataset
+
+    def process_results(self, generations, references):
+        """Takes the list of LM generations and evaluates them against ground truth references,
+        returning the metric for the generations.
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list(str)
+            list of str containing refrences
+        """
+        results, _ = compute_code_eval(
+            references=references,
+            predictions=generations,
+            timeout=10.0,  # 10s timeout
+        )
+        return results
--- a/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/multiple.py
+++ b/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/multiple.py
+"""MultiPL-E: A Scalable and Extensible Approach to Benchmarking Neural Code Generation
+https://arxiv.org/abs/2107.03374
+
+MultiPL-E is a dataset for evaluating large language models for code generation that supports 18 programming languages.
+It takes the OpenAI "HumanEval" and the MBPP Python benchmarks and uses little compilers to translate them to other languages.
+
+Homepage: https://nuprl.github.io/MultiPL-E/
+"""
+
+import json
+import os
+import re
+import tempfile
+from multiprocessing import cpu_count
+from pathlib import Path
+from time import time
+
+import numpy as np
+from datasets import load_dataset
+from tqdm import tqdm
+
+from bigcode_eval.base import Task
+from bigcode_eval.tasks.custom_metrics.multiple_metrics.evaluation import \
+    evaluate_problem
+from bigcode_eval.tasks.custom_metrics.multiple_metrics.single_experiment_pass_k import \
+    for_file
+
+_CITATION = """
+@article{cassano2022scalable,
+  title={A Scalable and Extensible Approach to Benchmarking NL2Code for 18 Programming Languages},
+  author={Cassano, Federico and Gouwar, John and Nguyen, Daniel and Nguyen, Sydney and Phipps-Costin, Luna and Pinckney, Donald and Yee, Ming Ho and Zi, Yangtian and Anderson, Carolyn Jane and Feldman, Molly Q and others},
+  journal={arXiv preprint arXiv:2208.08227},
+  year={2022}
+}
+"""
+
+LANGUAGES = [
+    "py",
+    "sh",
+    "cpp",
+    "cs",
+    "d",
+    "go",
+    "java",
+    "js",
+    "jl",
+    "lua",
+    "pl",
+    "php",
+    "r",
+    "rkt",
+    "rb",
+    "rs",
+    "scala",
+    "swift",
+    "ts",
+]
+
+
+def create_all_tasks():
+    """Creates a dictionary of tasks from a list of levels
+    :return: {task_name: task}
+        e.g. {multiple-py: Task, multiple-java: Task}
+    """
+    return {f"multiple-{language}": create_task(language) for language in LANGUAGES}
+
+
+def create_task(language):
+    class MultiPLE(GeneralMultiPLE):
+        def __init__(self):
+            super().__init__(language)
+
+    return MultiPLE
+
+
+class GeneralMultiPLE(Task):
+    """A task represents an entire benchmark including its dataset, problems,
+    answers, generation settings and evaluation methods.
+    """
+
+    DATASET_PATH = "nuprl/MultiPL-E"
+    DATASET_NAME = None
+    DATASET_REVISION = "d23b094346c5dbda1080a74bb2a24c18adbf7409"
+
+    def __init__(self, language):
+        self.language = language
+        self.DATASET_NAME = f"humaneval-{language}"
+        # we need the dataset to get stop words for each language
+        self.dataset = load_dataset(
+            GeneralMultiPLE.DATASET_PATH,
+            self.DATASET_NAME,
+            revision=self.DATASET_REVISION)
+        stop_words = self.dataset["test"][0]["stop_tokens"] + ["<file_sep>"]
+        super().__init__(
+            stop_words=stop_words,
+            requires_execution=True,
+        )
+
+    def get_dataset(self):
+        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
+        return self.dataset["test"]
+
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from."""
+        return doc["prompt"].strip()
+
+    def get_reference(self, doc):
+        """Builds the reference solution for the doc (sample from the test dataset)."""
+        return doc["tests"]
+
+    @staticmethod
+    def remove_last_block(string, stop_words):
+        # Remove the last block of the code containing stop_words for HumanEval
+        string_list = re.split("(%s)" % "|".join(stop_words), string)
+        # last string should be ""
+        return "".join(string_list[:-2])
+
+
+    def postprocess_generation(self, generation, idx):
+        """Defines the postprocessing for a LM generation.
+        :param generation: str
+            code generation from LM
+        :param idx: int
+            index of doc in the dataset to which the generation belongs
+            (not used for this task)
+        """
+        prompt = self.get_prompt(self.get_dataset()[idx])
+        completion = generation[len(prompt) :]
+        return prompt + self._stop_at_stop_token(completion, self.stop_words)
+
+    def process_results(self, generations, references):
+        """Takes the list of LM generations and evaluates them against ground truth references,
+        returning the metric for the generations.
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list(str)
+            list of str containing refrences
+        """
+        # get prompts and problem names
+        prompts_names = [
+            {"prompt": doc["prompt"], "name": doc["name"]}
+            for i, doc in enumerate(self.get_dataset())
+            if i < len(generations)
+        ]
+        # a common temp dir for all the problems
+        temp_dir = tempfile.gettempdir()
+        list_files = []
+        for (prompt_name, generation, reference) in zip(
+            prompts_names, generations, references
+        ):
+            problem = {
+                "name": prompt_name["name"],
+                "language": self.language,
+                "prompt": prompt_name["prompt"],
+                "completions": generation,
+                "tests": reference,
+            }
+            # each problem is save in a json file
+            temp_file_name = os.path.join(temp_dir, f"{prompt_name['name']}.json")
+            list_files.append(temp_file_name)
+            with open(temp_file_name, "wt") as f:
+                json.dump(problem, f)
+        print(
+            f"Saved {len(list_files)} problems in {temp_dir} for evaluation, each problem has {len(generations[0])} completions"
+        )
+
+        # execute the problems to evaluate them
+        max_workers = cpu_count() - 1 if cpu_count() > 1 else 1
+        for file in tqdm(list_files):
+            evaluate_problem(temp_dir, file, max_workers)
+
+        # compute pass@k scores
+        result_array = np.array(
+            [for_file(p) for p in Path(temp_dir).glob("*.results.json")]
+        )
+        result = result_array.mean(axis=0)
+        name = (
+            temp_dir.split("/")[-1]
+            if temp_dir.split("/")[-1] != ""
+            else temp_dir.split("/")[-2]
+        )
+        results = {
+            f"pass@{k}": v
+            for k, v in zip([1, 10, 100], result)
+            if k <= len(generations[0])
+        }
+        return results
--- a/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/parity.py
+++ b/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/parity.py
+"""Parity bug fixing task."""
+
+import itertools
+import re
+
+from bigcode_eval.base import Task
+from bigcode_eval.tasks.custom_metrics.code_eval import compute_code_eval
+import tqdm
+
+def mutate_code(
+    n_bugs: int = 5, task: str = "parity", prompt="prompt"
+):
+    """
+    Modified from https://github.com/CarperAI/OpenELM/blob/e6402a0696096011572152334ccbe049f89c332e/src/openelm/utils/code_eval.py
+    
+    Mutate code to create n bugs. Output the prompt in diff format.
+    Args:
+        n_bugs: number of bugs to introduce (from 1 to 5).
+        task: (Optional) the task to be performed.
+        prompt: (Optional) 'diff', 'prompt' or 'edit'.
+    Returns:
+        template for code mutation
+    """
+    mutation_templates = {
+        "diff": [
+            f"<NME> {task}.py\n<BEF> ",
+            "",  # placeholder for the context, e.g., the buggy code
+            "\n<MSG> Fixed bugs",
+        ],
+        "prompt_carper": [
+            "# A buggy implementation\n#!/usr/bin/python3\n",
+            "",  # placeholder for the context, e.g., the buggy code
+            "\n# Fixed bugs\ndef",
+        ],
+        "prompt": [
+            "#!/usr/bin/python3\n# A buggy implementation\n", # Fixed order
+            "",  # placeholder for the context, e.g., the buggy code
+            "\n# Fixed bugs\ndef", # Past tense is key
+        ],
+        "edit": [
+            "<commit_before>",
+            "",  # placeholder for the context, e.g., the buggy code
+            "<commit_msg>Fix bugs<commit_after>",
+        ],
+    }
+    mutation_template = mutation_templates[prompt]
+    if task == "parity":
+        variables = ["b", "b", "b", "b", 2]
+        for i in range(n_bugs):
+            variables[i] = "c" if i < 4 else 3
+        func_str = (
+            'def parity(b1,b2,b3,b4):\n    """Return binary parity of a sequence of input bits.'
+            ' Return 0 for even parity, 1 for odd parity."""\n    bit_sum = sum(['
+            "{}1,{}2,{}3,{}4])\n    return bit_sum % {}".format(*variables)
+        )
+        mutation_template[1] = func_str
+        return "".join(mutation_template)
+    else:
+        raise ValueError(f"Unknown task: {task}")
+
+# https://github.com/CarperAI/OpenELM/blob/e6402a0696096011572152334ccbe049f89c332e/src/openelm/utils/code_eval.py#L131
+def parity_reference(b1, b2, b3, b4):
+    """
+    Return binary parity of a sequence of input bits.
+    Return 0 for even parity, 1 for odd parity.
+    """
+    bit_sum = sum([b1, b2, b3, b4])
+    return bit_sum % 2
+
+
+class Parity(Task):
+    def __init__(self, prompt="prompt"):
+
+        super().__init__(
+            stop_words=[
+                "\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif",
+                # Special cases for edit
+                "<commit_before>", "<commit_msg>", "<commit_after>", "<|endoftext|>",
+            ],
+            requires_execution=True,
+        )
+        self.prompt = prompt
+        self.parity_tests = "assert " + " and ".join([
+            f"({parity_reference(*i)} == parity{i})" for i in itertools.product(range(2), repeat=4)
+        ])
+        
+        # Allow max 3 times the length of the prompt to
+        # allow the model to e.g. add some comments
+        self.max_length_multiplier = 3
+
+    def get_dataset(self):
+        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
+        return [1, 2, 3, 4, 5]
+
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from."""
+        return mutate_code(n_bugs=doc, task="parity", prompt=self.prompt)
+
+    def get_reference(self, doc):
+        """Builds the reference solution for the doc (sample from the test dataset)."""
+        return []
+
+    @staticmethod
+    def first_block(string, stop_words):
+        """Split off first block of code by scanning for class, def etc. on newlines."""
+        stop_words = [re.escape(word) for word in stop_words] # Escape e.g. | in <|endoftext|>
+        return re.split("|".join(stop_words), string)[0].rstrip()        
+
+    def postprocess_generation(self, generation, idx):
+        """Defines the postprocessing for a LM generation.
+        :param generation: str
+            code generation from LM
+        :param idx: int
+            index of doc in the dataset to which the generation belongs
+        """
+        doc = self.get_dataset()[idx]
+        prompt = self.get_prompt(doc)
+        output = generation[len(prompt):]
+        if self.prompt.startswith("prompt"):
+            output = "def" + output # Add def which is in the prompt back to the output
+        return self.first_block(output, self.stop_words)
+
+    def process_results(self, generations, references):
+        """Takes the list of LM generations and evaluates them against ground truth references,
+        returning the metric for the generations.
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list(str)
+            list of str containing refrences
+        """
+        out = {}
+        # Compute metrics for each number of bugs
+        for idx, gens in tqdm.tqdm(enumerate(generations), total=len(generations)):
+            results, _ = compute_code_eval(
+                references=[self.parity_tests for _ in gens],
+                predictions=[[g] for g in gens],
+            )
+            out[f"{idx+1} bugs"] = results
+        return out
--- a/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/python_bugs.py
+++ b/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/python_bugs.py
+"""Python Bugs
+https://proceedings.mlr.press/v162/he22a.html
+
+This dataset is taken from the preprossing done by CarperAI (https://carper.ai/diff-models-a-new-way-to-edit-code).
+It is uploaded here: https://huggingface.co/datasets/Muennighoff/python-bugs
+
+Make sure to run with sufficient context length (512 is not enough for e.g. CodeGen).
+"""
+
+import re
+
+from evaluate import load
+from bigcode_eval.base import Task
+import tqdm
+
+_CITATION = """
+@inproceedings{he2022distribution,
+  title={On distribution shift in learning-based bug detectors},
+  author={He, Jingxuan and Beurer-Kellner, Luca and Vechev, Martin},
+  booktitle={International Conference on Machine Learning},
+  pages={8559--8580},
+  year={2022},
+  organization={PMLR}
+}
+"""
+
+MUTATE_TO_TASK_TO_PROMPT = {
+    "prompt_carper": {
+        "bin-op": "# Fixed binary operator",
+        "var-misuse": "# Fixed incorrect variable name",
+    },
+    "prompt_present": {
+        "bin-op": "# Fix binary operator",
+        "var-misuse": "# Fix incorrect variable name",
+    },
+    # Same as prompt_carper, but other parts are still different
+    "prompt": {
+        "bin-op": "# Fixed binary operator",
+        "var-misuse": "# Fixed incorrect variable name",
+    },
+    "edit": {
+        "bin-op": "Fix binary operator",
+        "var-misuse": "Fix incorrect variable name",
+    },
+}
+
+def mutate_code(input_code, task, prompt="prompt"):
+    """
+    Create template for code mutation.
+    Args:
+        input_code: code to be mutated
+        task: task to be performed
+        prompt: (Optional) 'edit' or 'prompt'
+    Returns:
+        template for code mutation
+    """
+    instruction = MUTATE_TO_TASK_TO_PROMPT[prompt][task]
+    if prompt == "prompt_carper":
+        return f"# A buggy implementation\n#!/usr/bin/python3\n{input_code}\n{instruction}\ndef"
+    if prompt == "prompt":
+        return f"#!/usr/bin/python3\n# A buggy implementation\n{input_code}\n{instruction}\ndef"        
+    if prompt == "edit":
+        return f"<commit_before>{input_code}<commit_msg>{instruction}<commit_after>"
+    else:
+        raise ValueError(f"Unknown prompt: {prompt}")
+
+
+class PythonBugs(Task):
+
+    DATASET_PATH = "Muennighoff/python-bugs"
+
+    def __init__(self, prompt="prompt"):
+        super().__init__(
+            # Correct code always starts with `def ...` and is a single function, so stop everything else
+            # Since a function always has a tab, stop when the first line does not have a tab
+            stop_words=[
+                "\nclass", "\n#", "\ndef", "\nassert", '\n"', "\nprint", "\nif",
+                # Special cases for edit
+                "<commit_before>", "<commit_msg>", "<commit_after>", "<|endoftext|>",
+            ],
+            requires_execution=True,
+        )
+        self.max_length_multiplier = 2.25 # Allow 2.25 times the length of the prompt
+        self.prompt = prompt
+
+    def get_dataset(self):
+        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
+        dataset = self.dataset["train"]
+        return dataset
+
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from."""
+        return mutate_code(doc["prompt_code"], doc["task"], self.prompt)
+
+    def get_reference(self, doc):
+        """Builds the reference solution for the doc (sample from the test dataset)."""
+        return doc["correct_code"]
+
+    def postprocess_generation(self, generation, idx):
+        """Defines the postprocessing for a LM generation.
+        :param generation: str
+            code generation from LM
+        :param idx: int
+            index of doc in the dataset to which the generation belongs
+        """
+        doc = self.get_dataset()[idx]
+        prompt = self.get_prompt(doc)
+        correct_code = self.get_reference(doc)
+        output = generation[len(prompt):]
+        if self.prompt.startswith("prompt"):
+            output = "def" + output # Add def which is in the prompt back to the output
+        return output[:len(correct_code)]
+
+    def process_results(self, generations, references):
+        """Takes the list of LM generations and evaluates them against ground truth references,
+        returning the metric for the generations.
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list(str)
+            list of str containing refrences
+        """
+        num_correct = 0
+        print("Scoring generations...")
+        for i, ref in tqdm.tqdm(enumerate(references), total=len(references)):
+            for gen in generations[i]:
+                num_correct += int(gen == ref)
+        accuracy = num_correct / len(references) / len(generations[0])
+        return {"mean exact match": accuracy}
--- a/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/quixbugs.py
+++ b/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/quixbugs.py
+"""QuixBugs"""
+
+import re
+
+from bigcode_eval.base import Task
+from bigcode_eval.tasks.custom_metrics.code_eval import compute_code_eval
+
+_CITATION = """
+@inproceedings{lin2017quixbugs,
+  title={QuixBugs: A multi-lingual program repair benchmark set based on the Quixey Challenge},
+  author={Lin, Derrick and Koppel, James and Chen, Angela and Solar-Lezama, Armando},
+  booktitle={Proceedings Companion of the 2017 ACM SIGPLAN international conference on systems, programming, languages, and applications: software for humanity},
+  pages={55--56},
+  year={2017}
+}
+"""
+
+
+class QuixBugs(Task):
+
+    DATASET_PATH = "Muennighoff/quixbugs"
+
+    def __init__(self, prompt="prompt"):
+        self.prompt = prompt
+        if self.prompt == "edit":
+            self.stop_words = [
+                "<commit_before>",
+                "<commit_msg>", 
+                "<commit_after>", 
+                "<|endoftext|>",
+            ]
+        elif self.prompt.startswith("prompt"):
+            self.stop_words = [
+                "\ndef",
+                "\nclass",
+                "\n#",
+                "\n@",
+                "\nprint",
+                "\nif",
+                "###",
+                "///",
+                "<|endoftext|>",
+            ]
+        elif self.prompt.startswith("prompt_codex"):
+            # https://arxiv.org/pdf/2111.03922.pdf
+            self.stop_words = [
+                "\nclass", "###", "///", "<|endoftext|>",
+            ]
+        else:
+            raise ValueError(f"Unknown prompt: {self.prompt}")
+
+        super().__init__(
+            stop_words=self.stop_words,
+            requires_execution=True,
+        )
+        self.max_length_multiplier = 3 # Allow 3 times the length of the prompt
+
+    def get_dataset(self):
+        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
+        return self.dataset["train"]
+
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from."""
+        if self.prompt == "edit":
+            prompt = "<commit_before>" + doc["buggy_program"]
+            prompt += "<commit_msg>" + "Fix bug in " + doc["name"]
+            prompt += "<commit_after>"
+        elif self.prompt == "edit-openai":
+            return doc["buggy_program"], "Fix bug in " + doc["name"]
+        elif self.prompt == "prompt":
+            prompt = "# Buggy function"
+            prompt += "\n" + doc["buggy_program"] + "\n"
+            prompt += "# Fixed function\ndef"            
+        elif self.prompt == "prompt_codex":
+            # https://arxiv.org/pdf/2111.03922.pdf, Prenner et al.
+            prompt = "### fix the bug in the following function"
+            prompt += "\n" + doc["buggy_program"] + "\n"
+            prompt += "### fixed function"
+        else:
+            raise ValueError(f"Unknown prompt: {prompt}")
+
+        return prompt.strip()
+
+    def get_reference(self, doc):
+        """Builds the reference solution for the doc (sample from the test dataset)."""
+        return (doc["name"], doc["tests"].strip())
+
+    @staticmethod
+    def remove_last_block(string, stop_words):
+        stop_words = [re.escape(word) for word in stop_words] # Escape e.g. | in <|endoftext|>
+        # Remove the last block of the code containing stop_words for HumanEval
+        string_list = re.split("(%s)" % "|".join(stop_words), string)
+        # last string should be ""
+        return "".join(string_list[:-2])
+
+    def postprocess_generation(self, generation, idx):
+        """Defines the postprocessing for a LM generation.
+        :param generation: str
+            code generation from LM
+        :param idx: int
+            index of doc in the dataset to which the generation belongs
+            (not used for Humaneval-Task)
+        """
+        doc = self.get_dataset()[idx]
+        prompt = self.get_prompt(doc)
+        generation = generation[len(prompt):]
+        if self.prompt == "prompt":
+            generation = "def" + generation # Add def which is in the prompt back to the output        
+        return self.remove_last_block(generation, self.stop_words)
+
+    def process_results(self, generations, references):
+        """Takes the list of LM generations and evaluates them against ground truth references,
+        returning the metric for the generations.
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list(str)
+            list of str containing refrences
+        """
+        results = {}
+        for i, (gen, (name, ref)) in enumerate(zip(generations, references)):
+            sub_results, _ = compute_code_eval(
+                references=[ref],
+                predictions=[gen],
+                timeout=10, # Levenshtein distance is slow
+            )
+            results[name] = sub_results
+        # Provide average of all metrics computed
+        if results:
+            results["all"] = {
+                k: sum(v[k] for v in results.values()) / len(results) for k in results[list(results.keys())[0]]
+            }
+            results["num_correct"] = results["all"]["pass@1"] * (len(results) - 1) # -1 for the all metric
+        return results
--- a/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/recode.py
+++ b/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/recode.py
+
+"""
+ReCode: Robustness Evaluation of Code Generation Models
+https://arxiv.org/abs/2212.10264
+Recode is a benchmark evaluating the robustness of code generation models to code and natural language perturbations.
+This task allows to run the released perturbed HumanEval benchmark, and compute the robust-pass-at-k metric.
+"""
+from collections import defaultdict
+from bigcode_eval.base import Task
+from bigcode_eval.tasks.custom_metrics.code_eval import compute_code_eval
+
+import numpy as np
+
+_CITATION = """
+@article{wang2022recode,
+  title={ReCode: Robustness Evaluation of Code Generation Models},
+  author={Wang, Shiqi and Li, Zheng and Qian, Haifeng and Yang, Chenghao and Wang, Zijian and Shang, Mingyue and Kumar, Varun and Tan, Samson and Ray, Baishakhi and Bhatia, Parminder and others},
+  journal={arXiv preprint arXiv:2212.10264},
+  year={2022}
+}
+"""
+
+TRANSFORMATION_CATEGORIES = ["format", "func_name", "natgen", "nlaugmenter"]
+
+
+def create_all_tasks():
+    """Creates a dictionary of tasks from a list of levels
+    :return: {task_name: task}
+        e.g. {multiple-py: Task, multiple-java: Task}
+    """
+    return {
+        f"perturbed-humaneval-{category}-num_seeds_{num_seeds}": create_task(
+            category, num_seeds
+        )
+        for category in TRANSFORMATION_CATEGORIES
+        for num_seeds in range(1, 11)
+    }
+
+
+def create_task(category, num_seeds):
+    class PerturbedHumanEval(GeneralPerturbedHumanEval):
+        DATASET_NAME = category
+
+        def __init__(self):
+            super().__init__(category, num_seeds)
+
+    return PerturbedHumanEval
+
+
+class GeneralPerturbedHumanEval(Task):
+    DATASET_PATH = "RaymondLi/perturbed_humaneval"
+
+    def __init__(self, category, num_seeds):
+        super().__init__(
+            stop_words=["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif"],
+            requires_execution=True,
+        )
+        # Transformation category
+        self.category = category
+        self.num_seeds = num_seeds
+        self.filtered_dataset = self.dataset["test"].filter(
+            lambda x: x["seed"] < num_seeds
+        )
+
+    def get_dataset(self):
+        """
+        Returns dataset for the task or an iterable of any object, that get_prompt can handle
+        Only keep the first NUM_SEEDS seeds
+        """
+        return self.filtered_dataset
+
+    def get_prompt(self, doc):
+        """
+        Builds the prompt for the LM to generate from.
+        :param doc: dict[str: str]
+            sample from the test dataset
+        :return: str
+        """
+        return doc["prompt"].strip()
+
+    def get_reference(self, doc):
+        """
+        Builds the reference solution for the doc (sample from the test dataset).
+        Will be passed to the `process_results` function, and potentially saved.
+        :param doc: dict[str: str]
+            sample from the test dataset
+        :return: dict
+        """
+        test_func = doc["test"]
+        entry_point = f"check({doc['entry_point']})"
+        test_code = "\n" + test_func + "\n" + entry_point
+        return {
+            "task_id": doc["task_id"],
+            "seed": doc["seed"],
+            "perturbation_name": doc["perturbation_name"],
+            "test_code": test_code,
+        }
+
+
+    def postprocess_generation(self, generation, idx):
+        """
+        Defines the postprocessing for a LM generation.
+        :param generation: str
+            code generation from LM
+        :param idx: int (if needed)
+            index of doc in the dataset to which the generation belongs
+        :return: str
+        """
+        prompt = self.get_prompt(self.filtered_dataset[idx])
+        generation = generation[len(prompt) :]
+        return prompt + self._stop_at_stop_token(generation, self.stop_words)
+
+    def process_results(self, generations, references):
+        """
+        Takes the list of LM generations and evaluates them against ground truth references,
+        returning the metric for the generations as in {"metric_name": result}.
+        We encourage to directly load the metric from `evaluate` library to keep the code concise.
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list(dict)
+            list of dict containing refrences
+        :return: dict[str: float]
+        """
+
+        _, detailed_results = compute_code_eval(
+            references=[ref["test_code"] for ref in references],
+            predictions=generations,
+        )
+
+        # Compute robust-pass-at-1. For each transformation and each prompt, we have s=5 randomly perturbed prompts.
+        # With a single sample per prompt, RP@1 on a given transformation is the fraction of examples where completions
+        # for all the perturbed prompts are correct.
+        # With n samples per prompt, https://arxiv.org/abs/2212.10264 defines RP@1 as the average of the
+        # 1/n * sum_{i=1}^n I(all s correct for generation-seed i) over all prompts.
+        # An alternate could be the average of the
+        # prod_{j=1}^s 1/n * sum_{i=1}^n I(j-th prompt correct for generation-seed i) over all prompts.
+
+        # We compute RP@1 for each transformation
+        # transformation -> problem -> seed -> [n results]
+        transformation_problem_results = defaultdict(lambda: defaultdict(dict))
+        for i, ref in enumerate(references):
+            result = detailed_results[i]
+            result = [x[1]["passed"] for x in result]
+            assert (
+                ref["seed"]
+                not in transformation_problem_results[ref["perturbation_name"]][
+                    ref["task_id"]
+                ]
+            )
+            transformation_problem_results[ref["perturbation_name"]][ref["task_id"]][
+                ref["seed"]
+            ] = result
+
+        rp1 = {}
+        for transformation, problem_results in transformation_problem_results.items():
+            res = {}
+            res["robust-pass-at-1"] = sum(
+                # results = {seed -> [n results]}
+                # 1/n * sum_{i=1}^n I(all s correct for generation-seed i)
+                float(all(results_)) / len(list(results.values())[0])
+                for results in problem_results.values()
+                for results_ in zip(*results.values())
+            ) / len(problem_results)
+
+            res["alt-robust-pass-at-1"] = sum(
+                # results = {seed -> [n results]}
+                # prod_{j=1}^s 1/n * sum_{i=1}^n I(j-th prompt correct for generation-seed i)
+                np.prod([np.mean(results[j]) for j in results])
+                for results in problem_results.values()
+            ) / len(problem_results)
+            rp1[transformation] = res
+
+        # TODO: for overall-performance, a prompt is solved if correct over the s prompts for all transformation categories.
+        return rp1
--- a/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/santacoder_fim.py
+++ b/Bigcode-Evaluation-Harness-240327/bigcode_eval/tasks/santacoder_fim.py
+from typing import Dict, List
+
+from tqdm import tqdm
+
+from bigcode_eval.base import Task
+
+_CITATION = """
+@article{allal2023santacoder,
+  title={SantaCoder: don't reach for the stars!},
+  author={Allal, Loubna Ben and Li, Raymond and Kocetkov, Denis and Mou, Chenghao and Akiki, Christopher and Ferrandis, Carlos Munoz and Muennighoff, Niklas and Mishra, Mayank and Gu, Alex and Dey, Manan and others},
+  journal={arXiv preprint arXiv:2301.03988},
+  year={2023}
+}
+"""
+
+LANGUAGES = [
+    "py",
+    "js",
+    "java",
+]
+
+
+def create_all_tasks():
+    return {
+        "santacoder_fim": SantaCoderFIM,
+        "starcoder_fim": StarCoderFIM,
+    }
+
+
+def initialize_empty_metrics(languages: List[str]) -> Dict[str, float]:
+    metrics = {}
+    for lang in languages:
+        metrics[f"n_accurate_{lang}"] = 0.0
+        metrics[f"n_count_{lang}"] = 0.0
+    return metrics
+
+
+def aggregate_per_lang_accuracy(
+    metrics: Dict[str, float], languages: List[str]
+) -> Dict[str, float]:
+    em_metrics = {}
+    for lang in languages:
+        # avoid div by 0
+        acc = (
+            metrics[f"n_accurate_{lang}"] / metrics[f"n_count_{lang}"]
+            if metrics[f"n_count_{lang}"]
+            else 0
+        )
+        em_metrics[f"{lang} Exact Match"] = acc
+
+    return em_metrics
+
+
+class SantaCoderFIM(Task):
+    DATASET_PATH = "bigcode/santacoder-fim-task"
+
+    def __init__(
+        self,
+        fim_prefix: str = "<fim-prefix>",
+        fim_middle: str = "<fim-middle>",
+        fim_suffix: str = "<fim-suffix>",
+        stop_words: List[str] = ["<|endoftext|>", "<|filename|>"],
+        requires_execution: bool = False
+    ):
+        super().__init__(
+            stop_words=stop_words,
+            requires_execution=requires_execution,
+        )
+        self.fim_prefix = fim_prefix
+        self.fim_middle = fim_middle
+        self.fim_suffix = fim_suffix
+
+    def get_dataset(self):
+        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
+        dataset = self.dataset["train"]
+        return dataset
+
+    def get_prompt(self, doc):
+        """Builds the prompt for the LM to generate from."""
+        return f"""{self.fim_prefix}{doc["prompt"]}{self.fim_suffix}{doc["suffix"]}{self.fim_middle}"""
+
+    def get_reference(self, doc):
+        """Builds the reference solution for the doc (sample from the test dataset)."""
+        return doc["canonical_solution"]
+
+    def postprocess_generation(self, generation, idx):
+        """Defines the postprocessing for a LM generation.
+        :param generation: str
+            code generation from LM
+        :param idx: int
+            index of doc in the dataset to which the generation belongs
+        """
+        doc = self.get_dataset()[idx]
+        prompt = self.get_prompt(doc)
+        output = generation[len(prompt) :]
+        return self._stop_at_stop_token(output, self.stop_words)
+        # return generation
+
+    def process_results(self, generations, references):
+        """Takes the list of LM generations and evaluates them against ground truth references,
+        returning the metric for the generations as in {"metric_name": result}.
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list(str)
+            list of str containing refrences
+        :return: dict[str: float]
+        """
+        metrics = initialize_empty_metrics(LANGUAGES)
+        for idx, (gen, reference) in tqdm(enumerate(zip(generations, references))):
+            language = self.get_dataset()[idx]["language"]
+            for g in gen:
+                metrics[f"n_accurate_{language}"] += int(g.strip() == reference.strip())
+
+            metrics[f"n_count_{language}"] += len(gen)
+
+        em_metrics = aggregate_per_lang_accuracy(metrics, LANGUAGES)
+
+        return em_metrics
+
+
+class StarCoderFIM(SantaCoderFIM):
+    DATASET_PATH = "bigcode/santacoder-fim-task"
+
+    def __init__(self):
+        fim_prefix = "<fim_prefix>"
+        fim_middle = "<fim_middle>"
+        fim_suffix = "<fim_suffix>"
+        stop_words = ["<|endoftext|>", "<|filename|>", "<file_sep>"]
+        super().__init__(
+            stop_words=stop_words,
+            requires_execution=False,
+            fim_prefix=fim_prefix,
+            fim_middle=fim_middle,
+            fim_suffix=fim_suffix,
+        )