"tools/dataset_converters/nuscenes_converter.py" did not exist on "8f88914da76edf6bea58980d57a64f077bb4fae3"
Commit 3041681f authored by silencealiang's avatar silencealiang
Browse files

init

parent 291fc518
Pipeline #2557 canceled with stages
"""
DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation
https://arxiv.org/pdf/2211.11501.pdf
DS-1000 is a code generation benchmark with a thousand data science questions spanning seven Python libraries that (1) reflects diverse, realistic, and practical use cases, (2) has a reliable metric, (3) defends against memorization by perturbing questions.
Homepage: https://ds1000-code-gen.github.io/
"""
import fcntl
import functools
import io
import itertools
import pathlib
import warnings
import zipfile
import requests
import tqdm
from bigcode_eval.base import Task
_CITATION = """
@article{Lai2022DS1000,
title={DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation},
author={Yuhang Lai and Chengxi Li and Yiming Wang and Tianyi Zhang and Ruiqi Zhong and Luke Zettlemoyer and Scott Wen-tau Yih and Daniel Fried and Sida Wang and Tao Yu},
journal={ArXiv},
year={2022},
volume={abs/2211.11501}
}
"""
def create_all_tasks():
def create_task(key, mode):
class DS1000(GeneralDS1000):
def __init__(self, **kwargs):
super().__init__(key, mode, **kwargs)
return DS1000
return {
f"ds1000-{key.lower()}-{mode.lower()}": create_task(key, mode)
for key in [
"All",
"Numpy",
"Pandas",
"Scipy",
"Matplotlib",
"Sklearn",
"Tensorflow",
"Pytorch",
]
for mode in ["Completion", "Insertion"]
}
class GeneralDS1000(Task):
DATASET_PATH = None
DATASET_NAME = None
def __init__(self, key, mode):
super().__init__(
stop_words=["</code>", "# SOLUTION END"], requires_execution=True
)
self._key = key
self._mode = mode
if self._key == "Matplotlib" and self._mode == "Insertion":
warnings.warn("Insertion not supported for Matplotlib. Running Completion.")
self._mode = "Completion"
self._dir = pathlib.Path(__file__).parent / "ds"
self._dir.mkdir(parents=True, exist_ok=True)
self._src = self._dir / "ds1000.py"
self._data = self._dir / "ds1000_data"
self._download_source()
self._download_dataset()
def _download_source(self):
url = "https://github.com/HKUNLP/DS-1000/blob/49c1c543ada8b58138181333cdc62e613204efcf/ds1000.py?raw=true"
lock = self._src.with_suffix(".lock")
with open(lock, "w") as f_lock:
fcntl.flock(f_lock, fcntl.LOCK_EX)
if not self._src.exists():
warnings.warn(f"DS-1000 source is being saved to {self._src}.")
print("Downloading source code...")
r = requests.get(url, stream=True)
with open(self._src, "wb") as f_src:
f_src.write(r.content)
open(self._src.parent / "__init__.py", "w").close()
print("Done.")
fcntl.flock(f_lock, fcntl.LOCK_UN)
def _download_dataset(self):
url = "https://github.com/HKUNLP/DS-1000/blob/49c1c543ada8b58138181333cdc62e613204efcf/ds1000_data.zip?raw=true"
lock = self._data.with_suffix(".lock")
with open(lock, "w") as f_lock:
fcntl.flock(f_lock, fcntl.LOCK_EX)
if not self._data.exists():
warnings.warn(f"DS-1000 data is being saved to {self._data}.")
print("Downloading dataset...")
r = requests.get(url, stream=True)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall(self._dir)
print("Done.")
fcntl.flock(f_lock, fcntl.LOCK_UN)
@functools.lru_cache()
def get_dataset(self):
"""Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
from .ds.ds1000 import DS1000Dataset
data = DS1000Dataset(self._data, mode=self._mode).data
if self._key == "All":
if self._mode == "Insertion":
warnings.warn(
"Insertion not supported for Matplotlib. Only running others."
)
data = {k: v for k, v in data.items() if k != "Matplotlib"}
dataset = list(itertools.chain(*data.values()))
else:
dataset = data[self._key]
return dataset
def get_prompt(self, doc):
"""
Builds the prompt for the LM to generate from.
:param doc: dict[str: str]
sample from the test dataset
:return: str | dict[str: str]
"""
if self._mode == "Completion":
return doc["prompt"]
elif self._mode == "Insertion":
prefix, suffix = doc["prompt"].split("[insert]")
prefix = f"{prefix.strip()}\n"
suffix = f"\n{suffix.strip()}\n"
return {"prefix": prefix, "suffix": suffix}
else:
raise ValueError(f"Invalid mode: {self._mode}")
def get_reference(self, doc):
"""
Builds the reference solution for the doc (sample from the test dataset).
:param doc: dict[str: str]
sample from the test dataset
:return: str
"""
return doc["reference_code"]
def postprocess_generation(self, generation, idx):
"""
Defines the postprocessing for a LM generation.
:param generation: str
code generation from LM
:param idx: int (if needed)
index of doc in the dataset to which the generation belongs
:return: str
"""
if self._mode == "Completion":
for start in ["BEGIN SOLUTION\n<code>", "# SOLUTION START"]:
try:
generation = generation.split(start, 1)[-1]
except IndexError:
pass
for stop in self.stop_words:
generation = generation.split(stop)[0]
return generation.strip()
def process_results(self, generations, references):
"""
Takes the list of LM generations and evaluates them against ground truth references,
returning the metric for the generations as in {"metric_name": result}.
We encourage to directly load the metric from `evaluate` library to keep the code concise.
:param generations: list(list(str))
list of lists containing generations
:param references: list(str)
list of str containing refrences
:return: dict[str: float]
"""
dataset = self.get_dataset()
num_correct = 0
print("Scoring generations...")
for i, ref in tqdm.tqdm(enumerate(references), total=len(references)):
test = [doc for doc in dataset if doc["reference_code"] == ref][0]
for gen in generations[i]:
is_correct = test.test(gen)
if is_correct:
num_correct += 1
accuracy = num_correct / len(references) / len(generations[0])
return {f"mean pass@1 accuracy ({len(generations[0])} samples)": accuracy}
{"danish":{"source1":"2 . Udfyld felterne i hvert trin i vejledningen . ","source2":"* Vise rapporter med finansposter og saldi . ","target1":"2 . Fill in the fields in each step of the guide . ","target2":"* View reports that show general ledger entries and balances . "},"chinese":{"source1":"返回 与 筛选器 初始化 由 平台 的 MCDRemoteSystemPlatformFilter 对象 。 ","source2":"用于 将 本地 的 ( 调用 ) 应用 程序 可 见性 首选 项 设置 发现 远程 系统 时 的 类 。 ","target1":"Returns an MCDRemoteSystemPlatformFilter object initialized with a filter by platform . ","target2":"A class used to set the local ( calling ) application visibility preference when discovering remote systems ."},"norwegian":{"source1":"Kosttypesaldo = Kostsentersaldo + Kostobjektsaldo ","source2":"* Vise en liste over bokføringsgrupper som du posterer til kontoen . ","target1":"Cost Type Balance = Cost Center Balance + Cost Object Balance ","target2":"* See a list of posting groups that post to that account . "},"latvian":{"source1":"# # &lt; a name = &quot; 6-change-the-status-of-the-conversion-record-to-ready &quot; &gt; &lt; / a &gt; 6 . Mainiet pārveidošanas ieraksta statusu uz Gatavs ","source2":"title : Preču saņemšanas reģistrēšana pirkšanas pasūtījumā ","target1":"# # 6 . Change the status of the conversion record to Ready ","target2":"title : Record the receipt of goods on the purchase order "}}
\ No newline at end of file
{"instruction1": "convert a list of integers into a single integer", "instruction2": "how to convert a datetime string back to datetime object?", "solution1": "r = int(''.join(map(str, x)))", "solution2": "datetime.datetime.strptime(str, '%m/%d/%Y')"}
\ No newline at end of file
{"instruction1": "get the distance of map coordinates to the center ", "instruction2": "check if details are parsed", "solution1": "float function ( int arg0 , int arg1 ) { int loc0 = arg0 - cx ; int loc1 = arg1 - cy ; return getSquaredDistance ( loc0 , loc1 ) ; }", "solution2": "boolean function ( ) { return isParsed ; }"}
\ No newline at end of file
{
"questions": ["Olivia has $23. She bought five bagels for $3 each. How much money does she have left?",
"Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?",
"There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?",
"Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?",
"Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?",
"Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?",
"If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?",
"There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?"],
"solutions": [" money_initial = 23\n bagels = 5\n bagel_cost = 3\n money_spent = bagels * bagel_cost\n money_left = money_initial - money_spent\n result = money_left\n return result",
" golf_balls_initial = 58\n golf_balls_lost_tuesday = 23\n golf_balls_lost_wednesday = 2\n golf_balls_left = golf_balls_initial - golf_balls_lost_tuesday - golf_balls_lost_wednesday\n result = golf_balls_left\n return result",
" computers_initial = 9\n computers_per_day = 5\n num_days = 4 # 4 days between monday and thursday\n computers_added = computers_per_day * num_days\n computers_total = computers_initial + computers_added\n result = computers_total\n return result",
" toys_initial = 5\n mom_toys = 2\n dad_toys = 2\n total_received = mom_toys + dad_toys\n total_toys = toys_initial + total_received\n result = total_toys\n return result",
" jason_lollipops_initial = 20\n jason_lollipops_after = 12\n denny_lollipops = jason_lollipops_initial - jason_lollipops_after\n result = denny_lollipops\n return result",
" leah_chocolates = 32\n sister_chocolates = 42\n total_chocolates = leah_chocolates + sister_chocolates\n chocolates_eaten = 35\n chocolates_left = total_chocolates - chocolates_eaten\n result = chocolates_left\n return result",
" cars_initial = 3\n cars_arrived = 2\n total_cars = cars_initial + cars_arrived\n result = total_cars\n return result",
" trees_initial = 15\n trees_after = 21\n trees_added = trees_after - trees_initial\n result = trees_added\n return result"]
}
\ No newline at end of file
"""PAL: Program-aided Language Models
https://arxiv.org/abs/2211.10435
GSM-8k: Training Verifiers to Solve Math Word Problems
https://arxiv.org/abs/2110.14168
In PaL, Large Language Model solves reasoning problems that involve complex arithmetic and procedural tasks by generating
reasoning chains of text and code.This offloads the execution of the code to a program runtime, in our case, a Python interpreter.
This task implements PAL methodology to evaluate GSM-8k and GSM-Hard benchmarks.
"""
import json
import os
import re
from enum import Enum
from typing import Union
from evaluate import load
from bigcode_eval.base import Task
from bigcode_eval.tasks.custom_metrics.pal_metric.pal_code_exec import compute
_CITATION = """
@article{gao2022pal,
title={PAL: Program-aided Language Models},
author={Gao, Luyu and Madaan, Aman and Zhou, Shuyan and Alon, Uri and Liu, Pengfei and Yang, Yiming and Callan, Jamie and Neubig, Graham},
journal={arXiv preprint arXiv:2211.10435},
year={2022}
}
@article{cobbe2021gsm8k,
title={Training Verifiers to Solve Math Word Problems},
author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
journal={arXiv preprint arXiv:2110.14168},
year={2021}
}
"""
# Number of few shot examples to consider
NUM_SHOTS = 8
class EvaluationType(str, Enum):
"""Possible values for evaluation type argument"""
GREEDY = "greedy"
MAJORITY_VOTING = "majority_voting"
def create_all_tasks():
"""Creates a dictionary of tasks for all evalution type
:return: {task_name: task}
e.g. {pal-gsm8k-greedy: Task, pal-gsm8k-majority_voting: Task}
"""
tasks = [Gsm8k, GsmHard]
eval_types = [et.value for et in EvaluationType]
return {
f"pal-{task.__name__.lower()}-{eval_type}": create_task(task, eval_type)
for eval_type in eval_types
for task in tasks
}
def create_task(cls, evaluation_type):
class Gsm(cls):
def __init__(self, **kwargs):
super().__init__(evaluation_type, **kwargs)
return Gsm
class Gsm8k(Task):
DATASET_PATH = "gsm8k"
DATASET_NAME = "main"
POST_SCRIPT = "print(solution())"
SPLIT = "test"
def __init__(
self, evaluation_type: Union[str, EvaluationType] = EvaluationType.GREEDY
):
"""
:param evaluation_type: Union[str,EvaluationType]
Type of evaluation to perform. Authors of PAL had originally evaluated the generations on greedy and majority voting methods.
Values can be `greedy` or `majority_voting`
greedy: One Generation is sampled using greedy decoding and evaluated against references
majority_voting: Predicted answer is selected from multiple generations based on majority voting and evaluated.
"""
stop_words = ["\n\n\n"]
requires_execution = True
if evaluation_type == EvaluationType.MAJORITY_VOTING:
self.majority_voting = True
else:
self.majority_voting = False
super().__init__(stop_words, requires_execution)
def get_dataset(self):
"""Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
if self.SPLIT:
return self.dataset[self.SPLIT]
return self.dataset
def fewshot_examples(self):
"""Loads and returns the few-shot examples for the task if they exist."""
with open(
"bigcode_eval/tasks/few_shot_examples/gsm8k_few_shot_prompts.json",
"r",
) as file:
examples = json.load(file)
return examples
@staticmethod
def few_shot_prompt(entry, text, examples):
"""Two shot prompt format as source & target language documentation"""
prompt = ""
for question, solution in zip(
examples["questions"][:NUM_SHOTS], examples["solutions"][:NUM_SHOTS]
):
prompt += f'''Q: {question}\n\n# solution in Python:\n\n\ndef solution():\n """{question}"""\n{solution}\n\n\n\n\n\n'''
prompt += f"""Q: {text}\n\n# solution in Python:\n\n\n"""
return entry + prompt
def get_prompt(self, doc):
"""Builds the prompt for the LM to generate from."""
text = doc["question"]
entry = f""
examples = self.fewshot_examples()
prompt = self.few_shot_prompt(entry, text, examples)
return prompt
@staticmethod
def parse_target(txt):
def _is_num(txt):
try:
txt = txt.replace(",", "")
float(txt)
except ValueError:
return False
return True
txt = txt.strip()
if _is_num(txt):
txt = txt.replace(",", "")
try:
num = int(txt)
except ValueError:
num = float(txt)
return num
return txt
def get_reference(self, doc):
"""Builds the reference solution for the doc (sample from the test dataset)."""
_answer_delim = "#### "
target = doc["answer"].split(_answer_delim)[-1]
return self.parse_target(target)
def postprocess_generation(self, generation, idx):
"""Defines the postprocessing for a LM generation.
:param generation: str
code generation from LM
:param idx: int
index of doc in the dataset to which the generation belongs
(not used for this task)
"""
output = generation.split("# solution in Python:", NUM_SHOTS + 1)[-1].strip()
if "Q:" in output:
output = output.split("Q:")[0]
output += "\n" + self.POST_SCRIPT
return output
def process_results(self, generations, references):
"""Takes the list of LM generations and evaluates them against ground truth references,
returning the metric for the generations.
:param generations: list(list(str))
list of lists containing generations
:param references: list(float)
list of references
"""
results = compute(
references=references,
predictions=generations,
majority_voting=self.majority_voting,
)
return results
class GsmHard(Gsm8k):
DATASET_PATH = "reasoning-machines/gsm-hard"
DATASET_NAME = None
# the default split of GSMHARD - actually taken from test split of GSM dataset
SPLIT = "train"
def __init__(self, evaluation_type: str = EvaluationType.GREEDY):
"""
:param evaluation_type: str
Type of evaluation to perform. Authors of PAL had originally evaluated the generations on greedy and majority voting methods.
Values can be `greedy` or `majority_voting`
greedy: One Generation is sampled using greedy decoding and evaluated against references
majority_voting: Predicted answer is selected from multiple generations based on majority voting and evaluated.
"""
super().__init__(evaluation_type)
def get_prompt(self, doc):
"""Builds the prompt for the LM to generate from."""
text = doc["input"]
entry = ""
examples = self.fewshot_examples()
prompt = self.few_shot_prompt(entry, text, examples)
return prompt
def get_reference(self, doc):
return doc["target"]
"""Evaluating Large Language Models Trained on Code
https://arxiv.org/abs/2107.03374
The HumanEval dataset released by OpenAI includes 164 programming problems with a function signature,
docstring, body, and several unit tests.
They were handwritten to ensure not to be included in the training set of code generation models.
Homepage: https://github.com/openai/human-eval
"""
from bigcode_eval.base import Task
from bigcode_eval.tasks.custom_metrics.code_eval import compute_code_eval
_CITATION = """
@misc{chen2021evaluating,
title={Evaluating Large Language Models Trained on Code},
author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
year={2021},
eprint={2107.03374},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
"""
def create_all_tasks():
"""Creates a dictionary of tasks from a list of levels
:return: {task_name: task}
e.g. {multiple-py: Task, multiple-java: Task}
"""
return {"humaneval": create_task(True), "humaneval-unstripped": create_task(False)}
def create_task(strip_prompt):
class HumanEval(GeneralHumanEval):
def __init__(self, **kwargs):
super().__init__(strip_prompt, **kwargs)
return HumanEval
class GeneralHumanEval(Task):
"""A task represents an entire benchmark including its dataset, problems,
answers, generation settings and evaluation methods.
"""
DATASET_PATH = "openai_humaneval"
def __init__(self, strip_prompt, k=[1, 10, 100], num_workers=16, timeout=3.0):
super().__init__(
stop_words=["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif", "\n```", "<file_sep>"],
requires_execution=True,
)
self.strip_prompt = strip_prompt
self.k = k
self.num_workers = num_workers
self.timeout = timeout
def get_dataset(self):
"""Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
return self.dataset["test"]
def get_prompt(self, doc):
"""Builds the prompt for the LM to generate from."""
if self.strip_prompt:
return doc["prompt"].strip()
else:
return doc["prompt"]
def get_reference(self, doc):
"""Builds the reference solution for the doc (sample from the test dataset)."""
test_func = doc["test"]
entry_point = f"check({doc['entry_point']})"
return "\n" + test_func + "\n" + entry_point
def postprocess_generation(self, generation, idx):
"""Defines the postprocessing for a LM generation.
:param generation: str
code generation from LM
:param idx: int
index of doc in the dataset to which the generation belongs
(not used for Humaneval-Task)
"""
prompt = self.get_prompt(self.dataset["test"][idx])
generation = generation[len(prompt) :]
return prompt + self._stop_at_stop_token(generation, self.stop_words)
def process_results(self, generations, references):
"""Takes the list of LM generations and evaluates them against ground truth references,
returning the metric for the generations.
:param generations: list(list(str))
list of lists containing generations
:param references: list(str)
list of str containing refrences
"""
results, _ = compute_code_eval(
references=references,
predictions=generations,
k=self.k,
num_workers=self.num_workers,
timeout=self.timeout,
)
return results
import json
import re
from evaluate import load
from bigcode_eval.base import Task
_CITATION = """
@article{muennighoff2023octopack,
title={OctoPack: Instruction Tuning Code Large Language Models},
author={Niklas Muennighoff and Qian Liu and Armel Zebaze and Qinkai Zheng and Binyuan Hui and Terry Yue Zhuo and Swayam Singh and Xiangru Tang and Leandro von Werra and Shayne Longpre},
journal={arXiv preprint arXiv:2308.07124},
year={2023}
}
"""
LANGUAGES = ["python", "cpp", "js", "java", "go", "rust"]
LANGUAGE_TO_NAME = {
"python": "Python",
"cpp": "C++",
"js": "JavaScript",
"java": "Java",
"go": "Go",
"rust": "Rust",
}
LANGUAGE_TO_EXTENSION = {
"python": "py",
"cpp": "cpp",
"js": "js",
"java": "java",
"go": "go",
"rust": "rs",
}
# Taken from https://huggingface.co/datasets/nuprl/MultiPL-E/ & https://github.com/THUDM/CodeGeeX
LANGUAGE_TO_STOP_WORDS = {
# https://github.com/THUDM/CodeGeeX/blob/23ee51505a2bcd34d59d2e271b22e5bd91475462/codegeex/benchmark/utils.py#L164
"python": ["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif", "\nassert"],
# https://github.com/THUDM/CodeGeeX/blob/23ee51505a2bcd34d59d2e271b22e5bd91475462/codegeex/benchmark/utils.py#L185
"cpp": [],
# https://github.com/THUDM/CodeGeeX/blob/23ee51505a2bcd34d59d2e271b22e5bd91475462/codegeex/benchmark/utils.py#L188
"js": [],
# https://github.com/THUDM/CodeGeeX/blob/23ee51505a2bcd34d59d2e271b22e5bd91475462/codegeex/benchmark/utils.py#L177
"go": ["\n//", "\nfunc main(", "struct", "\nfunc"],
# https://github.com/THUDM/CodeGeeX/blob/23ee51505a2bcd34d59d2e271b22e5bd91475462/codegeex/benchmark/utils.py#L169
"java": [],
"rust": [],
}
LANGUAGE_TO_TIMEOUT = {
"python": 10,
"cpp": 60,
"js": 10,
"java": 10,
"go": 20,
"rust": 300, # Necessary for first-time compilation of cargo
}
# Java sometimes fails with more workers; For JS it's twice as fast with 4 workers
LANGUAGE_TO_NUM_WORKERS = {
"python": 4,
"cpp": 4,
"js": 4,
"java": 1,
"go": 4,
"rust": 1,
}
# https://github.com/THUDM/CodeGeeX/blob/23ee51505a2bcd34d59d2e271b22e5bd91475462/codegeex/benchmark/utils.py#L6
IMPORT_HELPER = {
"python": [
"import math",
"import re",
"import sys",
"import copy",
"import datetime",
"import itertools",
"import collections",
"import heapq",
"import statistics",
"import functools",
"import hashlib",
"import numpy",
"import numpy as np",
"import string",
"from typing import *",
"from collections import *",
],
"go": [
"math",
"strings",
"fmt",
"strconv",
"time",
"bytes",
"regexp",
"sort",
"math/rand",
"crypto/md5",
],
"cpp": [
"using namespace std;",
"#include<stdlib.h>",
"#include<algorithm>",
"#include<cmath>",
"#include<math.h>",
"#include<numeric>",
"#include<stdio.h>",
"#include<vector>",
"#include<set>",
"#include<map>",
"#include<queue>",
"#include<stack>",
"#include<list>",
"#include<deque>",
"#include<boost/any.hpp>",
"#include<string>",
"#include<climits>",
"#include<cstring>",
"#include<iostream>",
"#include<sstream>",
"#include<fstream>",
],
}
def create_all_tasks():
fix = {f"humanevalfix{mode}-{language}": create_task(language, "fix" + mode) for language in LANGUAGES for mode in ["tests", "docs"]}
explain = {f"humanevalexplain{mode}-{language}": create_task(language, "explain" + mode) for language in LANGUAGES for mode in ["describe", "synthesize"]}
synthesize = {f"humanevalsynthesize-{language}": create_task(language, "synthesize") for language in LANGUAGES}
return {**fix, **explain, **synthesize}
def create_task(language, name):
class HumanEvalFixTests(HumanEvalFixBase):
def __init__(self, language=language, prompt="instruct"):
super().__init__(language=language, prompt=prompt, with_docs=False)
class HumanEvalFixDocs(HumanEvalFixBase):
def __init__(self, language=language, prompt="instruct"):
super().__init__(language=language, prompt=prompt, with_docs=True)
class HumanEvalExplainDescribe(HumanEvalExplainDescribeBase):
def __init__(self, language=language, prompt="instruct"):
super().__init__(language=language, prompt=prompt, with_docs=False)
class HumanEvalExplainSynthesize(HumanEvalExplainSynthesizeBase):
def __init__(self, language=language, prompt="instruct", load_data_path=None):
super().__init__(language=language, prompt=prompt, with_docs=False, load_data_path=load_data_path)
class HumanEvalSynthesize(HumanEvalSynthesizeBase):
def __init__(self, language=language, prompt="instruct"):
super().__init__(language=language, prompt=prompt, with_docs=True)
if name == "fixtests": return HumanEvalFixTests
elif name == "fixdocs": return HumanEvalFixDocs
elif name == "explaindescribe": return HumanEvalExplainDescribe
elif name == "explainsynthesize": return HumanEvalExplainSynthesize
elif name == "synthesize": return HumanEvalSynthesize
class HumanEvalPack(Task):
"""Parent class for all HumanEvalPack tasks"""
DATASET_PATH = "bigcode/humanevalpack"
DATASET_NAME = None
def __init__(self, prompt="instruct", language="python", with_docs=True):
self.DATASET_NAME = language
self.prompt = prompt
stop_words = LANGUAGE_TO_STOP_WORDS[language]
if self.prompt.startswith("edit"):
stop_words.extend([
"<commit_before>",
"<commit_msg>",
"<commit_after>",
])
elif self.prompt == "starchat":
stop_words.append("<|end|>")
elif self.prompt == "diff":
stop_words = ["<commit_before>", "<commit_msg>", "<commit_after>"]
elif self.prompt == "diff-carper":
stop_words = ["<BEF>", "<MSG>", "<DFF>", "\ No newline at end of file"]
elif self.prompt == "issue":
stop_words.append("```")
stop_words.append("<|endoftext|>")
self.with_docs = with_docs
super().__init__(stop_words=stop_words, requires_execution=True)
def get_dataset(self):
return self.dataset["test"]
def get_prompt_base(self, doc):
if self.with_docs: return doc["prompt"] # Already includes fn main for rust
else:
if self.DATASET_NAME == "rust":
# See
# https://github.com/roG0d/CodeGeeX/blob/f66205b5f615a4eead9c26d7ec297e14738ea18d/codegeex/benchmark/evaluate_humaneval_x.py#L78
# https://github.com/THUDM/CodeGeeX/pull/76#issuecomment-1500653190
return "fn main(){}\n" + doc["declaration"]
else: return doc["declaration"]
def get_prompt(self, prompt_base, instruction, context=None):
if context is None:
inp = instruction
# `Context first then instruction` methods
elif self.prompt in ["continue", "instruct"]:
inp = context + "\n" + instruction
else:
inp = instruction + "\n" + context
if self.prompt == "continue":
assert context is None, "The `continue` prompt should only be used for HumanEvalSynthesize. Use `instruct` for HumanEvalFix and HumanEvalExplain."
prompt = prompt_base
elif self.prompt == "instruct":
prompt = inp + "\n\n" + prompt_base
elif self.prompt == "octocoder":
prompt = f'Question: {inp}\n\nAnswer:\n{prompt_base}'
elif self.prompt == "octogeex":
prompt = f'Question: {inp.strip()}\n\nAnswer:\n{prompt_base}'
elif self.prompt == "starchat":
# https://hf.co/HuggingFaceH4/starchat-beta
prompt = f'<|system|>\n<|end|>\n<|user|>\n{inp}<|end|>\n<|assistant|>\n{prompt_base}'
elif self.prompt == "starcodercommit":
prompt = f'<commit_before><commit_msg>{inp}<commit_after>{prompt_base}'
elif self.prompt == "instructcodet5p":
# https://github.com/salesforce/CodeT5/blob/main/CodeT5%2B/humaneval/generate_codet5p.py#L89
prompt = f'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{inp}\n\n### Response:{prompt_base}'
elif self.prompt == "wizardcoder":
# https://github.com/nlpxucan/WizardLM/blob/main/WizardCoder/src/humaneval_gen.py#L37
prompt = f'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{inp}\n\n### Response:\n{prompt_base}'
elif self.prompt == "codellama":
# https://hf.co/codellama
prompt = f"[INST] {inp.strip()} [/INST] {prompt_base}"
elif self.prompt in ["tulu", "gritlm"]:
# https://hf.co/GritLM/GritLM-7B
prompt = f"<|user|>\n{inp}\n<|assistant|>\n{prompt_base}"
elif self.prompt == "zephyr":
# https://hf.co/HuggingFaceH4/zephyr-7b-beta
prompt = f"<|user|>\n{inp}</s>\n<|assistant|>\n{prompt_base}"
elif self.prompt == "yi":
# https://hf.co/01-ai/Yi-34B-Chat
prompt = f"<|im_start|>user\n{inp}<|im_end|>\n<|im_start|>assistant\n{prompt_base}"
elif self.prompt == "codellama-70b":
prompt = f"Source: user\n\n {inp.strip()} Source: assistant\nDestination: user \n\n{prompt_base}"
elif self.prompt == "aurora-m":
prompt = f'### Instruction:\n{inp}\n### Response:\n{prompt_base}'
else:
raise ValueError(f"The --prompt argument {self.prompt} wasn't provided or isn't supported")
# Strip off the final \n to make the tokens more natural
# Essentially, we want to make sure that if there was no distinction between
# input & output, the tokens would be the same
# E.g. for SantaCoder:
# tokenize("""def hi()\n return""")
# ['def', 'Ġhi', '()', 'ĊĠĠ', 'Ġreturn']
# So we need to split before the \n so that the input is
# ['def', 'Ġhi', '()'] and the model can generate ['ĊĠĠ', 'Ġreturn']
# If instead we provide def hi()\n the tokens will be
# ['def', 'Ġhi', '()', 'Ċ'] and the model would need to generate ['ĠĠ', 'Ġreturn']
# Which would be harder, as it's not the usual way these tokens are tokenized
# i.e. the model has never seen the token sequence of ['()', 'Ċ', 'ĠĠ'], but only ['()', 'ĊĠĠ']
# The same holds for Java, JS, Go, Rust, C++ tho the start sequences are slightly different
return prompt.strip()
def get_reference(self, doc, get_solution=False):
if get_solution:
return doc["prompt"] + doc["canonical_solution"]
else:
return "\n" + doc["test"] # check(func_name) is already included
class HumanEvalPackGenerative(HumanEvalPack):
"""Parent class for all HumanEvalPack tasks except describing code"""
def check_fn(self, code):
"""
Checks whether the generated code is finished.
Problem: Models rarely split their code into multiple functions, but this stops the model after the 1st function.
Inspiration: https://github.com/THUDM/CodeGeeX/blob/23ee51505a2bcd34d59d2e271b22e5bd91475462/codegeex/benchmark/utils.py#L115
"""
if any([w in code for w in self.stop_words]): return True
# The heuristics below do not hold for diff generation
if (self.prompt.startswith("diff")): return False
if self.DATASET_NAME == "python":
for line in code.split("\n"):
if len(line.strip()) > 0 and line[0] != ' ' and line[0] != '\t':
return True
else:
open_brackets = 2 if self.DATASET_NAME == "java" else 1
if code.count("{") + open_brackets == code.count("}"):
return True
return False
def remove_last_block(self, code):
"""
Adapted from https://github.com/THUDM/CodeGeeX/blob/23ee51505a2bcd34d59d2e271b22e5bd91475462/codegeex/benchmark/utils.py#L151
"""
for w in self.stop_words:
if w in code:
code = code[:code.find(w)]
### Find the first occassion where a chain of { } is closed
if self.DATASET_NAME == "python":
for i, line in enumerate(code.split("\n")):
if len(line.strip()) > 0 and line[0] != ' ' and line[0] != '\t':
return "\n".join(code.split("\n")[:i])
elif self.DATASET_NAME in ["java", "js", "go", "cpp", "rust"]:
open_brackets = 2 if self.DATASET_NAME == "java" else 1
cut = False
for i, c in enumerate(code):
if c == '{':
open_brackets += 1
elif c == '}':
open_brackets -= 1
if open_brackets == 0:
code = code[:i+1]
cut = True
break
if not cut:
if self.DATASET_NAME == "java":
main_pos = code.find("public static void main")
if main_pos != -1:
code = code[:main_pos] + '}'
if '}' in code:
code = code[:code.rfind('}')] + '}'
if code.count('{') - 1 == code.count('}'):
code += "\n}"
elif '}' in code:
code = code[:code.rfind('}')] + '}'
return code
def postprocess_generation(self, generation, idx):
"""Defines the postprocessing for a LM generation.
:param generation: str
code generation from LM
:param idx: int
index of doc in the dataset to which the generation belongs
(not used for Humaneval-Task)
"""
doc = self.get_dataset()[idx]
prompt = self.get_prompt(doc)
gen = self.remove_last_block(generation[len(prompt):].rstrip())
# Strip to maintain same behavior as with get_prompt
return doc["prompt"].rstrip() + gen
def process_results(self, generations, references):
"""Takes the list of LM generations and evaluates them against ground truth references.
:param generations: list(list(str))
list of lists containing generations
:param references: list(str)
list of str containing refrences
"""
code_metric = load("Muennighoff/code_eval_octopack")
timeout = LANGUAGE_TO_TIMEOUT[self.DATASET_NAME]
num_workers = LANGUAGE_TO_NUM_WORKERS[self.DATASET_NAME]
language = self.DATASET_NAME if self.DATASET_NAME != "js" else "javascript"
### CUSTOM MUTATE METHOD CHANGES ###
if self.prompt == "diff":
# Requires:
# !wget https://raw.githubusercontent.com/google/diff-match-patch/master/python3/diff_match_patch.py
from diff_match_patch import diff_match_patch
dmp = diff_match_patch()
ds = self.get_dataset().select(range(len(generations)))
for gen, doc in zip(generations, ds):
prompt_base = self.get_prompt_base(doc)
old_code = prompt_base + doc["buggy_solution"]
for i, diff in enumerate(gen):
try:
# Strip away anything to the left such as \n
patches = dmp.patch_fromText(diff.lstrip())
fixed_code, _ = dmp.patch_apply(patches, old_code)
except Exception as e:
print(f"Failed with {e} when applying patch to buggy code: {diff}")
fixed_code = ""
gen[i] = fixed_code
elif self.prompt == "diff-carper":
from bigcode_eval.tasks.custom_metrics.diff_eval import apply_diff
ds = self.get_dataset().select(range(len(generations)))
for gen, doc in zip(generations, ds):
prompt_base = self.get_prompt_base(doc)
old_code = prompt_base + doc["buggy_solution"]
for i, diff_hunk in enumerate(gen):
if not(diff_hunk):
gen[i] = ""
continue
res: str = apply_diff(old_code, diff_hunk)
gen[i] = res
### CUSTOM PROG LANGUAGE CHANGES ###
# Inspiration: https://github.com/THUDM/CodeGeeX/blob/ebeb850f227a90c79de39f7e26b1302f374f3240/codegeex/benchmark/evaluate_humaneval_x.py
if language == "python":
python_imports = "\n".join(IMPORT_HELPER["python"])
generations = [
[(python_imports + "\n" + g).strip() for g in gen] for gen in generations
]
elif language == "cpp":
cpp_imports = "\n".join(IMPORT_HELPER["cpp"])
# Remove main in case present
generations = [
[(cpp_imports + "\n" + g.split("int main")[0]).strip() for g in gen] for gen in generations
]
elif language == "java":
generations = [
[g.replace("public class Main {\n }", "").strip() for g in gen] for gen in generations
]
elif language == "go":
ds = self.get_dataset().select(range(len(generations)))
for gen, ref, doc in zip(generations, references, ds):
for line in doc["import"].split("\n"):
line = line.replace("import", "").replace("(", "").replace(")", "").replace('"', "").strip()
if line: assert line in IMPORT_HELPER["go"], doc["import"] # Will be added later
test_setup_str = doc["test_setup"] + "\n"
for i, g in enumerate(gen):
for line in test_setup_str.split("\n"):
line = line.replace("import", "").replace("(", "").replace(")", "").strip()
if line.startswith('"') and line in g:
test_setup_str = test_setup_str.replace(line, "")
g = test_setup_str + g + "\n" + ref
other_pkgs = set()
for pkg in IMPORT_HELPER["go"]:
if ('"' + pkg + '"' not in g):
p = pkg.split("/")[-1]
# Check if the package is used
if (p + "." in g):
# The problem is that it could appear in a comment
# E.g. in problem 158, the docstring is:
# // ... a list of strings.
# but the "strings" pkg is never used
# Golang throws an error if the pkg is not used
# Thus search for the package & make sure it's not in a commented line
lines = g.split("\n")
for line in lines:
if (p + "." in line) and not(line.strip().startswith("//")):
other_pkgs.add('"' + p + '"')
break
other_pkgs_str = ""
if other_pkgs:
other_pkgs_str = "import (\n" + "\n".join([" " + p for p in other_pkgs]) + "\n)\n"
if ("package main" in gen[i]) and ("package main" in test_setup_str):
gen[i] = gen[i].replace("package main", "")
gen[i] = test_setup_str + other_pkgs_str + gen[i]
elif language == "rust":
ds = self.get_dataset().select(range(len(generations)))
main = "fn main(){}\n"
for gen, doc in zip(generations, ds):
declaration = doc["declaration"]
for i, g in enumerate(gen):
new_gen = ""
if "fn main()" not in g:
new_gen += main
for line in declaration.split("\n"):
if line.strip() not in g:
# Skip if the function is already present
if line.strip().startswith("fn") and (line.strip().split("(")[0]) in g:
continue
new_gen += line.strip() + "\n"
# If fn main() is present twice, cut off before the second one
g = "fn main()".join(g.split("fn main()")[0:2])
new_gen += g
gen[i] = new_gen
### EVALUATION ###
results, logs = code_metric.compute(
references=references,
predictions=generations,
language=language,
timeout=timeout,
num_workers=num_workers,
)
# Write logs to json
with open("logs.json", "w") as f:
json.dump(logs, f, indent=4, ensure_ascii=False)
"""Debugging help
for i, (gen, ref) in enumerate(zip(generations, references)):
import time
starttime = time.time()
results, log = code_metric.compute(
references=[ref],
predictions=[gen],
language=language,
timeout=timeout,
)
print("Took: ", time.time() - starttime)
with open("errors.txt", "a") as f:
f.write(log[0][0][1]["result"] + "\n")
if ("compilation error" in log[0][0][1]["result"]):
print("Result")
print(results)
print("Log")
print(log)
print("Gen")
print(gen[0])
print("Ref")
print(ref)
"""
return results
class HumanEvalFixBase(HumanEvalPackGenerative):
def get_filename_with_extension(self, input_file):
"""Returns the synthetic filename for different datasets"""
file_name = input_file if input_file is not None else "solution"
return file_name + "." + LANGUAGE_TO_EXTENSION[self.DATASET_NAME]
def get_prompt(self, doc):
"""Builds the prompt for the LM to generate from."""
prompt_base = self.get_prompt_base(doc)
instruction = f'Fix bugs in {doc["entry_point"]}.'
context = prompt_base + doc["buggy_solution"]
if self.with_docs is False: # Add tests as source of ground truth
context += "\n" + doc["test"]
if self.prompt == "file":
file_name = self.get_filename_with_extension(input_file=doc["entry_point"])
prompt = f"<file_name>\n{file_name}\n<commit_before>\n{context}\n<commit_msg>\n{instruction}<commit_after>\n{prompt_base}"
elif self.prompt == "starcodercommit":
prompt = f"<commit_before>{context}<commit_msg>{instruction}<commit_after>{prompt_base}"
elif self.prompt == "diff":
prompt = f"<commit_before>{context}<commit_msg>{instruction}<commit_after>"
elif self.prompt == "diff-carper":
prompt = f"<NME> {self.get_filename_with_extension(input_file=doc['entry_point'])}\n"
prompt += f"<BEF> {context}\n<MSG> {instruction}\n<DFF>"
elif self.prompt == "issue":
prompt = f"<issue_start>username_0: {instruction}\n\n```{context}```\nUpvotes: 100<issue_comment>username_1: Sure, here is the fixed code.\n\n```{prompt_base}"
else:
prompt = super().get_prompt(prompt_base, instruction, context)
return prompt.strip()
def postprocess_generation(self, generation, idx):
"""Defines the postprocessing for a LM generation.
:param generation: str
code generation from LM
:param idx: int
index of doc in the dataset to which the generation belongs
(not used for Humaneval-Task)
"""
doc = self.get_dataset()[idx]
prompt = self.get_prompt(doc)
if self.prompt == "diff-carper":
# Only remove final stopwords like <MSG>
generation = self.remove_last_block(generation[len(prompt):].rstrip())
generation = prompt + generation
from bigcode_eval.tasks.custom_metrics.diff_eval import split_diff
# From https://github.com/CarperAI/OpenELM/blob/e6402a0696096011572152334ccbe049f89c332e/src/openelm/benchmarks/benchmark_bugs.py#L93
end_of_diff = re.compile("\n[^ +-@]+")
parsed: dict = split_diff(generation)
if parsed and all(
(s in parsed for s in ["name", "file", "message", "diff"])
):
# truncate diff hunk at the first line not starting with " ", "+", "-", or "@"
diff_hunk: str = end_of_diff.split(parsed["diff"])[0]
# We apply diff patch loosely:
# 1. it ignores the line numbers;
# 2. it ignores invalid lines (not starting with " ",
# "+" or "-" and not being "@@ ... @@").
# https://github.com/CarperAI/OpenELM/blob/e6402a0696096011572152334ccbe049f89c332e/src/openelm/benchmarks/benchmark_bugs.py#L162
nme_idx: int = diff_hunk.find("<NME>")
if nme_idx != -1:
diff_hunk = diff_hunk[:nme_idx]
return diff_hunk
else:
gen = self.remove_last_block(generation[len(prompt):].rstrip())
if self.prompt.startswith("diff"):
return gen
else:
# Strip on the right to maintain same behavior as with get_prompt
prompt_base = self.get_prompt_base(doc)
return prompt_base.rstrip() + gen
class HumanEvalExplainDescribeBase(HumanEvalPack):
def get_prompt_encoder(self, doc):
"""Encoder input for models with Enc-Dec architecture like CodeT5"""
assert self.prompt == "instructcodet5p", "Enc-Dec is only tested for InstructCodeT5+"
prompt_base = self.get_prompt_base(doc)
instruction = f"Provide a concise natural language description of the code using at most {len(doc['docstring'])} characters."
context = prompt_base + doc["canonical_solution"]
return super().get_prompt("", instruction, context) # No prompt base as not generating
def get_prompt(self, doc):
"""Builds the prompt for the LM to generate from."""
prompt_base = self.get_prompt_base(doc)
instruction = f"Provide a concise natural language description of the code using at most {len(doc['docstring'])} characters."
context = prompt_base + doc["canonical_solution"]
return super().get_prompt("", instruction, context)
def remove_last_block(self, text):
for w in self.stop_words:
if w in text:
text = text[:text.find(w)]
return text
def remove_code(self, text, canonical_solution):
for line in canonical_solution.split("\n"):
line = line.strip()
if len(line) > 20 and line in text:
text = text.replace(line, "")
return text
def postprocess_generation(self, generation, idx):
"""Defines the postprocessing for a LM generation.
:param generation: str
code generation from LM
:param idx: int
index of doc in the dataset to which the generation belongs
(not used for Humaneval-Task)
"""
doc = self.get_dataset()[idx]
prompt = self.get_prompt(doc)
docstring_len = len(doc["docstring"])
gen = self.remove_last_block(generation[len(prompt):].strip()[:docstring_len]).rstrip()
gen = self.remove_code(gen, doc["canonical_solution"])
return gen
def get_reference(self, doc, get_solution=False):
return None
def process_results(self, generations, references):
raise ValueError("""ExplainDescribe should be run with `--generation_only`.
Once generations are done run ExplainSynthesize with `--load_data_path path/to/generations.json`
It will load the explanations, generate from them and evaluate.""")
class HumanEvalExplainSynthesizeBase(HumanEvalPackGenerative):
def __init__(self, load_data_path=None, **kwargs):
assert load_data_path is not None, "load_data_path must be specified to load the descriptions."
with open(load_data_path) as fp:
self.descriptions = json.load(fp)
print(f"{len(self.descriptions)} descriptions with {len(self.descriptions[0])} description candidates loaded.")
super().__init__(**kwargs)
def get_dataset(self):
"""Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
dataset = []
for description, sample in zip(self.descriptions, self.dataset["test"]):
for description_candidate in description:
dataset.append({"description": description_candidate} | sample)
return dataset
def get_prompt_encoder(self, doc):
"""Encoder input for models with Enc-Dec architecture like CodeT5"""
assert self.prompt == "instructcodet5p", "Enc-Dec is only tested for InstructCodeT5+"
prompt_base = "" # No prompt base as not generating
instruction = f"Write functional code in {LANGUAGE_TO_NAME[self.DATASET_NAME]} according to the description."
context = doc["description"]
return super().get_prompt(prompt_base, instruction, context)
def get_prompt(self, doc):
"""Builds the prompt for the LM to generate from."""
prompt_base = self.get_prompt_base(doc)
instruction = f"Write functional code in {LANGUAGE_TO_NAME[self.DATASET_NAME]} according to the description."
context = doc["description"]
return super().get_prompt(prompt_base, instruction, context)
class HumanEvalSynthesizeBase(HumanEvalPackGenerative):
def get_prompt_encoder(self, doc):
"""Encoder input for models with Enc-Dec architecture like CodeT5"""
assert self.prompt == "instructcodet5p", "Enc-Dec is only tested for InstructCodeT5+"
prompt_base = "" # No prompt base as not generating
instruction = doc["instruction"].strip()
return super().get_prompt(prompt_base, instruction)
def get_prompt(self, doc):
"""Builds the prompt for the LM to generate from."""
prompt_base = self.get_prompt_base(doc)
instruction = doc["instruction"].strip()
return super().get_prompt(prompt_base, instruction)
"""Testing
from datasets import load_dataset
ds = load_dataset("bigcode/humaneval-x-bugs", "python")["test"]
idx = 0
def get_prompt_base(doc, language="python"):
# See
# https://github.com/roG0d/CodeGeeX/blob/f66205b5f615a4eead9c26d7ec297e14738ea18d/codegeex/benchmark/evaluate_humaneval_x.py#L78
# https://github.com/THUDM/CodeGeeX/pull/76#issuecomment-1500653190
if language == "rust":
main = "fn main(){}\n"
prompt_base = main + doc["declaration"] + doc["prompt"]
else:
prompt_base = doc["prompt"]
return prompt_base
prompt_base = get_prompt_base(ds[idx], language="python")
messages = [
{
"role": "user",
"content": ds[idx]["instruction"],
},
{
"role": "assistant",
"content": prompt_base,
},
]
gpt-4-0613
response = openai.ChatCompletion.create(
model="gpt-4-0613",
messages=messages
)
"""
import os
import openai
import jsonlines
import termcolor
from cdifflib import CSequenceMatcher
from camel_converter import to_snake
from datasets import load_dataset
from typing import List
from tqdm import tqdm
_CITATION = """
@article{muennighoff2023octopack,
title={OctoPack: Instruction Tuning Code Large Language Models},
author={Niklas Muennighoff and Qian Liu and Armel Zebaze and Qinkai Zheng and Binyuan Hui and Terry Yue Zhuo and Swayam Singh and Xiangru Tang and Leandro von Werra and Shayne Longpre},
journal={arXiv preprint arXiv:2308.07124},
year={2023}
}
"""
LANGUAGE_TO_NAME = {
"python": "Python",
"cpp": "C++",
"js": "JavaScript",
"java": "Java",
"go": "Go",
"rust": "Rust",
}
def get_prompt_base(doc, language):
# See
# https://github.com/roG0d/CodeGeeX/blob/f66205b5f615a4eead9c26d7ec297e14738ea18d/codegeex/benchmark/evaluate_humaneval_x.py#L78
# https://github.com/THUDM/CodeGeeX/pull/76#issuecomment-1500653190
if language == "rust":
main = "fn main(){}\n"
prompt_base = main + doc["declaration"]
else:
prompt_base = doc["declaration"]
return prompt_base
def get_prompt_synthesize(doc, language="python"):
# addon = f"Start your code with:\n{get_prompt_base(sample, language)}"
# return doc["instruction"] + "\n" + addon # Results in worse performance for GPT4
return doc["instruction"] # Problem: Difficult for problems that have helper functions
def get_base_prompt_fix(doc, language="python", mode="tests"):
if language == "rust":
if mode == "tests":
return "fn main(){}\n" + doc["declaration"]
elif mode == "docs":
return "fn main(){}\n" + doc["declaration"] + doc["prompt"]
else:
raise ValueError
else:
if mode == "tests":
return doc["declaration"]
elif mode == "docs":
return doc["prompt"]
else:
raise ValueError
def get_prompt_fix(doc, language="python", mode="tests"):
prompt_base = get_base_prompt_fix(doc, language, mode)
func = prompt_base + doc["buggy_solution"]
instruction = f'Fix bugs in {doc["entry_point"]}.'
return func + "\n" + instruction
def get_prompt_explain_desc(doc, language="python"):
if language == "rust":
main = "fn main(){}\n"
prompt_base = main + doc["declaration"]
else:
prompt_base = doc["declaration"]
docstring_len = len(doc["docstring"])
instruction = f"Provide a concise natural language description of the code using at most {docstring_len} characters."
func = prompt_base + doc["canonical_solution"]
return instruction + "\n" + func, docstring_len
def get_prompt_explain_syn(sample, desc, language="python"):
instruction = f"Write functional code in {LANGUAGE_TO_NAME[language]} according to the description."
addon = f"Start your code with:\n{get_prompt_base(sample, language)}"
return desc + "\n" + instruction + "\n" + addon
class ParseError(Exception):
pass
class ContentParser:
@staticmethod
def _entry_point_variations(entry_point: str) -> List[str]:
# NOTE: workaround dataset's bug with entry point naming
return [
entry_point,
to_snake(entry_point),
entry_point[0].lower() + entry_point[1:],
]
def __call__(self, prompt: str, content: str, entry_point: str):
# NOTE: Model doesn't follow instructions directly:
# adds description of change and sometimes fixes
# typos, or other "bugs" in description.
if "```" in content:
content = content.split("```")[1]
# first parse with assumption that content has description
matcher = CSequenceMatcher(None, prompt, content)
tag, _, _, j1, j2 = matcher.get_opcodes()[-1]
if tag == "insert":
return content[j1:j2]
# second parse content with assumption that model wrote code without description
for entry_point in self._entry_point_variations(entry_point):
if entry_point in content:
content = content.split(entry_point)[-1]
return "".join(content.splitlines(keepends=True)[1:])
raise ParseError(f"Prompt is not in content:\n{content}")
class ChatWrapper:
def __init__(self, model: str):
self._model = model
def __call__(self, prompt: str, n: int) -> str:
messages = [
{
"role": "user",
"content": prompt,
}
]
while True:
try:
response = openai.ChatCompletion.create(
model=self._model,
messages=messages,
temperature=0.2,
top_p=0.95,
n=n
)
content_list = list()
for i in range(n):
message = response["choices"][i]["message"]
assert message["role"] == "assistant"
content_list.append(message["content"])
return content_list
except Exception as e:
print("API EXCEPTION:", e)
if __name__ == '__main__':
TIMES = 1
VERBOSE = True
LANGUAGE = "python"
MODEL = "gpt-4-0613"
TASK = "humanevalsynthesize"
# Load descriptions
if TASK == "humanevalexplainsynthesize":
with jsonlines.open(f"completions_{LANGUAGE}_humanevalexplaindescribe.jsonl", "r") as f:
descriptions = [line["raw_generation"][0] for line in f]
openai.organization = os.getenv("OPENAI_ORGANIZATION")
openai.api_key = os.getenv("OPENAI_API_KEY")
samples = [s for s in load_dataset("bigcode/humanevalpack", LANGUAGE)["test"]]
chat_wrapper = ChatWrapper(MODEL)
parse_errors = 0
parser = ContentParser()
for idx, sample in enumerate(tqdm(samples)):
if TASK == "humanevalfix":
prompt = get_prompt_fix(sample, language=LANGUAGE, mode="tests")
elif TASK == "humanevalsynthesize":
prompt = get_prompt_synthesize(sample, language=LANGUAGE)
elif TASK == "humanevalexplaindescribe":
prompt, docstring_len = get_prompt_explain_desc(sample, language=LANGUAGE)
gen = chat_wrapper(prompt, TIMES)
sample["raw_generation"] = gen
sample["generation"] = [gen_item[:docstring_len] for gen_item in gen]
continue
elif TASK == "humanevalexplainsynthesize":
desc = descriptions[idx]
prompt = get_prompt_explain_syn(sample, desc, language=LANGUAGE)
if VERBOSE:
print(f"Processing {sample['task_id']} ({idx + 1}/{len(samples)}))...")
sample["raw_generation"] = chat_wrapper(prompt, TIMES)
try:
sample["generation"] = [parser(prompt, generation_item, sample["entry_point"]) for generation_item in sample["raw_generation"]]
except ParseError as e:
parse_errors += 1
print("PARSE EXCEPTION:", e)
sample["generation"] = [""]
if VERBOSE:
for i in range(TIMES):
print(termcolor.colored(sample["entry_point"], "yellow", attrs=["bold"]))
print(termcolor.colored(prompt, "yellow"))
print(termcolor.colored(sample["canonical_solution"], "red"))
print(termcolor.colored(sample["generation"][i], "green")+"\n\n")
if VERBOSE:
print("parse error rate:", parse_errors / len(samples))
results_filename = f"completions_{LANGUAGE}_{TASK}.jsonl"
with jsonlines.open(results_filename, "w") as writer:
writer.write_all(samples)
"""Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation
https://openreview.net/forum?id=1qvx610Cu7
The HumanEval+ dataset is created by the EvalPlus framework which extends the original HumanEval dataset
by adding more automatically generated test cases to each problem.
Homepage: https://github.com/evalplus/evalplus
"""
from warnings import warn
from bigcode_eval.tasks.humaneval import GeneralHumanEval
_CITATION = """
@inproceedings{evalplus,
title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation},
author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming},
booktitle = {Thirty-seventh Conference on Neural Information Processing Systems},
year = {2023},
url = {https://openreview.net/forum?id=1qvx610Cu7},
}
"""
class GeneralHumanEvalPlus(GeneralHumanEval):
"""A task represents an entire benchmark including its dataset, problems,
answers, generation settings and evaluation methods.
"""
DATASET_PATH = "evalplus/humanevalplus"
def __init__(self, strip_prompt, k=[1, 10, 100], num_workers=16, timeout=10.0):
if timeout < 10.0:
warn(
"It is suggested to have a longer timeout as HumanEval+ has lots of tests. "
f"The current timeout is {timeout}s while the suggested timeout is 10s."
)
super().__init__(strip_prompt, k, num_workers, timeout)
def create_task(strip_prompt):
class HumanEvalPlus(GeneralHumanEvalPlus):
def __init__(self, **kwargs):
super().__init__(strip_prompt, **kwargs)
return HumanEvalPlus
def create_all_tasks():
"""Creates a dictionary of tasks from a list of levels
:return: {task_name: task}
e.g. {multiple-py: Task, multiple-java: Task}
"""
return {
"humanevalplus": create_task(True),
"humanevalplus-unstripped": create_task(False),
}
"""Evaluating Large Language Models Trained on Code
https://arxiv.org/abs/2107.03374
The HumanEval dataset released by OpenAI includes 164 programming problems with a function signature,
docstring, body, and several unit tests.
They were handwritten to ensure not to be included in the training set of code generation models.
Homepage: https://github.com/openai/human-eval
"""
from bigcode_eval.base import Task
from bigcode_eval.utils import remove_after_return
from bigcode_eval.tasks.custom_metrics.code_eval import compute_code_eval
_CITATION = ""
def create_all_tasks():
"""Creates a dictionary of tasks corresponding for the 2 settings currently available
- instruction with code completion: we provide function signature/imports.. to the model after the instruction
- instruction to code generation: we only give the instruction without the function signature/imports..
"""
return {
"instruct-humaneval": InstructHumanEvalWithContext,
"instruct-humaneval-nocontext": InstructHumanEvalWithoutContext,
}
class InstructHumanEval(Task):
"""A task represents an entire benchmark including its dataset, problems,
answers, generation settings and evaluation methods.
"""
DATASET_PATH = "codeparrot/instructhumaneval"
DATASET_NAME = None
def __init__(self):
super().__init__(
stop_words=["if __name__", "\nprint", "\nclass"],
requires_execution=True,
)
def get_dataset(self):
"""Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
return self.dataset["test"]
def get_prompt(self, doc):
"""Builds the prompt for the LM to generate from."""
pass
def get_reference(self, doc):
"""Builds the reference solution for the doc (sample from the test dataset)."""
test_func = doc["test"]
entry_point = f"check({doc['entry_point']})"
return "\n" + test_func + "\n" + entry_point
def process_results(self, generations, references):
"""Takes the list of LM generations and evaluates them against ground truth references,
returning the metric for the generations.
:param generations: list(list(str))
list of lists containing generations
:param references: list(str)
list of str containing references
"""
results, _ = compute_code_eval(
references=references,
predictions=generations,
)
return results
class InstructHumanEvalWithContext(InstructHumanEval):
def __init__(self):
super().__init__()
def get_prompt(self, doc):
"""Builds the prompt for the LM to generate from."""
return {"instruction": doc["instruction"], "context": doc["context"]}
def postprocess_generation(self, generation, idx):
"""Defines the postprocessing for a LM generation.
:param generation: str
code generation from LM
:param idx: int
index of doc in the dataset to which the generation belongs
(not used for Humaneval-Task)
"""
generation = self._stop_at_stop_token(generation, self.stop_words)
function_name = self.get_dataset()["entry_point"][idx]
func_index = generation.find(f"def {function_name}")
return generation[0:func_index] + remove_after_return(generation[func_index:])
class InstructHumanEvalWithoutContext(InstructHumanEval):
def __init__(self):
super().__init__()
def get_prompt(self, doc):
"""Builds the prompt for the LM to generate from."""
return {"instruction": doc["instruction"], "context": ""}
def postprocess_generation(self, generation, idx):
"""Defines the postprocessing for a LM generation.
:param generation: str
code generation from LM
:param idx: int
index of doc in the dataset to which the generation belongs
(not used for Humaneval-Task)
"""
example = self.get_dataset()[idx]
prompt, function_name = example["context"], example["entry_point"]
prefix = prompt[0 : prompt.find(f"def {function_name}")]
sep_index = generation.find("```")
if sep_index == -1:
pass
else:
if (
generation[sep_index + len("```") : sep_index + len("```python")]
== "python"
):
generation = generation[sep_index + len("```python") :]
else:
generation = generation[sep_index + len("```") :]
generation = self._stop_at_stop_token(generation, self.stop_words)
func_index = generation.find(f"def {function_name}")
if func_index == -1:
func_index = 0
return_index = generation[func_index:].rfind(" return ")
if return_index == -1:
return_index = 0
j = func_index + return_index
n = len(generation)
while j < n and generation[j] != "\n":
j += 1
sep_index_2 = generation.find("```")
if sep_index_2 == -1:
return prefix.strip() + "\n" + generation[func_index:j]
else:
return prefix.strip() + "\n" + generation[func_index : min(j, sep_index_2)]
"""Instruction version of HumanEval used for WizardCoder Models evaluation
Evaluating Large Language Models Trained on Code
https://arxiv.org/abs/2107.03374
The HumanEval dataset released by OpenAI includes 164 programming problems with a function signature,
docstring, body, and several unit tests.
They were handwritten to ensure not to be included in the training set of code generation models.
Homepage: https://github.com/openai/human-eval
"""
from bigcode_eval.base import Task
from bigcode_eval.tasks.custom_metrics.code_eval import compute_code_eval
_CITATION = """
@misc{chen2021evaluating,
title={Evaluating Large Language Models Trained on Code},
author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
year={2021},
eprint={2107.03374},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
"""
def generate_prompt(input):
INSTRUCTION = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Create a Python script for this problem:
{input}
### Response:"""
return INSTRUCTION
class HumanEvalWizardCoder(Task):
"""A task represents an entire benchmark including its dataset, problems,
answers, generation settings and evaluation methods.
"""
DATASET_PATH = "openai_humaneval"
def __init__(self):
super().__init__(
stop_words=[],
requires_execution=True,
)
def get_dataset(self):
"""Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
return self.dataset["test"]
def get_prompt(self, doc):
"""Builds the prompt for the LM to generate from."""
prompt = doc["prompt"].replace(" ", "\t")
prompt = generate_prompt(prompt)
return prompt
def get_reference(self, doc):
"""Builds the reference solution for the doc (sample from the test dataset)."""
test_func = doc["test"]
entry_point = f"check({doc['entry_point']})"
return "\n" + test_func + "\n" + entry_point
@staticmethod
def clean_comp(completion):
# adapted from https://github.com/nlpxucan/WizardLM/blob/main/WizardCoder/src/process_humaneval.py
if "```python" in completion:
def_line = completion.index("```python")
completion = completion[def_line:].strip()
completion = completion.replace("```python", "")
try:
next_line = completion.index("```")
completion = completion[:next_line].strip()
except:
a += 1
if '__name__ == "__main__"' in completion:
next_line = completion.index('if __name__ == "__main__":')
completion = completion[:next_line].strip()
if "# Example usage" in completion:
next_line = completion.index("# Example usage")
completion = completion[:next_line].strip()
if completion.startswith("Here's"):
completion = completion.split("\n")[1:]
completion = "\n".join(completion)
result = completion
return result
def postprocess_generation(self, generation, idx):
"""Defines the postprocessing for a LM generation.
:param generation: str
code generation from LM
:param idx: int
index of doc in the dataset to which the generation belongs
(not used for Humaneval-Task)
"""
generation = generation.split("### Response:")[-1]
generation = generation.replace("\t", " ")
generation = generation.split("</s>")[0]
generation = self.clean_comp(generation)
return generation
def process_results(self, generations, references):
"""Takes the list of LM generations and evaluates them against ground truth references,
returning the metric for the generations.
:param generations: list(list(str))
list of lists containing generations
:param references: list(str)
list of str containing refrences
"""
results, _ = compute_code_eval(
references=references,
predictions=generations,
)
return results
"""Program Synthesis with Large Language Models
https://arxiv.org/abs/2108.07732
The benchmark consists of around 1,000 crowd-sourced Python programming problems,
designed to be solvable by entry level programmers, covering programming fundamentals,
standard library functionality, and so on. Each problem consists of a task description,
code solution and 3 automated test cases. As described in the paper, a subset of the data
has been hand-verified by the authors.
Homepage:: https://github.com/google-research/google-research/tree/master/mbpp
"""
from bigcode_eval.base import Task
from bigcode_eval.tasks.custom_metrics.code_eval import compute_code_eval
_CITATION = """
@article{austin2021program,
title={Program Synthesis with Large Language Models},
author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},
journal={arXiv preprint arXiv:2108.07732},
year={2021}
}
"""
class MBPP(Task):
"""A task represents an entire benchmark including its dataset, problems,
answers, generation settings and evaluation methods.
"""
DATASET_PATH = "mbpp"
def __init__(self):
super().__init__(
stop_words=["\nclass", "\nassert", '\n"""', "\nprint", "\nif", "\n<|/", "\n```"],
requires_execution=True,
)
def get_dataset(self):
"""Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
dataset = self.dataset["test"]
# the wrong split of mbpp can be loaded with old datasets cache
assert (
len(dataset) == 500
), "please ensure you have the latest version of MBPP dataset, try deleting its old cache"
return dataset
def get_prompt(self, doc):
"""Builds the prompt for the LM to generate from.
MBPP prompt is built following to InCoder (Fried et al.) approach
prompt = docstring that includes one test
"""
description = doc["text"]
test_example = doc["test_list"][0]
prompt = f'"""\n{description}\n{test_example}\n"""\n'
return prompt
def get_reference(self, doc):
"""Builds the reference solution for the doc (sample from the test dataset)."""
return "\n".join(doc["test_list"])
def postprocess_generation(self, generation, idx):
"""Defines the postprocessing for a LM generation.
:param generation: str
code generation from LM
:param idx: int
index of doc in the dataset to which the generation belongs
"""
prompt = self.get_prompt(self.dataset["test"][idx])
generation = generation[len(prompt) :]
return prompt + self._stop_at_stop_token(generation, self.stop_words)
def process_results(self, generations, references):
"""Takes the list of LM generations and evaluates them against ground truth references,
returning the metric for the generations.
:param generations: list(list(str))
list of lists containing generations
:param references: list(str)
list of str containing refrences
"""
results, _ = compute_code_eval(
references=references,
predictions=generations,
)
return results
"""Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation
https://openreview.net/forum?id=1qvx610Cu7
The MBPP+ dataset is created by the EvalPlus framework which extends the original MBPP dataset
by adding more automatically generated test cases to each problem. Note MBPP+ only includes 399
tasks which are a subset of the original MBPP dataset. The subset is selected from the sanitized
MBPP (a subset of manually examined tasks by the original MBPP authors) and EvalPlus further
removes low-quality and ill-formed tasks for benchmark quality control.
Homepage: https://github.com/evalplus/evalplus
"""
import os
from bigcode_eval.tasks.mbpp import MBPP
from bigcode_eval.tasks.custom_metrics.code_eval import compute_code_eval
_CITATION = """
@inproceedings{evalplus,
title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation},
author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming},
booktitle = {Thirty-seventh Conference on Neural Information Processing Systems},
year = {2023},
url = {https://openreview.net/forum?id=1qvx610Cu7},
}
"""
class MBPPPlus(MBPP):
"""A task represents an entire benchmark including its dataset, problems,
answers, generation settings and evaluation methods.
"""
DATASET_PATH = "evalplus/mbppplus"
def get_prompt(self, doc):
"""Builds the prompt for the LM to generate from.
MBPP prompt is built following to InCoder (Fried et al.) approach
prompt = docstring that includes one test
"""
description = doc["prompt"] # sanitized testset use "prompt" instead of "text"
test_example = doc["test_list"][0]
prompt = f'"""\n{description}\n{test_example}\n"""\n'
return prompt
# NOTE(@ganler): MBPP+ extends the original MBPP jsonl data with a "test" field which
# includes the testing code ready for execution. Note the "test" field
# is different from HumanEval(+) which further requires a `check` func
def get_reference(self, doc):
"""Builds the reference solution for the doc (sample from the test dataset)."""
use_mbpp_tests = os.getenv("MBBPPLUS_USE_MBPP_TESTS", "0")
if use_mbpp_tests == "1":
return "\n".join(doc["test_list"])
return "\n" + doc["test"]
def get_dataset(self):
"""Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
dataset = self.dataset["test"]
assert (
len(dataset) == 399
), "MBPP+ only has 399 problems. Please retry by deleting its old cache"
return dataset
def process_results(self, generations, references):
"""Takes the list of LM generations and evaluates them against ground truth references,
returning the metric for the generations.
:param generations: list(list(str))
list of lists containing generations
:param references: list(str)
list of str containing refrences
"""
results, _ = compute_code_eval(
references=references,
predictions=generations,
timeout=10.0, # 10s timeout
)
return results
"""MultiPL-E: A Scalable and Extensible Approach to Benchmarking Neural Code Generation
https://arxiv.org/abs/2107.03374
MultiPL-E is a dataset for evaluating large language models for code generation that supports 18 programming languages.
It takes the OpenAI "HumanEval" and the MBPP Python benchmarks and uses little compilers to translate them to other languages.
Homepage: https://nuprl.github.io/MultiPL-E/
"""
import json
import os
import re
import tempfile
from multiprocessing import cpu_count
from pathlib import Path
from time import time
import numpy as np
from datasets import load_dataset
from tqdm import tqdm
from bigcode_eval.base import Task
from bigcode_eval.tasks.custom_metrics.multiple_metrics.evaluation import \
evaluate_problem
from bigcode_eval.tasks.custom_metrics.multiple_metrics.single_experiment_pass_k import \
for_file
_CITATION = """
@article{cassano2022scalable,
title={A Scalable and Extensible Approach to Benchmarking NL2Code for 18 Programming Languages},
author={Cassano, Federico and Gouwar, John and Nguyen, Daniel and Nguyen, Sydney and Phipps-Costin, Luna and Pinckney, Donald and Yee, Ming Ho and Zi, Yangtian and Anderson, Carolyn Jane and Feldman, Molly Q and others},
journal={arXiv preprint arXiv:2208.08227},
year={2022}
}
"""
LANGUAGES = [
"py",
"sh",
"cpp",
"cs",
"d",
"go",
"java",
"js",
"jl",
"lua",
"pl",
"php",
"r",
"rkt",
"rb",
"rs",
"scala",
"swift",
"ts",
]
def create_all_tasks():
"""Creates a dictionary of tasks from a list of levels
:return: {task_name: task}
e.g. {multiple-py: Task, multiple-java: Task}
"""
return {f"multiple-{language}": create_task(language) for language in LANGUAGES}
def create_task(language):
class MultiPLE(GeneralMultiPLE):
def __init__(self):
super().__init__(language)
return MultiPLE
class GeneralMultiPLE(Task):
"""A task represents an entire benchmark including its dataset, problems,
answers, generation settings and evaluation methods.
"""
DATASET_PATH = "nuprl/MultiPL-E"
DATASET_NAME = None
DATASET_REVISION = "d23b094346c5dbda1080a74bb2a24c18adbf7409"
def __init__(self, language):
self.language = language
self.DATASET_NAME = f"humaneval-{language}"
# we need the dataset to get stop words for each language
self.dataset = load_dataset(
GeneralMultiPLE.DATASET_PATH,
self.DATASET_NAME,
revision=self.DATASET_REVISION)
stop_words = self.dataset["test"][0]["stop_tokens"] + ["<file_sep>"]
super().__init__(
stop_words=stop_words,
requires_execution=True,
)
def get_dataset(self):
"""Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
return self.dataset["test"]
def get_prompt(self, doc):
"""Builds the prompt for the LM to generate from."""
return doc["prompt"].strip()
def get_reference(self, doc):
"""Builds the reference solution for the doc (sample from the test dataset)."""
return doc["tests"]
@staticmethod
def remove_last_block(string, stop_words):
# Remove the last block of the code containing stop_words for HumanEval
string_list = re.split("(%s)" % "|".join(stop_words), string)
# last string should be ""
return "".join(string_list[:-2])
def postprocess_generation(self, generation, idx):
"""Defines the postprocessing for a LM generation.
:param generation: str
code generation from LM
:param idx: int
index of doc in the dataset to which the generation belongs
(not used for this task)
"""
prompt = self.get_prompt(self.get_dataset()[idx])
completion = generation[len(prompt) :]
return prompt + self._stop_at_stop_token(completion, self.stop_words)
def process_results(self, generations, references):
"""Takes the list of LM generations and evaluates them against ground truth references,
returning the metric for the generations.
:param generations: list(list(str))
list of lists containing generations
:param references: list(str)
list of str containing refrences
"""
# get prompts and problem names
prompts_names = [
{"prompt": doc["prompt"], "name": doc["name"]}
for i, doc in enumerate(self.get_dataset())
if i < len(generations)
]
# a common temp dir for all the problems
temp_dir = tempfile.gettempdir()
list_files = []
for (prompt_name, generation, reference) in zip(
prompts_names, generations, references
):
problem = {
"name": prompt_name["name"],
"language": self.language,
"prompt": prompt_name["prompt"],
"completions": generation,
"tests": reference,
}
# each problem is save in a json file
temp_file_name = os.path.join(temp_dir, f"{prompt_name['name']}.json")
list_files.append(temp_file_name)
with open(temp_file_name, "wt") as f:
json.dump(problem, f)
print(
f"Saved {len(list_files)} problems in {temp_dir} for evaluation, each problem has {len(generations[0])} completions"
)
# execute the problems to evaluate them
max_workers = cpu_count() - 1 if cpu_count() > 1 else 1
for file in tqdm(list_files):
evaluate_problem(temp_dir, file, max_workers)
# compute pass@k scores
result_array = np.array(
[for_file(p) for p in Path(temp_dir).glob("*.results.json")]
)
result = result_array.mean(axis=0)
name = (
temp_dir.split("/")[-1]
if temp_dir.split("/")[-1] != ""
else temp_dir.split("/")[-2]
)
results = {
f"pass@{k}": v
for k, v in zip([1, 10, 100], result)
if k <= len(generations[0])
}
return results
"""Parity bug fixing task."""
import itertools
import re
from bigcode_eval.base import Task
from bigcode_eval.tasks.custom_metrics.code_eval import compute_code_eval
import tqdm
def mutate_code(
n_bugs: int = 5, task: str = "parity", prompt="prompt"
):
"""
Modified from https://github.com/CarperAI/OpenELM/blob/e6402a0696096011572152334ccbe049f89c332e/src/openelm/utils/code_eval.py
Mutate code to create n bugs. Output the prompt in diff format.
Args:
n_bugs: number of bugs to introduce (from 1 to 5).
task: (Optional) the task to be performed.
prompt: (Optional) 'diff', 'prompt' or 'edit'.
Returns:
template for code mutation
"""
mutation_templates = {
"diff": [
f"<NME> {task}.py\n<BEF> ",
"", # placeholder for the context, e.g., the buggy code
"\n<MSG> Fixed bugs",
],
"prompt_carper": [
"# A buggy implementation\n#!/usr/bin/python3\n",
"", # placeholder for the context, e.g., the buggy code
"\n# Fixed bugs\ndef",
],
"prompt": [
"#!/usr/bin/python3\n# A buggy implementation\n", # Fixed order
"", # placeholder for the context, e.g., the buggy code
"\n# Fixed bugs\ndef", # Past tense is key
],
"edit": [
"<commit_before>",
"", # placeholder for the context, e.g., the buggy code
"<commit_msg>Fix bugs<commit_after>",
],
}
mutation_template = mutation_templates[prompt]
if task == "parity":
variables = ["b", "b", "b", "b", 2]
for i in range(n_bugs):
variables[i] = "c" if i < 4 else 3
func_str = (
'def parity(b1,b2,b3,b4):\n """Return binary parity of a sequence of input bits.'
' Return 0 for even parity, 1 for odd parity."""\n bit_sum = sum(['
"{}1,{}2,{}3,{}4])\n return bit_sum % {}".format(*variables)
)
mutation_template[1] = func_str
return "".join(mutation_template)
else:
raise ValueError(f"Unknown task: {task}")
# https://github.com/CarperAI/OpenELM/blob/e6402a0696096011572152334ccbe049f89c332e/src/openelm/utils/code_eval.py#L131
def parity_reference(b1, b2, b3, b4):
"""
Return binary parity of a sequence of input bits.
Return 0 for even parity, 1 for odd parity.
"""
bit_sum = sum([b1, b2, b3, b4])
return bit_sum % 2
class Parity(Task):
def __init__(self, prompt="prompt"):
super().__init__(
stop_words=[
"\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif",
# Special cases for edit
"<commit_before>", "<commit_msg>", "<commit_after>", "<|endoftext|>",
],
requires_execution=True,
)
self.prompt = prompt
self.parity_tests = "assert " + " and ".join([
f"({parity_reference(*i)} == parity{i})" for i in itertools.product(range(2), repeat=4)
])
# Allow max 3 times the length of the prompt to
# allow the model to e.g. add some comments
self.max_length_multiplier = 3
def get_dataset(self):
"""Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
return [1, 2, 3, 4, 5]
def get_prompt(self, doc):
"""Builds the prompt for the LM to generate from."""
return mutate_code(n_bugs=doc, task="parity", prompt=self.prompt)
def get_reference(self, doc):
"""Builds the reference solution for the doc (sample from the test dataset)."""
return []
@staticmethod
def first_block(string, stop_words):
"""Split off first block of code by scanning for class, def etc. on newlines."""
stop_words = [re.escape(word) for word in stop_words] # Escape e.g. | in <|endoftext|>
return re.split("|".join(stop_words), string)[0].rstrip()
def postprocess_generation(self, generation, idx):
"""Defines the postprocessing for a LM generation.
:param generation: str
code generation from LM
:param idx: int
index of doc in the dataset to which the generation belongs
"""
doc = self.get_dataset()[idx]
prompt = self.get_prompt(doc)
output = generation[len(prompt):]
if self.prompt.startswith("prompt"):
output = "def" + output # Add def which is in the prompt back to the output
return self.first_block(output, self.stop_words)
def process_results(self, generations, references):
"""Takes the list of LM generations and evaluates them against ground truth references,
returning the metric for the generations.
:param generations: list(list(str))
list of lists containing generations
:param references: list(str)
list of str containing refrences
"""
out = {}
# Compute metrics for each number of bugs
for idx, gens in tqdm.tqdm(enumerate(generations), total=len(generations)):
results, _ = compute_code_eval(
references=[self.parity_tests for _ in gens],
predictions=[[g] for g in gens],
)
out[f"{idx+1} bugs"] = results
return out
"""Python Bugs
https://proceedings.mlr.press/v162/he22a.html
This dataset is taken from the preprossing done by CarperAI (https://carper.ai/diff-models-a-new-way-to-edit-code).
It is uploaded here: https://huggingface.co/datasets/Muennighoff/python-bugs
Make sure to run with sufficient context length (512 is not enough for e.g. CodeGen).
"""
import re
from evaluate import load
from bigcode_eval.base import Task
import tqdm
_CITATION = """
@inproceedings{he2022distribution,
title={On distribution shift in learning-based bug detectors},
author={He, Jingxuan and Beurer-Kellner, Luca and Vechev, Martin},
booktitle={International Conference on Machine Learning},
pages={8559--8580},
year={2022},
organization={PMLR}
}
"""
MUTATE_TO_TASK_TO_PROMPT = {
"prompt_carper": {
"bin-op": "# Fixed binary operator",
"var-misuse": "# Fixed incorrect variable name",
},
"prompt_present": {
"bin-op": "# Fix binary operator",
"var-misuse": "# Fix incorrect variable name",
},
# Same as prompt_carper, but other parts are still different
"prompt": {
"bin-op": "# Fixed binary operator",
"var-misuse": "# Fixed incorrect variable name",
},
"edit": {
"bin-op": "Fix binary operator",
"var-misuse": "Fix incorrect variable name",
},
}
def mutate_code(input_code, task, prompt="prompt"):
"""
Create template for code mutation.
Args:
input_code: code to be mutated
task: task to be performed
prompt: (Optional) 'edit' or 'prompt'
Returns:
template for code mutation
"""
instruction = MUTATE_TO_TASK_TO_PROMPT[prompt][task]
if prompt == "prompt_carper":
return f"# A buggy implementation\n#!/usr/bin/python3\n{input_code}\n{instruction}\ndef"
if prompt == "prompt":
return f"#!/usr/bin/python3\n# A buggy implementation\n{input_code}\n{instruction}\ndef"
if prompt == "edit":
return f"<commit_before>{input_code}<commit_msg>{instruction}<commit_after>"
else:
raise ValueError(f"Unknown prompt: {prompt}")
class PythonBugs(Task):
DATASET_PATH = "Muennighoff/python-bugs"
def __init__(self, prompt="prompt"):
super().__init__(
# Correct code always starts with `def ...` and is a single function, so stop everything else
# Since a function always has a tab, stop when the first line does not have a tab
stop_words=[
"\nclass", "\n#", "\ndef", "\nassert", '\n"', "\nprint", "\nif",
# Special cases for edit
"<commit_before>", "<commit_msg>", "<commit_after>", "<|endoftext|>",
],
requires_execution=True,
)
self.max_length_multiplier = 2.25 # Allow 2.25 times the length of the prompt
self.prompt = prompt
def get_dataset(self):
"""Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
dataset = self.dataset["train"]
return dataset
def get_prompt(self, doc):
"""Builds the prompt for the LM to generate from."""
return mutate_code(doc["prompt_code"], doc["task"], self.prompt)
def get_reference(self, doc):
"""Builds the reference solution for the doc (sample from the test dataset)."""
return doc["correct_code"]
def postprocess_generation(self, generation, idx):
"""Defines the postprocessing for a LM generation.
:param generation: str
code generation from LM
:param idx: int
index of doc in the dataset to which the generation belongs
"""
doc = self.get_dataset()[idx]
prompt = self.get_prompt(doc)
correct_code = self.get_reference(doc)
output = generation[len(prompt):]
if self.prompt.startswith("prompt"):
output = "def" + output # Add def which is in the prompt back to the output
return output[:len(correct_code)]
def process_results(self, generations, references):
"""Takes the list of LM generations and evaluates them against ground truth references,
returning the metric for the generations.
:param generations: list(list(str))
list of lists containing generations
:param references: list(str)
list of str containing refrences
"""
num_correct = 0
print("Scoring generations...")
for i, ref in tqdm.tqdm(enumerate(references), total=len(references)):
for gen in generations[i]:
num_correct += int(gen == ref)
accuracy = num_correct / len(references) / len(generations[0])
return {"mean exact match": accuracy}
"""QuixBugs"""
import re
from bigcode_eval.base import Task
from bigcode_eval.tasks.custom_metrics.code_eval import compute_code_eval
_CITATION = """
@inproceedings{lin2017quixbugs,
title={QuixBugs: A multi-lingual program repair benchmark set based on the Quixey Challenge},
author={Lin, Derrick and Koppel, James and Chen, Angela and Solar-Lezama, Armando},
booktitle={Proceedings Companion of the 2017 ACM SIGPLAN international conference on systems, programming, languages, and applications: software for humanity},
pages={55--56},
year={2017}
}
"""
class QuixBugs(Task):
DATASET_PATH = "Muennighoff/quixbugs"
def __init__(self, prompt="prompt"):
self.prompt = prompt
if self.prompt == "edit":
self.stop_words = [
"<commit_before>",
"<commit_msg>",
"<commit_after>",
"<|endoftext|>",
]
elif self.prompt.startswith("prompt"):
self.stop_words = [
"\ndef",
"\nclass",
"\n#",
"\n@",
"\nprint",
"\nif",
"###",
"///",
"<|endoftext|>",
]
elif self.prompt.startswith("prompt_codex"):
# https://arxiv.org/pdf/2111.03922.pdf
self.stop_words = [
"\nclass", "###", "///", "<|endoftext|>",
]
else:
raise ValueError(f"Unknown prompt: {self.prompt}")
super().__init__(
stop_words=self.stop_words,
requires_execution=True,
)
self.max_length_multiplier = 3 # Allow 3 times the length of the prompt
def get_dataset(self):
"""Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
return self.dataset["train"]
def get_prompt(self, doc):
"""Builds the prompt for the LM to generate from."""
if self.prompt == "edit":
prompt = "<commit_before>" + doc["buggy_program"]
prompt += "<commit_msg>" + "Fix bug in " + doc["name"]
prompt += "<commit_after>"
elif self.prompt == "edit-openai":
return doc["buggy_program"], "Fix bug in " + doc["name"]
elif self.prompt == "prompt":
prompt = "# Buggy function"
prompt += "\n" + doc["buggy_program"] + "\n"
prompt += "# Fixed function\ndef"
elif self.prompt == "prompt_codex":
# https://arxiv.org/pdf/2111.03922.pdf, Prenner et al.
prompt = "### fix the bug in the following function"
prompt += "\n" + doc["buggy_program"] + "\n"
prompt += "### fixed function"
else:
raise ValueError(f"Unknown prompt: {prompt}")
return prompt.strip()
def get_reference(self, doc):
"""Builds the reference solution for the doc (sample from the test dataset)."""
return (doc["name"], doc["tests"].strip())
@staticmethod
def remove_last_block(string, stop_words):
stop_words = [re.escape(word) for word in stop_words] # Escape e.g. | in <|endoftext|>
# Remove the last block of the code containing stop_words for HumanEval
string_list = re.split("(%s)" % "|".join(stop_words), string)
# last string should be ""
return "".join(string_list[:-2])
def postprocess_generation(self, generation, idx):
"""Defines the postprocessing for a LM generation.
:param generation: str
code generation from LM
:param idx: int
index of doc in the dataset to which the generation belongs
(not used for Humaneval-Task)
"""
doc = self.get_dataset()[idx]
prompt = self.get_prompt(doc)
generation = generation[len(prompt):]
if self.prompt == "prompt":
generation = "def" + generation # Add def which is in the prompt back to the output
return self.remove_last_block(generation, self.stop_words)
def process_results(self, generations, references):
"""Takes the list of LM generations and evaluates them against ground truth references,
returning the metric for the generations.
:param generations: list(list(str))
list of lists containing generations
:param references: list(str)
list of str containing refrences
"""
results = {}
for i, (gen, (name, ref)) in enumerate(zip(generations, references)):
sub_results, _ = compute_code_eval(
references=[ref],
predictions=[gen],
timeout=10, # Levenshtein distance is slow
)
results[name] = sub_results
# Provide average of all metrics computed
if results:
results["all"] = {
k: sum(v[k] for v in results.values()) / len(results) for k in results[list(results.keys())[0]]
}
results["num_correct"] = results["all"]["pass@1"] * (len(results) - 1) # -1 for the all metric
return results
"""
ReCode: Robustness Evaluation of Code Generation Models
https://arxiv.org/abs/2212.10264
Recode is a benchmark evaluating the robustness of code generation models to code and natural language perturbations.
This task allows to run the released perturbed HumanEval benchmark, and compute the robust-pass-at-k metric.
"""
from collections import defaultdict
from bigcode_eval.base import Task
from bigcode_eval.tasks.custom_metrics.code_eval import compute_code_eval
import numpy as np
_CITATION = """
@article{wang2022recode,
title={ReCode: Robustness Evaluation of Code Generation Models},
author={Wang, Shiqi and Li, Zheng and Qian, Haifeng and Yang, Chenghao and Wang, Zijian and Shang, Mingyue and Kumar, Varun and Tan, Samson and Ray, Baishakhi and Bhatia, Parminder and others},
journal={arXiv preprint arXiv:2212.10264},
year={2022}
}
"""
TRANSFORMATION_CATEGORIES = ["format", "func_name", "natgen", "nlaugmenter"]
def create_all_tasks():
"""Creates a dictionary of tasks from a list of levels
:return: {task_name: task}
e.g. {multiple-py: Task, multiple-java: Task}
"""
return {
f"perturbed-humaneval-{category}-num_seeds_{num_seeds}": create_task(
category, num_seeds
)
for category in TRANSFORMATION_CATEGORIES
for num_seeds in range(1, 11)
}
def create_task(category, num_seeds):
class PerturbedHumanEval(GeneralPerturbedHumanEval):
DATASET_NAME = category
def __init__(self):
super().__init__(category, num_seeds)
return PerturbedHumanEval
class GeneralPerturbedHumanEval(Task):
DATASET_PATH = "RaymondLi/perturbed_humaneval"
def __init__(self, category, num_seeds):
super().__init__(
stop_words=["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif"],
requires_execution=True,
)
# Transformation category
self.category = category
self.num_seeds = num_seeds
self.filtered_dataset = self.dataset["test"].filter(
lambda x: x["seed"] < num_seeds
)
def get_dataset(self):
"""
Returns dataset for the task or an iterable of any object, that get_prompt can handle
Only keep the first NUM_SEEDS seeds
"""
return self.filtered_dataset
def get_prompt(self, doc):
"""
Builds the prompt for the LM to generate from.
:param doc: dict[str: str]
sample from the test dataset
:return: str
"""
return doc["prompt"].strip()
def get_reference(self, doc):
"""
Builds the reference solution for the doc (sample from the test dataset).
Will be passed to the `process_results` function, and potentially saved.
:param doc: dict[str: str]
sample from the test dataset
:return: dict
"""
test_func = doc["test"]
entry_point = f"check({doc['entry_point']})"
test_code = "\n" + test_func + "\n" + entry_point
return {
"task_id": doc["task_id"],
"seed": doc["seed"],
"perturbation_name": doc["perturbation_name"],
"test_code": test_code,
}
def postprocess_generation(self, generation, idx):
"""
Defines the postprocessing for a LM generation.
:param generation: str
code generation from LM
:param idx: int (if needed)
index of doc in the dataset to which the generation belongs
:return: str
"""
prompt = self.get_prompt(self.filtered_dataset[idx])
generation = generation[len(prompt) :]
return prompt + self._stop_at_stop_token(generation, self.stop_words)
def process_results(self, generations, references):
"""
Takes the list of LM generations and evaluates them against ground truth references,
returning the metric for the generations as in {"metric_name": result}.
We encourage to directly load the metric from `evaluate` library to keep the code concise.
:param generations: list(list(str))
list of lists containing generations
:param references: list(dict)
list of dict containing refrences
:return: dict[str: float]
"""
_, detailed_results = compute_code_eval(
references=[ref["test_code"] for ref in references],
predictions=generations,
)
# Compute robust-pass-at-1. For each transformation and each prompt, we have s=5 randomly perturbed prompts.
# With a single sample per prompt, RP@1 on a given transformation is the fraction of examples where completions
# for all the perturbed prompts are correct.
# With n samples per prompt, https://arxiv.org/abs/2212.10264 defines RP@1 as the average of the
# 1/n * sum_{i=1}^n I(all s correct for generation-seed i) over all prompts.
# An alternate could be the average of the
# prod_{j=1}^s 1/n * sum_{i=1}^n I(j-th prompt correct for generation-seed i) over all prompts.
# We compute RP@1 for each transformation
# transformation -> problem -> seed -> [n results]
transformation_problem_results = defaultdict(lambda: defaultdict(dict))
for i, ref in enumerate(references):
result = detailed_results[i]
result = [x[1]["passed"] for x in result]
assert (
ref["seed"]
not in transformation_problem_results[ref["perturbation_name"]][
ref["task_id"]
]
)
transformation_problem_results[ref["perturbation_name"]][ref["task_id"]][
ref["seed"]
] = result
rp1 = {}
for transformation, problem_results in transformation_problem_results.items():
res = {}
res["robust-pass-at-1"] = sum(
# results = {seed -> [n results]}
# 1/n * sum_{i=1}^n I(all s correct for generation-seed i)
float(all(results_)) / len(list(results.values())[0])
for results in problem_results.values()
for results_ in zip(*results.values())
) / len(problem_results)
res["alt-robust-pass-at-1"] = sum(
# results = {seed -> [n results]}
# prod_{j=1}^s 1/n * sum_{i=1}^n I(j-th prompt correct for generation-seed i)
np.prod([np.mean(results[j]) for j in results])
for results in problem_results.values()
) / len(problem_results)
rp1[transformation] = res
# TODO: for overall-performance, a prompt is solved if correct over the s prompts for all transformation categories.
return rp1
from typing import Dict, List
from tqdm import tqdm
from bigcode_eval.base import Task
_CITATION = """
@article{allal2023santacoder,
title={SantaCoder: don't reach for the stars!},
author={Allal, Loubna Ben and Li, Raymond and Kocetkov, Denis and Mou, Chenghao and Akiki, Christopher and Ferrandis, Carlos Munoz and Muennighoff, Niklas and Mishra, Mayank and Gu, Alex and Dey, Manan and others},
journal={arXiv preprint arXiv:2301.03988},
year={2023}
}
"""
LANGUAGES = [
"py",
"js",
"java",
]
def create_all_tasks():
return {
"santacoder_fim": SantaCoderFIM,
"starcoder_fim": StarCoderFIM,
}
def initialize_empty_metrics(languages: List[str]) -> Dict[str, float]:
metrics = {}
for lang in languages:
metrics[f"n_accurate_{lang}"] = 0.0
metrics[f"n_count_{lang}"] = 0.0
return metrics
def aggregate_per_lang_accuracy(
metrics: Dict[str, float], languages: List[str]
) -> Dict[str, float]:
em_metrics = {}
for lang in languages:
# avoid div by 0
acc = (
metrics[f"n_accurate_{lang}"] / metrics[f"n_count_{lang}"]
if metrics[f"n_count_{lang}"]
else 0
)
em_metrics[f"{lang} Exact Match"] = acc
return em_metrics
class SantaCoderFIM(Task):
DATASET_PATH = "bigcode/santacoder-fim-task"
def __init__(
self,
fim_prefix: str = "<fim-prefix>",
fim_middle: str = "<fim-middle>",
fim_suffix: str = "<fim-suffix>",
stop_words: List[str] = ["<|endoftext|>", "<|filename|>"],
requires_execution: bool = False
):
super().__init__(
stop_words=stop_words,
requires_execution=requires_execution,
)
self.fim_prefix = fim_prefix
self.fim_middle = fim_middle
self.fim_suffix = fim_suffix
def get_dataset(self):
"""Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
dataset = self.dataset["train"]
return dataset
def get_prompt(self, doc):
"""Builds the prompt for the LM to generate from."""
return f"""{self.fim_prefix}{doc["prompt"]}{self.fim_suffix}{doc["suffix"]}{self.fim_middle}"""
def get_reference(self, doc):
"""Builds the reference solution for the doc (sample from the test dataset)."""
return doc["canonical_solution"]
def postprocess_generation(self, generation, idx):
"""Defines the postprocessing for a LM generation.
:param generation: str
code generation from LM
:param idx: int
index of doc in the dataset to which the generation belongs
"""
doc = self.get_dataset()[idx]
prompt = self.get_prompt(doc)
output = generation[len(prompt) :]
return self._stop_at_stop_token(output, self.stop_words)
# return generation
def process_results(self, generations, references):
"""Takes the list of LM generations and evaluates them against ground truth references,
returning the metric for the generations as in {"metric_name": result}.
:param generations: list(list(str))
list of lists containing generations
:param references: list(str)
list of str containing refrences
:return: dict[str: float]
"""
metrics = initialize_empty_metrics(LANGUAGES)
for idx, (gen, reference) in tqdm(enumerate(zip(generations, references))):
language = self.get_dataset()[idx]["language"]
for g in gen:
metrics[f"n_accurate_{language}"] += int(g.strip() == reference.strip())
metrics[f"n_count_{language}"] += len(gen)
em_metrics = aggregate_per_lang_accuracy(metrics, LANGUAGES)
return em_metrics
class StarCoderFIM(SantaCoderFIM):
DATASET_PATH = "bigcode/santacoder-fim-task"
def __init__(self):
fim_prefix = "<fim_prefix>"
fim_middle = "<fim_middle>"
fim_suffix = "<fim_suffix>"
stop_words = ["<|endoftext|>", "<|filename|>", "<file_sep>"]
super().__init__(
stop_words=stop_words,
requires_execution=False,
fim_prefix=fim_prefix,
fim_middle=fim_middle,
fim_suffix=fim_suffix,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment