"git@developer.sourcefind.cn:modelzoo/solov2-pytorch.git" did not exist on "bac11303e2776b57b730d4ffb5d41bc5ce34b079"
Unverified Commit 76a95e9e authored by Jingming's avatar Jingming Committed by GitHub
Browse files

[Feature] Support the use of humaneval_plus. (#720)



* [Feature] Support the use of humaneval_plus.

* [Feature] Add humaneval_plus_gen.py

* minor check

* [Fix] Fix bug

---------
Co-authored-by: default avataryingfhu <yingfhu@gmail.com>
parent 47e745d7
from mmengine.config import read_base
with read_base():
from .humaneval_plus_gen_8e312c import humaneval_datasets # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess_v2
humaneval_plus_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
# TODO: allow empty output-column
humaneval_plus_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='Complete the following python code:\n{prompt}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
humaneval_plus_eval_cfg = dict(
evaluator=dict(type=HumanEvaluator,k=1, metric='EvalPlus'),
pred_role='BOT',
k=[1, 10, 100], # the parameter only for humaneval
pred_postprocessor=dict(type=humaneval_postprocess_v2),
)
humaneval_plus_datasets = [
dict(
abbr='humaneval_plus',
type=HumanevalDataset,
path='./data/humaneval/human-eval-v2-20210705.jsonl',
reader_cfg=humaneval_plus_reader_cfg,
infer_cfg=humaneval_plus_infer_cfg,
eval_cfg=humaneval_plus_eval_cfg)
]
......@@ -41,42 +41,90 @@ class HumanevalDataset(BaseDataset):
class HumanEvaluator(BaseEvaluator):
"""Evaluator for human eval."""
def __init__(self, k: List[int] = [1, 10, 100]) -> None:
try:
from human_eval.data import HUMAN_EVAL, write_jsonl
from human_eval.evaluation import evaluate_functional_correctness
self.write_jsonl = write_jsonl
self.HUMAN_EVAL = HUMAN_EVAL
self.eval = evaluate_functional_correctness
except ImportError:
raise ImportError('Please install human_eval following'
'https://github.com/openai/human-eval/tree/'
'master#installation first.')
"""Evaluator for HumanEval or EvalPlus."""
def __init__(self,
k: List[int] = [1, 10, 100],
metric: str = 'HumanEval') -> None:
self.metric = metric
assert self.metric in ['HumanEval', 'EvalPlus']
if self.metric == 'HumanEval':
try:
from human_eval.data import HUMAN_EVAL, write_jsonl
from human_eval.evaluation import \
evaluate_functional_correctness
self.write_jsonl = write_jsonl
self.HUMAN_EVAL = HUMAN_EVAL
self.eval = evaluate_functional_correctness
except ImportError:
raise ImportError(
'Please install human_eval use following steps:\n'
'git clone git@github.com:open-compass/human-eval.git\n'
'cd human-eval && pip install -e .')
else:
try:
from evalplus.data import write_jsonl
from evalplus.evaluate import evaluate
self.write_jsonl = write_jsonl
self.eval = evaluate
except ImportError:
raise ImportError(
'Please install evalplus use following steps:\n'
'git clone --recurse-submodules git@github.com:open-compass/human-eval.git\n' # noqa
'cd human-eval\n'
'pip install -e .\n'
'pip install -e evalplus\n')
self.k = k
super().__init__()
def score(self, predictions, references):
def score(self, predictions, references, test_set):
prompts = [item['prompt'] for item in test_set]
humaneval_preds = []
# create json file in human_eval format
for preds, refer in zip(predictions, references):
# suits for two case
# 1. use repeated dataset
# 2. use `num_return_sequences` to generate multiple responses
if not isinstance(preds, list):
preds = [preds]
for pred in preds:
humaneval_preds.append({'task_id': refer, 'completion': pred})
with tempfile.TemporaryDirectory() as tmp_dir:
out_dir = osp.join(tmp_dir, 'human_eval.json')
self.write_jsonl(out_dir, humaneval_preds)
score = self.eval(out_dir,
self.k,
n_workers=4,
timeout=3.0,
problem_file=self.HUMAN_EVAL)
return {f'humaneval_{k}': score[k] * 100 for k in score}
if self.metric == 'HumanEval':
# create json file in human_eval format
for preds, refer in zip(predictions, references):
# suits for two case
# 1. use repeated dataset
# 2. use `num_return_sequences` to generate multiple responses
if not isinstance(preds, list):
preds = [preds]
for pred in preds:
humaneval_preds.append({
'task_id': refer,
'completion': pred
})
with tempfile.TemporaryDirectory() as tmp_dir:
out_dir = osp.join(tmp_dir, 'human_eval.json')
self.write_jsonl(out_dir, humaneval_preds)
score = self.eval(out_dir,
self.k,
n_workers=4,
timeout=3.0,
problem_file=self.HUMAN_EVAL)
return {f'humaneval_{k}': score[k] * 100 for k in score}
else:
for preds, refer, prompt in zip(predictions, references, prompts):
if not isinstance(preds, list):
preds = [preds]
for pred in preds:
humaneval_preds.append({
'task_id': refer,
'solution': prompt + pred
})
with tempfile.TemporaryDirectory() as tmp_dir:
out_dir = osp.join(tmp_dir, 'human_eval.jsonl')
self.write_jsonl(out_dir, humaneval_preds)
flags = dict(dataset='humaneval',
samples=out_dir,
base_only=None,
parallel=None,
i_just_wanna_run=None,
test_details=0.2,
min_time_limit=0.2,
gt_time_limit_factor=4.0,
mini=None)
score = self.eval(flags)
return {f'humaneval_plus_{k}': score[k] * 100 for k in score}
def humaneval_postprocess(text: str) -> str:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment