Commit 5b159bf9 authored by Baber's avatar Baber
Browse files

test humaneval

parent 173b2bc3
...@@ -1503,9 +1503,9 @@ class ConfigurableTask(Task): ...@@ -1503,9 +1503,9 @@ class ConfigurableTask(Task):
# we expect multiple_targets to be a list. # we expect multiple_targets to be a list.
elif self.multiple_target: elif self.multiple_target:
gold = list(gold) gold = list(gold)
elif ( # TODO: handle this better
type(gold) is not type(result) elif type(gold) is not type(result) and not (
and "bypass" not in self._metric_fn_list.keys() "bypass" in self._metric_fn_list.keys() or isinstance(result, list)
): ):
# cast gold to the same type as result # cast gold to the same type as result
gold = type(result)(gold) gold = type(result)(gold)
......
...@@ -3,11 +3,12 @@ dataset_path: openai/openai_humaneval ...@@ -3,11 +3,12 @@ dataset_path: openai/openai_humaneval
output_type: generate_until output_type: generate_until
test_split: test test_split: test
doc_to_text: "{{prompt}}" doc_to_text: "{{prompt}}"
doc_to_target: !function utils.build_references doc_to_target: "{{test}}\ncheck({{entry_point}})"
metric_list: metric_list:
- metric: !function utils.pass_at_1 - metric: !function utils.pass_at_1
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
k: 64
generation_kwargs: generation_kwargs:
until: until:
- "\nclass" - "\nclass"
...@@ -18,7 +19,7 @@ generation_kwargs: ...@@ -18,7 +19,7 @@ generation_kwargs:
do_sample: true do_sample: true
temperature: 0.2 temperature: 0.2
top_p: 0.95 top_p: 0.95
repeats: 64 repeats: 2
num_fewshot: 0 num_fewshot: 0
filter_list: filter_list:
- name: "n=64" # number of samples to estimate pass@k - name: "n=64" # number of samples to estimate pass@k
......
import evaluate as hf_evaluate import evaluate as hf_evaluate
pass_at_k = hf_evaluate.load("code_eval") # pass_at_k = hf_evaluate.load("code_eval")
#
# run simple test to check code execution is enabled before model generation # # run simple test to check code execution is enabled before model generation
test_cases = ["assert add(2, 3)==5"] # test_cases = ["assert add(2, 3)==5"]
candidates = [["def add(a,b): return a*b"]] # candidates = [["def add(a,b): return a*b"]]
results = pass_at_k.compute(references=test_cases, predictions=candidates, k=[1]) # results = pass_at_k.compute(references=test_cases, predictions=candidates, k=[1])
def pass_at_1(references, predictions): def pass_at_1(references: list[str], predictions: list[list[str]], k: list[int] = None):
return pass_at_k.compute( pass_at_k = hf_evaluate.load("code_eval")
assert k is not None
if isinstance(k, int):
k = [k]
res = pass_at_k.compute(
references=references, references=references,
predictions=predictions, predictions=predictions,
k=[1], k=k,
)[0]["pass@1"] )[0]
return {
key: val for key, val in res.items() if key in map(lambda x: f"pass@{x}", k)
}
def build_references(doc): def build_references(doc):
return doc["test"] + "\n" + f"check({doc['entry_point']})" return doc["test"] + "\n" + f"check({doc['entry_point']})"
def build_predictions(resps, docs): def build_predictions(resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
preds = [] return [[doc["prompt"] + r for r in resp] for resp, doc in zip(resps, docs)]
for resp, doc in zip(resps, docs):
pred = [doc["prompt"] + r for r in resp]
preds.append(pred)
return preds
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment