Commit 52192906 authored by Baber's avatar Baber
Browse files

update mbpp

parent 13aa5096
...@@ -4,20 +4,31 @@ dataset_name: full ...@@ -4,20 +4,31 @@ dataset_name: full
unsafe_code: true unsafe_code: true
output_type: generate_until output_type: generate_until
test_split: test test_split: test
doc_to_text: "You are an expert Python programmer, and here is your task: {{text}} Your code should pass these tests:\n\n{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}\n[BEGIN]\n" repeats: 20
doc_to_target: "{% if is_fewshot is defined %}{{code}}\n[DONE]{% else %}{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}{% endif %}" doc_to_text: "{{text|trim}}\n{{code}}.split(':')[0]:\n"
doc_to_target: "{% if is_fewshot is defined %}{{code}}\n{% else %}{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}{% endif %}"
target_delimiter: "" target_delimiter: ""
gen_prefix: "Here is the completed function:\n\n```python\n"
metric_list: metric_list:
- metric: !function utils.pass_at_1 - metric: !function utils.pass_at_k
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
k: [ 10 ]
generation_kwargs: generation_kwargs:
until: until: [
- "[DONE]" "\nclass",
"\nassert",
'\n"""',
"\nprint",
"\nif",
"\n```",
"\n#",
"\n<|/",
"<|eot_id|>",
]
do_sample: false do_sample: false
num_fewshot: 3
fewshot_config: fewshot_config:
sampler: first_n sampler: first_n
samples: !function utils.list_fewshot_samples samples: !function utils.list_fewshot_samples
metadata: metadata:
version: 1.0 version: 2.0
...@@ -17,9 +17,10 @@ doc_to_target: "{% if is_fewshot is defined %}{{code}}\n[DONE]{% else %}{{test_l ...@@ -17,9 +17,10 @@ doc_to_target: "{% if is_fewshot is defined %}{{code}}\n[DONE]{% else %}{{test_l
target_delimiter: "" target_delimiter: ""
gen_prefix: "Here is the completed function:\n\n```python\n" gen_prefix: "Here is the completed function:\n\n```python\n"
metric_list: metric_list:
- metric: !function utils.pass_at_10 - metric: !function utils.pass_at_k
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
k: [ 10 ]
filter_list: filter_list:
- name: "create_test" - name: "create_test"
filter: filter:
...@@ -27,16 +28,16 @@ filter_list: ...@@ -27,16 +28,16 @@ filter_list:
filter_fn: !function utils.build_predictions filter_fn: !function utils.build_predictions
generation_kwargs: generation_kwargs:
until: [ until: [
"\nclass", "\nclass",
"\nassert", "\nassert",
'\n"""', '\n"""',
"\nprint", "\nprint",
"\nif", "\nif",
"\n```", "\n```",
"\n#", "\n#",
"\n<|/", "\n<|/",
"<|eot_id|>", "<|eot_id|>",
] ]
do_sample: true do_sample: true
temperature: 0.8 temperature: 0.8
top_p: 0.95 top_p: 0.95
......
...@@ -9,9 +9,10 @@ doc_to_target: "{% if is_fewshot is defined %}{{code}}\n```{% else %}{{test_list ...@@ -9,9 +9,10 @@ doc_to_target: "{% if is_fewshot is defined %}{{code}}\n```{% else %}{{test_list
gen_prefix: "\n```python\n" gen_prefix: "\n```python\n"
target_delimiter: "" target_delimiter: ""
metric_list: metric_list:
- metric: !function utils.pass_at_1 - metric: !function utils.pass_at_k
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
k: [ 1 ]
filter_list: filter_list:
- name: "extract_code" - name: "extract_code"
filter: filter:
...@@ -19,7 +20,7 @@ filter_list: ...@@ -19,7 +20,7 @@ filter_list:
filter_fn: !function utils.build_predictions filter_fn: !function utils.build_predictions
generation_kwargs: generation_kwargs:
max_gen_toks: 256 max_gen_toks: 256
until: [] until: [ ]
do_sample: false do_sample: false
num_fewshot: 3 num_fewshot: 3
fewshot_config: fewshot_config:
......
...@@ -5,44 +5,36 @@ import evaluate as hf_evaluate ...@@ -5,44 +5,36 @@ import evaluate as hf_evaluate
try: try:
pass_at_k = hf_evaluate.load("code_eval") compute_ = hf_evaluate.load("code_eval")
# run simple test to check code execution is enabled before model generation
test_cases = ["assert add(2, 3)==5"] test_cases = ["assert add(2, 3)==5"]
candidates = [["def add(a,b): return a*b"]] candidates = [["def add(a,b): return a*b"]]
results = pass_at_k.compute(references=test_cases, predictions=candidates, k=[1]) results = compute_.compute(references=test_cases, predictions=candidates, k=[1])
except Exception as e: except Exception as e:
raise e raise e
def pass_at_1( def pass_at_k(references: list[str], predictions: list[list[str]], k: list[int] = None):
references: Union[str, list[str]], predictions: Union[str, list[list[str]]] global compute_
) -> float: assert k is not None
if isinstance(references, str): if isinstance(k, int):
references = [references] k = [k]
if isinstance(predictions[0], str): res = compute_.compute(
predictions = [[p] for p in predictions]
return pass_at_k.compute(
references=references, references=references,
predictions=predictions, predictions=predictions,
k=[1], k=k,
)[0]["pass@1"]
def pass_at_10(
references: Union[str, list[str]], predictions: Union[str, list[list[str]]]
) -> float:
global pass_at_k
if isinstance(references, str):
references = [references]
if isinstance(predictions[0], str):
predictions = [[p] for p in predictions]
res = pass_at_k.compute(
references=references, predictions=predictions, k=[10], num_workers=20
) )
return res[0] return res[0]
def extract_python_block(text: str) -> str:
if not text.startswith("```"):
text = "```python\n" + text + "\n```"
# capture only fences whose language tag is 'python'
pattern = re.compile(r"```python\n([\s\S]*?)\n?```", re.IGNORECASE)
m = pattern.search(text)
return "from __future__ import annotations\n" + m.group(1) if m else ""
def extract_code_blocks(text: str) -> str: def extract_code_blocks(text: str) -> str:
# Pattern to match ```...``` blocks # Pattern to match ```...``` blocks
ignore_annotations = "from __future__ import annotations\n" ignore_annotations = "from __future__ import annotations\n"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment