update mbpp

52192906 · Baber · 13aa5096 · 52192906 · 52192906 · 52192906
Commit 52192906 authored Sep 16, 2025 by Baber
4 changed files
--- a/lm_eval/tasks/mbpp/mbpp.yaml
+++ b/lm_eval/tasks/mbpp/mbpp.yaml
@@ -4,20 +4,31 @@ dataset_name: full
 unsafe_code: true
 output_type: generate_until
 test_split: test
-doc_to_text: "You are an expert Python programmer, and here is your task: {{text}} Your code should pass these tests:\n\n{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}\n[BEGIN]\n"
+repeats: 20
-doc_to_target: "{% if is_fewshot is defined %}{{code}}\n[DONE]{% else %}{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}{% endif %}"
+doc_to_text: "{{text|trim}}\n{{code}}.split(':')[0]:\n"
+doc_to_target: "{% if is_fewshot is defined %}{{code}}\n{% else %}{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}{% endif %}"
 target_delimiter: ""
+gen_prefix: "Here is the completed function:\n\n```python\n"
 metric_list:
-  - metric: !function utils.pass_at_1
+  - metric: !function utils.pass_at_k
    aggregation: mean
    higher_is_better: true
+    k: [ 10 ]
 generation_kwargs:
-  until:
+  until: [
-    - "[DONE]"
+    "\nclass",
+    "\nassert",
+    '\n"""',
+    "\nprint",
+    "\nif",
+    "\n```",
+    "\n#",
+    "\n<|/",
+    "<|eot_id|>",
+  ]
  do_sample: false
-num_fewshot: 3
 fewshot_config:
  sampler: first_n
  samples: !function utils.list_fewshot_samples
 metadata:
-  version: 1.0
+  version: 2.0
--- a/lm_eval/tasks/mbpp/mbpp_evalplus.yaml
+++ b/lm_eval/tasks/mbpp/mbpp_evalplus.yaml
@@ -17,9 +17,10 @@ doc_to_target: "{% if is_fewshot is defined %}{{code}}\n[DONE]{% else %}{{test_l
 target_delimiter: ""
 gen_prefix: "Here is the completed function:\n\n```python\n"
 metric_list:
-  - metric: !function utils.pass_at_10
+  - metric: !function utils.pass_at_k
    aggregation: mean
    higher_is_better: true
+    k: [ 10 ]
 filter_list:
  - name: "create_test"
    filter:
@@ -27,16 +28,16 @@ filter_list:
        filter_fn: !function utils.build_predictions
 generation_kwargs:
  until: [
-                  "\nclass",
+    "\nclass",
-                  "\nassert",
+    "\nassert",
-                  '\n"""',
+    '\n"""',
-                  "\nprint",
+    "\nprint",
-                  "\nif",
+    "\nif",
-                  "\n```",
+    "\n```",
-                  "\n#",
+    "\n#",
-                  "\n<|/",
+    "\n<|/",
-                  "<|eot_id|>",
+    "<|eot_id|>",
-              ]
+  ]
  do_sample: true
  temperature: 0.8
  top_p: 0.95

--- a/lm_eval/tasks/mbpp/mbpp_instruct.yaml
+++ b/lm_eval/tasks/mbpp/mbpp_instruct.yaml
@@ -9,9 +9,10 @@ doc_to_target: "{% if is_fewshot is defined %}{{code}}\n```{% else %}{{test_list
 gen_prefix: "\n```python\n"
 target_delimiter: ""
 metric_list:
-  - metric: !function utils.pass_at_1
+  - metric: !function utils.pass_at_k
    aggregation: mean
    higher_is_better: true
+    k: [ 1 ]
 filter_list:
  - name: "extract_code"
    filter:
@@ -19,7 +20,7 @@ filter_list:
        filter_fn: !function utils.build_predictions
 generation_kwargs:
  max_gen_toks: 256
-  until: []
+  until: [ ]
  do_sample: false
 num_fewshot: 3
 fewshot_config:

--- a/lm_eval/tasks/mbpp/utils.py
+++ b/lm_eval/tasks/mbpp/utils.py
@@ -5,44 +5,36 @@ import evaluate as hf_evaluate
 try:
-    pass_at_k = hf_evaluate.load("code_eval")
+    compute_ = hf_evaluate.load("code_eval")
-    # run simple test to check code execution is enabled before model generation
    test_cases = ["assert add(2, 3)==5"]
    candidates = [["def add(a,b): return a*b"]]
-    results = pass_at_k.compute(references=test_cases, predictions=candidates, k=[1])
+    results = compute_.compute(references=test_cases, predictions=candidates, k=[1])
 except Exception as e:
    raise e
-def pass_at_1(
+def pass_at_k(references: list[str], predictions: list[list[str]], k: list[int] = None):
-    references: Union[str, list[str]], predictions: Union[str, list[list[str]]]
+    global compute_
-) -> float:
+    assert k is not None
-    if isinstance(references, str):
+    if isinstance(k, int):
-        references = [references]
+        k = [k]
-    if isinstance(predictions[0], str):
+    res = compute_.compute(
-        predictions = [[p] for p in predictions]
-    return pass_at_k.compute(
        references=references,
        predictions=predictions,
-        k=[1],
+        k=k,
-    )[0]["pass@1"]
-def pass_at_10(
-    references: Union[str, list[str]], predictions: Union[str, list[list[str]]]
-) -> float:
-    global pass_at_k
-    if isinstance(references, str):
-        references = [references]
-    if isinstance(predictions[0], str):
-        predictions = [[p] for p in predictions]
-    res = pass_at_k.compute(
-        references=references, predictions=predictions, k=[10], num_workers=20
    )
    return res[0]
+def extract_python_block(text: str) -> str:
+    if not text.startswith("```"):
+        text = "```python\n" + text + "\n```"
+    # capture only fences whose language tag is 'python'
+    pattern = re.compile(r"```python\n([\s\S]*?)\n?```", re.IGNORECASE)
+    m = pattern.search(text)
+    return "from __future__ import annotations\n" + m.group(1) if m else ""
 def extract_code_blocks(text: str) -> str:
    # Pattern to match ```...``` blocks
    ignore_annotations = "from __future__ import annotations\n"