test humaneval

5b159bf9 · Baber · 173b2bc3 · 5b159bf9 · 5b159bf9 · 5b159bf9
Commit 5b159bf9 authored Jan 10, 2025 by Baber
Showing with 28 additions and 24 deletions

lm_eval/api/task.py lm_eval/api/task.py +3 -3

lm_eval/tasks/humaneval/humaneval.yaml lm_eval/tasks/humaneval/humaneval.yaml +3 -2

lm_eval/tasks/humaneval/utils.py lm_eval/tasks/humaneval/utils.py +22 -19

No files found.
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -1503,9 +1503,9 @@ class ConfigurableTask(Task):
            # we expect multiple_targets to be a list.
            elif self.multiple_target:
                gold = list(gold)
-            elif (
-                type(gold) is not type(result)
-                and "bypass" not in self._metric_fn_list.keys()
+            # TODO: handle this better
+            elif type(gold) is not type(result) and not (
+                "bypass" in self._metric_fn_list.keys() or isinstance(result, list)
            ):
                # cast gold to the same type as result
                gold = type(result)(gold)

--- a/lm_eval/tasks/humaneval/humaneval.yaml
+++ b/lm_eval/tasks/humaneval/humaneval.yaml
@@ -3,11 +3,12 @@ dataset_path: openai/openai_humaneval
 output_type: generate_until
 test_split: test
 doc_to_text: "{{prompt}}"
-doc_to_target: !function utils.build_references
+doc_to_target: "{{test}}\ncheck({{entry_point}})"
 metric_list:
  - metric: !function utils.pass_at_1
    aggregation: mean
    higher_is_better: true
+    k: 64
 generation_kwargs:
  until:
    - "\nclass"
@@ -18,7 +19,7 @@ generation_kwargs:
  do_sample: true
  temperature: 0.2
  top_p: 0.95
-repeats: 64
+repeats: 2
 num_fewshot: 0
 filter_list:
  - name: "n=64" # number of samples to estimate pass@k

--- a/lm_eval/tasks/humaneval/utils.py
+++ b/lm_eval/tasks/humaneval/utils.py
 import evaluate as hf_evaluate


-pass_at_k = hf_evaluate.load("code_eval")
-
-# run simple test to check code execution is enabled before model generation
-test_cases = ["assert add(2, 3)==5"]
-candidates = [["def add(a,b): return a*b"]]
-results = pass_at_k.compute(references=test_cases, predictions=candidates, k=[1])
-
-
-def pass_at_1(references, predictions):
-    return pass_at_k.compute(
+# pass_at_k = hf_evaluate.load("code_eval")
+#
+# # run simple test to check code execution is enabled before model generation
+# test_cases = ["assert add(2, 3)==5"]
+# candidates = [["def add(a,b): return a*b"]]
+# results = pass_at_k.compute(references=test_cases, predictions=candidates, k=[1])
+
+
+def pass_at_1(references: list[str], predictions: list[list[str]], k: list[int] = None):
+    pass_at_k = hf_evaluate.load("code_eval")
+    assert k is not None
+    if isinstance(k, int):
+        k = [k]
+    res = pass_at_k.compute(
        references=references,
        predictions=predictions,
-        k=[1],
-    )[0]["pass@1"]
+        k=k,
+    )[0]
+
+    return {
+        key: val for key, val in res.items() if key in map(lambda x: f"pass@{x}", k)
+    }


 def build_references(doc):
    return doc["test"] + "\n" + f"check({doc['entry_point']})"


-def build_predictions(resps, docs):
-    preds = []
-    for resp, doc in zip(resps, docs):
-        pred = [doc["prompt"] + r for r in resp]
-        preds.append(pred)
-
-    return preds
+def build_predictions(resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
+    return [[doc["prompt"] + r for r in resp] for resp, doc in zip(resps, docs)]