test out gsm8k

478d96cd · Baber · 2275b1c4 · 478d96cd · 478d96cd
Commit 478d96cd authored Aug 26, 2024 by Baber
Show whitespace changes
Inline Side-by-side

Showing with 13 additions and 11 deletions

lm_eval/api/judge_task.py lm_eval/api/judge_task.py +4 -1

lm_eval/tasks/judge/gsm8k_judge.yaml lm_eval/tasks/judge/gsm8k_judge.yaml +9 -10

No files found.
--- a/lm_eval/api/judge_task.py
+++ b/lm_eval/api/judge_task.py
@@ -44,7 +44,10 @@ class JudgeTask(ConfigurableTask):
        self.dataset["test"] = self.dataset["test"].add_column(
            "resp", [resp["resp"] for resp in resps]
        )
-        print("done")
+        self.dataset["train"] = self.dataset["train"].add_column(
+            "resp", self.dataset["train"]["answer"]
+        )
+        print("resp columns added")
    # def process_docs(self, dataset: datasets.Dataset):
    #     resps = []

--- a/lm_eval/tasks/judge/gsm8k_judge.yaml
+++ b/lm_eval/tasks/judge/gsm8k_judge.yaml
@@ -6,8 +6,10 @@ output_type: generate_until
 output_path:
-doc_to_text: "Question: {{question}}\nAnswer:"
+#doc_to_text: "Question: {{question}}\nAnswer:"
-doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
+doc_to_text: 'Given the following question and reference answer, verify if the attempted answer is correct. If it is, return "The answer is Correct". If it is incorrect, return "The answer is Incorrect".\nQuestion: {{question}}\nReference Answer: {{answer}}\nAnswer Attempt: {{resp}}'
+target_delimiter: "\n"
+doc_to_target: "The answer is Correct" #" {{answer.split('### ')[-1].rstrip()}}"
 metric_list:
  - metric: exact_match
    aggregation: mean
@@ -21,6 +23,7 @@ metric_list:
      - "\\.$"
 generation_kwargs:
  until:
+    - '<|start_header_id|>user<|end_header_id|>'
    - "Question:"
    - "</s>"
    - "<|im_end|>"
@@ -29,16 +32,12 @@ generation_kwargs:
 repeats: 1
 num_fewshot: 5
 filter_list:
-  - name: "strict-match"
+  - name: "test"
    filter:
      - function: "regex"
-        regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
+        regex_pattern: "The answer is (Correct|Incorrect)"
-      - function: "take_first"
+        ignore_punctuation: true
-  - name: "flexible-extract"
+        ignore_case: true
-    filter:
-      - function: "regex"
-        group_select: -1
-        regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)"
      - function: "take_first"
 metadata:
  version: 3.0