add todos

3453b128 · Baber · 15e3f5c5 · 3453b128 · 3453b128
Commit 3453b128 authored Sep 25, 2024 by Baber
Hide whitespace changes
Inline Side-by-side

Showing with 34 additions and 1 deletion

lm_eval/api/judge_task.py lm_eval/api/judge_task.py +5 -1

lm_eval/tasks/judge/arena_hard.yaml lm_eval/tasks/judge/arena_hard.yaml +29 -0

No files found.
--- a/lm_eval/api/judge_task.py
+++ b/lm_eval/api/judge_task.py
@@ -6,6 +6,10 @@ import datasets
 from lm_eval.api.task import ConfigurableTask
+# TODO: multifilter tasks
+# TODO: should we have a standard structure (modelA, modelB etc)
+# TODO: what about parsing the doc (rather than just resps from json)
+# TODO: best way for chat-template doc_to_text?
 class JudgeTask(ConfigurableTask):
    def __init__(
        self,
@@ -34,7 +38,7 @@ class JudgeTask(ConfigurableTask):
        resps = []
        # load json
        if self.output_path is not None:
-            with open(self.output_path, "r", encoding='utf-8') as f:
+            with open(self.output_path, "r", encoding="utf-8") as f:
                for line in f:
                    resp = json.loads(line)
                    resps.append({"resp": resp["resps"][0][0], "doc": resp["doc_id"]})

--- a/lm_eval/tasks/judge/arena_hard.yaml
+++ b/lm_eval/tasks/judge/arena_hard.yaml
+tag:
+  - math_word_problems
+task: arena_hard
+#dataset_path: gsm8k
+#dataset_name: main
+output_type: generate_until
+#training_split: train
+#fewshot_split: train
+doc_to_text: ["<|User Prompt|>\n{question_1}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>"]
+# TODO: need a different metric
+#doc_to_target: A, B #" {{answer.split('### ')[-1].rstrip()}}"
+metric_list:
+  - metric: bypass
+    aggregation: mean
+generation_kwargs:
+  until:
+    - "Question:"
+    - "</s>"
+    - "<|im_end|>"
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: "test"
+    filter:
+      - function: "regex"
+        regex_pattern: "[[([AB<>=]+)]]"
+      - function: "take_first"
+repeats: 1
+num_fewshot: 5