Commit 3453b128 authored by Baber's avatar Baber
Browse files

add todos

parent 15e3f5c5
......@@ -6,6 +6,10 @@ import datasets
from lm_eval.api.task import ConfigurableTask
# TODO: multifilter tasks
# TODO: should we have a standard structure (modelA, modelB etc)
# TODO: what about parsing the doc (rather than just resps from json)
# TODO: best way for chat-template doc_to_text?
class JudgeTask(ConfigurableTask):
def __init__(
self,
......@@ -34,7 +38,7 @@ class JudgeTask(ConfigurableTask):
resps = []
# load json
if self.output_path is not None:
with open(self.output_path, "r", encoding='utf-8') as f:
with open(self.output_path, "r", encoding="utf-8") as f:
for line in f:
resp = json.loads(line)
resps.append({"resp": resp["resps"][0][0], "doc": resp["doc_id"]})
......
tag:
- math_word_problems
task: arena_hard
#dataset_path: gsm8k
#dataset_name: main
output_type: generate_until
#training_split: train
#fewshot_split: train
doc_to_text: ["<|User Prompt|>\n{question_1}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>"]
# TODO: need a different metric
#doc_to_target: A, B #" {{answer.split('### ')[-1].rstrip()}}"
metric_list:
- metric: bypass
aggregation: mean
generation_kwargs:
until:
- "Question:"
- "</s>"
- "<|im_end|>"
do_sample: false
temperature: 0.0
filter_list:
- name: "test"
filter:
- function: "regex"
regex_pattern: "[[([AB<>=]+)]]"
- function: "take_first"
repeats: 1
num_fewshot: 5
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment