gsm8k_judge.yaml 1.23 KB
Newer Older
Baber's avatar
Baber committed
1
2
3
4
5
6
tag:
  - judge
include: gsm8k_judge_1.yaml
task: gsm8k_judge_2
output_type: generate_until

haileyschoelkopf's avatar
haileyschoelkopf committed
7
output_path: '/home/mchorse/lm-evaluation-harness/gsm8k_resps/meta-llama__Meta-Llama-3-8B-Instruct/samples_gsm8k_cot_llama_2024-08-25T21-59-12.123082.jsonl'
Baber's avatar
Baber committed
8

Baber's avatar
Baber committed
9
10
#doc_to_text: "Question: {{question}}\nAnswer:"
doc_to_text: 'Given the following question and reference answer, verify if the attempted answer is correct. If it is, return "The answer is Correct". If it is incorrect, return "The answer is Incorrect".\nQuestion: {{question}}\nReference Answer: {{answer}}\nAnswer Attempt: {{resp}}'
haileyschoelkopf's avatar
haileyschoelkopf committed
11
12
# target_delimiter: "\n"
doc_to_target: "Correct" #" {{answer.split('### ')[-1].rstrip()}}"
Baber's avatar
Baber committed
13
14
15
16
17
metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
    ignore_case: true
haileyschoelkopf's avatar
haileyschoelkopf committed
18
    ignore_punctuation: true
Baber's avatar
Baber committed
19
20
21
22
23
24
25
    regexes_to_ignore:
      - ","
      - "\\$"
      - "(?s).*#### "
      - "\\.$"
generation_kwargs:
  until:
Baber's avatar
Baber committed
26
    - '<|start_header_id|>user<|end_header_id|>'
Baber's avatar
Baber committed
27
28
29
30
31
32
33
34
    - "Question:"
    - "</s>"
    - "<|im_end|>"
  do_sample: false
  temperature: 0.0
repeats: 1
num_fewshot: 5
filter_list:
Baber's avatar
Baber committed
35
  - name: "test"
Baber's avatar
Baber committed
36
37
    filter:
      - function: "regex"
haileyschoelkopf's avatar
haileyschoelkopf committed
38
        regex_pattern: "(Correct|Incorrect)"
Baber's avatar
Baber committed
39
40
41
      - function: "take_first"
metadata:
  version: 3.0