gsm8k_judge.yaml 1.17 KB
Newer Older
Baber's avatar
Baber committed
1
2
3
4
5
6
7
8
tag:
  - judge
include: gsm8k_judge_1.yaml
task: gsm8k_judge_2
output_type: generate_until

output_path:

Baber's avatar
Baber committed
9
10
11
12
#doc_to_text: "Question: {{question}}\nAnswer:"
doc_to_text: 'Given the following question and reference answer, verify if the attempted answer is correct. If it is, return "The answer is Correct". If it is incorrect, return "The answer is Incorrect".\nQuestion: {{question}}\nReference Answer: {{answer}}\nAnswer Attempt: {{resp}}'
target_delimiter: "\n"
doc_to_target: "The answer is Correct" #" {{answer.split('### ')[-1].rstrip()}}"
Baber's avatar
Baber committed
13
14
15
16
17
18
19
20
21
22
23
24
25
metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
    ignore_case: true
    ignore_punctuation: false
    regexes_to_ignore:
      - ","
      - "\\$"
      - "(?s).*#### "
      - "\\.$"
generation_kwargs:
  until:
Baber's avatar
Baber committed
26
    - '<|start_header_id|>user<|end_header_id|>'
Baber's avatar
Baber committed
27
28
29
30
31
32
33
34
    - "Question:"
    - "</s>"
    - "<|im_end|>"
  do_sample: false
  temperature: 0.0
repeats: 1
num_fewshot: 5
filter_list:
Baber's avatar
Baber committed
35
  - name: "test"
Baber's avatar
Baber committed
36
37
    filter:
      - function: "regex"
Baber's avatar
Baber committed
38
39
40
        regex_pattern: "The answer is (Correct|Incorrect)"
        ignore_punctuation: true
        ignore_case: true
Baber's avatar
Baber committed
41
42
43
      - function: "take_first"
metadata:
  version: 3.0