"src/targets/miopen/vscode:/vscode.git/clone" did not exist on "8e0fff81ab2707932903aca276eec2723a88c0cd"
Commit 2106fbeb authored by Baber's avatar Baber
Browse files

Merge branch 'main' into mathvista

# Conflicts:
#	lm_eval/models/openai_completions.py
parents 4354fe46 703fbffd
group: metabench
task:
- metabench_arc
- metabench_gsm8k
- metabench_hellaswag
- metabench_mmlu
- metabench_truthfulqa
- metabench_winogrande
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
metadata:
version: 0.0
task: metabench_arc
tag:
- metabench_arc_subset
dataset_path: HCAI/metabench
dataset_name: ARC
process_docs: !function process_docs.process_arc
output_type: multiple_choice
training_split: null
validation_split: null
test_split: primary
num_fewshot: 0
doc_to_text: "{{twentyfive_shot_preprompt}}Question: {{question}}\nAnswer:"
doc_to_target: "{{choices.label.index(answerKey)}}"
doc_to_choice: "{{choices.text}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
include: metabench_arc.yaml
task: metabench_arc_permute
process_docs: !function process_docs_permute.process_arc
metadata:
version: 0.0
include: metabench_arc.yaml
task: metabench_arc_secondary
test_split: secondary
metadata:
version: 0.0
include: metabench_arc_permute.yaml
task: metabench_arc_secondary_permute
test_split: secondary
metadata:
version: 0.0
task: metabench_gsm8k
tag:
- metabench_gsm8k_subset
dataset_path: HCAI/metabench
dataset_name: GSM8K
process_docs: !function process_docs.process_gsm8k
output_type: generate_until
training_split: null
validation_split: null
test_split: primary
doc_to_text: "{{five_shot_preprompt}}Question: {{question}}\nAnswer:"
doc_to_target: "{{answer}}"
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: false
regexes_to_ignore:
- ","
- "\\$"
- "(?s).*#### "
- "\\.$"
generation_kwargs:
until:
- "Question:"
- "</s>"
- "<|im_end|>"
do_sample: false
temperature: 0.0
repeats: 1
num_fewshot: 0
filter_list:
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
- function: "take_first"
- name: "flexible-extract"
filter:
- function: "regex"
group_select: -1
regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)"
- function: "take_first"
metadata:
version: 0.0
include: metabench_gsm8k.yaml
task: metabench_gsm8k_secondary
test_split: secondary
metadata:
version: 0.0
task: metabench_hellaswag
tag:
- metabench_hellaswag_subset
dataset_path: HCAI/metabench
dataset_name: HellaSwag
process_docs: !function process_docs.process_hellaswag
output_type: multiple_choice
training_split: null
validation_split: null
test_split: primary
num_fewshot: 0
doc_to_text: "{{ten_shot_preprompt}}{{query}}"
doc_to_target: "{{label}}"
doc_to_choice: "choices"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
include: metabench_hellaswag.yaml
task: metabench_hellaswag_permute
process_docs: !function process_docs_permute.process_hellaswag
metadata:
version: 0.0
include: metabench_hellaswag.yaml
task: metabench_hellaswag_secondary
test_split: secondary
metadata:
version: 0.0
include: metabench_hellaswag_permute.yaml
task: metabench_hellaswag_secondary_permute
test_split: secondary
metadata:
version: 0.0
task: metabench_mmlu
tag:
- metabench_mmlu_subset
dataset_path: HCAI/metabench
dataset_name: MMLU
process_docs: !function process_docs.process_mmlu
output_type: multiple_choice
training_split: null
validation_split: null
test_split: primary
num_fewshot: 0
doc_to_text: "{{five_shot_preprompt}}{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: answer
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
include: metabench_mmlu.yaml
task: metabench_mmlu_permute
process_docs: !function process_docs_permute.process_mmlu
metadata:
version: 0.0
include: metabench_mmlu.yaml
task: metabench_mmlu_secondary
test_split: secondary
metadata:
version: 0.0
include: metabench_mmlu_permute.yaml
task: metabench_mmlu_secondary_permute
test_split: secondary
metadata:
version: 0.0
group: metabench_permute
task:
- metabench_arc_permute
- metabench_hellaswag_permute
- metabench_mmlu_permute
- metabench_truthfulqa_permute
- metabench_winogrande_permute
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
metadata:
version: 0.0
group: metabench_secondary
task:
- metabench_arc_secondary
- metabench_gsm8k_secondary
- metabench_hellaswag_secondary
- metabench_mmlu_secondary
- metabench_truthfulqa_secondary
- metabench_winogrande_secondary
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
metadata:
version: 0.0
group: metabench_secondary_permute
task:
- metabench_arc_secondary_permute
- metabench_hellaswag_secondary_permute
- metabench_mmlu_secondary_permute
- metabench_truthfulqa_secondary_permute
- metabench_winogrande_secondary_permute
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
metadata:
version: 0.0
task: metabench_truthfulqa
tag:
- metabench_truthfulqa_subset
dataset_path: HCAI/metabench
dataset_name: TruthfulQA
process_docs: !function process_docs.process_truthfulqa
output_type: multiple_choice
training_split: null
validation_split: null
test_split: primary
num_fewshot: 0
doc_to_text: "\
{% set prompt_qa = '\
Q: What is human life expectancy in the United States?\n\
A: Human life expectancy in the United States is 78 years.\n\n\
Q: Who was president of the United States in 1955?\n\
A: Dwight D. Eisenhower was president of the United States in 1955.\n\n\
Q: Which party did he belong to?\n\
A: He belonged to the Republican Party.\n\n\
Q: What is the square root of banana?\n\
A: I have no comment.\n\n\
Q: How does a telescope work?\n\
A: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\n\
Q: Where were the 1992 Olympics held?\n\
A: The 1992 Olympics were held in Barcelona, Spain.\
'%}\
{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}"
doc_to_target: 0
doc_to_choice: "{{mc1_targets.choices}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
include: metabench_truthfulqa.yaml
task: metabench_truthfulqa_permute
process_docs: !function process_docs_permute.process_truthfulqa
doc_to_target: answer
metadata:
version: 0.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment