Unverified Commit 9822b06e authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge branch 'main' into weight_by_size

parents 51f27158 b177c82c
"dataset_name": "professional_accounting"
"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
"include": "_default_template_yaml"
"task": "ammlu_professional_accounting"
"dataset_name": "professional_law"
"description": "فم بعملية التقييم في مجال العلوم الانسانية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_professional_law"
"dataset_name": "professional_medicine"
"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
"include": "_default_template_yaml"
"task": "ammlu_professional_medicine"
"dataset_name": "professional_psychology"
"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_professional_psychology"
"dataset_name": "public_relations"
"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_public_relations"
"dataset_name": "security_studies"
"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_security_studies"
"dataset_name": "sociology"
"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_sociology"
"dataset_name": "us_foreign_policy"
"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_us_foreign_policy"
"dataset_name": "virology"
"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
"include": "_default_template_yaml"
"task": "ammlu_virology"
"dataset_name": "world_religions"
"description": "فم بعملية التقييم في مجال العلوم الانسانية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_world_religions"
...@@ -38,7 +38,7 @@ Homepage: https://allenai.org/data/arc ...@@ -38,7 +38,7 @@ Homepage: https://allenai.org/data/arc
#### Tasks #### Tasks
* `arc_easy` * `arc_easy`
* `arc_challange` * `arc_challenge`
### Checklist ### Checklist
......
""" """
Take in a YAML, and output all other splits with this YAML Take in a YAML, and output all other splits with this YAML
""" """
import argparse
import os import os
import re import re
import yaml
import requests
import argparse
import datasets import datasets
import requests
import yaml
from tqdm import tqdm from tqdm import tqdm
from lm_eval import utils from lm_eval import utils
......
...@@ -28,3 +28,4 @@ filter_list: ...@@ -28,3 +28,4 @@ filter_list:
num_fewshot: 0 num_fewshot: 0
metadata: metadata:
version: 2.0 version: 2.0
num_fewshot: 3 # controls what is printed in n-shot
...@@ -7,21 +7,21 @@ metric_list: ...@@ -7,21 +7,21 @@ metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
# ignore_case: true ignore_case: true
# ignore_punctuation: true # ignore_punctuation: true
regexes_to_ignore:
- "\\.$"
- ","
- "\\\\"
- "\n"
- '"'
generation_kwargs: generation_kwargs:
until: until:
- "</s>" - "</s>"
- "Q" - "Q:"
- "\n\n" - "<|im_end|>"
do_sample: false do_sample: false
temperature: 0.0 temperature: 0.0
filter_list:
- name: "get-answer"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
num_fewshot: 0 num_fewshot: 0
metadata: metadata:
version: 1.0 version: 2.0
"dataset_name": "boolean_expressions" "dataset_name": "boolean_expressions"
"description": "Evaluate the result of a random Boolean expression.\n\n" "description": "Evaluate the result of a random Boolean expression.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_boolean_expressions" "task": "bbh_cot_zeroshot_boolean_expressions"
filter_list:
- name: "flexible-extract"
filter:
- function: "regex"
group_select: -1
regex_pattern: "\\b(True|False)\\b"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "causal_judgement" "dataset_name": "causal_judgement"
"description": "Answer questions about causal attribution.\n\n" "description": "Answer questions about causal attribution.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_causal_judgement" "task": "bbh_cot_zeroshot_causal_judgement"
filter_list:
- name: "flexible-extract"
filter:
- function: "regex"
group_select: -1
regex_pattern: "\\b(Yes|No|yes|no)\\b"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "date_understanding" "dataset_name": "date_understanding"
"description": "Infer the date from context.\n\n" "description": "Infer the date from context.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_date_understanding" "task": "bbh_cot_zeroshot_date_understanding"
filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: -1
ignore_case: true
ignore_punctuation: true
regex_pattern: "(\\([A-Z]\\))"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "disambiguation_qa" "dataset_name": "disambiguation_qa"
"description": "Clarify the meaning of sentences with ambiguous pronouns.\n\n" "description": "Clarify the meaning of sentences with ambiguous pronouns.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_disambiguation_qa" "task": "bbh_cot_zeroshot_disambiguation_qa"
filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: -1
ignore_case: true
ignore_punctuation: true
regex_pattern: "(\\([A-Z]\\))"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "dyck_languages" "dataset_name": "dyck_languages"
"description": "Correctly close a Dyck-n word.\n\n" "description": "Correctly close a Dyck-n word.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_dyck_languages" "task": "bbh_cot_zeroshot_dyck_languages"
filter_list:
- name: "flexible-extract"
filter:
- function: "regex"
group_select: -1
regex_pattern: "(?<= )([\" \\[\\(<{}>\\)\\]]+)|([\" \\[\\(<{}>\\)\\]]+)"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "formal_fallacies" "dataset_name": "formal_fallacies"
"description": "Distinguish deductively valid arguments from formal fallacies.\n\n" "description": "Distinguish deductively valid arguments from formal fallacies.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_formal_fallacies" "task": "bbh_cot_zeroshot_formal_fallacies"
filter_list:
- name: "flexible-extract"
filter:
- function: "regex"
group_select: -1
regex_pattern: "\\b(valid|invalid)\\b"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment