Unverified Commit 9822b06e authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge branch 'main' into weight_by_size

parents 51f27158 b177c82c
"dataset_name": "professional_accounting"
"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
"include": "_default_template_yaml"
"task": "ammlu_professional_accounting"
"dataset_name": "professional_law"
"description": "فم بعملية التقييم في مجال العلوم الانسانية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_professional_law"
"dataset_name": "professional_medicine"
"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
"include": "_default_template_yaml"
"task": "ammlu_professional_medicine"
"dataset_name": "professional_psychology"
"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_professional_psychology"
"dataset_name": "public_relations"
"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_public_relations"
"dataset_name": "security_studies"
"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_security_studies"
"dataset_name": "sociology"
"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_sociology"
"dataset_name": "us_foreign_policy"
"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_us_foreign_policy"
"dataset_name": "virology"
"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
"include": "_default_template_yaml"
"task": "ammlu_virology"
"dataset_name": "world_religions"
"description": "فم بعملية التقييم في مجال العلوم الانسانية \n\n"
"include": "_default_template_yaml"
"task": "ammlu_world_religions"
......@@ -38,7 +38,7 @@ Homepage: https://allenai.org/data/arc
#### Tasks
* `arc_easy`
* `arc_challange`
* `arc_challenge`
### Checklist
......
"""
Take in a YAML, and output all other splits with this YAML
"""
import argparse
import os
import re
import yaml
import requests
import argparse
import datasets
import requests
import yaml
from tqdm import tqdm
from lm_eval import utils
......
......@@ -28,3 +28,4 @@ filter_list:
num_fewshot: 0
metadata:
version: 2.0
num_fewshot: 3 # controls what is printed in n-shot
......@@ -7,21 +7,21 @@ metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
# ignore_case: true
ignore_case: true
# ignore_punctuation: true
regexes_to_ignore:
- "\\.$"
- ","
- "\\\\"
- "\n"
- '"'
generation_kwargs:
until:
- "</s>"
- "Q"
- "\n\n"
- "Q:"
- "<|im_end|>"
do_sample: false
temperature: 0.0
filter_list:
- name: "get-answer"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
num_fewshot: 0
metadata:
version: 1.0
version: 2.0
"dataset_name": "boolean_expressions"
"description": "Evaluate the result of a random Boolean expression.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_boolean_expressions"
filter_list:
- name: "flexible-extract"
filter:
- function: "regex"
group_select: -1
regex_pattern: "\\b(True|False)\\b"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "causal_judgement"
"description": "Answer questions about causal attribution.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_causal_judgement"
filter_list:
- name: "flexible-extract"
filter:
- function: "regex"
group_select: -1
regex_pattern: "\\b(Yes|No|yes|no)\\b"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "date_understanding"
"description": "Infer the date from context.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_date_understanding"
filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: -1
ignore_case: true
ignore_punctuation: true
regex_pattern: "(\\([A-Z]\\))"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "disambiguation_qa"
"description": "Clarify the meaning of sentences with ambiguous pronouns.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_disambiguation_qa"
filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: -1
ignore_case: true
ignore_punctuation: true
regex_pattern: "(\\([A-Z]\\))"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "dyck_languages"
"description": "Correctly close a Dyck-n word.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_dyck_languages"
filter_list:
- name: "flexible-extract"
filter:
- function: "regex"
group_select: -1
regex_pattern: "(?<= )([\" \\[\\(<{}>\\)\\]]+)|([\" \\[\\(<{}>\\)\\]]+)"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "formal_fallacies"
"description": "Distinguish deductively valid arguments from formal fallacies.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_formal_fallacies"
filter_list:
- name: "flexible-extract"
filter:
- function: "regex"
group_select: -1
regex_pattern: "\\b(valid|invalid)\\b"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment