Unverified Commit 12b6eeb5 authored by Ramiro R. C.'s avatar Ramiro R. C. Committed by GitHub
Browse files

fixed mmlu generative response extraction (#2503)



* fixed mmlu generative response extraction

* updated file version | added args to exact_match

* fix

* fix

* pre-commit

* fix groups

---------
Co-authored-by: default avatarBaber <baber@hey.com>
parent 88144079
......@@ -13,46 +13,48 @@ from tqdm import tqdm
eval_logger = logging.getLogger("lm-eval")
SUBJECTS = {'Islamic Studies': 'humanities',
'Driving Test': 'other',
'Natural Science (Middle School)': 'stem',
'Natural Science (Primary School)': 'stem',
'History (Primary School)': 'humanities',
'History (Middle School)': 'humanities',
'History (High School)': 'humanities',
'General Knowledge': 'other',
'General Knowledge (Primary School)': 'other',
'General Knowledge (Middle School)': 'other',
'Law (Professional)': 'humanities',
'Physics (High School)': 'stem',
'Social Science (Middle School)': 'social_science',
'Social Science (Primary School)': 'social_science',
'Management (University)': 'other',
'Arabic Language (Primary School)': 'language',
'Arabic Language (Middle School)': 'language',
'Arabic Language (High School)': 'language',
'Political Science (University)': 'social_science',
'Philosophy (High School)': 'humanities',
'Accounting (University)': 'social_science',
'Computer Science (University)': 'stem',
'Computer Science (Middle School)': 'stem',
'Computer Science (Primary School)': 'stem',
'Computer Science (High School)': 'stem',
'Geography (Primary School)': 'social_science',
'Geography (Middle School)': 'social_science',
'Geography (High School)': 'social_science',
'Math (Primary School)': 'stem',
'Biology (High School)': 'stem',
'Economics (University)': 'social_science',
'Economics (Middle School)': 'social_science',
'Economics (High School)': 'social_science',
'Arabic Language (General)': 'language',
'Arabic Language (Grammar)': 'language',
'Islamic Studies (High School)': 'humanities',
'Islamic Studies (Middle School)': 'humanities',
'Islamic Studies (Primary School)': 'humanities',
'Civics (Middle School)': 'social_science',
'Civics (High School)': 'social_science'}
SUBJECTS = {
"Islamic Studies": "humanities",
"Driving Test": "other",
"Natural Science (Middle School)": "stem",
"Natural Science (Primary School)": "stem",
"History (Primary School)": "humanities",
"History (Middle School)": "humanities",
"History (High School)": "humanities",
"General Knowledge": "other",
"General Knowledge (Primary School)": "other",
"General Knowledge (Middle School)": "other",
"Law (Professional)": "humanities",
"Physics (High School)": "stem",
"Social Science (Middle School)": "social_science",
"Social Science (Primary School)": "social_science",
"Management (University)": "other",
"Arabic Language (Primary School)": "language",
"Arabic Language (Middle School)": "language",
"Arabic Language (High School)": "language",
"Political Science (University)": "social_science",
"Philosophy (High School)": "humanities",
"Accounting (University)": "social_science",
"Computer Science (University)": "stem",
"Computer Science (Middle School)": "stem",
"Computer Science (Primary School)": "stem",
"Computer Science (High School)": "stem",
"Geography (Primary School)": "social_science",
"Geography (Middle School)": "social_science",
"Geography (High School)": "social_science",
"Math (Primary School)": "stem",
"Biology (High School)": "stem",
"Economics (University)": "social_science",
"Economics (Middle School)": "social_science",
"Economics (High School)": "social_science",
"Arabic Language (General)": "language",
"Arabic Language (Grammar)": "language",
"Islamic Studies (High School)": "humanities",
"Islamic Studies (Middle School)": "humanities",
"Islamic Studies (Primary School)": "humanities",
"Civics (Middle School)": "social_science",
"Civics (High School)": "social_science",
}
def parse_args():
......
# noqa
"""
Take in a YAML, and output all "other" splits with this YAML
"""
......
......@@ -14,7 +14,21 @@ metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_punctuation: true
ignore_case: true
filter_list:
- name: get_response
filter:
# Filter everything after the first break line
- function: "regex"
regex_pattern: "^(.*?)(?=\\n|$)"
# Remove leading white spaces
- function: remove_whitespace
# function to ignore right white spaces or line breaks
- function: "regex"
regex_pattern: "^(.*?)\\s*$"
- function: take_first
metadata:
version: 2.0
version: 3.0
dataset_kwargs:
trust_remote_code: true
......@@ -5,29 +5,29 @@ task:
task:
- mmlu_stem_generative
aggregate_metric_list:
- metric: acc
weight_by_size: True
- metric: exact_match
weight_by_size: true
- group: other
task:
- mmlu_other_generative
aggregate_metric_list:
- metric: acc
weight_by_size: True
- metric: exact_match
weight_by_size: true
- group: social sciences
task:
- mmlu_social_sciences_generative
aggregate_metric_list:
- metric: acc
weight_by_size: True
- metric: exact_match
weight_by_size: true
- group: humanities
task:
- mmlu_humanities_generative
aggregate_metric_list:
- metric: acc
weight_by_size: True
- metric: exact_match
weight_by_size: true
aggregate_metric_list:
- aggregation: mean
metric: exact_match
weight_by_size: True
weight_by_size: true
metadata:
version: 2
version: 3
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment