fixed mmlu generative response extraction (#2503)

* fixed mmlu generative response extraction * updated file version | added args to exact_match * fix * fix * pre-commit * fix groups --------- Co-authored-by: Baber <baber@hey.com>

fixed mmlu generative response extraction (#2503)
* fixed mmlu generative response extraction * updated file version | added args to exact_match * fix * fix * pre-commit * fix groups --------- Co-authored-by: Baber <baber@hey.com>
12b6eeb5 · Ramiro R. C. · GitHub · 88144079 · 12b6eeb5 · 12b6eeb5
Unverified Commit 12b6eeb5 authored Jan 20, 2025 by Ramiro R. C. Committed by GitHub Jan 21, 2025
4 changed files
--- a/lm_eval/tasks/arabicmmlu/_generate_configs.py
+++ b/lm_eval/tasks/arabicmmlu/_generate_configs.py
@@ -13,46 +13,48 @@ from tqdm import tqdm
 eval_logger = logging.getLogger("lm-eval")


-SUBJECTS = {'Islamic Studies': 'humanities',
- 'Driving Test': 'other',
- 'Natural Science (Middle School)': 'stem',
- 'Natural Science (Primary School)': 'stem',
- 'History (Primary School)': 'humanities',
- 'History (Middle School)': 'humanities',
- 'History (High School)': 'humanities',
- 'General Knowledge': 'other',
- 'General Knowledge (Primary School)': 'other',
- 'General Knowledge (Middle School)': 'other',
- 'Law (Professional)': 'humanities',
- 'Physics (High School)': 'stem',
- 'Social Science (Middle School)': 'social_science',
- 'Social Science (Primary School)': 'social_science',
- 'Management (University)': 'other',
- 'Arabic Language (Primary School)': 'language',
- 'Arabic Language (Middle School)': 'language',
- 'Arabic Language (High School)': 'language',
- 'Political Science (University)': 'social_science',
- 'Philosophy (High School)': 'humanities',
- 'Accounting (University)': 'social_science',
- 'Computer Science (University)': 'stem',
- 'Computer Science (Middle School)': 'stem',
- 'Computer Science (Primary School)': 'stem',
- 'Computer Science (High School)': 'stem',
- 'Geography (Primary School)': 'social_science',
- 'Geography (Middle School)': 'social_science',
- 'Geography (High School)': 'social_science',
- 'Math (Primary School)': 'stem',
- 'Biology (High School)': 'stem',
- 'Economics (University)': 'social_science',
- 'Economics (Middle School)': 'social_science',
- 'Economics (High School)': 'social_science',
- 'Arabic Language (General)': 'language',
- 'Arabic Language (Grammar)': 'language',
- 'Islamic Studies (High School)': 'humanities',
- 'Islamic Studies (Middle School)': 'humanities',
- 'Islamic Studies (Primary School)': 'humanities',
- 'Civics (Middle School)': 'social_science',
- 'Civics (High School)': 'social_science'}
+SUBJECTS = {
+    "Islamic Studies": "humanities",
+    "Driving Test": "other",
+    "Natural Science (Middle School)": "stem",
+    "Natural Science (Primary School)": "stem",
+    "History (Primary School)": "humanities",
+    "History (Middle School)": "humanities",
+    "History (High School)": "humanities",
+    "General Knowledge": "other",
+    "General Knowledge (Primary School)": "other",
+    "General Knowledge (Middle School)": "other",
+    "Law (Professional)": "humanities",
+    "Physics (High School)": "stem",
+    "Social Science (Middle School)": "social_science",
+    "Social Science (Primary School)": "social_science",
+    "Management (University)": "other",
+    "Arabic Language (Primary School)": "language",
+    "Arabic Language (Middle School)": "language",
+    "Arabic Language (High School)": "language",
+    "Political Science (University)": "social_science",
+    "Philosophy (High School)": "humanities",
+    "Accounting (University)": "social_science",
+    "Computer Science (University)": "stem",
+    "Computer Science (Middle School)": "stem",
+    "Computer Science (Primary School)": "stem",
+    "Computer Science (High School)": "stem",
+    "Geography (Primary School)": "social_science",
+    "Geography (Middle School)": "social_science",
+    "Geography (High School)": "social_science",
+    "Math (Primary School)": "stem",
+    "Biology (High School)": "stem",
+    "Economics (University)": "social_science",
+    "Economics (Middle School)": "social_science",
+    "Economics (High School)": "social_science",
+    "Arabic Language (General)": "language",
+    "Arabic Language (Grammar)": "language",
+    "Islamic Studies (High School)": "humanities",
+    "Islamic Studies (Middle School)": "humanities",
+    "Islamic Studies (Primary School)": "humanities",
+    "Civics (Middle School)": "social_science",
+    "Civics (High School)": "social_science",
+}


 def parse_args():

--- a/lm_eval/tasks/mmlu/_generate_configs.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
+# noqa
 """
 Take in a YAML, and output all "other" splits with this YAML
 """

--- a/lm_eval/tasks/mmlu/generative/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/generative/_default_template_yaml
@@ -14,7 +14,21 @@ metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
+    ignore_punctuation: true
+    ignore_case: true
+filter_list:
+  - name: get_response
+    filter:
+      # Filter everything after the first break line
+      - function: "regex"
+        regex_pattern: "^(.*?)(?=\\n|$)"
+      # Remove leading white spaces
+      - function: remove_whitespace
+      # function to ignore right white spaces or line breaks
+      - function: "regex"
+        regex_pattern: "^(.*?)\\s*$"
+      - function: take_first
 metadata:
-  version: 2.0
+  version: 3.0
 dataset_kwargs:
  trust_remote_code: true
--- a/lm_eval/tasks/mmlu/generative/_mmlu.yaml
+++ b/lm_eval/tasks/mmlu/generative/_mmlu.yaml
@@ -5,29 +5,29 @@ task:
    task:
      - mmlu_stem_generative
    aggregate_metric_list:
-      - metric: acc
-        weight_by_size: True
+      - metric: exact_match
+        weight_by_size: true
  - group: other
    task:
      - mmlu_other_generative
    aggregate_metric_list:
-      - metric: acc
-        weight_by_size: True
+      - metric: exact_match
+        weight_by_size: true
  - group: social sciences
    task:
      - mmlu_social_sciences_generative
    aggregate_metric_list:
-      - metric: acc
-        weight_by_size: True
+      - metric: exact_match
+        weight_by_size: true
  - group: humanities
    task:
      - mmlu_humanities_generative
    aggregate_metric_list:
-      - metric: acc
-        weight_by_size: True
+      - metric: exact_match
+        weight_by_size: true
 aggregate_metric_list:
  - aggregation: mean
    metric: exact_match
-    weight_by_size: True
+    weight_by_size: true
 metadata:
-  version: 2
+  version: 3