Merge branch 'main' into mathvista

8fada609 · Baber · 0007b74a · 1208afd3 · 8fada609 · 8fada609
Commit 8fada609 authored Jan 29, 2025 by Baber
20 changed files
--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -76,9 +76,9 @@ class VLLM(TemplateLM):
            )

        assert "cuda" in device or device is None, "vLLM only supports CUDA"
-        assert (
-            max_length is None or max_model_len is None
-        ), "Either max_length or max_model_len may be provided, but not both"
+        assert max_length is None or max_model_len is None, (
+            "Either max_length or max_model_len may be provided, but not both"
+        )

        self._max_length = max_model_len if max_model_len is not None else max_length
        self.tensor_parallel_size = int(tensor_parallel_size)
@@ -142,9 +142,9 @@ class VLLM(TemplateLM):
        self._max_gen_toks = max_gen_toks

        if lora_local_path is not None:
-            assert parse_version(version("vllm")) > parse_version(
-                "0.3.0"
-            ), "lora adapters only compatible with vllm > v0.3.0."
+            assert parse_version(version("vllm")) > parse_version("0.3.0"), (
+                "lora adapters only compatible with vllm > v0.3.0."
+            )
            self.lora_request = LoRARequest("finetuned", 1, lora_local_path)
        else:
            self.lora_request = None

--- a/lm_eval/models/vllm_vlms.py
+++ b/lm_eval/models/vllm_vlms.py
@@ -271,7 +271,9 @@ class VLLM_VLM(VLLM):
                left_truncate_len=max_ctx_len,
            )

-            cont = self._model_generate(inputs, stop=until, generate=True, **kwargs)
+            cont = self._model_generate(
+                inputs, stop=until, generate=True, max_tokens=max_gen_toks, **kwargs
+            )

            for output, context in zip(cont, contexts):
                generated_text = output.outputs[0].text

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -6,7 +6,7 @@
 For more information, including a full list of task names and their precise meanings or sources, follow the links provided to the individual README.md files for each subfolder.

 | Task Family                                                              | Description | Language(s)                                                                                                                   |
-|-------------|-------------|-------------|
+|--------------------------------------------------------------------------|-------------|-------------------------------------------------------------------------------------------------------------------------------|
 | [aclue](aclue/README.md)                                                 | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese                                                                                                               |
 | [aexams](aexams/README.md)                                               | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic                                                                                                                        |
 | [agieval](agieval/README.md)                                             | Tasks involving historical data or questions related to history and historical texts. | English, Chinese                                                                                                              |
@@ -44,7 +44,7 @@
 | [eus_trivia](eus_trivia/README.md)                                       | Trivia and knowledge testing tasks in the Basque language. | Basque                                                                                                                        |
 | [fda](fda/README.md)                                                     | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English                                                                                                                       |
 | [fld](fld/README.md)                                                     | Tasks involving free-form and directed dialogue understanding. | English                                                                                                                       |
-| [french_bench](french_bench/README.md) | Set of tasks designed to assess language model performance in French. | French|
+| [french_bench](french_bench/README.md)                                   | Set of tasks designed to assess language model performance in French. | French                                                                                                                        |
 | [galician_bench](galician_bench/README.md)                               | Collection of tasks in Galician encompassing various evaluation areas. | Galician                                                                                                                      |
 | [global_mmlu](global_mmlu/README.md)                                     | Collection of culturally sensitive and culturally agnostic MMLU tasks in 15 languages with human translations or post-edits. | Multiple (15 languages)                                                                                                       |
 | [glue](glue/README.md)                                                   | General Language Understanding Evaluation benchmark to test broad language abilities. | English                                                                                                                       |
@@ -55,6 +55,8 @@
 | [hellaswag](hellaswag/README.md)                                         | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity. | English                                                                                                                       |
 | [hendrycks_ethics](hendrycks_ethics/README.md)                           | Tasks designed to evaluate the ethical reasoning capabilities of models. | English                                                                                                                       |
 | [hendrycks_math](hendrycks_math/README.md)                               | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English                                                                                                                       |
+| [histoires_morales](histoires_morales/README.md)                         | A dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations.  | French (Some MT)                                                                                                                        |
+| [hrm8k](hrm8k/README.md)                                                 | A challenging bilingual math reasoning benchmark for Korean and English. | Korean (Some MT), English (Some MT)                                                                                           |
 | [humaneval](humaneval/README.md)                                         | Code generation task that measure functional correctness for synthesizing programs from docstrings. | Python                                                                                                                        |
 | [ifeval](ifeval/README.md)                                               | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English                                                                                                                       |
 | [inverse_scaling](inverse_scaling/README.md)                             | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English                                                                                                                       |
@@ -85,6 +87,7 @@
 | [mmlu_pro](mmlu_pro/README.md)                                           | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options. | English                                                                                                                       |
 | [mmlusr](mmlusr/README.md)                                               | Variation of MMLU designed to be more rigorous. | English                                                                                                                       |
 | model_written_evals                                                      | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. |                                                                                                                               |
+| [moral_stories](moral_stories/README.md)                                 | A crowd-sourced dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations. | English  
 | [mutual](mutual/README.md)                                               | A retrieval-based dataset for multi-turn dialogue reasoning. | English                                                                                                                       |
 | [nq_open](nq_open/README.md)                                             | Open domain question answering tasks based on the Natural Questions dataset. | English                                                                                                                       |
 | [okapi/arc_multilingual](okapi/arc_multilingual/README.md)               | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.**                                                                               |

--- a/lm_eval/tasks/arabicmmlu/_arabicmmlu.yaml
+++ b/lm_eval/tasks/arabicmmlu/_arabicmmlu.yaml
@@ -9,4 +9,4 @@ aggregate_metric_list:
  - metric: acc
    weight_by_size: True
 metadata:
-  version: 0
+  version: 1
--- a/lm_eval/tasks/arabicmmlu/_arabicmmlu_humanities.yaml
+++ b/lm_eval/tasks/arabicmmlu/_arabicmmlu_humanities.yaml
@@ -6,4 +6,4 @@ aggregate_metric_list:
  - metric: acc
    weight_by_size: True
 metadata:
-  version: 0
+  version: 1
--- a/lm_eval/tasks/arabicmmlu/_arabicmmlu_language.yaml
+++ b/lm_eval/tasks/arabicmmlu/_arabicmmlu_language.yaml
@@ -6,4 +6,4 @@ aggregate_metric_list:
  - metric: acc
    weight_by_size: True
 metadata:
-  version: 0
+  version: 1
--- a/lm_eval/tasks/arabicmmlu/_arabicmmlu_other.yaml
+++ b/lm_eval/tasks/arabicmmlu/_arabicmmlu_other.yaml
@@ -6,4 +6,4 @@ aggregate_metric_list:
  - metric: acc
    weight_by_size: True
 metadata:
-  version: 0
+  version: 1
--- a/lm_eval/tasks/arabicmmlu/_arabicmmlu_social_science.yaml
+++ b/lm_eval/tasks/arabicmmlu/_arabicmmlu_social_science.yaml
@@ -6,4 +6,4 @@ aggregate_metric_list:
  - metric: acc
    weight_by_size: True
 metadata:
-  version: 0
+  version: 1
--- a/lm_eval/tasks/arabicmmlu/_arabicmmlu_stem.yaml
+++ b/lm_eval/tasks/arabicmmlu/_arabicmmlu_stem.yaml
@@ -6,4 +6,4 @@ aggregate_metric_list:
  - metric: acc
    weight_by_size: True
 metadata:
-  version: 0
+  version: 1
--- a/lm_eval/tasks/arabicmmlu/_default_arabicmmlu_template_yaml
+++ b/lm_eval/tasks/arabicmmlu/_default_arabicmmlu_template_yaml
-dataset_path: yazeed7/ArabicMMLU
+dataset_path: MBZUAI/ArabicMMLU
 test_split: test
 fewshot_split: dev
 fewshot_config:
@@ -12,4 +12,4 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 metadata:
-  version: 0.0
+  version: 1.0
--- a/lm_eval/tasks/arabicmmlu/_generate_configs.py
+++ b/lm_eval/tasks/arabicmmlu/_generate_configs.py
@@ -14,46 +14,46 @@ eval_logger = logging.getLogger("lm-eval")


 SUBJECTS = {
-    "Driving Test": "other",
-    "High Geography": "social_science",
-    "High History": "humanities",
    "Islamic Studies": "humanities",
-    "Univ Accounting": "social_science",
-    "Primary General Knowledge": "other",
-    "Univ Political Science": "social_science",
-    "Primary Math": "stem",
-    "Middle General Knowledge": "other",
-    "High Biology": "stem",
-    "Primary Natural Science": "stem",
-    "High Economics": "social_science",
-    "Middle Natural Science": "stem",
-    "Middle Geography": "social_science",
-    "Primary Social Science": "social_science",
-    "Middle Computer Science": "stem",
-    "Middle Islamic Studies": "humanities",
-    "Primary Computer Science": "stem",
-    "High Physics": "stem",
-    "Middle Social Science": "social_science",
-    "Middle Civics": "social_science",
-    "High Computer Science": "stem",
+    "Driving Test": "other",
+    "Natural Science (Middle School)": "stem",
+    "Natural Science (Primary School)": "stem",
+    "History (Primary School)": "humanities",
+    "History (Middle School)": "humanities",
+    "History (High School)": "humanities",
    "General Knowledge": "other",
-    "High Civics": "social_science",
-    "Prof Law": "humanities",
-    "High Islamic Studies": "humanities",
-    "Primary Arabic Language": "language",
-    "High Arabic Language": "language",
-    "Arabic Language (Grammar)": "language",
-    "Primary History": "humanities",
-    "Middle History": "humanities",
-    "Univ Economics": "social_science",
+    "General Knowledge (Primary School)": "other",
+    "General Knowledge (Middle School)": "other",
+    "Law (Professional)": "humanities",
+    "Physics (High School)": "stem",
+    "Social Science (Middle School)": "social_science",
+    "Social Science (Primary School)": "social_science",
+    "Management (University)": "other",
+    "Arabic Language (Primary School)": "language",
+    "Arabic Language (Middle School)": "language",
+    "Arabic Language (High School)": "language",
+    "Political Science (University)": "social_science",
+    "Philosophy (High School)": "humanities",
+    "Accounting (University)": "social_science",
+    "Computer Science (University)": "stem",
+    "Computer Science (Middle School)": "stem",
+    "Computer Science (Primary School)": "stem",
+    "Computer Science (High School)": "stem",
+    "Geography (Primary School)": "social_science",
+    "Geography (Middle School)": "social_science",
+    "Geography (High School)": "social_science",
+    "Math (Primary School)": "stem",
+    "Biology (High School)": "stem",
+    "Economics (University)": "social_science",
+    "Economics (Middle School)": "social_science",
+    "Economics (High School)": "social_science",
    "Arabic Language (General)": "language",
-    "Univ Computer Science": "stem",
-    "Primary Islamic Studies": "humanities",
-    "Primary Geography": "social_science",
-    "High Philosophy": "humanities",
-    "Middle Arabic Language": "language",
-    "Middle Economics": "social_science",
-    "Univ Management": "other",
+    "Arabic Language (Grammar)": "language",
+    "Islamic Studies (High School)": "humanities",
+    "Islamic Studies (Middle School)": "humanities",
+    "Islamic Studies (Primary School)": "humanities",
+    "Civics (Middle School)": "social_science",
+    "Civics (High School)": "social_science",
 }


@@ -69,8 +69,9 @@ if __name__ == "__main__":

    # get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
-    with open(args.base_yaml_path, encoding="utf-8") as f:
-        base_yaml = yaml.full_load(f)
+
+    # with open(args.base_yaml_path, encoding="utf-8") as f:
+    #     base_yaml = yaml.full_load(f)

    ALL_CATEGORIES = []
    for subject, category in tqdm(SUBJECTS.items()):
@@ -81,8 +82,8 @@ if __name__ == "__main__":

        yaml_dict = {
            "include": base_yaml_name,
-            "tag": f"arabicmmlu_{category}",
-            "task": f"arabicmmlu_{subject.lower().replace(' ', '_')}",
+            "tag": f"arabicmmlu_{category}_tasks",
+            "task": f"arabicmmlu_{subject.lower().replace(' ', '_').replace('(', '').replace(')', '')}",
            "task_alias": subject,
            "dataset_name": subject,
            # "description": description,

--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_civics.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_civics.yaml
-"dataset_name": "Middle Civics"
-"tag": "arabicmmlu_social_science_tasks"
+"dataset_name": "Accounting (University)"
 "include": "_default_arabicmmlu_template_yaml"
-"task": "arabicmmlu_middle_civics"
-"task_alias": "Middle Civics"
+"tag": "arabicmmlu_social_science_tasks"
+"task": "arabicmmlu_accounting_university"
+"task_alias": "Accounting (University)"
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_arabic_language_general.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_arabic_language_general.yaml
 "dataset_name": "Arabic Language (General)"
-"tag": "arabicmmlu_language_tasks"
 "include": "_default_arabicmmlu_template_yaml"
-"task": "arabicmmlu_arabic_language_(general)"
+"tag": "arabicmmlu_language_tasks"
+"task": "arabicmmlu_arabic_language_general"
 "task_alias": "Arabic Language (General)"
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_arabic_language_grammar.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_arabic_language_grammar.yaml
 "dataset_name": "Arabic Language (Grammar)"
-"tag": "arabicmmlu_language_tasks"
 "include": "_default_arabicmmlu_template_yaml"
-"task": "arabicmmlu_arabic_language_(grammar)"
+"tag": "arabicmmlu_language_tasks"
+"task": "arabicmmlu_arabic_language_grammar"
 "task_alias": "Arabic Language (Grammar)"
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_arabic_language.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_arabic_language.yaml
-"dataset_name": "High Arabic Language"
-"tag": "arabicmmlu_language_tasks"
+"dataset_name": "Arabic Language (High School)"
 "include": "_default_arabicmmlu_template_yaml"
-"task": "arabicmmlu_high_arabic_language"
-"task_alias": "High Arabic Language"
+"tag": "arabicmmlu_language_tasks"
+"task": "arabicmmlu_arabic_language_high_school"
+"task_alias": "Arabic Language (High School)"
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_arabic_language.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_middle_arabic_language.yaml
-"dataset_name": "Middle Arabic Language"
-"tag": "arabicmmlu_language_tasks"
+"dataset_name": "Arabic Language (Middle School)"
 "include": "_default_arabicmmlu_template_yaml"
-"task": "arabicmmlu_middle_arabic_language"
-"task_alias": "Middle Arabic Language"
+"tag": "arabicmmlu_language_tasks"
+"task": "arabicmmlu_arabic_language_middle_school"
+"task_alias": "Arabic Language (Middle School)"
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_arabic_language.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_primary_arabic_language.yaml
-"dataset_name": "Primary Arabic Language"
-"tag": "arabicmmlu_language_tasks"
+"dataset_name": "Arabic Language (Primary School)"
 "include": "_default_arabicmmlu_template_yaml"
-"task": "arabicmmlu_primary_arabic_language"
-"task_alias": "Primary Arabic Language"
+"tag": "arabicmmlu_language_tasks"
+"task": "arabicmmlu_arabic_language_primary_school"
+"task_alias": "Arabic Language (Primary School)"
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_physics.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_physics.yaml
-"dataset_name": "High Physics"
-"tag": "arabicmmlu_stem_tasks"
+"dataset_name": "Biology (High School)"
 "include": "_default_arabicmmlu_template_yaml"
-"task": "arabicmmlu_high_physics"
-"task_alias": "High Physics"
+"tag": "arabicmmlu_stem_tasks"
+"task": "arabicmmlu_biology_high_school"
+"task_alias": "Biology (High School)"
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_economics.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_economics.yaml
-"dataset_name": "High Economics"
-"tag": "arabicmmlu_social_science_tasks"
+"dataset_name": "Civics (High School)"
 "include": "_default_arabicmmlu_template_yaml"
-"task": "arabicmmlu_high_economics"
-"task_alias": "High Economics"
+"tag": "arabicmmlu_social_science_tasks"
+"task": "arabicmmlu_civics_high_school"
+"task_alias": "Civics (High School)"
--- a/lm_eval/tasks/arabicmmlu/arabicmmlu_high_geography.yaml
+++ b/lm_eval/tasks/arabicmmlu/arabicmmlu_high_geography.yaml
-"dataset_name": "High Geography"
-"tag": "arabicmmlu_social_science_tasks"
+"dataset_name": "Civics (Middle School)"
 "include": "_default_arabicmmlu_template_yaml"
-"task": "arabicmmlu_high_geography"
-"task_alias": "High Geography"
+"tag": "arabicmmlu_social_science_tasks"
+"task": "arabicmmlu_civics_middle_school"
+"task_alias": "Civics (Middle School)"