Merge branch 'main' into mathvista

# Conflicts: # lm_eval/models/openai_completions.py

Merge branch 'main' into mathvista
# Conflicts: # lm_eval/models/openai_completions.py
2106fbeb · Baber · 4354fe46 · 703fbffd · 2106fbeb · 2106fbeb
Commit 2106fbeb authored Jan 15, 2025 by Baber
20 changed files
--- a/lm_eval/models/vllm_vlms.py
+++ b/lm_eval/models/vllm_vlms.py
@@ -7,7 +7,12 @@ from tqdm import tqdm

 from lm_eval.api.instance import Instance
 from lm_eval.api.registry import register_model
-from lm_eval.models.utils import Collator, replace_placeholders, undistribute
+from lm_eval.models.utils import (
+    Collator,
+    handle_stop_sequences,
+    replace_placeholders,
+    undistribute,
+)
 from lm_eval.models.vllm_causallms import VLLM
 from lm_eval.utils import eval_logger

@@ -139,7 +144,9 @@ class VLLM_VLM(VLLM):
            )
        return outputs

-    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+    def apply_chat_template(
+        self, chat_history: List[Dict[str, str]], add_generation_prompt=True
+    ) -> str:
        self.chat_applied = True
        if not self.interleave:
            for content in chat_history:
@@ -189,7 +196,9 @@ class VLLM_VLM(VLLM):
                    )

        return self.processor.apply_chat_template(
-            chat_history, add_generation_prompt=True
+            chat_history,
+            add_generation_prompt=add_generation_prompt,
+            continue_final_message=not add_generation_prompt,
        )

    def generate_until(
@@ -225,7 +234,7 @@ class VLLM_VLM(VLLM):
            group_fn=lambda x: x[1],
        )
        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
-
+        eos = self.tokenizer.decode(self.eot_token_id)
        for chunk in chunks:
            contexts, all_gen_kwargs, aux_arguments = zip(*chunk)

@@ -241,27 +250,14 @@ class VLLM_VLM(VLLM):
            # this is safe to assume because the `grouper` object ensures it.
            gen_kwargs = all_gen_kwargs[0]
            # unpack our keyword arguments.
-            until = None
            if isinstance(gen_kwargs, dict):
                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
-                if "until" in kwargs.keys():
-                    until = kwargs.pop("until")
-                    if isinstance(until, str):
-                        until = [until]
-                    elif not isinstance(until, list):
-                        raise ValueError(
-                            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
-                        )
+                # add EOS token to stop sequences
+                until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
            else:
                raise ValueError(
                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
                )
-            # add EOS token to stop sequences
-            eos = self.tokenizer.decode(self.eot_token_id)
-            if not until:
-                until = [eos]
-            else:
-                until.append(eos)
            if "max_gen_toks" in kwargs.keys():
                max_gen_toks = kwargs.pop("max_gen_toks")
            else:

--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
@@ -29,8 +29,8 @@ def get_prompt(prompt_id: str, dataset_name: str = None, subset_name: str = None
    if category_name == "promptsource":
        try:
            from promptsource.templates import DatasetTemplates
-        except ModuleNotFoundError:
-            raise Exception(
+        except ModuleNotFoundError as exception:
+            raise type(exception)(
                "Tried to load a Promptsource template, but promptsource is not installed ",
                "please install promptsource via pip install lm-eval[promptsource] or pip install -e .[promptsource]",
            )
@@ -118,7 +118,7 @@ class PromptString:

        # TODO need a way to process doc_to_choice
        if "doc_to_choice" in self.prompt_string:
-            raise Exception("Not yet implemented to accept doc_to_choice")
+            raise NotImplementedError("Not yet implemented to accept doc_to_choice")

        text_string = utils.apply_template(doc_to_text, doc)
        target_string = utils.apply_template(doc_to_target, doc)

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -14,6 +14,7 @@
 | [arabic_leaderboard_complete](arabic_leaderboard_complete/README.md) | A full version of the tasks in the Open Arabic LLM Leaderboard, focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT) |
 | [arabic_leaderboard_light](arabic_leaderboard_light/README.md) | A light version of the tasks in the Open Arabic LLM Leaderboard (i.e., 10% samples of the test set in the original benchmarks), focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT) |
 | [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic |
+| [AraDICE](aradice/README.md) | A collection of multiple tasks carefully designed to evaluate dialectal and cultural capabilities in large language models (LLMs). | Arabic |
 | [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions.  | English |
 | [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English |
 | [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English |
@@ -45,6 +46,7 @@
 | [fld](fld/README.md) | Tasks involving free-form and directed dialogue understanding. | English |
 | [french_bench](french_bench/README.md) | Set of tasks designed to assess language model performance in French. | French|
 | [galician_bench](galician_bench/README.md) | Collection of tasks in Galician encompassing various evaluation areas. | Galician |
+| [global_mmlu](global_mmlu/README.md) | Collection of culturally sensitive and culturally agnostic MMLU tasks in 15 languages with human translations or post-edits. | Multiple (15 languages) |
 | [glue](glue/README.md) | General Language Understanding Evaluation benchmark to test broad language abilities. | English |
 | [gpqa](gpqa/README.md) | Tasks designed for general public question answering and knowledge verification. | English |
 | [gsm8k](gsm8k/README.md) | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English |
@@ -53,8 +55,11 @@
 | [hellaswag](hellaswag/README.md) | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity. | English |
 | [hendrycks_ethics](hendrycks_ethics/README.md)     | Tasks designed to evaluate the ethical reasoning capabilities of models. | English |
 | [hendrycks_math](hendrycks_math/README.md) | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English |
+| [humaneval](humaneval/README.md) | Code generation task that measure functional correctness for synthesizing programs from docstrings. | Python |
 | [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English |
 | [inverse_scaling](inverse_scaling/README.md) | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English |
+| [japanese_leaderboard](japanese_leaderboard/README.md) | Japanese language understanding tasks to benchmark model performance on various linguistic aspects. | Japanese |
+| [kbl](kbl/README.md) | Korean Benchmark for Legal Language Understanding. | Korean |
 | [kmmlu](kmmlu/README.md) | Knowledge-based multi-subject multiple choice questions for academic evaluation. | Korean |
 | [kobest](kobest/README.md) | A collection of tasks designed to evaluate understanding in Korean language. | Korean |
 | [kormedmcqa](kormedmcqa/README.md) | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean |
@@ -67,13 +72,17 @@
 | [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese |
 | [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese |
 | [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English |
+| [mbpp](mbpp/README.md) | A benchmark designed to measure the ability to synthesize short Python programs from natural language descriptions. | Python |
 | [mc_taco](mc_taco/README.md) | Question-answer pairs that require temporal commonsense comprehension. | English |
 | [med_concepts_qa](med_concepts_qa/README.md) | Benchmark for evaluating LLMs on their abilities to interpret medical codes and distinguish between medical concept. | English |
+| [metabench](metabench/README.md) | Distilled versions of six popular benchmarks which are highly predictive of overall benchmark performance and of a single general ability latent trait. | English |
 | medmcqa | Medical multiple choice questions assessing detailed medical knowledge. | English |
 | medqa | Multiple choice question answering based on the United States Medical License Exams. | |
 | [mgsm](mgsm/README.md) | Benchmark of multilingual grade-school math problems. | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu |
 | [minerva_math](minerva_math/README.md) | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills. | English |
-| mmlu | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English |
+| [mlqa](mlqa/README.md) | MultiLingual Question Answering benchmark dataset for evaluating cross-lingual question answering performance. | English, Arabic, German, Spanish, Hindi, Vietnamese, Simplified Chinese |
+| [mmlu](mmlu/README.md) | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English |
+| [mmlu_pro](mmlu_pro/README.md) | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options. | English |
 | [mmlusr](mmlusr/README.md) | Variation of MMLU designed to be more rigorous. | English |
 | model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. | |
 | [mutual](mutual/README.md) | A retrieval-based dataset for multi-turn dialogue reasoning. | English |
@@ -97,6 +106,7 @@
 | [race](race/README.md) | Reading comprehension assessment tasks based on English exams in China. | English |
 | realtoxicityprompts | Tasks to evaluate language models for generating text with potential toxicity. | |
 | [sciq](sciq/README.md) | Science Question Answering tasks to assess understanding of scientific concepts. | English |
+| [score](score/README.md) | Systematic consistency and robustness evaluation for LLMs on 3 datasets(MMLU-Pro, Agi Eval and MATH) | English |
 | [scrolls](scrolls/README.md) | Tasks that involve long-form reading comprehension across various domains. | English |
 | [siqa](siqa/README.md) | Social Interaction Question Answering to evaluate common sense and social reasoning.  | English |
 | [spanish_bench](spanish_bench/README.md) | Collection of tasks in Spanish encompassing various evaluation areas. | Spanish |
@@ -124,5 +134,6 @@
 | [xcopa](xcopa/README.md) | Cross-lingual Choice of Plausible Alternatives, testing reasoning in multiple languages. | Estonian, Haitian, Indonesian, Italian, Quechua, Swahili, Tamil, Thai, Turkish, Vietnamese, Chinese |
 | [xnli](xnli/README.md) | Cross-Lingual Natural Language Inference to test understanding across different languages. | Arabic, Bulgarian, German, Greek, English, Spanish, French, Hindi, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese |
 | [xnli_eu](xnli_eu/README.md) | Cross-lingual Natural Language Inference tasks in Basque. | Basque |
+| [xquad](xquad/README.md) | Cross-lingual Question Answering Dataset in multiple languages. | Arabic, German, Greek, English, Spanish, Hindi, Romanian, Russian, Thai, Turkish, Vietnamese, Chinese |
 | [xstorycloze](xstorycloze/README.md) | Cross-lingual narrative understanding tasks to predict story endings in multiple languages. | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese |
 | [xwinograd](xwinograd/README.md) | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages. | English, French, Japanese, Portuguese, Russian, Chinese |
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU.yaml
+group: AraDiCE_ArabicMMLU_egy
+task:
+- AraDiCE_ArabicMMLU_humanities_egy
+- AraDiCE_ArabicMMLU_language_egy
+- AraDiCE_ArabicMMLU_social-science_egy
+- AraDiCE_ArabicMMLU_stem_egy
+- AraDiCE_ArabicMMLU_other_egy
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+  - metric: acc_norm
+    weight_by_size: True
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_humanities_history.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_humanities_history.yaml
+"dataset_name": "high_humanities_history"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_humanities_egy"
+"task": "AraDiCE_ArabicMMLU_high_humanities_history_egy"
+"task_alias": "high humanities history"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_humanities_islamic-studies.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_humanities_islamic-studies.yaml
+"dataset_name": "high_humanities_islamic-studies"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_humanities_egy"
+"task": "AraDiCE_ArabicMMLU_high_humanities_islamic-studies_egy"
+"task_alias": "high humanities islamic-studies"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_humanities_philosophy.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_humanities_philosophy.yaml
+"dataset_name": "high_humanities_philosophy"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_humanities_egy"
+"task": "AraDiCE_ArabicMMLU_high_humanities_philosophy_egy"
+"task_alias": "high humanities philosophy"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_language_arabic-language.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_language_arabic-language.yaml
+"dataset_name": "high_language_arabic-language"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_language_egy"
+"task": "AraDiCE_ArabicMMLU_high_language_arabic-language_egy"
+"task_alias": "high language arabic-language"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_social-science_civics.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_social-science_civics.yaml
+"dataset_name": "high_social-science_civics"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_social-science_egy"
+"task": "AraDiCE_ArabicMMLU_high_social-science_civics_egy"
+"task_alias": "high social-science civics"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_social-science_economics.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_social-science_economics.yaml
+"dataset_name": "high_social-science_economics"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_social-science_egy"
+"task": "AraDiCE_ArabicMMLU_high_social-science_economics_egy"
+"task_alias": "high social-science economics"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_social-science_geography.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_social-science_geography.yaml
+"dataset_name": "high_social-science_geography"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_social-science_egy"
+"task": "AraDiCE_ArabicMMLU_high_social-science_geography_egy"
+"task_alias": "high social-science geography"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_stem_biology.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_stem_biology.yaml
+"dataset_name": "high_stem_biology"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_stem_egy"
+"task": "AraDiCE_ArabicMMLU_high_stem_biology_egy"
+"task_alias": "high stem biology"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_stem_computer-science.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_stem_computer-science.yaml
+"dataset_name": "high_stem_computer-science"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_stem_egy"
+"task": "AraDiCE_ArabicMMLU_high_stem_computer-science_egy"
+"task_alias": "high stem computer-science"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_stem_physics.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_high_stem_physics.yaml
+"dataset_name": "high_stem_physics"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_stem_egy"
+"task": "AraDiCE_ArabicMMLU_high_stem_physics_egy"
+"task_alias": "high stem physics"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_humanities_history.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_humanities_history.yaml
+"dataset_name": "middle_humanities_history"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_humanities_egy"
+"task": "AraDiCE_ArabicMMLU_middle_humanities_history_egy"
+"task_alias": "middle humanities history"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_humanities_islamic-studies.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_humanities_islamic-studies.yaml
+"dataset_name": "middle_humanities_islamic-studies"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_humanities_egy"
+"task": "AraDiCE_ArabicMMLU_middle_humanities_islamic-studies_egy"
+"task_alias": "middle humanities islamic-studies"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_language_arabic-language.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_language_arabic-language.yaml
+"dataset_name": "middle_language_arabic-language"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_language_egy"
+"task": "AraDiCE_ArabicMMLU_middle_language_arabic-language_egy"
+"task_alias": "middle language arabic-language"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_other_general-knowledge.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_other_general-knowledge.yaml
+"dataset_name": "middle_other_general-knowledge"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_other_egy"
+"task": "AraDiCE_ArabicMMLU_middle_other_general-knowledge_egy"
+"task_alias": "middle other general-knowledge"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_social-science_civics.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_social-science_civics.yaml
+"dataset_name": "middle_social-science_civics"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_social-science_egy"
+"task": "AraDiCE_ArabicMMLU_middle_social-science_civics_egy"
+"task_alias": "middle social-science civics"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_social-science_economics.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_middle_social-science_economics.yaml
+"dataset_name": "middle_social-science_economics"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_social-science_egy"
+"task": "AraDiCE_ArabicMMLU_middle_social-science_economics_egy"
+"task_alias": "middle social-science economics"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"