Merge branch 'main' into group-agg-rework

3d1b8f43 · Lintang Sutawika · GitHub · e200c24e · d855d0ba · 3d1b8f43
Unverified Commit 3d1b8f43 authored Jul 03, 2024 by Lintang Sutawika Committed by GitHub Jul 03, 2024
20 changed files
--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -5,6 +5,7 @@ import itertools
 import time
 from functools import wraps
 from typing import (
+    TYPE_CHECKING,
    Any,
    Callable,
    Dict,
@@ -24,6 +25,11 @@ import transformers
 from lm_eval.utils import eval_logger


+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizerBase
+    from transformers.configuration_utils import PretrainedConfig
+
+
 def chunks(iter, n: int = 0, fn=None):
    """
    Divides an iterable into chunks of specified size or based on a given function.
@@ -613,3 +619,48 @@ class Collator:

        if arr:
            yield arr
+
+
+def configure_pad_token(
+    tokenizer: "PreTrainedTokenizerBase",
+    model_config: Optional["PretrainedConfig"] = None,
+) -> "PreTrainedTokenizerBase":
+    """
+    This function checks if the (Hugging Face) tokenizer has a padding token and sets it if not present.
+    Some tokenizers require special handling.
+
+    Args:
+        tokenizer: The tokenizer for which the padding token is to be handled.
+        model_config: The configuration of the model. Default is None.
+
+    Returns:
+        The tokenizer after the padding token has been handled.
+
+    Raises:
+        AssertionError: If the tokenizer is of type RWKVWorldTokenizer or Rwkv5Tokenizer and the padding token id is not 0.
+    """
+    if tokenizer.pad_token:
+        pass
+    elif tokenizer.unk_token:
+        tokenizer.pad_token_id = tokenizer.unk_token_id
+    elif tokenizer.eos_token:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    else:
+        # handle special cases
+        if model_config and getattr(model_config, "model_type", None) == "qwen":
+            # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
+            tokenizer.pad_token = "<|endoftext|>"
+        elif (
+            tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
+            or tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
+        ):
+            # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
+            # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
+            # ---
+            # Note that the world tokenizer class name, might change in the future for the final huggingface merge
+            # https://github.com/huggingface/transformers/pull/26963
+            assert tokenizer.pad_token_id == 0
+        else:
+            tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
+
+    return tokenizer
--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
 import copy
 from importlib.metadata import version
 from importlib.util import find_spec
-from typing import List, Literal, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union

 from more_itertools import distribute
 from packaging.version import parse as parse_version
@@ -10,7 +10,7 @@ from tqdm import tqdm
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import TemplateLM
 from lm_eval.api.registry import register_model
-from lm_eval.models.utils import Collator, undistribute
+from lm_eval.models.utils import Collator, configure_pad_token, undistribute
 from lm_eval.utils import (
    eval_logger,
    get_rolling_token_windows,
@@ -26,6 +26,8 @@ try:
 except ModuleNotFoundError:
    pass

+if TYPE_CHECKING:
+    pass

 eval_logger = eval_logger

@@ -118,7 +120,14 @@ class VLLM(TemplateLM):
            trust_remote_code=trust_remote_code,
            tokenizer_revision=tokenizer_revision,
        )
+        self.tokenizer = configure_pad_token(self.tokenizer)
        self.add_bos_token = add_bos_token
+        if "gemma" in pretrained.lower():
+            self.add_bos_token = True
+            eval_logger.info(
+                "Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it."
+            )
+
        self.custom_prefix_token_id = prefix_token_id
        if prefix_token_id is not None:
            eval_logger.info(
@@ -170,23 +179,46 @@ class VLLM(TemplateLM):
    def max_gen_toks(self):
        return self._max_gen_toks

+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        """
+        Method to apply a chat template to a list of chat history between user and model.
+        """
+        return self.tokenizer.apply_chat_template(
+            chat_history, tokenize=False, add_generation_prompt=True
+        )
+
+    @property
+    def chat_template(self) -> str:
+        if self.tokenizer.chat_template is not None:
+            return self.tokenizer.chat_template
+        return self.tokenizer.default_chat_template
+
+    @property
+    def tokenizer_name(self) -> str:
+        return self.tokenizer.name_or_path.replace("/", "__")
+
    def tok_encode(
        self,
-        string: str,
-        left_truncate_len=None,
-        add_special_tokens=None,
-        truncation=False,
-    ):
-        """ """
+        string: Union[str, List[str]],
+        left_truncate_len: int = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+    ) -> Union[List[int], List[List[int]]]:
        if not add_special_tokens:
            add_special_tokens = False or self.add_bos_token
-        encoding = self.tokenizer.encode(
-            string, add_special_tokens=add_special_tokens, truncation=truncation
-        )
+        encoding: Union[List[List[int]], List[int]] = self.tokenizer(
+            string,
+            add_special_tokens=add_special_tokens,
+            truncation=truncation,
+            return_attention_mask=False,
+        ).input_ids

        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
        if left_truncate_len:
-            encoding = encoding[-left_truncate_len:]
+            if not isinstance(string, str):
+                encoding = [enc[-left_truncate_len:] for enc in encoding]
+            else:
+                encoding = encoding[-left_truncate_len:]

        return encoding

@@ -203,7 +235,7 @@ class VLLM(TemplateLM):
            sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs)
        else:
            sampling_params = SamplingParams(
-                temperature=0, prompt_logprobs=1, max_tokens=1
+                temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
            )
        if self.data_parallel_size > 1:
            # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
@@ -284,7 +316,9 @@ class VLLM(TemplateLM):

        # batch tokenize contexts
        context, all_gen_kwargs = zip(*(req.args for req in requests))
-        context_encoding = self.tokenizer(context, add_special_tokens=False).input_ids
+        context_encoding: List[List[int]] = self.tok_encode(
+            context, add_special_tokens=self.add_bos_token
+        )
        requests = [
            ((a, b), c) for a, b, c in zip(context, context_encoding, all_gen_kwargs)
        ]
@@ -493,7 +527,10 @@ class VLLM(TemplateLM):
    def modify_gen_kwargs(kwargs: dict) -> dict:
        # sampling_params
        do_sample = kwargs.pop("do_sample", None)
-        if do_sample is False or "temperature" not in kwargs:
+        if do_sample is False and "temperature" not in kwargs:
+            eval_logger.debug(
+                "Got `do_sample=False` and no temperature value, setting VLLM temperature to 0.0 ..."
+            )
            kwargs["temperature"] = 0.0
        # hf defaults
        kwargs["skip_special_tokens"] = kwargs.get("skip_special_tokens", False)

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -10,8 +10,8 @@
 | [aclue](aclue/README.md) | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese |
 | [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic |
 | [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese |
-| [ammlu](ammlu/README.md) | Arabic version of MMLU. | Arabic |
 | [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English |
+| [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic |
 | [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions.  | English |
 | [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English |
 | [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English |
@@ -20,11 +20,13 @@
 | [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German |
 | [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) |
 | benchmarks | General benchmarking tasks that test a wide range of language understanding capabilities. | |
+| [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) |
 | [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple |
 | [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English |
 | [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese |
 | [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese |
 | code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby |
+| [commonsense_qa](commmonsense_qa/README.md) | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English |
 | [copal_id](copal_id/README.md) | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian |
 | [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English |
 | [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French |
@@ -47,12 +49,15 @@
 | [hendrycks_ethics](hendrycks_ethics/README.md)     | Tasks designed to evaluate the ethical reasoning capabilities of models. | English |
 | [hendrycks_math](hendrycks_math/README.md) | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English |
 | [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English |
+| [inverse_scaling](inverse_scaling/README.md) | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English |
 | [kmmlu](kmmlu/README.md) | Knowledge-based multi-subject multiple choice questions for academic evaluation. | Korean |
 | [kobest](kobest/README.md) | A collection of tasks designed to evaluate understanding in Korean language. | Korean |
 | [kormedmcqa](kormedmcqa/README.md) | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean |
 | [lambada](lambada/README.md) | Tasks designed to predict the endings of text passages, testing language prediction skills. | English |
 | [lambada_cloze](lambada_cloze/README.md) | Cloze-style LAMBADA dataset. | English |
-| [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. | German, English, Spanish, French, Italian |
+| [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian |
+| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese |
+| [leaderboard](leaderboard/README.md) | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English |
 | [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese |
 | [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese |
 | [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English |
@@ -70,6 +75,7 @@
 | okapi/mmlu_multilingual | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) |
 | [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) |
 | [openbookqa](openbookqa/README.md) | Open-book question answering tasks that require external knowledge and reasoning. | English |
+| [paloma](paloma/README.md) | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit. | English |
 | [paws-x](paws-x/README.md) | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean |
 | [pile](pile/README.md) | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English |
 | [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English |

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -17,27 +17,43 @@ class TaskManager:

    """

-    def __init__(self, verbosity="INFO", include_path: Optional[str] = None) -> None:
+    def __init__(
+        self,
+        verbosity="INFO",
+        include_path: Optional[Union[str, List]] = None,
+        include_defaults: bool = True,
+    ) -> None:
        self.verbosity = verbosity
        self.include_path = include_path
        self.logger = utils.eval_logger
        self.logger.setLevel(getattr(logging, f"{verbosity}"))

-        self._task_index = self.initialize_tasks(include_path=include_path)
+        self._task_index = self.initialize_tasks(
+            include_path=include_path, include_defaults=include_defaults
+        )
        self._all_tasks = sorted(list(self._task_index.keys()))

        self.task_group_map = collections.defaultdict(list)

-    def initialize_tasks(self, include_path: Optional[str] = None):
+    def initialize_tasks(
+        self,
+        include_path: Optional[Union[str, List]] = None,
+        include_defaults: bool = True,
+    ):
        """Creates a dictionary of tasks index.

-        :param include_path: str = None
-            An additional path to be searched for tasks
-
+        :param include_path: Union[str, List] = None
+            An additional path to be searched for tasks recursively.
+            Can provide more than one such path as a list.
+        :param include_defaults: bool = True
+            If set to false, default tasks (those in lm_eval/tasks/) are not indexed.
        :return
            Dictionary of task names as key and task metadata
        """
-        all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
+        if include_defaults:
+            all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
+        else:
+            all_paths = []
        if include_path is not None:
            if isinstance(include_path, str):
                include_path = [include_path]
@@ -318,8 +334,13 @@ class TaskManager:
        """
        # TODO: remove group in next release
        print_info = True
+        ignore_dirs = [
+            "__pycache__",
+            ".ipynb_checkpoints",
+        ]
        tasks_and_groups = collections.defaultdict()
-        for root, _, file_list in os.walk(task_dir):
+        for root, dirs, file_list in os.walk(task_dir):
+            dirs[:] = [d for d in dirs if d not in ignore_dirs]
            for f in file_list:
                if f.endswith(".yaml"):
                    yaml_path = os.path.join(root, f)

--- a/lm_eval/tasks/aclue/_generate_configs.py
+++ b/lm_eval/tasks/aclue/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
+
 import argparse
 import os


--- a/lm_eval/tasks/ammlu/README.md
+++ b/lm_eval/tasks/ammlu/README.md
-# ArabicMMLU
+#Arabic COPA

 ### Paper

-ArabicMMLU: Measuring massive multitask language understanding in Arabic
-This dataset has been translated from the original MMLU with the help of GPT-4.
+Original Title: `COPA`

-The original data [MMLU](https://arxiv.org/pdf/2009.03300v3.pdf)

-The translation has been done with AceGPT researchers [AceGPT](https://arxiv.org/abs/2309.12053)

-ArabicMMLU is a comprehensive evaluation benchmark specifically designed to evaluate the knowledge and reasoning abilities of LLMs within the context of Arabic language and culture.
-ArabicMMLU covers a wide range of subjects, comprising 57 topics that span from elementary to advanced professional levels.
+The Choice Of Plausible Alternatives (COPA) evaluation provides researchers with a tool for assessing progress in open-domain commonsense causal reasoning.

-Homepage: [AceGPT Homepage](https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic)
+[Homepage](https://people.ict.usc.edu/~gordon/copa.html)

-### Citation
+AlGhafa has translated this dataset to Arabic[AlGafa](https://aclanthology.org/2023.arabicnlp-1.21.pdf)
+
+The link to the Arabic version of the dataset [PICA](https://gitlab.com/tiiuae/alghafa/-/tree/main/arabic-eval/copa_ar)

+### Citation

 ### Groups and Tasks

 #### Groups

- `ammlu`: All 57 subjects of the ArabicMMLU dataset, evaluated following the methodology in MMLU's original implementation.
+* Not part of a group yet.

 #### Tasks

-
-The following tasks evaluate subjects in the ArabicMMLU dataset using loglikelihood-based multiple-choice scoring:
- `ammlu_{subject_english}`
+* `copa_ar`

 ### Checklist

+For adding novel benchmarks/datasets to the library:
 * [x] Is the task an existing benchmark in the literature?
  * [x] Have you referenced the original paper that introduced the task?
-  * [x] If yes, does the original paper provide a reference implementation?
-    * [x] Yes, original implementation contributed by author of the benchmark
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+

 If other tasks on this dataset are already supported:
 * [x] Is the "Main" variant of this task clearly denoted?

--- a/lm_eval/tasks/ammlu/_default_template_yaml
+++ b/lm_eval/tasks/ammlu/_default_template_yaml
-group: ammlu
-dataset_path: Hennara/ammlu
-test_split: test
-fewshot_split: dev
-fewshot_config:
-  sampler: first_n
+task: copa_ar
+dataset_path: Hennara/copa_ar
+dataset_name: null
 output_type: multiple_choice
-doc_to_text: "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nالجواب："
-doc_to_choice: ["A", "B", "C", "D"]
-doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer)}}"
+training_split: null
+validation_split: null
+test_split: test
+doc_to_text: "السؤال: {{query}}\nالجواب:"
+doc_to_choice: "{{[sol1, sol2]}}"
+doc_to_target: label
+should_decontaminate: true
+doc_to_decontamination_query: query
 metric_list:
  - metric: acc
    aggregation: mean
@@ -16,4 +18,4 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 metadata:
-  version: 0.0
+  version: 1.0
--- a/lm_eval/tasks/alghafa/piqa_ar/README.md
+++ b/lm_eval/tasks/alghafa/piqa_ar/README.md
+#Arabic PIQA
+
+### Paper
+
+Original Title: `PIQA: Reasoning about Physical Commonsense in Natural Language`
+
+Original paper: [PICA](https://arxiv.org/abs/1911.11641)
+
+Physical Interaction: Question Answering (PIQA) is a physical commonsense
+reasoning and a corresponding benchmark dataset. PIQA was designed to investigate
+the physical knowledge of existing models. To what extent are current approaches
+actually learning about the world?
+
+[Homepage](https://yonatanbisk.com/piqa)
+
+AlGhafa has translated this dataset to Arabic[AlGafa](https://aclanthology.org/2023.arabicnlp-1.21.pdf)
+
+The link to the Arabic version of the dataset [PICA](https://gitlab.com/tiiuae/alghafa/-/tree/main/arabic-eval/pica_ar)
+
+### Citation
+
+### Groups and Tasks
+
+#### Groups
+
+* Not part of a group yet.
+
+#### Tasks
+
+* `piqa_ar`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/alghafa/piqa_ar/piqa_ar.yaml
+++ b/lm_eval/tasks/alghafa/piqa_ar/piqa_ar.yaml
+task: piqa_ar
+dataset_path: Hennara/pica_ar
+dataset_name: null
+output_type: multiple_choice
+training_split: null
+validation_split: null
+test_split: test
+doc_to_text: "السؤال: {{goal}}\nالجواب:"
+doc_to_choice: "{{[sol1, sol2]}}"
+doc_to_target: label
+should_decontaminate: true
+doc_to_decontamination_query: goal
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/ammlu/_generate_configs.py
+++ b/lm_eval/tasks/ammlu/_generate_configs.py
-"""
-Take in a YAML, and output all other splits with this YAML
-"""
-import argparse
-import os
-
-import yaml
-from tqdm import tqdm
-
-
-SUBJECTS = {
-    "abstract_algebra": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "anatomy": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "astronomy": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "business_ethics": "علوم أخرى",
-    "clinical_knowledge": "علوم أخرى",
-    "college_biology": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "college_chemistry": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "college_computer_science": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "college_mathematics": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "college_medicine": "علوم أخرى",
-    "college_physics": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "computer_security": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "conceptual_physics": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "econometrics": "العلوم الإجتماعية",
-    "electrical_engineering": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "elementary_mathematics": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "formal_logic": "العلوم الانسانية",
-    "global_facts": "علوم أخرى",
-    "high_school_biology": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "high_school_chemistry": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "high_school_computer_science": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "high_school_european_history": "العلوم الانسانية",
-    "high_school_geography": "العلوم الإجتماعية",
-    "high_school_government_and_politics": "العلوم الإجتماعية",
-    "high_school_macroeconomics": "العلوم الإجتماعية",
-    "high_school_mathematics": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "high_school_microeconomics": "العلوم الإجتماعية",
-    "high_school_physics": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "high_school_psychology": "العلوم الإجتماعية",
-    "high_school_statistics": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "high_school_us_history": "العلوم الانسانية",
-    "high_school_world_history": "العلوم الانسانية",
-    "human_aging": "علوم أخرى",
-    "human_sexuality": "العلوم الإجتماعية",
-    "international_law": "العلوم الانسانية",
-    "jurisprudence": "العلوم الانسانية",
-    "logical_fallacies": "العلوم الانسانية",
-    "machine_learning": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "management": "علوم أخرى",
-    "marketing": "علوم أخرى",
-    "medical_genetics": "علوم أخرى",
-    "miscellaneous": "علوم أخرى",
-    "moral_disputes": "العلوم الانسانية",
-    "moral_scenarios": "العلوم الانسانية",
-    "nutrition": "علوم أخرى",
-    "philosophy": "العلوم الانسانية",
-    "prehistory": "العلوم الانسانية",
-    "professional_accounting": "علوم أخرى",
-    "professional_law": "العلوم الانسانية",
-    "professional_medicine": "علوم أخرى",
-    "professional_psychology": "العلوم الإجتماعية",
-    "public_relations": "العلوم الإجتماعية",
-    "security_studies": "العلوم الإجتماعية",
-    "sociology": "العلوم الإجتماعية",
-    "us_foreign_policy": "العلوم الإجتماعية",
-    "virology": "علوم أخرى",
-    "world_religions": "العلوم الانسانية",
-}
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--base_yaml_path", required=True)
-    parser.add_argument("--save_prefix_path", default="ammlu")
-    parser.add_argument("--cot_prompt_path", default=None)
-    parser.add_argument("--task_prefix", default="")
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
-    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
-    with open(args.base_yaml_path, encoding="utf-8") as f:
-        base_yaml = yaml.full_load(f)
-
-    if args.cot_prompt_path is not None:
-        import json
-
-        with open(args.cot_prompt_path, encoding="utf-8") as f:
-            cot_file = json.load(f)
-
-    for subject_eng, category in tqdm(SUBJECTS.items()):
-        if args.cot_prompt_path is not None:
-            description = cot_file[subject_eng]
-        else:
-            description = f"فم بعملية التقييم في مجال {category} \n\n"
-
-        yaml_dict = {
-            "include": base_yaml_name,
-            "task": f"ammlu_{args.task_prefix}_{subject_eng}"
-            if args.task_prefix != ""
-            else f"ammlu_{subject_eng}",
-            "dataset_name": subject_eng,
-            "description": description,
-        }
-
-        file_save_path = args.save_prefix_path + f"_{subject_eng}.yaml"
-        print(f"Saving yaml for subset {subject_eng} to {file_save_path}")
-        with open(file_save_path, "w", encoding="utf-8") as yaml_file:
-            yaml.dump(
-                yaml_dict,
-                yaml_file,
-                width=float("inf"),
-                allow_unicode=True,
-                default_style='"',
-            )
--- a/lm_eval/tasks/ammlu/ammlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_abstract_algebra.yaml
-"dataset_name": "abstract_algebra"
-"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
-"include": "_default_template_yaml"
-"task": "ammlu_abstract_algebra"
--- a/lm_eval/tasks/ammlu/ammlu_anatomy.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_anatomy.yaml
-"dataset_name": "anatomy"
-"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
-"include": "_default_template_yaml"
-"task": "ammlu_anatomy"
--- a/lm_eval/tasks/ammlu/ammlu_astronomy.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_astronomy.yaml
-"dataset_name": "astronomy"
-"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
-"include": "_default_template_yaml"
-"task": "ammlu_astronomy"
--- a/lm_eval/tasks/ammlu/ammlu_business_ethics.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_business_ethics.yaml
-"dataset_name": "business_ethics"
-"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
-"include": "_default_template_yaml"
-"task": "ammlu_business_ethics"
--- a/lm_eval/tasks/ammlu/ammlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_clinical_knowledge.yaml
-"dataset_name": "clinical_knowledge"
-"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
-"include": "_default_template_yaml"
-"task": "ammlu_clinical_knowledge"
--- a/lm_eval/tasks/ammlu/ammlu_college_biology.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_college_biology.yaml
-"dataset_name": "college_biology"
-"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
-"include": "_default_template_yaml"
-"task": "ammlu_college_biology"
--- a/lm_eval/tasks/ammlu/ammlu_college_chemistry.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_college_chemistry.yaml
-"dataset_name": "college_chemistry"
-"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
-"include": "_default_template_yaml"
-"task": "ammlu_college_chemistry"
--- a/lm_eval/tasks/ammlu/ammlu_college_computer_science.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_college_computer_science.yaml
-"dataset_name": "college_computer_science"
-"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
-"include": "_default_template_yaml"
-"task": "ammlu_college_computer_science"
--- a/lm_eval/tasks/ammlu/ammlu_college_mathematics.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_college_mathematics.yaml
-"dataset_name": "college_mathematics"
-"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
-"include": "_default_template_yaml"
-"task": "ammlu_college_mathematics"
--- a/lm_eval/tasks/ammlu/ammlu_college_medicine.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_college_medicine.yaml
-"dataset_name": "college_medicine"
-"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
-"include": "_default_template_yaml"
-"task": "ammlu_college_medicine"