Merge branch 'EleutherAI:main' into main

da211969 · Jess · GitHub · 1b97e487 · 801322e0 · da211969
Unverified Commit da211969 authored Jun 28, 2024 by Jess Committed by GitHub Jun 28, 2024
20 changed files
--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -307,7 +307,7 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
        # defaults to os.environ.get("ANTHROPIC_API_KEY")
        self.client = anthropic.Anthropic()
        self.temperature = temperature
-        self.max_token = max_tokens
+        self.max_tokens = max_tokens
        self.tokenizer = self.client.get_tokenizer()
        self.kwargs = kwargs

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -2,7 +2,7 @@ import copy
 import os
 from datetime import timedelta
 from pathlib import Path
-from typing import List, Literal, Optional, Tuple, Union
+from typing import Dict, List, Literal, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
@@ -44,13 +44,13 @@ def _get_accelerate_args(
    max_memory_per_gpu: Optional[Union[int, str]] = None,
    max_cpu_memory: Optional[Union[int, str]] = None,
    offload_folder: Optional[str] = "./offload",
+    gpus: Optional[int] = None,
 ) -> dict:
    """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
    max_memory = {}
    if max_memory_per_gpu is not None:
        max_memory_per_gpu_map = {
-            device_idx: max_memory_per_gpu
+            device_idx: max_memory_per_gpu for device_idx in range(gpus)
-            for device_idx in range(torch.cuda.device_count())
        }
        max_memory.update(max_memory_per_gpu_map)
    if max_cpu_memory is not None:
@@ -153,12 +153,16 @@ class HFLM(TemplateLM):
            if accelerator.num_processes > 1:
                self.accelerator = accelerator
+            if "npu" in accelerator.device.type:
+                gpus = torch.npu.device_count()
            if not (parallelize or accelerator.num_processes > 1):
                # use user-passed device
                device_list = set(
                    ["cuda", "cpu"]
-                    + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
+                    + [f"cuda:{i}" for i in range(gpus)]
                    + ["mps", "mps:0"]
+                    + [f"npu:{i}" for i in range(gpus)]
                )
                if device and device in device_list:
                    self._device = torch.device(device)
@@ -199,6 +203,15 @@ class HFLM(TemplateLM):
            config=self.config, backend=backend, trust_remote_code=trust_remote_code
        )
+        # load tokenizer so we know tokenizer vocabulary size before loading model and PEFT
+        self._create_tokenizer(
+            pretrained,
+            tokenizer,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            use_fast_tokenizer=use_fast_tokenizer,
+        )
        # if we passed `pretrained` as a string, initialize our model now
        if isinstance(pretrained, str):
            self._create_model(
@@ -207,6 +220,7 @@ class HFLM(TemplateLM):
                dtype=dtype,
                trust_remote_code=trust_remote_code,
                parallelize=parallelize,
+                gpus=gpus,
                device_map_option=device_map_option,
                max_memory_per_gpu=max_memory_per_gpu,
                max_cpu_memory=max_cpu_memory,
@@ -235,14 +249,6 @@ class HFLM(TemplateLM):
                        "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
                    )
-        self._create_tokenizer(
-            pretrained,
-            tokenizer,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-            use_fast_tokenizer=use_fast_tokenizer,
-        )
        self.truncation = truncation
        self.logits_cache = logits_cache
        self.vocab_size = self.tokenizer.vocab_size
@@ -321,6 +327,7 @@ class HFLM(TemplateLM):
                        in [
                            DistributedType.FSDP,
                            DistributedType.MULTI_GPU,
+                            DistributedType.MULTI_NPU,
                        ]
                    ), "Unsupported distributed type provided. Only DDP and FSDP are supported."
                    if accelerator.distributed_type == DistributedType.FSDP:
@@ -329,9 +336,7 @@ class HFLM(TemplateLM):
                        self._model = accelerator.prepare_model(
                            self.model, evaluation_mode=True
                        )
-                    self._device = torch.device(
+                    self._device = torch.device(f"{accelerator.device}")
-                        f"cuda:{accelerator.local_process_index}"
-                    )
                    self.accelerator = accelerator
                    if self.accelerator.is_local_main_process:
@@ -414,6 +419,16 @@ class HFLM(TemplateLM):
    def world_size(self):
        return self._world_size
+    @property
+    def tokenizer_name(self) -> str:
+        return self.tokenizer.name_or_path.replace("/", "__")
+    @property
+    def chat_template(self) -> str:
+        if self.tokenizer.chat_template is not None:
+            return self.tokenizer.chat_template
+        return self.tokenizer.default_chat_template
    def _get_backend(
        self,
        config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
@@ -488,6 +503,7 @@ class HFLM(TemplateLM):
        # only used if `parallelize=True`.
        # (accelerate naive PP (device_map) options)
        parallelize: Optional[bool] = False,
+        gpus: Optional[int] = None,
        device_map_option: Optional[str] = "auto",
        max_memory_per_gpu: Optional[Union[int, str]] = None,
        max_cpu_memory: Optional[Union[int, str]] = None,
@@ -519,6 +535,7 @@ class HFLM(TemplateLM):
                    max_memory_per_gpu,
                    max_cpu_memory,
                    offload_folder,
+                    gpus,
                )
            )
        elif "device_map" not in model_kwargs:
@@ -527,9 +544,7 @@ class HFLM(TemplateLM):
            # for quantized models now seems to be device_map="auto"
            # which breaks data-parallel mode.
            if hasattr(self, "accelerator"):
-                model_kwargs.update(
+                model_kwargs.update({"device_map": {"": f"{self.accelerator.device}"}})
-                    {"device_map": {"": f"cuda:{self.accelerator.local_process_index}"}}
-                )
            else:
                model_kwargs.update({"device_map": {"": str(self.device)}})
@@ -579,6 +594,12 @@ class HFLM(TemplateLM):
            if model_kwargs.get("load_in_4bit", None):
                if version.parse(PEFT_VERSION) < version.parse("0.4.0"):
                    raise AssertionError("load_in_4bit requires peft >= 0.4.0")
+            if self._model.config.vocab_size != len(self.tokenizer):
+                # resize model for LoRAs with added tokens
+                self._model.resize_token_embeddings(len(self.tokenizer))
+                eval_logger.info(
+                    f"Model config indicates vocab_size='{self._model.config.vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..."
+                )
            self._model = PeftModel.from_pretrained(
                self._model, peft, revision=revision
            )
@@ -1279,6 +1300,14 @@ class HFLM(TemplateLM):
        return res
+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        """
+        Method to apply a chat template to a list of chat history between user and model.
+        """
+        return self.tokenizer.apply_chat_template(
+            chat_history, tokenize=False, add_generation_prompt=True
+        )
    def get_model_info(self) -> dict:
        """
        Method to get Hugging Face model information for experiment reproducibility.

--- a/lm_eval/models/neuron_optimum.py
+++ b/lm_eval/models/neuron_optimum.py
@@ -288,7 +288,7 @@ class NEURON_HF(TemplateLM):
        self.vocab_size = self.tokenizer.vocab_size
        self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
-        self.add_bos_token = self.add_bos_token
+        self.add_bos_token = add_bos_token
        self._max_length = max_length

--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
-""" TextSynth API
+"""TextSynth API
 Implementation provided by Fabrice Bellard:
    https://github.com/EleutherAI/lm-evaluation-harness/issues/295
@@ -11,6 +11,7 @@ Example usage:
 Homepage: https://textsynth.com/index.html
 """
 import logging
 import os

--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -21,9 +21,7 @@ from lm_eval.utils import (
 try:
    import ray
    from vllm import LLM, SamplingParams
+    from vllm.lora.request import LoRARequest
-    if parse_version(version("vllm")) > parse_version("0.3.0"):
-        from vllm.lora.request import LoRARequest
    from vllm.transformers_utils.tokenizer import get_tokenizer
 except ModuleNotFoundError:
    pass
@@ -102,9 +100,6 @@ class VLLM(TemplateLM):
        if self.data_parallel_size <= 1:
            self.model = LLM(**self.model_args)
        else:
-            assert parse_version(version("vllm")) < parse_version(
-                "0.3.3"
-            ), "data_parallel is only compatible with vllm < v0.3.3."
            eval_logger.warning(
                "You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached."
            )
@@ -124,6 +119,12 @@ class VLLM(TemplateLM):
            tokenizer_revision=tokenizer_revision,
        )
        self.add_bos_token = add_bos_token
+        if "gemma" in pretrained.lower():
+            self.add_bos_token = True
+            eval_logger.info(
+                "Found 'gemma' in model name, a BOS token will be used as Gemma underperforms without it."
+            )
        self.custom_prefix_token_id = prefix_token_id
        if prefix_token_id is not None:
            eval_logger.info(
@@ -498,7 +499,10 @@ class VLLM(TemplateLM):
    def modify_gen_kwargs(kwargs: dict) -> dict:
        # sampling_params
        do_sample = kwargs.pop("do_sample", None)
-        if do_sample is False or "temperature" not in kwargs:
+        if do_sample is False and "temperature" not in kwargs:
+            eval_logger.debug(
+                "Got `do_sample=False` and no temperature value, setting VLLM temperature to 0.0 ..."
+            )
            kwargs["temperature"] = 0.0
        # hf defaults
        kwargs["skip_special_tokens"] = kwargs.get("skip_special_tokens", False)

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
+# Tasks
+ A list of supported tasks and task groupings can be viewed with `lm-eval --tasks list`.
+ For more information, including a full list of task names and their precise meanings or sources, follow the links provided to the individual README.md files for each subfolder.
+| Task Family | Description | Language(s) |
+|-------------|-------------|-------------|
+| [aclue](aclue/README.md) | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese |
+| [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic |
+| [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese |
+| [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English |
+| [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic |
+| [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions.  | English |
+| [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English |
+| [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English |
+| [babi](babi/README.md) | Tasks designed as question and answering challenges based on simulated stories. | English |
+| [basqueglue](basqueglue/README.md) | Tasks designed to evaluate language understanding in Basque language. | Basque |
+| [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German |
+| [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) |
+| benchmarks | General benchmarking tasks that test a wide range of language understanding capabilities. | |
+| [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) |
+| [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple |
+| [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English |
+| [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese |
+| [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese |
+| code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby |
+| [commonsense_qa](commmonsense_qa/README.md) | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English |
+| [copal_id](copal_id/README.md) | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian |
+| [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English |
+| [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French |
+| csatqa | Tasks related to SAT and other standardized testing questions for academic assessment. | Korean |
+| [drop](drop/README.md) | Tasks requiring numerical reasoning, reading comprehension, and question answering. | English |
+| [eq_bench](eq_bench/README.md) | Tasks focused on equality and ethics in question answering and decision-making. | English |
+| [eus_exams](eus_exams/README.md) | Tasks based on various professional and academic exams in the Basque language. | Basque |
+| [eus_proficiency](eus_proficiency/README.md) | Tasks designed to test proficiency in the Basque language across various topics. | Basque |
+| [eus_reading](eus_reading/README.md) | Reading comprehension tasks specifically designed for the Basque language. | Basque |
+| [eus_trivia](eus_trivia/README.md) | Trivia and knowledge testing tasks in the Basque language. | Basque |
+| [fda](fda/README.md) | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English |
+| [fld](fld/README.md) | Tasks involving free-form and directed dialogue understanding. | English |
+| [french_bench](french_bench/README.md) | Set of tasks designed to assess language model performance in French. | French|
+| [glue](glue/README.md) | General Language Understanding Evaluation benchmark to test broad language abilities. | English |
+| [gpqa](gpqa/README.md) | Tasks designed for general public question answering and knowledge verification. | English |
+| [gsm8k](gsm8k/README.md) | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English |
+| [haerae](haerae/README.md) | Tasks focused on assessing detailed factual and historical knowledge. | Korean |
+| [headqa](headqa/README.md) | A high-level education-based question answering dataset to test specialized knowledge. | Spanish, English |
+| [hellaswag](hellaswag/README.md) | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity. | English |
+| [hendrycks_ethics](hendrycks_ethics/README.md)     | Tasks designed to evaluate the ethical reasoning capabilities of models. | English |
+| [hendrycks_math](hendrycks_math/README.md) | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English |
+| [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English |
+| [kmmlu](kmmlu/README.md) | Knowledge-based multi-subject multiple choice questions for academic evaluation. | Korean |
+| [kobest](kobest/README.md) | A collection of tasks designed to evaluate understanding in Korean language. | Korean |
+| [kormedmcqa](kormedmcqa/README.md) | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean |
+| [lambada](lambada/README.md) | Tasks designed to predict the endings of text passages, testing language prediction skills. | English |
+| [lambada_cloze](lambada_cloze/README.md) | Cloze-style LAMBADA dataset. | English |
+| [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian |
+| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese |
+| [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese |
+| [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese |
+| [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English |
+| [mc_taco](mc_taco/README.md) | Question-answer pairs that require temporal commonsense comprehension. | English |
+| medmcqa | Medical multiple choice questions assessing detailed medical knowledge. | English |
+| medqa | Multiple choice question answering based on the United States Medical License Exams. | |
+| [mgsm](mgsm/README.md) | Benchmark of multilingual grade-school math problems. | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu |
+| [minerva_math](minerva_math/README.md) | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills. | English |
+| mmlu | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English |
+| model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. | |
+| [mutual](mutual/README.md) | A retrieval-based dataset for multi-turn dialogue reasoning. | English |
+| [nq_open](nq_open/README.md) | Open domain question answering tasks based on the Natural Questions dataset. | English |
+| [okapi/arc_multilingual](okapi/arc_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.** |
+| [okapi/hellaswag_multilingual](okapi/hellaswag_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (30 languages) |
+| okapi/mmlu_multilingual | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) |
+| [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) |
+| [openbookqa](openbookqa/README.md) | Open-book question answering tasks that require external knowledge and reasoning. | English |
+| [paloma](paloma/README.md) | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit. | English |
+| [paws-x](paws-x/README.md) | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean |
+| [pile](pile/README.md) | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English |
+| [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English |
+| [piqa](piqa/README.md) | Physical Interaction Question Answering tasks to test physical commonsense reasoning. | English |
+| [polemo2](polemo2/README.md) | Sentiment analysis and emotion detection tasks based on Polish language data. | Polish |
+| [prost](prost/README.md) | Tasks requiring understanding of professional standards and ethics in various domains. | English |
+| [pubmedqa](pubmedqa/README.md) | Question answering tasks based on PubMed research articles for biomedical understanding. | English |
+| [qa4mre](qa4mre/README.md) | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning. | English |
+| [qasper](qasper/README.md) | Question Answering dataset based on academic papers, testing in-depth scientific knowledge. | English |
+| [race](race/README.md) | Reading comprehension assessment tasks based on English exams in China. | English |
+| realtoxicityprompts | Tasks to evaluate language models for generating text with potential toxicity. | |
+| [sciq](sciq/README.md) | Science Question Answering tasks to assess understanding of scientific concepts. | English |
+| [scrolls](scrolls/README.md) | Tasks that involve long-form reading comprehension across various domains. | English |
+| [siqa](siqa/README.md) | Social Interaction Question Answering to evaluate common sense and social reasoning.  | English |
+| [squad_completion](squad_completion/README.md) | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs. | English |
+| [squadv2](squadv2/README.md) | Stanford Question Answering Dataset version 2, a reading comprehension benchmark. | English |
+| [storycloze](storycloze/README.md) | Tasks to predict story endings, focusing on narrative logic and coherence. | English |
+| [super_glue](super_glue/README.md) | A suite of challenging tasks designed to test a range of language understanding skills. | English |
+| [swag](swag/README.md) | Situations With Adversarial Generations, predicting the next event in videos. | English |
+| [swde](swde/README.md) | Information extraction tasks from semi-structured web pages. | English |
+| [tinyBenchmarks](tinyBenchmarks/README.md) | Evaluation of large language models with fewer examples using tiny versions of popular benchmarks. | English |
+| [tmmluplus](tmmluplus/README.md) | An extended set of tasks under the TMMLU framework for broader academic assessments. | Traditional Chinese |
+| [toxigen](toxigen/README.md) | Tasks designed to evaluate language models on their propensity to generate toxic content. | English |
+| [translation](translation/README.md) | Tasks focused on evaluating the language translation capabilities of models. | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese |
+| [triviaqa](triviaqa/README.md) | A large-scale dataset for trivia question answering to test general knowledge. | English |
+| [truthfulqa](truthfulqa/README.md) | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English |
+| [unitxt](unitxt/README.md) | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI. | English |
+| [unscramble](unscramble/README.md) | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding. | English |
+| [webqs](webqs/README.md) | Web-based question answering tasks designed to evaluate internet search and retrieval. | English |
+| [wikitext](wikitext/README.md) | Tasks based on text from Wikipedia articles to assess language modeling and generation. | English |
+| [winogrande](winogrande/README.md) | A large-scale dataset for coreference resolution, inspired by the Winograd Schema Challenge. | English |
+| [wmdp](wmdp/README.md) | A benchmark with the objective of minimizing performance, based on potentially-sensitive multiple-choice knowledge questions. | English |
+| [wmt2016](wmt2016/README.md) | Tasks from the WMT 2016 shared task, focusing on translation between multiple languages. | English, Czech, German, Finnish, Russian, Romanian, Turkish |
+| [wsc273](wsc273/README.md) | The Winograd Schema Challenge, a test of commonsense reasoning and coreference resolution. | English |
+| [xcopa](xcopa/README.md) | Cross-lingual Choice of Plausible Alternatives, testing reasoning in multiple languages. | Estonian, Haitian, Indonesian, Italian, Quechua, Swahili, Tamil, Thai, Turkish, Vietnamese, Chinese |
+| [xnli](xnli/README.md) | Cross-Lingual Natural Language Inference to test understanding across different languages. | Arabic, Bulgarian, German, Greekm English, Spanish, French, Hindi, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese |
+| [xnli_eu](xnli_eu/README.md) | Cross-lingual Natural Language Inference tasks in Basque. | Basque |
+| [xstorycloze](xstorycloze/README.md) | Cross-lingual narrative understanding tasks to predict story endings in multiple languages. | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese |
+| [xwinograd](xwinograd/README.md) | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages. | English, French, Japanese, Portuguese, Russian, Chinese |
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -14,27 +14,43 @@ class TaskManager:
    """
-    def __init__(self, verbosity="INFO", include_path: Optional[str] = None) -> None:
+    def __init__(
+        self,
+        verbosity="INFO",
+        include_path: Optional[Union[str, List]] = None,
+        include_defaults: bool = True,
+    ) -> None:
        self.verbosity = verbosity
        self.include_path = include_path
        self.logger = utils.eval_logger
        self.logger.setLevel(getattr(logging, f"{verbosity}"))
-        self._task_index = self.initialize_tasks(include_path=include_path)
+        self._task_index = self.initialize_tasks(
+            include_path=include_path, include_defaults=include_defaults
+        )
        self._all_tasks = sorted(list(self._task_index.keys()))
        self.task_group_map = collections.defaultdict(list)
-    def initialize_tasks(self, include_path: Optional[str] = None):
+    def initialize_tasks(
+        self,
+        include_path: Optional[Union[str, List]] = None,
+        include_defaults: bool = True,
+    ):
        """Creates a dictionary of tasks index.
-        :param include_path: str = None
+        :param include_path: Union[str, List] = None
-            An additional path to be searched for tasks
+            An additional path to be searched for tasks recursively.
+            Can provide more than one such path as a list.
+        :param include_defaults: bool = True
+            If set to false, default tasks (those in lm_eval/tasks/) are not indexed.
        :return
            Dictionary of task names as key and task metadata
        """
-        all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
+        if include_defaults:
+            all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
+        else:
+            all_paths = []
        if include_path is not None:
            if isinstance(include_path, str):
                include_path = [include_path]
@@ -296,8 +312,13 @@ class TaskManager:
        :return
            Dictionary of task names as key and task metadata
        """
+        ignore_dirs = [
+            "__pycache__",
+            ".ipynb_checkpoints",
+        ]
        tasks_and_groups = collections.defaultdict()
-        for root, _, file_list in os.walk(task_dir):
+        for root, dirs, file_list in os.walk(task_dir):
+            dirs[:] = [d for d in dirs if d not in ignore_dirs]
            for f in file_list:
                if f.endswith(".yaml"):
                    yaml_path = os.path.join(root, f)
@@ -413,7 +434,9 @@ def get_task_dict(
        )
    string_task_name_list = [task for task in task_name_list if isinstance(task, str)]
-    others_task_name_list = [task for task in task_name_list if ~isinstance(task, str)]
+    others_task_name_list = [
+        task for task in task_name_list if not isinstance(task, str)
+    ]
    if len(string_task_name_list) > 0:
        if task_manager is None:
            task_manager = TaskManager()

--- a/lm_eval/tasks/aclue/_generate_configs.py
+++ b/lm_eval/tasks/aclue/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
 import argparse
 import os

--- a/lm_eval/tasks/ammlu/README.md
+++ b/lm_eval/tasks/ammlu/README.md
-# ArabicMMLU
+#Arabic COPA
 ### Paper
-ArabicMMLU: Measuring massive multitask language understanding in Arabic
+Original Title: `COPA`
-This dataset has been translated from the original MMLU with the help of GPT-4.
-The original data [MMLU](https://arxiv.org/pdf/2009.03300v3.pdf)
-The translation has been done with AceGPT researchers [AceGPT](https://arxiv.org/abs/2309.12053)
-ArabicMMLU is a comprehensive evaluation benchmark specifically designed to evaluate the knowledge and reasoning abilities of LLMs within the context of Arabic language and culture.
+The Choice Of Plausible Alternatives (COPA) evaluation provides researchers with a tool for assessing progress in open-domain commonsense causal reasoning.
-ArabicMMLU covers a wide range of subjects, comprising 57 topics that span from elementary to advanced professional levels.
-Homepage: [AceGPT Homepage](https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic)
+[Homepage](https://people.ict.usc.edu/~gordon/copa.html)
-### Citation
+AlGhafa has translated this dataset to Arabic[AlGafa](https://aclanthology.org/2023.arabicnlp-1.21.pdf)
+The link to the Arabic version of the dataset [PICA](https://gitlab.com/tiiuae/alghafa/-/tree/main/arabic-eval/copa_ar)
+### Citation
 ### Groups and Tasks
 #### Groups
- `ammlu`: All 57 subjects of the ArabicMMLU dataset, evaluated following the methodology in MMLU's original implementation.
+* Not part of a group yet.
 #### Tasks
+* `copa_ar`
-The following tasks evaluate subjects in the ArabicMMLU dataset using loglikelihood-based multiple-choice scoring:
- `ammlu_{subject_english}`
 ### Checklist
+For adding novel benchmarks/datasets to the library:
 * [x] Is the task an existing benchmark in the literature?
  * [x] Have you referenced the original paper that introduced the task?
-  * [x] If yes, does the original paper provide a reference implementation?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
-    * [x] Yes, original implementation contributed by author of the benchmark
 If other tasks on this dataset are already supported:
 * [x] Is the "Main" variant of this task clearly denoted?

--- a/lm_eval/tasks/ammlu/_default_template_yaml
+++ b/lm_eval/tasks/ammlu/_default_template_yaml
-group: ammlu
+task: copa_ar
-dataset_path: Hennara/ammlu
+dataset_path: Hennara/copa_ar
-test_split: test
+dataset_name: null
-fewshot_split: dev
-fewshot_config:
-  sampler: first_n
 output_type: multiple_choice
-doc_to_text: "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nالجواب："
+training_split: null
-doc_to_choice: ["A", "B", "C", "D"]
+validation_split: null
-doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer)}}"
+test_split: test
+doc_to_text: "السؤال: {{query}}\nالجواب:"
+doc_to_choice: "{{[sol1, sol2]}}"
+doc_to_target: label
+should_decontaminate: true
+doc_to_decontamination_query: query
 metric_list:
  - metric: acc
    aggregation: mean
@@ -16,4 +18,4 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 metadata:
-  version: 0.0
+  version: 1.0
--- a/lm_eval/tasks/alghafa/piqa_ar/README.md
+++ b/lm_eval/tasks/alghafa/piqa_ar/README.md
+#Arabic PIQA
+### Paper
+Original Title: `PIQA: Reasoning about Physical Commonsense in Natural Language`
+Original paper: [PICA](https://arxiv.org/abs/1911.11641)
+Physical Interaction: Question Answering (PIQA) is a physical commonsense
+reasoning and a corresponding benchmark dataset. PIQA was designed to investigate
+the physical knowledge of existing models. To what extent are current approaches
+actually learning about the world?
+[Homepage](https://yonatanbisk.com/piqa)
+AlGhafa has translated this dataset to Arabic[AlGafa](https://aclanthology.org/2023.arabicnlp-1.21.pdf)
+The link to the Arabic version of the dataset [PICA](https://gitlab.com/tiiuae/alghafa/-/tree/main/arabic-eval/pica_ar)
+### Citation
+### Groups and Tasks
+#### Groups
+* Not part of a group yet.
+#### Tasks
+* `piqa_ar`
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/alghafa/piqa_ar/piqa_ar.yaml
+++ b/lm_eval/tasks/alghafa/piqa_ar/piqa_ar.yaml
+task: piqa_ar
+dataset_path: Hennara/pica_ar
+dataset_name: null
+output_type: multiple_choice
+training_split: null
+validation_split: null
+test_split: test
+doc_to_text: "السؤال: {{goal}}\nالجواب:"
+doc_to_choice: "{{[sol1, sol2]}}"
+doc_to_target: label
+should_decontaminate: true
+doc_to_decontamination_query: goal
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/ammlu/_generate_configs.py
+++ b/lm_eval/tasks/ammlu/_generate_configs.py
-"""
-Take in a YAML, and output all other splits with this YAML
-"""
-import argparse
-import os
-import yaml
-from tqdm import tqdm
-SUBJECTS = {
-    "abstract_algebra": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "anatomy": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "astronomy": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "business_ethics": "علوم أخرى",
-    "clinical_knowledge": "علوم أخرى",
-    "college_biology": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "college_chemistry": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "college_computer_science": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "college_mathematics": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "college_medicine": "علوم أخرى",
-    "college_physics": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "computer_security": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "conceptual_physics": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "econometrics": "العلوم الإجتماعية",
-    "electrical_engineering": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "elementary_mathematics": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "formal_logic": "العلوم الانسانية",
-    "global_facts": "علوم أخرى",
-    "high_school_biology": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "high_school_chemistry": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "high_school_computer_science": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "high_school_european_history": "العلوم الانسانية",
-    "high_school_geography": "العلوم الإجتماعية",
-    "high_school_government_and_politics": "العلوم الإجتماعية",
-    "high_school_macroeconomics": "العلوم الإجتماعية",
-    "high_school_mathematics": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "high_school_microeconomics": "العلوم الإجتماعية",
-    "high_school_physics": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "high_school_psychology": "العلوم الإجتماعية",
-    "high_school_statistics": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "high_school_us_history": "العلوم الانسانية",
-    "high_school_world_history": "العلوم الانسانية",
-    "human_aging": "علوم أخرى",
-    "human_sexuality": "العلوم الإجتماعية",
-    "international_law": "العلوم الانسانية",
-    "jurisprudence": "العلوم الانسانية",
-    "logical_fallacies": "العلوم الانسانية",
-    "machine_learning": "ألعلوم وتقنية المعلومات و الرياضيات",
-    "management": "علوم أخرى",
-    "marketing": "علوم أخرى",
-    "medical_genetics": "علوم أخرى",
-    "miscellaneous": "علوم أخرى",
-    "moral_disputes": "العلوم الانسانية",
-    "moral_scenarios": "العلوم الانسانية",
-    "nutrition": "علوم أخرى",
-    "philosophy": "العلوم الانسانية",
-    "prehistory": "العلوم الانسانية",
-    "professional_accounting": "علوم أخرى",
-    "professional_law": "العلوم الانسانية",
-    "professional_medicine": "علوم أخرى",
-    "professional_psychology": "العلوم الإجتماعية",
-    "public_relations": "العلوم الإجتماعية",
-    "security_studies": "العلوم الإجتماعية",
-    "sociology": "العلوم الإجتماعية",
-    "us_foreign_policy": "العلوم الإجتماعية",
-    "virology": "علوم أخرى",
-    "world_religions": "العلوم الانسانية",
-}
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--base_yaml_path", required=True)
-    parser.add_argument("--save_prefix_path", default="ammlu")
-    parser.add_argument("--cot_prompt_path", default=None)
-    parser.add_argument("--task_prefix", default="")
-    return parser.parse_args()
-if __name__ == "__main__":
-    args = parse_args()
-    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
-    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
-    with open(args.base_yaml_path, encoding="utf-8") as f:
-        base_yaml = yaml.full_load(f)
-    if args.cot_prompt_path is not None:
-        import json
-        with open(args.cot_prompt_path, encoding="utf-8") as f:
-            cot_file = json.load(f)
-    for subject_eng, category in tqdm(SUBJECTS.items()):
-        if args.cot_prompt_path is not None:
-            description = cot_file[subject_eng]
-        else:
-            description = f"فم بعملية التقييم في مجال {category} \n\n"
-        yaml_dict = {
-            "include": base_yaml_name,
-            "task": f"ammlu_{args.task_prefix}_{subject_eng}"
-            if args.task_prefix != ""
-            else f"ammlu_{subject_eng}",
-            "dataset_name": subject_eng,
-            "description": description,
-        }
-        file_save_path = args.save_prefix_path + f"_{subject_eng}.yaml"
-        print(f"Saving yaml for subset {subject_eng} to {file_save_path}")
-        with open(file_save_path, "w", encoding="utf-8") as yaml_file:
-            yaml.dump(
-                yaml_dict,
-                yaml_file,
-                width=float("inf"),
-                allow_unicode=True,
-                default_style='"',
-            )
--- a/lm_eval/tasks/ammlu/ammlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_abstract_algebra.yaml
-"dataset_name": "abstract_algebra"
-"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
-"include": "_default_template_yaml"
-"task": "ammlu_abstract_algebra"
--- a/lm_eval/tasks/ammlu/ammlu_anatomy.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_anatomy.yaml
-"dataset_name": "anatomy"
-"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
-"include": "_default_template_yaml"
-"task": "ammlu_anatomy"
--- a/lm_eval/tasks/ammlu/ammlu_astronomy.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_astronomy.yaml
-"dataset_name": "astronomy"
-"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
-"include": "_default_template_yaml"
-"task": "ammlu_astronomy"
--- a/lm_eval/tasks/ammlu/ammlu_business_ethics.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_business_ethics.yaml
-"dataset_name": "business_ethics"
-"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
-"include": "_default_template_yaml"
-"task": "ammlu_business_ethics"
--- a/lm_eval/tasks/ammlu/ammlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_clinical_knowledge.yaml
-"dataset_name": "clinical_knowledge"
-"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
-"include": "_default_template_yaml"
-"task": "ammlu_clinical_knowledge"
--- a/lm_eval/tasks/ammlu/ammlu_college_biology.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_college_biology.yaml
-"dataset_name": "college_biology"
-"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
-"include": "_default_template_yaml"
-"task": "ammlu_college_biology"
--- a/lm_eval/tasks/ammlu/ammlu_college_chemistry.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_college_chemistry.yaml
-"dataset_name": "college_chemistry"
-"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
-"include": "_default_template_yaml"
-"task": "ammlu_college_chemistry"