Merge branch 'main' of https://github.com/EleutherAI/lm-evaluation-harness into t5v2-alt-plus

02e841ce · lintangsutawika · 90ad5db7 · e74ec966 · 02e841ce · 02e841ce
Commit 02e841ce authored Mar 14, 2024 by lintangsutawika
20 changed files
--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -105,7 +105,7 @@ class OpenaiCompletionsLM(TemplateLM):
        except ModuleNotFoundError:
            raise Exception(
                "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
-    please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
+    please install these via `pip install lm-eval[openai]` or `pip install -e .\"[openai]\"`",
            )
        self.model = model
        self.base_url = base_url
@@ -231,7 +231,7 @@ class OpenaiCompletionsLM(TemplateLM):
                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
        return re_ord.get_original(res)

-    def generate_until(self, requests) -> List[str]:
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
        if not requests:
            return []
        res = []
@@ -258,7 +258,8 @@ class OpenaiCompletionsLM(TemplateLM):

        # todo: more intelligent batching for heterogeneous `until`
        for chunk, request_args in tqdm(
-            list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size))
+            list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size)),
+            disable=disable_tqdm,
        ):
            inps = []
            self._max_gen_toks = request_args.get("max_gen_toks", self.max_gen_toks)
@@ -308,10 +309,12 @@ class OpenaiCompletionsLM(TemplateLM):
        # Isn't used because we override generate_until
        raise NotImplementedError()

-    def loglikelihood_rolling(self, requests) -> List[float]:
+    def loglikelihood_rolling(
+        self, requests, disable_tqdm: bool = False
+    ) -> List[float]:
        loglikelihoods = []

-        for (string,) in tqdm([req.args for req in requests]):
+        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
            rolling_token_windows = list(
                map(
                    utils.make_disjoint_window,
@@ -398,7 +401,7 @@ class OpenaiChatCompletionsLM(LM):
        # Isn't used because we override _loglikelihood_tokens
        raise NotImplementedError()

-    def generate_until(self, requests) -> List[str]:
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
        res = defaultdict(list)
        re_ords = {}

@@ -412,7 +415,7 @@ class OpenaiChatCompletionsLM(LM):
                [req.args for req in reqs], lambda x: (-len(x[0]), x[0])
            )

-        pbar = tqdm(total=len(requests), disable=(self.rank != 0))
+        pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
        for key, re_ord in re_ords.items():
            # n needs to be 1 because messages in
            # chat completion are not batch but
@@ -471,8 +474,8 @@ class OpenaiChatCompletionsLM(LM):

        return grouper.get_original(res)

-    def loglikelihood(self, requests):
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
        raise NotImplementedError("No support for logits.")

-    def loglikelihood_rolling(self, requests):
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
        raise NotImplementedError("No support for logits.")
--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
@@ -95,9 +95,9 @@ class TextSynthLM(LM):
        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
        raise NotImplementedError()

-    def loglikelihood(self, requests):
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
        res = []
-        for context, continuation in tqdm(requests):
+        for context, continuation in tqdm(requests, disable=disable_tqdm):
            response = textsynth_completion(
                url=self.api_url + "/v1/engines/" + self.engine + "/logprob",
                headers={"Authorization": "Bearer " + self.api_key},
@@ -119,7 +119,7 @@ class TextSynthLM(LM):
                assert False
        return res

-    def loglikelihood_rolling(self, requests):
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
        # TODO: The TextSynth API does not support tokenized inputs so we cannot
        # manually partition long contexts into smaller rolling windows as
        # done for other models derived from `BaseLM`. Override this method
@@ -129,12 +129,12 @@ class TextSynthLM(LM):
            "input tokenization support from TextSynth."
        )

-    def generate_until(self, requests):
+    def generate_until(self, requests, disable_tqdm: bool = False):
        if not requests:
            return []

        res = []
-        for request in tqdm(requests):
+        for request in tqdm(requests, disable=disable_tqdm):
            inp = request[0]
            request_args = request[1]
            until = request_args["until"]

--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
 import collections
 import fnmatch
 import gc
+import itertools
 import time
 from functools import wraps
 from typing import (
@@ -262,55 +263,44 @@ def stop_sequences_criteria(
    )


-def divide(iterable, n) -> List[Iterator]:
-    """Divide the elements from *iterable* into *n* parts, maintaining
-    order.
+def undistribute(iterable):
+    """
+    Undoes https://more-itertools.readthedocs.io/en/stable/api.html#more_itertools.distribute .

-        >>> group_1, group_2 = divide([1, 2, 3, 4, 5, 6], 2)
+    Re-interleaves results that have been split using more_itertools.distribute:
+        >>> group_1, group_2 = distribute(2, [1, 2, 3, 4, 5, 6])
        >>> list(group_1)
-        [1, 2, 3]
+        [1, 3, 5]
        >>> list(group_2)
-        [4, 5, 6]
+        [2, 4, 6]
+        >>> undistribute([group_1, group_2])
+        [1, 2, 3, 4, 5, 6]

-    If the length of *iterable* is not evenly divisible by *n*, then the
-    length of the returned iterables will not be identical:
+    Handles non-uniform component lengths:

-        >>> children = divide([1, 2, 3, 4, 5, 6, 7], 3)
+        >>> children = distribute(3, [1, 2, 3, 4, 5, 6, 7])
        >>> [list(c) for c in children]
-        [[1, 2, 3], [4, 5], [6, 7]]
+        [[1, 4, 7], [2, 5], [3, 6]]
+        >>> undistribute(children)
+        [1, 2, 3, 4, 5, 6, 7]

-    If the length of the iterable is smaller than n, then the last returned
-    iterables will be empty:
+    Also handles when some iterables are empty:

-        >>> children = divide([1, 2, 3], 5)
+        >>> children = distribute(5, [1, 2, 3])
        >>> [list(c) for c in children]
        [[1], [2], [3], [], []]
-
-    This function will exhaust the iterable before returning and may require
-    significant storage. If order is not important, see :func:`distribute`,
-    which does not first pull the iterable into memory.
+        >>> undistribute(children)
+        [1, 2, 3]

    """
-    if n < 1:
-        raise ValueError("n must be at least 1")

-    try:
-        iterable[:0]
-    except TypeError:
-        seq = tuple(iterable)
-    else:
-        seq = iterable
-
-    q, r = divmod(len(seq), n)
-
-    ret = []
-    stop = 0
-    for i in range(1, n + 1):
-        start = stop
-        stop += q + 1 if i <= r else q
-        ret.append(iter(seq[start:stop]))
-
-    return ret
+    return [
+        x
+        for x in itertools.chain.from_iterable(
+            itertools.zip_longest(*[list(x) for x in iterable])
+        )
+        if x is not None
+    ]


 def retry_on_specific_exceptions(

--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
 import copy
+from importlib.metadata import version
 from importlib.util import find_spec
 from typing import List, Literal, Optional, Tuple, Union

+from more_itertools import distribute
+from packaging.version import parse as parse_version
 from tqdm import tqdm

 from lm_eval.api.instance import Instance
 from lm_eval.api.model import TemplateLM
 from lm_eval.api.registry import register_model
-from lm_eval.models.utils import Collator, divide
+from lm_eval.models.utils import Collator, undistribute
 from lm_eval.utils import (
    eval_logger,
    get_rolling_token_windows,
@@ -17,7 +20,6 @@ from lm_eval.utils import (

 try:
    import ray
-    from ray.util.multiprocessing import Pool
    from vllm import LLM, SamplingParams
    from vllm.transformers_utils.tokenizer import get_tokenizer
 except ModuleNotFoundError:
@@ -26,14 +28,6 @@ except ModuleNotFoundError:
 eval_logger = eval_logger


-# adapted from https://github.com/vllm-project/vllm/issues/367#issuecomment-1788341727
-def run_inference_one_model(
-    model_args: dict, sampling_params, requests: List[List[int]]
-):
-    llm = LLM(**model_args)
-    return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params)
-
-
 @register_model("vllm")
 class VLLM(TemplateLM):
    _DEFAULT_MAX_LENGTH = 2048
@@ -60,6 +54,7 @@ class VLLM(TemplateLM):
        gpu_memory_utilization: float = 0.9,
        device: str = "cuda",
        data_parallel_size: int = 1,
+        **kwargs,
    ):
        super().__init__()

@@ -92,6 +87,7 @@ class VLLM(TemplateLM):
            "quantization": quantization,
            "seed": int(seed),
        }
+        self.model_args.update(kwargs)
        self.batch_size = (
            "auto"
            if isinstance(batch_size, str) and "auto" in batch_size
@@ -100,6 +96,12 @@ class VLLM(TemplateLM):
        if self.data_parallel_size <= 1:
            self.model = LLM(**self.model_args)
        else:
+            assert parse_version(version("vllm")) < parse_version(
+                "0.3.3"
+            ), "data_parallel is only compatible with vllm < v0.3.3."
+            eval_logger.warning(
+                "You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached."
+            )
            self.model_args["worker_use_ray"] = True
            self.batch_size = "auto"
            eval_logger.info("Manual batching is not compatible with data parallelism.")
@@ -181,15 +183,30 @@ class VLLM(TemplateLM):
                temperature=0, prompt_logprobs=1, max_tokens=1
            )
        if self.data_parallel_size > 1:
-            requests = [list(x) for x in divide(requests, self.data_parallel_size)]
-            inputs = [(self.model_args, sampling_params, req) for req in requests]
+            # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
+            # also seems to only work with decorator and not with ray.remote() fn
+            # see https://github.com/vllm-project/vllm/issues/973
+            # note: this has changed on 0.3.3, and it only works now if num_gpus are set.
+            # but then tensor_parallel breaks
+            @ray.remote
+            def run_inference_one_model(
+                model_args: dict, sampling_params, requests: List[List[int]]
+            ):
+                llm = LLM(**model_args)
+                return llm.generate(
+                    prompt_token_ids=requests, sampling_params=sampling_params
+                )

-            with Pool(self.data_parallel_size) as pool:
-                results = pool.starmap(run_inference_one_model, inputs)
+            # dispatch requests to all self.data_parallel_size workers, in interleaved fashion
+            # interleaved important to balance context lengths across workers
+            requests = [list(x) for x in distribute(self.data_parallel_size, requests)]
+            inputs = ((self.model_args, sampling_params, req) for req in requests)
+            object_refs = [run_inference_one_model.remote(*x) for x in inputs]
+            results = ray.get(object_refs)
            # Invoke ray.shutdown() to prevent hang-ups if subsequent calls required.
            ray.shutdown()
            # flatten results
-            return [item for sublist in results for item in sublist]
+            return undistribute(results)

        outputs = self.model.generate(
            prompt_token_ids=requests,
@@ -198,10 +215,12 @@ class VLLM(TemplateLM):
        )
        return outputs

-    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+    def loglikelihood_rolling(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[float]:
        loglikelihoods = []

-        for (string,) in tqdm([req.args for req in requests]):
+        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
            rolling_token_windows = list(
                map(
                    make_disjoint_window,
@@ -227,7 +246,9 @@ class VLLM(TemplateLM):
            loglikelihoods.append(string_nll)
        return loglikelihoods

-    def generate_until(self, requests: List[Instance]) -> List[str]:
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
        res = []

        # batch tokenize contexts
@@ -256,7 +277,7 @@ class VLLM(TemplateLM):

        pbar = tqdm(
            total=len(requests),
-            disable=(self.rank != 0),
+            disable=(disable_tqdm or (self.rank != 0)),
            desc="Running generate_until requests",
        )
        # for each different set of kwargs, we execute all requests, by batch.
@@ -282,8 +303,12 @@ class VLLM(TemplateLM):
                raise ValueError(
                    f"Expected `kwargs` to be of type `dict` but got {gen_kwargs}"
                )
+            # add EOS token to stop sequences
+            eos = self.tokenizer.decode(self.eot_token_id)
            if not until:
-                until = [self.tokenizer.decode(self.eot_token_id)]
+                until = [eos]
+            else:
+                until.append(eos)
            if "max_gen_toks" in kwargs.keys():
                max_gen_toks = kwargs.pop("max_gen_toks")
            else:
@@ -390,6 +415,26 @@ class VLLM(TemplateLM):
        # The first entry of prompt_logprobs is None because the model has no previous tokens to condition on.
        continuation_logprobs_dicts = outputs.prompt_logprobs

+        def coerce_logprob_to_num(logprob):
+            # vLLM changed the return type of logprobs from float
+            # to a Logprob object storing the float value + extra data
+            # (https://github.com/vllm-project/vllm/pull/3065).
+            # If we are dealing with vllm's Logprob object, return
+            # the logprob value stored as an attribute. Otherwise,
+            # return the object itself (which should be a float
+            # for older versions of vLLM).
+            return getattr(logprob, "logprob", logprob)
+
+        continuation_logprobs_dicts = [
+            {
+                token: coerce_logprob_to_num(logprob)
+                for token, logprob in logprob_dict.items()
+            }
+            if logprob_dict is not None
+            else None
+            for logprob_dict in continuation_logprobs_dicts
+        ]
+
        # Calculate continuation_logprobs
        # assume ctxlen always >= 1
        continuation_logprobs = sum(

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
-import abc
 import collections
 import logging
 import os
 from functools import partial
-from typing import Dict, List, Union
+from typing import Dict, List, Mapping, Optional, Union

 from lm_eval import utils
 from lm_eval.api.task import ConfigurableTask, Task
@@ -15,7 +14,7 @@ class TaskManager:

    """

-    def __init__(self, verbosity="INFO", include_path=None) -> None:
+    def __init__(self, verbosity="INFO", include_path: Optional[str] = None) -> None:
        self.verbosity = verbosity
        self.include_path = include_path
        self.logger = utils.eval_logger
@@ -26,8 +25,8 @@ class TaskManager:

        self.task_group_map = collections.defaultdict(list)

-    def initialize_tasks(self, include_path: str = None):
-        """Creates an dictionary of tasks index.
+    def initialize_tasks(self, include_path: Optional[str] = None):
+        """Creates a dictionary of tasks index.

        :param include_path: str = None
            An additional path to be searched for tasks
@@ -59,7 +58,7 @@ class TaskManager:
    def match_tasks(self, task_list):
        return utils.pattern_match(task_list, self.all_tasks)

-    def _name_is_registered(self, name):
+    def _name_is_registered(self, name) -> bool:
        if name in self.all_tasks:
            return True
        return False
@@ -69,7 +68,7 @@ class TaskManager:
            return True
        return False

-    def _name_is_group(self, name):
+    def _name_is_group(self, name) -> bool:
        if self._name_is_registered(name) and (
            self.task_index[name]["type"] == "group"
        ):
@@ -83,27 +82,29 @@ class TaskManager:
            return True
        return False

-    def _config_is_task(self, config):
+    def _config_is_task(self, config) -> bool:
        if ("task" in config) and isinstance(config["task"], str):
            return True
        return False

-    def _config_is_group(self, config):
+    def _config_is_group(self, config) -> bool:
        if ("task" in config) and isinstance(config["task"], list):
            return True
        return False

-    def _config_is_python_task(self, config):
+    def _config_is_python_task(self, config) -> bool:
        if "class" in config:
            return True
        return False

    def _get_yaml_path(self, name):
-        assert name in self.task_index
+        if name not in self.task_index:
+            raise ValueError
        return self.task_index[name]["yaml_path"]

    def _get_config(self, name):
-        assert name in self.task_index
+        if name not in self.task_index:
+            raise ValueError
        yaml_path = self._get_yaml_path(name)
        if yaml_path == -1:
            return {}
@@ -111,7 +112,8 @@ class TaskManager:
            return utils.load_yaml_config(yaml_path, mode="full")

    def _get_tasklist(self, name):
-        assert self._name_is_task(name) is False
+        if self._name_is_task(name):
+            raise ValueError
        return self.task_index[name]["task"]

    def _process_alias(self, config, group=None):
@@ -125,14 +127,15 @@ class TaskManager:

    def _load_individual_task_or_group(
        self,
-        name_or_config: Union[str, dict] = None,
-        parent_name: str = None,
-        update_config: dict = None,
-        yaml_path: str = None,
-    ) -> ConfigurableTask:
+        name_or_config: Optional[Union[str, dict]] = None,
+        parent_name: Optional[str] = None,
+        update_config: Optional[dict] = None,
+        yaml_path: Optional[str] = None,
+    ) -> Mapping:
        def load_task(config, task, group=None, yaml_path=None):
            if "include" in config:
-                assert yaml_path is not None
+                if yaml_path is None:
+                    raise ValueError
                config.update(
                    utils.load_yaml_config(
                        yaml_path,
@@ -166,7 +169,7 @@ class TaskManager:
                # This checks if we're at the root.
                if parent_name is None:
                    group_config = self._get_config(name_or_config)
-                    if set(group_config.keys()) > set(["task", "group"]):
+                    if set(group_config.keys()) > {"task", "group"}:
                        update_config = {
                            k: v
                            for k, v in group_config.items()
@@ -228,7 +231,7 @@ class TaskManager:
            else:
                group_name = name_or_config["group"]
                subtask_list = name_or_config["task"]
-                if set(name_or_config.keys()) > set(["task", "group"]):
+                if set(name_or_config.keys()) > {"task", "group"}:
                    update_config = {
                        k: v
                        for k, v in name_or_config.items()
@@ -251,7 +254,7 @@ class TaskManager:
        }
        return all_subtasks

-    def load_task_or_group(self, task_list: Union[str, list] = None) -> dict:
+    def load_task_or_group(self, task_list: Optional[Union[str, list]] = None) -> dict:
        """Loads a dictionary of task objects from a list

        :param task_list: Union[str, list] = None
@@ -272,7 +275,7 @@ class TaskManager:
        return self._load_individual_task_or_group(config)

    def _get_task_and_group(self, task_dir: str):
-        """Creates an dictionary of tasks index with the following metadata,
+        """Creates a dictionary of tasks index with the following metadata,
        - `type`, that can be either `task`, `python_task`, or `group`.
            `task` refer to regular task configs, `python_task` are special
            yaml files that only consists of `task` and `class` parameters.
@@ -358,7 +361,8 @@ def include_path(task_dir):
    logger.setLevel(getattr(logging, "INFO"))
    logger.info(
        "To still use tasks loaded from args.include_path,"
-        "see an example of the new TaskManager API in https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
+        "see an example of the new TaskManager API in "
+        "https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
    )
    return 0

@@ -397,7 +401,8 @@ def get_task_name_from_object(task_object):


 def get_task_dict(
-    task_name_list: List[Union[str, Dict, Task]], task_manager: TaskManager = None
+    task_name_list: List[Union[str, Dict, Task]],
+    task_manager: Optional[TaskManager] = None,
 ):
    """Creates a dictionary of task objects from either a name of task, config, or prepared Task object.

@@ -442,9 +447,10 @@ def get_task_dict(
                get_task_name_from_object(task_element): task_element,
            }

-    assert set(task_name_from_string_dict.keys()).isdisjoint(
+    if not set(task_name_from_string_dict.keys()).isdisjoint(
        set(task_name_from_object_dict.keys())
-    )
+    ):
+        raise ValueError

    return {
        **task_name_from_string_dict,

--- a/lm_eval/tasks/aexams/README.md
+++ b/lm_eval/tasks/aexams/README.md
+# Arabic EXAMS
+
+### Paper
+
+EXAMS: a resource specialized in multilingual high school exam questions.
+The original paper [EXAMS](https://aclanthology.org/2020.emnlp-main.438/)
+
+The Arabic EXAMS dataset includes five subjects
+
+  - Islamic studies
+  - Biology
+  - Physics
+  - Science
+  - Social
+
+The original dataset [EXAMS-QA](https://github.com/mhardalov/exams-qa)
+
+EXAMS is a benchmark dataset for cross-lingual and multilingual question answering for high school examinations.
+With 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others.
+EXAMS offers unique fine-grained evaluation framework across multiple languages and subjects
+
+Homepage for Arabic EXAMS: [EXAMS Arabic Homepage](https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/EXAMS_Arabic)
+
+### Citation
+
+
+### Groups and Tasks
+
+#### Groups
+
+- `EXAMS Arabic`: include IslamicStudies, Biology, Science, Physics, Social.
+
+#### Tasks
+
+
+The following tasks evaluate subjects in Arabic EXAMS dataset using loglikelihood-based multiple-choice scoring:
+- `aexams_IslamicStudies`
+- `aexams_Biology`
+- `aexams_Science`
+- `aexams_Physics`
+- `aexams_Social`
+
+### Checklist
+
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation?
+    * [x] Yes, original implementation contributed by author of the benchmark
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/aexams/_default_template_yaml
+++ b/lm_eval/tasks/aexams/_default_template_yaml
+group: aexams
+dataset_path: Hennara/aexams
+test_split: test
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nالجواب："
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/aexams/aexams_Biology.yaml
+++ b/lm_eval/tasks/aexams/aexams_Biology.yaml
+"dataset_name": "Biology"
+"description": "قم بالإجابة على مايلي في مجال العلوم الحيوية\n\n"
+"include": "_default_template_yaml"
+"task": "aexams_Biology"
--- a/lm_eval/tasks/aexams/aexams_IslamicStudies.yaml
+++ b/lm_eval/tasks/aexams/aexams_IslamicStudies.yaml
+"dataset_name": "IslamicStudies"
+"description": "قم بالإجابة على مايلي في مجال العلوم الإسلامية \n\n"
+"include": "_default_template_yaml"
+"task": "aexams_IslamicStudies"
--- a/lm_eval/tasks/aexams/aexams_Physics.yaml
+++ b/lm_eval/tasks/aexams/aexams_Physics.yaml
+"dataset_name": "Physics"
+"description": "قم بالإجابة على مايلي في مجال الفيزياء \n\n"
+"include": "_default_template_yaml"
+"task": "aexams_Physics"
--- a/lm_eval/tasks/aexams/aexams_Science.yaml
+++ b/lm_eval/tasks/aexams/aexams_Science.yaml
+"dataset_name": "Science"
+"description": "قم بالإجابة على مايلي في مجال العلوم \n\n"
+"include": "_default_template_yaml"
+"task": "aexams_Science"
--- a/lm_eval/tasks/aexams/aexams_Social.yaml
+++ b/lm_eval/tasks/aexams/aexams_Social.yaml
+"dataset_name": "Social"
+"description": "قم بالإجابة على مايلي في مجال العلوم الإجتماعية \n\n"
+"include": "_default_template_yaml"
+"task": "aexams_Social"
--- a/lm_eval/tasks/agieval/README.md
+++ b/lm_eval/tasks/agieval/README.md
+# AGIEval
+
+### Paper
+
+Title: AGIEval: A Human-Centric Benchmark for Evaluating Foundation Models
+
+Abstract: https://arxiv.org/abs/2304.06364.pdf
+
+AGIEval is a human-centric benchmark specifically designed to evaluate the general abilities of foundation models in tasks pertinent to human cognition and problem-solving.
+This benchmark is derived from 20 official, public, and high-standard admission and qualification exams intended for general human test-takers, such as general college admission tests (e.g., Chinese College Entrance Exam (Gaokao) and American SAT), law school admission tests, math competitions, lawyer qualification tests, and national civil service exams.
+
+Homepage: https://github.com/ruixiangcui/AGIEval
+
+### Citation
+
+```
+@misc{zhong2023agieval,
+      title={AGIEval: A Human-Centric Benchmark for Evaluating Foundation Models},
+      author={Wanjun Zhong and Ruixiang Cui and Yiduo Guo and Yaobo Liang and Shuai Lu and Yanlin Wang and Amin Saied and Weizhu Chen and Nan Duan},
+      year={2023},
+      eprint={2304.06364},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+
+Please make sure to cite all the individual datasets in your paper when you use them. We provide the relevant citation information below:
+
+```
+@inproceedings{ling-etal-2017-program,
+    title = "Program Induction by Rationale Generation: Learning to Solve and Explain Algebraic Word Problems",
+    author = "Ling, Wang  and
+      Yogatama, Dani  and
+      Dyer, Chris  and
+      Blunsom, Phil",
+    booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
+    month = jul,
+    year = "2017",
+    address = "Vancouver, Canada",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/P17-1015",
+    doi = "10.18653/v1/P17-1015",
+    pages = "158--167",
+    abstract = "Solving algebraic word problems requires executing a series of arithmetic operations{---}a program{---}to obtain a final answer. However, since programs can be arbitrarily complicated, inducing them directly from question-answer pairs is a formidable challenge. To make this task more feasible, we solve these problems by generating answer rationales, sequences of natural language and human-readable mathematical expressions that derive the final answer through a series of small steps. Although rationales do not explicitly specify programs, they provide a scaffolding for their structure via intermediate milestones. To evaluate our approach, we have created a new 100,000-sample dataset of questions, answers and rationales. Experimental results show that indirect supervision of program learning via answer rationales is a promising strategy for inducing arithmetic programs.",
+}
+
+@inproceedings{hendrycksmath2021,
+  title={Measuring Mathematical Problem Solving With the MATH Dataset},
+  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
+  journal={NeurIPS},
+  year={2021}
+}
+
+@inproceedings{Liu2020LogiQAAC,
+  title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning},
+  author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},
+  booktitle={International Joint Conference on Artificial Intelligence},
+  year={2020}
+}
+
+@inproceedings{zhong2019jec,
+  title={JEC-QA: A Legal-Domain Question Answering Dataset},
+  author={Zhong, Haoxi and Xiao, Chaojun and Tu, Cunchao and Zhang, Tianyang and Liu, Zhiyuan and Sun, Maosong},
+  booktitle={Proceedings of AAAI},
+  year={2020},
+}
+
+@article{Wang2021FromLT,
+  title={From LSAT: The Progress and Challenges of Complex Reasoning},
+  author={Siyuan Wang and Zhongkun Liu and Wanjun Zhong and Ming Zhou and Zhongyu Wei and Zhumin Chen and Nan Duan},
+  journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
+  year={2021},
+  volume={30},
+  pages={2201-2216}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+- `agieval`: Evaluates all tasks listed below.
+
+- `agieval_en`: Evaluates all English subtasks: `agieval_aqua_rat`, `agieval_gaokao_english`, `agieval_logiqa_en`, `agieval_lsat_*`, `agieval_sat_*`, `agieval_math`
+
+- `agieval_cn`: Evaluates all Chinese subtasks:
+`agieval_gaokao_biology`, `agieval_gaokao_chemistry`, `agieval_gaokao_chinese`, `agieval_gaokao_geography`,
+`agieval_gaokao_history`, `agieval_gaokao_mathqa`, `agieval_gaokao_mathcloze`, `agieval_gaokao_physics`, `agieval_jec_qa_ca`, `agieval_jec_qa_kd`, `agieval_logiqa_zh`
+
+- `agieval_nous`: Evaluates a specific subset of AGIEval tasks (multiple-choice and english-only), namely those in https://github.com/teknium1/LLM-Benchmark-Logs/blob/main/benchmark-logs/Mistral-7B-Base.md
+
+#### Tasks
+
+- `agieval_aqua_rat`
+- `agieval_gaokao_biology`
+- `agieval_gaokao_chemistry`
+- `agieval_gaokao_chinese`
+- `agieval_gaokao_english`
+- `agieval_gaokao_geography`
+- `agieval_gaokao_history`
+- `agieval_gaokao_mathqa`
+- `agieval_gaokao_mathcloze`
+- `agieval_gaokao_physics`
+- `agieval_jec_qa_ca`
+- `agieval_jec_qa_kd`
+- `agieval_logiqa_en`
+- `agieval_logiqa_zh`
+- `agieval_lsat_ar`
+- `agieval_lsat_lr`
+- `agieval_lsat_rc`
+- `agieval_sat_en`
+- `agieval_sat_en_without_passage`
+- `agieval_sat_math`
+- `agieval_math`
--- a/lm_eval/tasks/agieval/aqua-rat.yaml
+++ b/lm_eval/tasks/agieval/aqua-rat.yaml
+group:
+  - agieval
+  - agieval_en
+  - agieval_nous
+task: agieval_aqua_rat
+dataset_path: hails/agieval-aqua-rat
+dataset_name: null
+output_type: multiple_choice
+training_split: null
+validation_split: null
+test_split: test
+doc_to_text: "{{query}}"
+doc_to_target: "{{gold}}"
+doc_to_choice: "{{choices}}"
+process_results: !function utils.process_results_mcqa
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/agieval/gaokao-biology.yaml
+++ b/lm_eval/tasks/agieval/gaokao-biology.yaml
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_cn
+task: agieval_gaokao_biology
+dataset_path: hails/agieval-gaokao-biology
--- a/lm_eval/tasks/agieval/gaokao-chemistry.yaml
+++ b/lm_eval/tasks/agieval/gaokao-chemistry.yaml
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_cn
+task: agieval_gaokao_chemistry
+dataset_path: hails/agieval-gaokao-chemistry
--- a/lm_eval/tasks/agieval/gaokao-chinese.yaml
+++ b/lm_eval/tasks/agieval/gaokao-chinese.yaml
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_cn
+task: agieval_gaokao_chinese
+dataset_path: hails/agieval-gaokao-chinese
--- a/lm_eval/tasks/agieval/gaokao-english.yaml
+++ b/lm_eval/tasks/agieval/gaokao-english.yaml
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_en # categorizing as EN because the AGIEval codebase lists this as in `english_qa_tasks`
+task: agieval_gaokao_english
+dataset_path: hails/agieval-gaokao-english
--- a/lm_eval/tasks/agieval/gaokao-geography.yaml
+++ b/lm_eval/tasks/agieval/gaokao-geography.yaml
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_cn
+task: agieval_gaokao_geography
+dataset_path: hails/agieval-gaokao-geography
--- a/lm_eval/tasks/agieval/gaokao-history.yaml
+++ b/lm_eval/tasks/agieval/gaokao-history.yaml
+include: aqua-rat.yaml
+group:
+  - agieval
+  - agieval_cn
+task: agieval_gaokao_history
+dataset_path: hails/agieval-gaokao-history