Merge branch 'main' into ai2d

89b6bdb3 · Baber · 59053d58 · 144a1e58 · 89b6bdb3 · 89b6bdb3
Commit 89b6bdb3 authored Feb 06, 2025 by Baber
20 changed files
--- a/lm_eval/filters/__init__.py
+++ b/lm_eval/filters/__init__.py
@@ -4,7 +4,7 @@ from typing import List
 from lm_eval.api.filter import FilterEnsemble
 from lm_eval.api.registry import get_filter
-from . import extraction, selection, transformation
+from . import custom, extraction, selection, transformation
 def build_filter_ensemble(

--- a/lm_eval/filters/custom.py
+++ b/lm_eval/filters/custom.py
+from lm_eval.api.filter import Filter
+from lm_eval.api.registry import register_filter
+@register_filter("custom")
+class CustomFilter(Filter):
+    """
+    Custom filter that applies a custom, user-defined function to the model responses.
+    """
+    def __init__(self, **kwargs) -> None:
+        self.filter_fn = kwargs.pop("filter_fn")
+        super().__init__(**kwargs)
+    def apply(self, resps, docs):
+        return self.filter_fn(resps, docs)
--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -8,12 +8,17 @@ from lm_eval.api.registry import register_filter
 @register_filter("regex")
 class RegexFilter(Filter):
-    """ """
+    """A filter that extracts values from text using regex pattern matching.
+    This filter applies a regex pattern to each model response and extracts matched values.
+    If no match is found, returns a fallback value. Useful for extracting structured data
+    (like numbers) from unstructured model outputs.
+    """
    def __init__(
        self,
        regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
-        group_select=0,
+        group_select: int = 0,
        fallback: str = "[invalid]",
    ) -> None:
        """
@@ -25,7 +30,7 @@ class RegexFilter(Filter):
        self.group_select = group_select
        self.fallback = fallback
-    def apply(self, resps, docs):
+    def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
        # here, we assume we have a list, in which each element is
        # a list of model responses for some particular input/target pair.
        # so we process each of these (same input/target response sets)
@@ -55,12 +60,9 @@ class RegexFilter(Filter):
 @register_filter("remove_whitespace")
 class WhitespaceFilter(Filter):
-    """ """
+    """Filters out leading whitespace from responses."""
-    def __init__(self) -> None:
-        pass
-    def apply(self, resps, docs):
+    def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
        def filter_set(inst):
            filtered_resp = []
            for resp in inst:
@@ -105,7 +107,7 @@ class MultiChoiceRegexFilter(RegexFilter):
        self.ignore_punctuation = ignore_punctuation
        self.regexes_to_ignore = regexes_to_ignore
-    def apply(self, resps, docs):
+    def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
        # here, we assume we have a list, in which each element is
        # a list of model responses for some particular input/target pair.
        # so we process each of these (same input/target response sets)
@@ -164,7 +166,7 @@ class MultiChoiceRegexFilter(RegexFilter):
            fallback_regex = re.compile("|".join(fallback_regexes))
            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
            without_paren_fallback_regex = re.compile(
-                f":[\s]*({without_paren_fallback_regex})"
+                rf":[\s]*({without_paren_fallback_regex})"
            )
            filtered = []

--- a/lm_eval/filters/selection.py
+++ b/lm_eval/filters/selection.py
@@ -34,9 +34,9 @@ class TakeKFilter(Filter):
        # need resp to be subscriptable to check below
        resps = list(resps)
        # check we have at least k responses per doc, else we can't take the first k
-        assert (
+        assert len(resps[0]) >= self.k, (
-            len(resps[0]) >= self.k
+            f"Need at least {self.k} responses per doc to take first {self.k}, but got {len(resps[0])} only! Please increase TaskConfig.repeats ."
-        ), f"Need at least {self.k} responses per doc to take first {self.k}, but got {len(resps[0])} only! Please increase TaskConfig.repeats ."
+        )
        return map(lambda r: r[: self.k], resps)

--- a/lm_eval/filters/transformation.py
+++ b/lm_eval/filters/transformation.py
@@ -43,9 +43,9 @@ class MapFilter(Filter):
        """
        if mapping_dict is None:
            mapping_dict = {}
-        assert isinstance(
+        assert isinstance(mapping_dict, dict), (
-            mapping_dict, dict
+            "Provided mapping_dict is not a dictionary"
-        ), "Provided mapping_dict is not a dictionary"
+        )
        self.mapping_dict = mapping_dict
        self.default_value = default_value

--- a/lm_eval/loggers/evaluation_tracker.py
+++ b/lm_eval/loggers/evaluation_tracker.py
@@ -488,7 +488,7 @@ class EvaluationTracker:
        else:
            dataset_summary += f"{self.general_config_tracker.model_name}\n"
        dataset_summary += (
-            f"The dataset is composed of {len(card_metadata)-1} configuration(s), each one corresponding to one of the evaluated task.\n\n"
+            f"The dataset is composed of {len(card_metadata) - 1} configuration(s), each one corresponding to one of the evaluated task.\n\n"
            f"The dataset has been created from {len(results_files)} run(s). Each run can be found as a specific split in each "
            'configuration, the split being named using the timestamp of the run.The "train" split is always pointing to the latest results.\n\n'
            'An additional configuration "results" store all the aggregated results of the run.\n\n'
@@ -501,7 +501,7 @@ class EvaluationTracker:
            )
        dataset_summary += (
            "## Latest results\n\n"
-            f'These are the [latest results from run {latest_datetime}]({last_results_file_path.replace("/resolve/", "/blob/")}) '
+            f"These are the [latest results from run {latest_datetime}]({last_results_file_path.replace('/resolve/', '/blob/')}) "
            "(note that there might be results for other tasks in the repos if successive evals didn't cover the same tasks. "
            'You find each in the results and the "latest" split for each eval):\n\n'
            f"```python\n{results_string}\n```"

--- a/lm_eval/loggers/wandb_logger.py
+++ b/lm_eval/loggers/wandb_logger.py
@@ -48,6 +48,9 @@ class WandbLogger:
        self.wandb_args: Dict[str, Any] = kwargs
+        # pop the step key from the args to save for all logging calls
+        self.step = self.wandb_args.pop("step", None)
        # initialize a W&B run
        if wandb.run is None:
            self.run = wandb.init(**self.wandb_args)
@@ -152,11 +155,11 @@ class WandbLogger:
        # log the complete eval result to W&B Table
        table = make_table(["Tasks"] + columns, "results")
-        self.run.log({"evaluation/eval_results": table})
+        self.run.log({"evaluation/eval_results": table}, step=self.step)
        if "groups" in self.results.keys():
            table = make_table(["Groups"] + columns, "groups")
-            self.run.log({"evaluation/group_eval_results": table})
+            self.run.log({"evaluation/group_eval_results": table}, step=self.step)
    def _log_results_as_artifact(self) -> None:
        """Log results as JSON artifact to W&B."""
@@ -174,13 +177,13 @@ class WandbLogger:
        """Log evaluation results to W&B."""
        # Log configs to wandb
        configs = self._get_config()
-        self.run.config.update(configs)
+        self.run.config.update(configs, allow_val_change=self.step is not None)
        wandb_summary, self.wandb_results = self._sanitize_results_dict()
        # update wandb.run.summary with items that were removed
        self.run.summary.update(wandb_summary)
        # Log the evaluation metrics to wandb
-        self.run.log(self.wandb_results)
+        self.run.log(self.wandb_results, step=self.step)
        # Log the evaluation metrics as W&B Table
        self._log_results_as_table()
        # Log the results dict as json to W&B Artifacts
@@ -222,7 +225,7 @@ class WandbLogger:
            instance = [x["arguments"][0][0] for x in data]
            labels = [x["arguments"][0][1] for x in data]
            resps = [
-                f'log probability of continuation is {x["resps"][0][0][0]} '
+                f"log probability of continuation is {x['resps'][0][0][0]} "
                + "\n\n"
                + "continuation will {} generated with greedy sampling".format(
                    "not be" if not x["resps"][0][0][1] else "be"
@@ -230,7 +233,7 @@ class WandbLogger:
                for x in data
            ]
            filtered_resps = [
-                f'log probability of continuation is {x["filtered_resps"][0][0]} '
+                f"log probability of continuation is {x['filtered_resps'][0][0]} "
                + "\n\n"
                + "continuation will {} generated with greedy sampling".format(
                    "not be" if not x["filtered_resps"][0][1] else "be"
@@ -329,7 +332,7 @@ class WandbLogger:
            # log the samples as a W&B Table
            df = self._generate_dataset(eval_preds, self.task_configs.get(task_name))
-            self.run.log({f"{task_name}_eval_results": df})
+            self.run.log({f"{task_name}_eval_results": df}, step=self.step)
            # log the samples as a json file as W&B Artifact
            self._log_samples_as_artifact(eval_preds, task_name)
@@ -348,4 +351,4 @@ class WandbLogger:
                # log the samples as a json file as W&B Artifact
                self._log_samples_as_artifact(eval_preds, task_name)
-            self.run.log({f"{group}_eval_results": grouped_df})
+            self.run.log({f"{group}_eval_results": grouped_df}, step=self.step)
--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
@@ -11,6 +11,7 @@ from . import (
    neuralmagic,
    neuron_optimum,
    openai_completions,
+    optimum_ipex,
    optimum_lm,
    textsynth,
    vllm_causallms,

--- a/lm_eval/models/api_models.py
+++ b/lm_eval/models/api_models.py
@@ -21,7 +21,7 @@ from typing import (
 try:
    import requests
-    from aiohttp import ClientSession, TCPConnector
+    from aiohttp import ClientSession, ClientTimeout, TCPConnector
    from tenacity import RetryError, retry, stop_after_attempt, wait_exponential
    from tqdm import tqdm
    from tqdm.asyncio import tqdm_asyncio
@@ -81,6 +81,8 @@ class TemplateAPI(TemplateLM):
        use_fast_tokenizer: bool = True,
        verify_certificate: bool = True,
        eos_string: str = None,
+        # timeout in seconds
+        timeout: int = 300,
        **kwargs,
    ) -> None:
        super().__init__()
@@ -126,6 +128,7 @@ class TemplateAPI(TemplateLM):
        self.max_retries = int(max_retries)
        self.verify_certificate = verify_certificate
        self._eos_string = eos_string
+        self.timeout = int(timeout)
        eval_logger.info(f"Using tokenizer {self.tokenizer_backend}")
        if self.tokenizer_backend is None:
@@ -192,9 +195,9 @@ class TemplateAPI(TemplateLM):
        """Helper method to transform the prompt into the expected API input format. messages consist of batched requests"""
        if isinstance(messages[0], JsonChatStr):
            # for chat completions we need to decode the json string to list[dict,...]
-            assert (
+            assert self._batch_size == 1, (
-                self._batch_size == 1
+                "non-tokenized chat requests are only supported with batch_size=1"
-            ), "non-tokenized chat requests are only supported with batch_size=1"
+            )
            # list[dict["role":..., "content":...],...]
            return json.loads(messages[0].prompt)
@@ -250,12 +253,15 @@ class TemplateAPI(TemplateLM):
        return ""
    def apply_chat_template(
-        self, chat_history: List[Dict[str, str]]
+        self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
    ) -> Union[str, JsonChatStr]:
        """Applies a chat template to a list of chat history between user and model."""
        if self.tokenizer_backend == "huggingface" and self.tokenized_requests:
            return self.tokenizer.apply_chat_template(
-                chat_history, tokenize=False, add_generation_prompt=True
+                chat_history,
+                tokenize=False,
+                add_generation_prompt=add_generation_prompt,
+                continue_final_message=not add_generation_prompt,
            )
        else:
            # bit of a hack. We'll load back before sending to the API
@@ -445,9 +451,13 @@ class TemplateAPI(TemplateLM):
        for chunk in chunks:
            for cache_key, context_enc, continuation_enc in chunk:
                # max_length - 1 as we always have 1 token for generation
-                inp = (context_enc + continuation_enc)[-(self.max_length) :]
+                inp = (context_enc + continuation_enc)[-self.max_length :]
+                if len(inp) < len(context_enc + continuation_enc):
+                    eval_logger.warning(
+                        f"Context length ({len(context_enc)}) + continuation length ({len(continuation_enc)}) > max_length ({self.max_length}). Left truncating context."
+                    )
                ctxlen = len(context_enc) - max(
-                    0, len(context_enc) + len(continuation_enc) - (self.max_length)
+                    0, len(context_enc) + len(continuation_enc) - self.max_length
                )
                inputs.append(inp)
@@ -466,7 +476,9 @@ class TemplateAPI(TemplateLM):
    ) -> Union[List[List[str]], List[List[Tuple[float, bool]]]]:
        ctxlens = ctxlens if ctxlens else [None] * len(requests)
        conn = TCPConnector(limit=self._concurrent)
-        async with ClientSession(connector=conn) as session:
+        async with ClientSession(
+            connector=conn, timeout=ClientTimeout(total=self.timeout)
+        ) as session:
            retry_: Callable[..., Awaitable[Any]] = retry(
                stop=stop_after_attempt(self.max_retries),
                wait=wait_exponential(multiplier=0.5, min=1, max=10),
@@ -494,9 +506,9 @@ class TemplateAPI(TemplateLM):
            return await tqdm_asyncio.gather(*tasks, desc="Requesting API")
    def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
-        assert (
+        assert self.tokenizer is not None, (
-            self.tokenizer is not None
+            "Tokenizer is required for loglikelihood tasks to compute context lengths."
-        ), "Tokenizer is required for loglikelihood tasks to compute context lengths."
+        )
        res = []
        def _collate(req: LogLikelihoodInputs):
@@ -589,6 +601,24 @@ class TemplateAPI(TemplateLM):
            pbar = tqdm(desc="Requesting API", total=len(requests))
            for chunk in chunked:
                contexts, all_gen_kwargs, encodings_list = zip(*chunk)
+                if self.tokenized_requests:
+                    max_gen_toks = all_gen_kwargs[0].get(
+                        "max_gen_toks", self._max_gen_toks
+                    )
+                    max_context_len = self.max_length - max_gen_toks
+                    encodings_list = [x[-max_context_len:] for x in encodings_list]
+                    if any(
+                        len(x) + max_gen_toks > self.max_length for x in encodings_list
+                    ):
+                        eval_logger.warning(
+                            f"Some contexts exceeded (max length: ({self.max_length}) - max_gen_toks: ({max_gen_toks}). They were left truncated."
+                        )
+                else:
+                    eval_logger.info(
+                        "Tokenized requests are disabled. Context + generation length is not checked."
+                    )
                req = encodings_list if self.tokenized_requests else contexts
                outputs = retry(
                    stop=stop_after_attempt(self.max_retries),
@@ -620,6 +650,24 @@ class TemplateAPI(TemplateLM):
        else:
            for chunk in chunked:
                contexts, all_gen_kwargs, encodings_list = zip(*chunk)
+                if self.tokenized_requests:
+                    max_gen_toks = all_gen_kwargs[0].get(
+                        "max_gen_toks", self._max_gen_toks
+                    )
+                    max_context_len = self.max_length - max_gen_toks
+                    encodings_list = [x[-max_context_len:] for x in encodings_list]
+                    if any(
+                        len(x) + max_gen_toks > self.max_length for x in encodings_list
+                    ):
+                        eval_logger.warning(
+                            f"Some contexts exceeded (max length: ({self.max_length}) - max_gen_toks ({max_gen_toks}). They were left truncated."
+                        )
+                else:
+                    eval_logger.info(
+                        "Tokenized requests are disabled. Context + generation length is not checked."
+                    )
                req = encodings_list if self.tokenized_requests else contexts
                results = itertools.chain.from_iterable(
                    asyncio.run(

--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -51,9 +51,9 @@ class HFMultimodalLM(HFLM):
        # modify init behavior.
        super().__init__(pretrained, **kwargs)
-        assert (
+        assert self.batch_size != "auto", (
-            self.batch_size != "auto"
+            "Batch size 'auto' is not yet supported for hf-multimodal models."
-        ), "Batch size 'auto' is not yet supported for hf-multimodal models."
+        )
        self.chat_applied: bool = False
        # TODO: phi-3.5 "image placeholders" are <image_1>, <image_2>, ... in order. how to handle this case
@@ -73,9 +73,9 @@ class HFMultimodalLM(HFLM):
                    or getattr(self.config, "image_token_index", None)
                )
            )
-            assert (
+            assert self.image_token_id is not None, (
-                self.image_token_id is not None
+                "Must have a non-None image_token_id to evaluate a Hugging Face AutoModelForVision2Seq model. Please pass `image_token_id` in `--model_args` if model's config does not already specify one."
-            ), "Must have a non-None image_token_id to evaluate a Hugging Face AutoModelForVision2Seq model. Please pass `image_token_id` in `--model_args` if model's config does not already specify one."
+            )
            # get the string this token ID corresponds to
            self.image_token = self.tok_decode(
                [self.image_token_id], skip_special_tokens=False
@@ -200,7 +200,9 @@ class HFMultimodalLM(HFLM):
        return context_enc, continuation_enc, image_enc
-    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+    def apply_chat_template(
+        self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
+    ) -> str:
        self.chat_applied = True
        if not self.interleave:
            for content in chat_history:
@@ -250,7 +252,9 @@ class HFMultimodalLM(HFLM):
                    )
        return self.processor.apply_chat_template(
-            chat_history, add_generation_prompt=True
+            chat_history,
+            add_generation_prompt=add_generation_prompt,
+            continue_final_message=not add_generation_prompt,
        )
    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -90,6 +90,7 @@ class HFLM(TemplateLM):
        delta: Optional[str] = None,
        autogptq: Optional[Union[bool, str]] = False,
        gptqmodel: Optional[bool] = False,
+        gguf_file: Optional[str] = None,
        **kwargs,
    ) -> None:
        super().__init__()
@@ -98,7 +99,9 @@ class HFLM(TemplateLM):
            eval_logger.warning(
                "`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way."
            )
-            assert not parallelize, "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
+            assert not parallelize, (
+                "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
+            )
            self._model = pretrained
            self._device = self._model.device
            self._config = self._model.config
@@ -164,6 +167,7 @@ class HFLM(TemplateLM):
                pretrained,
                revision=revision,
                trust_remote_code=trust_remote_code,
+                gguf_file=gguf_file,
            )
            # determine which of 'causal' and 'seq2seq' backends to use for HF models
@@ -178,6 +182,7 @@ class HFLM(TemplateLM):
            revision=revision,
            trust_remote_code=trust_remote_code,
            use_fast_tokenizer=use_fast_tokenizer,
+            gguf_file=gguf_file,
        )
        # if we passed `pretrained` as a string, initialize our model now
@@ -196,6 +201,7 @@ class HFLM(TemplateLM):
                delta=delta,
                autogptq=autogptq,
                gptqmodel=gptqmodel,
+                gguf_file=gguf_file,
                **kwargs,
            )
@@ -508,12 +514,14 @@ class HFLM(TemplateLM):
        pretrained: str,
        revision: str = "main",
        trust_remote_code: bool = False,
+        gguf_file: Optional[str] = None,
    ) -> None:
        """Return the model config for HuggingFace models"""
        self._config = transformers.AutoConfig.from_pretrained(
            pretrained,
            revision=revision,
            trust_remote_code=trust_remote_code,
+            gguf_file=gguf_file,
        )
    def _create_model(
@@ -535,6 +543,7 @@ class HFLM(TemplateLM):
        delta: Optional[str] = None,
        autogptq: Optional[Union[bool, str]] = False,
        gptqmodel: Optional[bool] = False,
+        gguf_file: Optional[str] = None,
        **kwargs,
    ) -> None:
        """
@@ -564,9 +573,9 @@ class HFLM(TemplateLM):
        if not autogptq and not gptqmodel:
            if model_kwargs.get("load_in_4bit", None):
-                assert (
+                assert transformers.__version__ >= "4.30.0", (
-                    transformers.__version__ >= "4.30.0"
+                    "load_in_4bit requires transformers >= 4.30.0"
-                ), "load_in_4bit requires transformers >= 4.30.0"
+                )
            if transformers.__version__ >= "4.30.0":
                if model_kwargs.get("load_in_4bit", None):
                    if model_kwargs.get("bnb_4bit_compute_dtype", None):
@@ -579,6 +588,7 @@ class HFLM(TemplateLM):
                revision=revision,
                torch_dtype=get_dtype(dtype),
                trust_remote_code=trust_remote_code,
+                gguf_file=gguf_file,
                **model_kwargs,
            )
        else:
@@ -676,6 +686,7 @@ class HFLM(TemplateLM):
        revision: Optional[str] = "main",
        trust_remote_code: Optional[bool] = False,
        use_fast_tokenizer: Optional[bool] = True,
+        gguf_file: Optional[str] = None,
    ) -> None:
        """
        Helper method during initialization.
@@ -683,14 +694,21 @@ class HFLM(TemplateLM):
        Create a tokenizer object corresponding to the correct
        tokenizer for value of `pretrained`, or use the pre-initialized tokenizer passed.
        """
+        kwargs = {
+            "revision": revision,
+            "trust_remote_code": trust_remote_code,
+        }
+        # gguf format embeds tokenizer and is not compatible with hf tokenizer `use_fast` param
+        if gguf_file is not None:
+            kwargs["gguf_file"] = gguf_file
+        else:
+            kwargs["use_fast"] = use_fast_tokenizer
        if tokenizer:
            if isinstance(tokenizer, str):
                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                    tokenizer,
+                    tokenizer, **kwargs
-                    revision=revision,
-                    trust_remote_code=trust_remote_code,
-                    use_fast=use_fast_tokenizer,
                )
            else:
                assert isinstance(
@@ -705,10 +723,7 @@ class HFLM(TemplateLM):
                # get the HF hub name via accessor on model
                model_name = self.model.name_or_path
            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                model_name,
+                model_name, **kwargs
-                revision=revision,
-                trust_remote_code=trust_remote_code,
-                use_fast=use_fast_tokenizer,
            )
        return None
@@ -818,6 +833,12 @@ class HFLM(TemplateLM):
            **add_special_tokens,
        )
        if left_truncate_len:
+            original_lengths = encoding["input_ids"].size(1)
+            if original_lengths > left_truncate_len:
+                eval_logger.warn(
+                    f"Left truncation applied. Original sequence length was {original_lengths}, "
+                    f"truncating to last {left_truncate_len} tokens. Some content will be lost.",
+                )
            encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
            encoding["attention_mask"] = encoding["attention_mask"][
                :, -left_truncate_len:
@@ -886,16 +907,16 @@ class HFLM(TemplateLM):
        self, logits: torch.Tensor, contlen: int = None, inplen: int = None
    ) -> torch.Tensor:
        if self.backend == "causal":
-            assert (
+            assert contlen and inplen, (
-                contlen and inplen
+                "Must pass input len and cont. len to select scored logits for causal LM"
-            ), "Must pass input len and cont. len to select scored logits for causal LM"
+            )
            # discard right-padding.
            # also discard the input/context tokens. we'll only score continuations.
            logits = logits[inplen - contlen : inplen]
        elif self.backend == "seq2seq":
-            assert (
+            assert contlen and not inplen, (
-                contlen and not inplen
+                "Selecting scored logits for Seq2SeqLM requires only cont. len"
-            ), "Selecting scored logits for Seq2SeqLM requires only cont. len"
+            )
            # only discard right-padding.
            # the logits input to this fn only contain decoder-side tokens.
            logits = logits[:contlen]
@@ -905,8 +926,6 @@ class HFLM(TemplateLM):
    def loglikelihood_rolling(
        self, requests: List[Instance], disable_tqdm: bool = False
    ) -> List[float]:
-        loglikelihoods = []
        adaptive_batch_size = None
        if self.batch_size == "auto":
            # using rolling window with maximum context
@@ -915,10 +934,17 @@ class HFLM(TemplateLM):
            print(f"Determined Largest batch size: {batch_size}")
            adaptive_batch_size = batch_size
-        for (string,) in tqdm(
+        # First, collect all windows from all requests
-            [req.args for req in requests], disable=(disable_tqdm or (self.rank != 0))
+        all_windows = []  # List of (request_idx, window) tuples
+        request_window_counts = []  # Track number of windows per request
+        for req_idx, (string,) in enumerate(
+            tqdm(
+                [req.args for req in requests],
+                disable=(disable_tqdm or (self.rank != 0)),
+            )
        ):
-            rolling_token_windows = list(
+            rolling_token_windows: List[Tuple[List[int], List[int]]] = list(
                map(
                    utils.make_disjoint_window,
                    utils.get_rolling_token_windows(
@@ -931,37 +957,55 @@ class HFLM(TemplateLM):
            )
            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
-            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+            windows = [(None,) + x for x in rolling_token_windows]
-            pad_amnt = 0
-            if self.world_size > 1:
-                # We pad out the external document-level iterator so the inner iterator doesn't hang
-                mytensor = torch.tensor(len(rolling_token_windows), device=self.device)
-                gathered = (
-                    self.accelerator.gather(mytensor).cpu().detach().numpy().tolist()
-                )
-                pad_amnt = max(gathered) - gathered[self.rank]
+            # Store windows with their request index
-                if pad_amnt > 0:
+            all_windows.extend((req_idx, window) for window in windows)
-                    rolling_token_windows += pad_amnt * [rolling_token_windows[0]]
+            request_window_counts.append(len(windows))
-            string_nll = self._loglikelihood_tokens(
+        # Handle distributed case padding
-                requests=rolling_token_windows,
+        pad_amnt = 0
-                disable_tqdm=True,
+        if self.world_size > 1:
-                override_bs=adaptive_batch_size,
+            mytensor = torch.tensor(len(all_windows), device=self.device)
+            gathered = self.accelerator.gather(mytensor).cpu().detach().numpy().tolist()
+            pad_amnt = max(gathered) - gathered[self.rank]
+            if pad_amnt > 0:
+                all_windows += pad_amnt * [all_windows[0]]
+        all_nlls = []
+        batch_size = adaptive_batch_size or self.batch_size
+        for i in range(0, len(all_windows), batch_size):
+            batch = all_windows[i : i + batch_size]
+            # Extract just the windows for processing, keeping track of request indices
+            batch_indices, batch_windows = zip(*batch)
+            batch_nlls = self._loglikelihood_tokens(
+                requests=batch_windows,
+                disable_tqdm=False,
+                override_bs=len(batch_windows),
            )
+            # Store results with their request indices
+            all_nlls.extend(zip(batch_indices, batch_nlls))
-            if (self.world_size > 1) and (pad_amnt > 0):
+        # Remove padding if necessary
-                string_nll = [x[0] for x in string_nll[:-pad_amnt]]
+        if (self.world_size > 1) and (pad_amnt > 0):
-            else:
+            all_nlls = all_nlls[:-pad_amnt]
-                # discard is_greedy
-                string_nll = [x[0] for x in string_nll]
-            string_nll = sum(string_nll)
+        # Reconstruct per-request loglikelihoods
-            loglikelihoods.append(string_nll)
+        loglikelihoods = []
+        current_idx = 0
-            # cache this loglikelihood_rolling request
+        for window_count in request_window_counts:
-            self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
+            # Get all nlls for this request
+            request_nlls = all_nlls[current_idx : current_idx + window_count]
+            # Sum up the nlls for this request (discarding is_greedy)
+            request_total = sum(nll[0] for _, nll in request_nlls)
+            loglikelihoods.append(request_total)
+            current_idx += window_count
+            string = requests[len(loglikelihoods) - 1].args[0]
+            self.cache_hook.add_partial(
+                "loglikelihood_rolling", (string,), request_total
+            )
        return loglikelihoods
@@ -1073,6 +1117,13 @@ class HFLM(TemplateLM):
                # when too long to fit in context, truncate from the left
                if self.backend == "causal":
+                    total_length = len(context_enc) + len(continuation_enc)
+                    if total_length > self.max_length + 1:
+                        eval_logger.warn(
+                            f"Combined length of context ({len(context_enc)}) and continuation ({len(continuation_enc)}) "
+                            f"exceeds model's maximum length ({self.max_length}). "
+                            f"Truncating {total_length - self.max_length + 1} tokens from the left."
+                        )
                    inp = torch.tensor(
                        (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
                        dtype=torch.long,
@@ -1280,6 +1331,9 @@ class HFLM(TemplateLM):
            if self.backend == "causal":
                # max len for inputs = max length, minus room to generate the max new tokens
                max_ctx_len = self.max_length - max_gen_toks
+                assert max_ctx_len > 0, (
+                    f"Invalid configuration: requested max tokens to generate ({max_gen_toks}) must be less than model's maximum sequence length ({self.max_length})."
+                )
            elif self.backend == "seq2seq":
                # max len for inputs = encoder's whole max_length
                max_ctx_len = self.max_length
@@ -1330,13 +1384,18 @@ class HFLM(TemplateLM):
        return res
-    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+    def apply_chat_template(
+        self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
+    ) -> str:
        """
        Method to apply a chat template to a list of chat history between user and model.
        """
        try:
            chat_templated = self.tokenizer.apply_chat_template(
-                chat_history, tokenize=False, add_generation_prompt=True
+                chat_history,
+                tokenize=False,
+                add_generation_prompt=add_generation_prompt,
+                continue_final_message=not add_generation_prompt,
            )
        except jinja2.exceptions.TemplateError:
            eval_logger.warning(
@@ -1344,7 +1403,10 @@ class HFLM(TemplateLM):
            )
            chat_history = [msg for msg in chat_history if msg["role"] != "system"]
            chat_templated = self.tokenizer.apply_chat_template(
-                chat_history, tokenize=False, add_generation_prompt=True
+                chat_history,
+                tokenize=False,
+                add_generation_prompt=add_generation_prompt,
+                continue_final_message=not add_generation_prompt,
            )
        return chat_templated

--- a/lm_eval/models/nemo_lm.py
+++ b/lm_eval/models/nemo_lm.py
@@ -187,11 +187,11 @@ class NeMoLM(LM):
        **kwargs,
    ):
        try:
+            from lightning.pytorch.trainer.trainer import Trainer
            from nemo.collections.nlp.modules.common.text_generation_utils import (
                generate,
            )
            from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
-            from pytorch_lightning.trainer.trainer import Trainer
            self.generate = generate
        except ModuleNotFoundError as exception:

--- a/lm_eval/models/neuron_optimum.py
+++ b/lm_eval/models/neuron_optimum.py
@@ -206,7 +206,7 @@ class NEURON_HF(TemplateLM):
                    "Only float16/bfloat16/float32 are supported."
                )
-            print(f"{'='*20} \n exporting model to neuron")
+            print(f"{'=' * 20} \n exporting model to neuron")
            self.model = CustomNeuronModelForCausalLM.from_pretrained(
                pretrained,
                revision=revision,
@@ -220,19 +220,17 @@ class NEURON_HF(TemplateLM):
            )
            neuron_config = self.model.config.neuron
            print(
-                f"SUCCESS: neuron model exported with config {neuron_config}. \n {'='*20}"
+                f"SUCCESS: neuron model exported with config {neuron_config}. \n {'=' * 20}"
            )
        else:
-            print(
+            print(f"{'=' * 20} \n loading neuron model with config {neuron_config}...")
-                f"{'='*20} \n loading neuron model with config" f" {neuron_config}..."
-            )
            self.model = CustomNeuronModelForCausalLM.from_pretrained(
                pretrained,
                revision=revision,
                trust_remote_code=trust_remote_code,
                low_cpu_mem_usage=low_cpu_mem_usage,
            )
-            print(f"SUCCESS: neuron model loaded. \n {'='*20}")
+            print(f"SUCCESS: neuron model loaded. \n {'=' * 20}")
        self.truncation = truncation
@@ -353,9 +351,9 @@ class NEURON_HF(TemplateLM):
            )
    def _select_cont_toks(self, logits, contlen=None, inplen=None):
-        assert (
+        assert contlen and inplen, (
-            contlen and inplen
+            "Must pass input len and cont. len to select scored logits for causal LM"
-        ), "Must pass input len and cont. len to select scored logits for causal LM"
+        )
        # discard right-padding.
        # also discard the input/context tokens. we'll only score continuations.
        logits = logits[inplen - contlen : inplen]

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
 import os
 from functools import cached_property
+from operator import itemgetter
 from typing import Any, Dict, List, Optional, Tuple, Union
 from lm_eval.api.registry import register_model
@@ -68,7 +69,9 @@ class LocalCompletionsAPI(TemplateAPI):
        if not isinstance(outputs, list):
            outputs = [outputs]
        for out in outputs:
-            for choice, ctxlen in zip(out["choices"], ctxlens):
+            for choice, ctxlen in zip(
+                sorted(out["choices"], key=itemgetter("index")), ctxlens
+            ):
                assert ctxlen > 0, "Context length must be greater than 0"
                logprobs = sum(choice["logprobs"]["token_logprobs"][ctxlen:-1])
                tokens_logprobs = choice["logprobs"]["token_logprobs"][ctxlen:-1]
@@ -87,8 +90,10 @@ class LocalCompletionsAPI(TemplateAPI):
        if not isinstance(outputs, list):
            outputs = [outputs]
        for out in outputs:
+            tmp = [None] * len(out["choices"])
            for choices in out["choices"]:
-                res.append(choices["text"])
+                tmp[choices["index"]] = choices["text"]
+            res = res + tmp
        return res
    @property
@@ -129,9 +134,9 @@ class LocalChatCompletion(LocalCompletionsAPI):
        eos=None,
        **kwargs,
    ) -> dict:
-        assert (
+        assert type(messages) is not str, (
-            type(messages) is not str
+            "chat-completions require the --apply_chat_template flag."
-        ), "chat-completions require the --apply_chat_template flag."
+        )
        gen_kwargs.pop("do_sample", False)
        if "max_tokens" in gen_kwargs:
            max_tokens = gen_kwargs.pop("max_tokens")
@@ -157,8 +162,10 @@ class LocalChatCompletion(LocalCompletionsAPI):
        if not isinstance(outputs, list):
            outputs = [outputs]
        for out in outputs:
+            tmp = [None] * len(out["choices"])
            for choices in out["choices"]:
-                res.append(choices["message"]["content"])
+                tmp[choices["index"]] = choices["message"]["content"]
+            res = res + tmp
        return res
    def tok_encode(
@@ -201,13 +208,12 @@ class OpenAICompletionsAPI(LocalCompletionsAPI):
        return key
    def loglikelihood(self, requests, **kwargs):
-        assert (
+        assert self.model in [
-            self.model
+            "babbage-002",
-            in [
+            "davinci-002",
-                "babbage-002",
+        ], (
-                "davinci-002",
+            f"Prompt loglikelihoods are only supported by OpenAI's API for {['babbage-002', 'davinci-002']}."
-            ]
+        )
-        ), f"Prompt loglikelihoods are only supported by OpenAI's API for {['babbage-002', 'davinci-002']}."
        return super().loglikelihood(requests, **kwargs)
    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
@@ -258,9 +264,9 @@ class OpenAIChatCompletion(LocalChatCompletion):
        eos="<|endoftext|>",
        **kwargs,
    ) -> dict:
-        assert (
+        assert type(messages) is not str, (
-            type(messages) is not str
+            "chat-completions require the --apply_chat_template flag."
-        ), "chat-completions require the --apply_chat_template flag."
+        )
        gen_kwargs.pop("do_sample", False)
        if "max_tokens" in gen_kwargs:
            max_tokens = gen_kwargs.pop("max_tokens")

--- a/lm_eval/models/optimum_ipex.py
+++ b/lm_eval/models/optimum_ipex.py
+from importlib.util import find_spec
+from lm_eval import utils
+from lm_eval.api.registry import register_model
+from lm_eval.models.huggingface import HFLM
+from lm_eval.models.utils import get_dtype
+eval_logger = utils.eval_logger
+@register_model("ipex")
+class IPEXLM(HFLM):
+    """
+    using the HuggingFace transformers + optimum-intel ipex backend, can run on intel cpu and intel gpu
+    """
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        if "backend" in kwargs:
+            # currently only supports causal models
+            assert kwargs["backend"] == "causal", (
+                "Currently, only IPEXModelForCausalLM is supported."
+            )
+        super().__init__(
+            backend=kwargs.pop("backend", "causal"),
+            **kwargs,
+        )
+    def _create_model(
+        self,
+        pretrained: str,
+        revision="main",
+        dtype="auto",
+        trust_remote_code=False,
+        # arguments used for splitting a model across GPUs naively.
+        # only used if `parallelize=True`.
+        # (accelerate naive PP (device_map) options)
+        parallelize=False,
+        gpus=None,
+        max_memory_per_gpu=None,
+        max_cpu_memory=None,
+        offload_folder="./offload",
+        # PEFT, delta weights and quantization options
+        peft=None,
+        delta=None,
+        autogptq=False,
+        gptqmodel=False,
+        **kwargs,
+    ) -> None:
+        if not find_spec("optimum"):
+            raise ModuleNotFoundError(
+                "package `optimum` is not installed. Please install it via `pip install optimum[ipex]`"
+            )
+        else:
+            from optimum.intel import IPEXModelForCausalLM
+        model_kwargs = kwargs if kwargs else {}
+        model_kwargs.update(
+            self._get_accelerate_args(
+                parallelize=parallelize,
+                device_map=kwargs.get("device_map", None),
+                max_memory_per_gpu=max_memory_per_gpu,
+                max_cpu_memory=max_cpu_memory,
+                offload_folder=offload_folder,
+                gpus=gpus,
+            )
+        )
+        self._model = IPEXModelForCausalLM.from_pretrained(
+            pretrained,
+            revision=revision,
+            torch_dtype=get_dtype(dtype),
+            trust_remote_code=trust_remote_code,
+            **model_kwargs,
+        )
--- a/lm_eval/models/optimum_lm.py
+++ b/lm_eval/models/optimum_lm.py
@@ -29,9 +29,9 @@ class OptimumLM(HFLM):
    ) -> None:
        if "backend" in kwargs:
            # optimum currently only supports causal models
-            assert (
+            assert kwargs["backend"] == "causal", (
-                kwargs["backend"] == "causal"
+                "Currently, only OVModelForCausalLM is supported."
-            ), "Currently, only OVModelForCausalLM is supported."
+            )
        self.openvino_device = device
@@ -71,6 +71,11 @@ class OptimumLM(HFLM):
        else:
            model_kwargs["ov_config"] = {}
        model_kwargs["ov_config"].setdefault("CACHE_DIR", "")
+        if "pipeline_parallel" in model_kwargs:
+            if model_kwargs["pipeline_parallel"]:
+                model_kwargs["ov_config"]["MODEL_DISTRIBUTION_POLICY"] = (
+                    "PIPELINE_PARALLEL"
+                )
        model_file = Path(pretrained) / "openvino_model.xml"
        if model_file.exists():
            export = False

--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -155,9 +155,9 @@ def pad_and_concat(
    length in the batch. Used for batching inputs and continuations in
    seq2seq models.
    """
-    assert (
+    assert padding_side == "left" or padding_side == "right", (
-        padding_side == "left" or padding_side == "right"
+        f"Unrecognized padding type: '{padding_side}' not 'left' or 'right'"
-    ), f"Unrecognized padding type: '{padding_side}' not 'left' or 'right'"
+    )
    for i, tensor in enumerate(tensors):
        if len(tensor.shape) == 2:

--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -76,9 +76,9 @@ class VLLM(TemplateLM):
            )
        assert "cuda" in device or device is None, "vLLM only supports CUDA"
-        assert (
+        assert max_length is None or max_model_len is None, (
-            max_length is None or max_model_len is None
+            "Either max_length or max_model_len may be provided, but not both"
-        ), "Either max_length or max_model_len may be provided, but not both"
+        )
        self._max_length = max_model_len if max_model_len is not None else max_length
        self.tensor_parallel_size = int(tensor_parallel_size)
@@ -102,7 +102,7 @@ class VLLM(TemplateLM):
        self.batch_size = (
            "auto"
            if isinstance(batch_size, str) and "auto" in batch_size
-            else batch_size
+            else int(batch_size)
        )
        if self.data_parallel_size <= 1:
            self.model = LLM(**self.model_args)
@@ -142,9 +142,9 @@ class VLLM(TemplateLM):
        self._max_gen_toks = max_gen_toks
        if lora_local_path is not None:
-            assert parse_version(version("vllm")) > parse_version(
+            assert parse_version(version("vllm")) > parse_version("0.3.0"), (
-                "0.3.0"
+                "lora adapters only compatible with vllm > v0.3.0."
-            ), "lora adapters only compatible with vllm > v0.3.0."
+            )
            self.lora_request = LoRARequest("finetuned", 1, lora_local_path)
        else:
            self.lora_request = None
@@ -184,14 +184,21 @@ class VLLM(TemplateLM):
    def max_gen_toks(self):
        return self._max_gen_toks
-    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+    def apply_chat_template(
+        self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
+    ) -> str:
        """
        Method to apply a chat template to a list of chat history between user and model.
        """
-        return self.tokenizer.apply_chat_template(
+        chat_templated = self.tokenizer.apply_chat_template(
-            chat_history, tokenize=False, add_generation_prompt=True
+            chat_history,
+            tokenize=False,
+            add_generation_prompt=add_generation_prompt,
+            continue_final_message=not add_generation_prompt,
        )
+        return chat_templated
    @property
    def tokenizer_name(self) -> str:
        return self.tokenizer.name_or_path.replace("/", "__")
@@ -281,10 +288,21 @@ class VLLM(TemplateLM):
    def loglikelihood_rolling(
        self, requests: List[Instance], disable_tqdm: bool = False
    ) -> List[float]:
-        loglikelihoods = []
+        adaptive_batch_size = None
+        if self.batch_size == "auto":
-        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
+            adaptive_batch_size = len(requests)
-            rolling_token_windows = list(
+        # First, collect all windows from all requests
+        all_windows = []  # List of (request_idx, window) tuples
+        request_window_counts = []  # Track number of windows per request
+        for req_idx, (string,) in enumerate(
+            tqdm(
+                [req.args for req in requests],
+                disable=(disable_tqdm or (self.rank != 0)),
+            )
+        ):
+            rolling_token_windows: List[Tuple[List[int], List[int]]] = list(
                map(
                    make_disjoint_window,
                    get_rolling_token_windows(
@@ -297,20 +315,42 @@ class VLLM(TemplateLM):
                )
            )
-            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
+            windows = [(None,) + x for x in rolling_token_windows]
-            string_nll = self._loglikelihood_tokens(
+            # Store windows with their request index
-                rolling_token_windows,
+            all_windows.extend((req_idx, window) for window in windows)
-            )
+            request_window_counts.append(len(windows))
-            # discard is_greedy
+        all_nlls = []
-            string_nll = [x[0] for x in string_nll]
+        batch_size = adaptive_batch_size or int(self.batch_size)
+        for i in range(0, len(all_windows), batch_size):
+            batch = all_windows[i : i + batch_size]
+            # Extract just the windows for processing, keeping track of request indices
+            batch_indices, batch_windows = zip(*batch)
-            string_nll = sum(string_nll)
+            batch_nlls = self._loglikelihood_tokens(
-            loglikelihoods.append(string_nll)
+                requests=batch_windows,
+                disable_tqdm=False,
+            )
+            # Store results with their request indices
+            all_nlls.extend(zip(batch_indices, batch_nlls))
-            # cache this loglikelihood_rolling request
+        # Reconstruct per-request loglikelihoods
-            self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
+        loglikelihoods = []
+        current_idx = 0
+        for window_count in request_window_counts:
+            # Get all nlls for this request
+            request_nlls = all_nlls[current_idx : current_idx + window_count]
+            # Sum up the nlls for this request (discarding is_greedy)
+            request_total = sum(nll[0] for _, nll in request_nlls)
+            loglikelihoods.append(request_total)
+            current_idx += window_count
+            string = requests[len(loglikelihoods) - 1].args[0]
+            self.cache_hook.add_partial(
+                "loglikelihood_rolling", (string,), request_total
+            )
        return loglikelihoods

--- a/lm_eval/models/vllm_vlms.py
+++ b/lm_eval/models/vllm_vlms.py
@@ -144,7 +144,9 @@ class VLLM_VLM(VLLM):
            )
        return outputs
-    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+    def apply_chat_template(
+        self, chat_history: List[Dict[str, str]], add_generation_prompt=True
+    ) -> str:
        self.chat_applied = True
        if not self.interleave:
            for content in chat_history:
@@ -194,7 +196,9 @@ class VLLM_VLM(VLLM):
                    )
        return self.processor.apply_chat_template(
-            chat_history, add_generation_prompt=True
+            chat_history,
+            add_generation_prompt=add_generation_prompt,
+            continue_final_message=not add_generation_prompt,
        )
    def generate_until(
@@ -267,7 +271,9 @@ class VLLM_VLM(VLLM):
                left_truncate_len=max_ctx_len,
            )
-            cont = self._model_generate(inputs, stop=until, generate=True, **kwargs)
+            cont = self._model_generate(
+                inputs, stop=until, generate=True, max_tokens=max_gen_toks, **kwargs
+            )
            for output, context in zip(cont, contexts):
                generated_text = output.outputs[0].text

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -5,130 +5,139 @@
 For more information, including a full list of task names and their precise meanings or sources, follow the links provided to the individual README.md files for each subfolder.
-| Task Family | Description | Language(s) |
+| Task Family                                                              | Description | Language(s)                                                                                                                   |
-|-------------|-------------|-------------|
+|--------------------------------------------------------------------------|-------------|-------------------------------------------------------------------------------------------------------------------------------|
-| [aclue](aclue/README.md) | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese |
+| [aclue](aclue/README.md)                                                 | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese                                                                                                               |
-| [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic |
+| [aexams](aexams/README.md)                                               | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic                                                                                                                        |
-| [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese |
+| [agieval](agieval/README.md)                                             | Tasks involving historical data or questions related to history and historical texts. | English, Chinese                                                                                                              |
-| [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English |
+| [anli](anli/README.md)                                                   | Adversarial natural language inference tasks designed to test model robustness. | English                                                                                                                       |
-| [arabic_leaderboard_complete](arabic_leaderboard_complete/README.md) | A full version of the tasks in the Open Arabic LLM Leaderboard, focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT) |
+| [arabic_leaderboard_complete](arabic_leaderboard_complete/README.md)     | A full version of the tasks in the Open Arabic LLM Leaderboard, focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT)                                                                                                              |
-| [arabic_leaderboard_light](arabic_leaderboard_light/README.md) | A light version of the tasks in the Open Arabic LLM Leaderboard (i.e., 10% samples of the test set in the original benchmarks), focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT) |
+| [arabic_leaderboard_light](arabic_leaderboard_light/README.md)           | A light version of the tasks in the Open Arabic LLM Leaderboard (i.e., 10% samples of the test set in the original benchmarks), focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT)                                                                                                              |
-| [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic |
+| [arabicmmlu](arabicmmlu/README.md)                                       | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic                                                                                                                        |
-| [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions.  | English |
+| [AraDICE](aradice/README.md)                                             | A collection of multiple tasks carefully designed to evaluate dialectal and cultural capabilities in large language models (LLMs). | Arabic                                                                                                                        |
-| [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English |
+| [arc](arc/README.md)                                                     | Tasks involving complex reasoning over a diverse set of questions.  | English                                                                                                                       |
-| [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English |
+| [arithmetic](arithmetic/README.md)                                       | Tasks involving numerical computations and arithmetic reasoning. | English                                                                                                                       |
-| [babi](babi/README.md) | Tasks designed as question and answering challenges based on simulated stories. | English |
+| [asdiv](asdiv/README.md)                                                 | Tasks involving arithmetic and mathematical reasoning challenges. | English                                                                                                                       |
-| [basque_bench](basque_bench/README.md) | Collection of tasks in Basque encompassing various evaluation areas. | Basque |
+| [babi](babi/README.md)                                                   | Tasks designed as question and answering challenges based on simulated stories. | English                                                                                                                       |
-| [basqueglue](basqueglue/README.md) | Tasks designed to evaluate language understanding in Basque language. | Basque |
+| [basque_bench](basque_bench/README.md)                                   | Collection of tasks in Basque encompassing various evaluation areas. | Basque                                                                                                                        |
-| [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German |
+| [basqueglue](basqueglue/README.md)                                       | Tasks designed to evaluate language understanding in Basque language. | Basque                                                                                                                        |
-| [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) |
+| [bbh](bbh/README.md)                                                     | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German                                                                                                               |
-| benchmarks | General benchmarking tasks that test a wide range of language understanding capabilities. | |
+| [belebele](belebele/README.md)                                           | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages)                                                                                                      |
-| [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) |
+| benchmarks                                                               | General benchmarking tasks that test a wide range of language understanding capabilities. |                                                                                                                               |
-| [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple |
+| [bertaqa](bertaqa/README.md)                                             | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT)                                                                                                  |
-| [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English |
+| [bigbench](bigbench/README.md)                                           | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple                                                                                                                      |
-| [catalan_bench](catalan_bench/README.md) | Collection of tasks in Catalan encompassing various evaluation areas. | Catalan |
+| [blimp](blimp/README.md)                                                 | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English                                                                                                                       |
-| [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese |
+| [catalan_bench](catalan_bench/README.md)                                 | Collection of tasks in Catalan encompassing various evaluation areas. | Catalan                                                                                                                       |
-| [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese |
+| [ceval](ceval/README.md)                                                 | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese                                                                                                                       |
-| code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby |
+| [cmmlu](cmmlu/README.md)                                                 | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese                                                                                                                       |
-| [commonsense_qa](commonsense_qa/README.md) | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English |
+| code_x_glue                                                              | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby                                                                                               |
-| [copal_id](copal_id/README.md) | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian |
+| [commonsense_qa](commonsense_qa/README.md)                               | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English                                                                                                                       |
-| [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English |
+| [copal_id](copal_id/README.md)                                           | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian                                                                                                                    |
-| [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French |
+| [coqa](coqa/README.md)                                                   | Conversational question answering tasks to test dialog understanding. | English                                                                                                                       |
-| csatqa | Tasks related to SAT and other standardized testing questions for academic assessment. | Korean |
+| [crows_pairs](crows_pairs/README.md)                                     | Tasks designed to test model biases in various sociodemographic groups. | English, French                                                                                                               |
-| [drop](drop/README.md) | Tasks requiring numerical reasoning, reading comprehension, and question answering. | English |
+| csatqa                                                                   | Tasks related to SAT and other standardized testing questions for academic assessment. | Korean                                                                                                                        |
-| [eq_bench](eq_bench/README.md) | Tasks focused on equality and ethics in question answering and decision-making. | English |
+| [drop](drop/README.md)                                                   | Tasks requiring numerical reasoning, reading comprehension, and question answering. | English                                                                                                                       |
-| [eus_exams](eus_exams/README.md) | Tasks based on various professional and academic exams in the Basque language. | Basque |
+| [eq_bench](eq_bench/README.md)                                           | Tasks focused on equality and ethics in question answering and decision-making. | English                                                                                                                       |
-| [eus_proficiency](eus_proficiency/README.md) | Tasks designed to test proficiency in the Basque language across various topics. | Basque |
+| [eus_exams](eus_exams/README.md)                                         | Tasks based on various professional and academic exams in the Basque language. | Basque                                                                                                                        |
-| [eus_reading](eus_reading/README.md) | Reading comprehension tasks specifically designed for the Basque language. | Basque |
+| [eus_proficiency](eus_proficiency/README.md)                             | Tasks designed to test proficiency in the Basque language across various topics. | Basque                                                                                                                        |
-| [eus_trivia](eus_trivia/README.md) | Trivia and knowledge testing tasks in the Basque language. | Basque |
+| [eus_reading](eus_reading/README.md)                                     | Reading comprehension tasks specifically designed for the Basque language. | Basque                                                                                                                        |
-| [fda](fda/README.md) | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English |
+| [eus_trivia](eus_trivia/README.md)                                       | Trivia and knowledge testing tasks in the Basque language. | Basque                                                                                                                        |
-| [fld](fld/README.md) | Tasks involving free-form and directed dialogue understanding. | English |
+| [fda](fda/README.md)                                                     | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English                                                                                                                       |
-| [french_bench](french_bench/README.md) | Set of tasks designed to assess language model performance in French. | French|
+| [fld](fld/README.md)                                                     | Tasks involving free-form and directed dialogue understanding. | English                                                                                                                       |
-| [galician_bench](galician_bench/README.md) | Collection of tasks in Galician encompassing various evaluation areas. | Galician |
+| [french_bench](french_bench/README.md)                                   | Set of tasks designed to assess language model performance in French. | French                                                                                                                        |
-| [glue](glue/README.md) | General Language Understanding Evaluation benchmark to test broad language abilities. | English |
+| [galician_bench](galician_bench/README.md)                               | Collection of tasks in Galician encompassing various evaluation areas. | Galician                                                                                                                      |
-| [gpqa](gpqa/README.md) | Tasks designed for general public question answering and knowledge verification. | English |
+| [global_mmlu](global_mmlu/README.md)                                     | Collection of culturally sensitive and culturally agnostic MMLU tasks in 15 languages with human translations or post-edits. | Multiple (15 languages)                                                                                                       |
-| [gsm8k](gsm8k/README.md) | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English |
+| [glue](glue/README.md)                                                   | General Language Understanding Evaluation benchmark to test broad language abilities. | English                                                                                                                       |
-| [haerae](haerae/README.md) | Tasks focused on assessing detailed factual and historical knowledge. | Korean |
+| [gpqa](gpqa/README.md)                                                   | Tasks designed for general public question answering and knowledge verification. | English                                                                                                                       |
-| [headqa](headqa/README.md) | A high-level education-based question answering dataset to test specialized knowledge. | Spanish, English |
+| [gsm8k](gsm8k/README.md)                                                 | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English                                                                                                                       |
-| [hellaswag](hellaswag/README.md) | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity. | English |
+| [haerae](haerae/README.md)                                               | Tasks focused on assessing detailed factual and historical knowledge. | Korean                                                                                                                        |
-| [hendrycks_ethics](hendrycks_ethics/README.md)     | Tasks designed to evaluate the ethical reasoning capabilities of models. | English |
+| [headqa](headqa/README.md)                                               | A high-level education-based question answering dataset to test specialized knowledge. | Spanish, English                                                                                                              |
-| [hendrycks_math](hendrycks_math/README.md) | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English |
+| [hellaswag](hellaswag/README.md)                                         | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity. | English                                                                                                                       |
-| [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English |
+| [hendrycks_ethics](hendrycks_ethics/README.md)                           | Tasks designed to evaluate the ethical reasoning capabilities of models. | English                                                                                                                       |
-| [inverse_scaling](inverse_scaling/README.md) | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English |
+| [hendrycks_math](hendrycks_math/README.md)                               | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English                                                                                                                       |
-| [japanese_leaderboard](japanese_leaderboard/README.md) | Japanese language understanding tasks to benchmark model performance on various linguistic aspects. | Japanese |
+| [histoires_morales](histoires_morales/README.md)                         | A dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations.  | French (Some MT)                                                                                                                        |
-| [kbl](kbl/README.md) | Korean Benchmark for Legal Language Understanding. | Korean |
+| [hrm8k](hrm8k/README.md)                                                 | A challenging bilingual math reasoning benchmark for Korean and English. | Korean (Some MT), English (Some MT)                                                                                           |
-| [kmmlu](kmmlu/README.md) | Knowledge-based multi-subject multiple choice questions for academic evaluation. | Korean |
+| [humaneval](humaneval/README.md)                                         | Code generation task that measure functional correctness for synthesizing programs from docstrings. | Python                                                                                                                        |
-| [kobest](kobest/README.md) | A collection of tasks designed to evaluate understanding in Korean language. | Korean |
+| [ifeval](ifeval/README.md)                                               | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English                                                                                                                       |
-| [kormedmcqa](kormedmcqa/README.md) | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean |
+| [inverse_scaling](inverse_scaling/README.md)                             | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English                                                                                                                       |
-| [lambada](lambada/README.md) | Tasks designed to predict the endings of text passages, testing language prediction skills. | English |
+| [japanese_leaderboard](japanese_leaderboard/README.md)                   | Japanese language understanding tasks to benchmark model performance on various linguistic aspects. | Japanese                                                                                                                      |
-| [lambada_cloze](lambada_cloze/README.md) | Cloze-style LAMBADA dataset. | English |
+| [kbl](kbl/README.md)                                                     | Korean Benchmark for Legal Language Understanding. | Korean                                                                                                                        |
-| [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian |
+| [kmmlu](kmmlu/README.md)                                                 | Knowledge-based multi-subject multiple choice questions for academic evaluation. | Korean                                                                                                                        |
-| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese |
+| [kobest](kobest/README.md)                                               | A collection of tasks designed to evaluate understanding in Korean language. | Korean                                                                                                                        |
-| [leaderboard](leaderboard/README.md) | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English |
+| [kormedmcqa](kormedmcqa/README.md)                                       | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean                                                                                                                        |
-| [lingoly](lingoly/README.md) | Challenging logical reasoning benchmark in low-resource languages with controls for memorization | English, Multilingual |
+| [lambada](lambada/README.md)                                             | Tasks designed to predict the endings of text passages, testing language prediction skills. | English                                                                                                                       |
-| [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese |
+| [lambada_cloze](lambada_cloze/README.md)                                 | Cloze-style LAMBADA dataset. | English                                                                                                                       |
-| [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese |
+| [lambada_multilingual](lambada_multilingual/README.md)                   | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian                                                                                     |
-| [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English |
+| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese                                                                  |
-| [mc_taco](mc_taco/README.md) | Question-answer pairs that require temporal commonsense comprehension. | English |
+| [leaderboard](leaderboard/README.md)                                     | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English                                                                                                                       |
-| [med_concepts_qa](med_concepts_qa/README.md) | Benchmark for evaluating LLMs on their abilities to interpret medical codes and distinguish between medical concept. | English |
+| [lingoly](lingoly/README.md)                                             | Challenging logical reasoning benchmark in low-resource languages with controls for memorization | English, Multilingual                                                                                                         |
-| [metabench](metabench/README.md) | Distilled versions of six popular benchmarks which are highly predictive of overall benchmark performance and of a single general ability latent trait. | English |
+| [logiqa](logiqa/README.md)                                               | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese                                                                                                              |
-| medmcqa | Medical multiple choice questions assessing detailed medical knowledge. | English |
+| [logiqa2](logiqa2/README.md)                                             | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese                                                                                                              |
-| medqa | Multiple choice question answering based on the United States Medical License Exams. | |
+| [mathqa](mathqa/README.md)                                               | Question answering tasks involving mathematical reasoning and problem-solving. | English                                                                                                                       |
-| [mgsm](mgsm/README.md) | Benchmark of multilingual grade-school math problems. | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu |
+| [mbpp](mbpp/README.md)                                                   | A benchmark designed to measure the ability to synthesize short Python programs from natural language descriptions. | Python                                                                                                                        |
-| [minerva_math](minerva_math/README.md) | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills. | English |
+| [mc_taco](mc_taco/README.md)                                             | Question-answer pairs that require temporal commonsense comprehension. | English                                                                                                                       |
-| [mmlu](mmlu/README.md) | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English |
+| [med_concepts_qa](med_concepts_qa/README.md)                             | Benchmark for evaluating LLMs on their abilities to interpret medical codes and distinguish between medical concept. | English                                                                                                                       |
-| [mmlu_pro](mmlu_pro/README.md) | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options. | English |
+| [metabench](metabench/README.md)                                         | Distilled versions of six popular benchmarks which are highly predictive of overall benchmark performance and of a single general ability latent trait. | English                                                                                                                       |
-| [mmlusr](mmlusr/README.md) | Variation of MMLU designed to be more rigorous. | English |
+| medmcqa                                                                  | Medical multiple choice questions assessing detailed medical knowledge. | English                                                                                                                       |
-| model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. | |
+| medqa                                                                    | Multiple choice question answering based on the United States Medical License Exams. |                                                                                                                               |
-| [mutual](mutual/README.md) | A retrieval-based dataset for multi-turn dialogue reasoning. | English |
+| [mgsm](mgsm/README.md)                                                   | Benchmark of multilingual grade-school math problems. | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu                                           |
-| [nq_open](nq_open/README.md) | Open domain question answering tasks based on the Natural Questions dataset. | English |
+| [minerva_math](minerva_math/README.md)                                   | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills. | English                                                                                                                       |
-| [okapi/arc_multilingual](okapi/arc_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.** |
+| [mlqa](mlqa/README.md)                                                   | MultiLingual Question Answering benchmark dataset for evaluating cross-lingual question answering performance. | English, Arabic, German, Spanish, Hindi, Vietnamese, Simplified Chinese                                                       |
-| [okapi/hellaswag_multilingual](okapi/hellaswag_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (30 languages) **Machine Translated.** |
+| [mmlu](mmlu/README.md)                                                   | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English                                                                                                                       |
-| okapi/mmlu_multilingual | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) **Machine Translated.** |
+| [mmlu_pro](mmlu_pro/README.md)                                           | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options. | English                                                                                                                       |
-| [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.** |
+| [mmlu-pro-plus](mmlu-pro-plus/README.md) | A new test set for evaluating shortcut learning and higher-order reasoning of LLMs.                                                                                                                                                                                                                                                   | English |
-| [openbookqa](openbookqa/README.md) | Open-book question answering tasks that require external knowledge and reasoning. | English |
+| [mmlusr](mmlusr/README.md)                                               | Variation of MMLU designed to be more rigorous. | English                                                                                                                       |
-| [paloma](paloma/README.md) | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit. | English |
+| model_written_evals                                                      | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. |                                                                                                                               |
-| [paws-x](paws-x/README.md) | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean |
+| [moral_stories](moral_stories/README.md)                                 | A crowd-sourced dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations. | English  
-| [pile](pile/README.md) | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English |
+| [mutual](mutual/README.md)                                               | A retrieval-based dataset for multi-turn dialogue reasoning. | English                                                                                                                       |
-| [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English |
+| [nq_open](nq_open/README.md)                                             | Open domain question answering tasks based on the Natural Questions dataset. | English                                                                                                                       |
-| [piqa](piqa/README.md) | Physical Interaction Question Answering tasks to test physical commonsense reasoning. | English |
+| [okapi/arc_multilingual](okapi/arc_multilingual/README.md)               | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.**                                                                               |
-| [polemo2](polemo2/README.md) | Sentiment analysis and emotion detection tasks based on Polish language data. | Polish |
+| [okapi/hellaswag_multilingual](okapi/hellaswag_multilingual/README.md)   | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (30 languages) **Machine Translated.**                                                                               |
-| [portuguese_bench](portuguese_bench/README.md) | Collection of tasks in European Portuguese encompassing various evaluation areas. | Portuguese |
+| okapi/mmlu_multilingual                                                  | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) **Machine Translated.**                                                                               |
-| [prost](prost/README.md) | Tasks requiring understanding of professional standards and ethics in various domains. | English |
+| [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.**                                                                               |
-| [pubmedqa](pubmedqa/README.md) | Question answering tasks based on PubMed research articles for biomedical understanding. | English |
+| [openbookqa](openbookqa/README.md)                                       | Open-book question answering tasks that require external knowledge and reasoning. | English                                                                                                                       |
-| [qa4mre](qa4mre/README.md) | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning. | English |
+| [paloma](paloma/README.md)                                               | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit. | English                                                                                                                       |
-| [qasper](qasper/README.md) | Question Answering dataset based on academic papers, testing in-depth scientific knowledge. | English |
+| [paws-x](paws-x/README.md)                                               | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean                                                                   |
-| [race](race/README.md) | Reading comprehension assessment tasks based on English exams in China. | English |
+| [pile](pile/README.md)                                                   | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English                                                                                                                       |
-| realtoxicityprompts | Tasks to evaluate language models for generating text with potential toxicity. | |
+| [pile_10k](pile_10k/README.md)                                           | The first 10K elements of The Pile, useful for debugging models trained on it. | English                                                                                                                       |
-| [sciq](sciq/README.md) | Science Question Answering tasks to assess understanding of scientific concepts. | English |
+| [piqa](piqa/README.md)                                                   | Physical Interaction Question Answering tasks to test physical commonsense reasoning. | English                                                                                                                       |
-| [score](score/README.md) | Systematic consistency and robustness evaluation for LLMs on 3 datasets(MMLU-Pro, Agi Eval and MATH) | English |
+| [polemo2](polemo2/README.md)                                             | Sentiment analysis and emotion detection tasks based on Polish language data. | Polish                                                                                                                        |
-| [scrolls](scrolls/README.md) | Tasks that involve long-form reading comprehension across various domains. | English |
+| [portuguese_bench](portuguese_bench/README.md)                           | Collection of tasks in European Portuguese encompassing various evaluation areas. | Portuguese                                                                                                                    |
-| [siqa](siqa/README.md) | Social Interaction Question Answering to evaluate common sense and social reasoning.  | English |
+| [prost](prost/README.md)                                                 | Tasks requiring understanding of professional standards and ethics in various domains. | English                                                                                                                       |
-| [spanish_bench](spanish_bench/README.md) | Collection of tasks in Spanish encompassing various evaluation areas. | Spanish |
+| [pubmedqa](pubmedqa/README.md)                                           | Question answering tasks based on PubMed research articles for biomedical understanding. | English                                                                                                                       |
-| [squad_completion](squad_completion/README.md) | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs. | English |
+| [qa4mre](qa4mre/README.md)                                               | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning. | English                                                                                                                       |
-| [squadv2](squadv2/README.md) | Stanford Question Answering Dataset version 2, a reading comprehension benchmark. | English |
+| [qasper](qasper/README.md)                                               | Question Answering dataset based on academic papers, testing in-depth scientific knowledge. | English                                                                                                                       |
-| [storycloze](storycloze/README.md) | Tasks to predict story endings, focusing on narrative logic and coherence. | English |
+| [race](race/README.md)                                                   | Reading comprehension assessment tasks based on English exams in China. | English                                                                                                                       |
-| [super_glue](super_glue/README.md) | A suite of challenging tasks designed to test a range of language understanding skills. | English |
+| realtoxicityprompts                                                      | Tasks to evaluate language models for generating text with potential toxicity. |                                                                                                                               |
-| [swag](swag/README.md) | Situations With Adversarial Generations, predicting the next event in videos. | English |
+| [sciq](sciq/README.md)                                                   | Science Question Answering tasks to assess understanding of scientific concepts. | English                                                                                                                       |
-| [swde](swde/README.md) | Information extraction tasks from semi-structured web pages. | English |
+| [score](score/README.md)                                                 | Systematic consistency and robustness evaluation for LLMs on 3 datasets(MMLU-Pro, Agi Eval and MATH) | English                                                                                                                       |
-| [tinyBenchmarks](tinyBenchmarks/README.md) | Evaluation of large language models with fewer examples using tiny versions of popular benchmarks. | English |
+| [scrolls](scrolls/README.md)                                             | Tasks that involve long-form reading comprehension across various domains. | English                                                                                                                       |
-| [tmmluplus](tmmluplus/README.md) | An extended set of tasks under the TMMLU framework for broader academic assessments. | Traditional Chinese |
+| [siqa](siqa/README.md)                                                   | Social Interaction Question Answering to evaluate common sense and social reasoning.  | English                                                                                                                       |
-| [toxigen](toxigen/README.md) | Tasks designed to evaluate language models on their propensity to generate toxic content. | English |
+| [spanish_bench](spanish_bench/README.md)                                 | Collection of tasks in Spanish encompassing various evaluation areas. | Spanish                                                                                                                       |
-| [translation](translation/README.md) | Tasks focused on evaluating the language translation capabilities of models. | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese |
+| [squad_completion](squad_completion/README.md)                           | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs. | English                                                                                                                       |
-| [triviaqa](triviaqa/README.md) | A large-scale dataset for trivia question answering to test general knowledge. | English |
+| [squadv2](squadv2/README.md)                                             | Stanford Question Answering Dataset version 2, a reading comprehension benchmark. | English                                                                                                                       |
-| [truthfulqa](truthfulqa/README.md) | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English |
+| [storycloze](storycloze/README.md)                                       | Tasks to predict story endings, focusing on narrative logic and coherence. | English                                                                                                                       |
-| [turkishmmlu](turkishmmlu/README.md) | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams. | Turkish |
+| [super_glue](super_glue/README.md)                                       | A suite of challenging tasks designed to test a range of language understanding skills. | English                                                                                                                       |
-| [unitxt](unitxt/README.md) | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI. | English |
+| [swag](swag/README.md)                                                   | Situations With Adversarial Generations, predicting the next event in videos. | English                                                                                                                       |
-| [unscramble](unscramble/README.md) | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding. | English |
+| [swde](swde/README.md)                                                   | Information extraction tasks from semi-structured web pages. | English                                                                                                                       |
-| [webqs](webqs/README.md) | Web-based question answering tasks designed to evaluate internet search and retrieval. | English |
+| [tinyBenchmarks](tinyBenchmarks/README.md)                               | Evaluation of large language models with fewer examples using tiny versions of popular benchmarks. | English                                                                                                                       |
-| [wikitext](wikitext/README.md) | Tasks based on text from Wikipedia articles to assess language modeling and generation. | English |
+| [tmmluplus](tmmluplus/README.md)                                         | An extended set of tasks under the TMMLU framework for broader academic assessments. | Traditional Chinese                                                                                                           |
-| [winogrande](winogrande/README.md) | A large-scale dataset for coreference resolution, inspired by the Winograd Schema Challenge. | English |
+| [toxigen](toxigen/README.md)                                             | Tasks designed to evaluate language models on their propensity to generate toxic content. | English                                                                                                                       |
-| [wmdp](wmdp/README.md) | A benchmark with the objective of minimizing performance, based on potentially-sensitive multiple-choice knowledge questions. | English |
+| [translation](translation/README.md)                                     | Tasks focused on evaluating the language translation capabilities of models. | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese                               |
-| [wmt2016](wmt2016/README.md) | Tasks from the WMT 2016 shared task, focusing on translation between multiple languages. | English, Czech, German, Finnish, Russian, Romanian, Turkish |
+| [triviaqa](triviaqa/README.md)                                           | A large-scale dataset for trivia question answering to test general knowledge. | English                                                                                                                       |
-| [wsc273](wsc273/README.md) | The Winograd Schema Challenge, a test of commonsense reasoning and coreference resolution. | English |
+| [truthfulqa](truthfulqa/README.md)                                       | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English                                                                                                                       |
-| [xcopa](xcopa/README.md) | Cross-lingual Choice of Plausible Alternatives, testing reasoning in multiple languages. | Estonian, Haitian, Indonesian, Italian, Quechua, Swahili, Tamil, Thai, Turkish, Vietnamese, Chinese |
+| [turkishmmlu](turkishmmlu/README.md)                                     | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams. | Turkish                                                                                                                       |
-| [xnli](xnli/README.md) | Cross-Lingual Natural Language Inference to test understanding across different languages. | Arabic, Bulgarian, German, Greek, English, Spanish, French, Hindi, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese |
+| [unitxt](unitxt/README.md)                                               | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI. | English                                                                                                                       |
-| [xnli_eu](xnli_eu/README.md) | Cross-lingual Natural Language Inference tasks in Basque. | Basque |
+| [unscramble](unscramble/README.md)                                       | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding. | English                                                                                                                       |
-| [xquad](xquad/README.md) | Cross-lingual Question Answering Dataset in multiple languages. | Arabic, German, Greek, English, Spanish, Hindi, Romanian, Russian, Thai, Turkish, Vietnamese, Chinese |
+| [webqs](webqs/README.md)                                                 | Web-based question answering tasks designed to evaluate internet search and retrieval. | English                                                                                                                       |
-| [xstorycloze](xstorycloze/README.md) | Cross-lingual narrative understanding tasks to predict story endings in multiple languages. | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese |
+| [wikitext](wikitext/README.md)                                           | Tasks based on text from Wikipedia articles to assess language modeling and generation. | English                                                                                                                       |
-| [xwinograd](xwinograd/README.md) | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages. | English, French, Japanese, Portuguese, Russian, Chinese |
+| [winogrande](winogrande/README.md)                                       | A large-scale dataset for coreference resolution, inspired by the Winograd Schema Challenge. | English                                                                                                                       |
+| [wmdp](wmdp/README.md)                                                   | A benchmark with the objective of minimizing performance, based on potentially-sensitive multiple-choice knowledge questions. | English                                                                                                                       |
+| [wmt2016](wmt2016/README.md)                                             | Tasks from the WMT 2016 shared task, focusing on translation between multiple languages. | English, Czech, German, Finnish, Russian, Romanian, Turkish                                                                   |
+| [wsc273](wsc273/README.md)                                               | The Winograd Schema Challenge, a test of commonsense reasoning and coreference resolution. | English                                                                                                                       |
+| [xcopa](xcopa/README.md)                                                 | Cross-lingual Choice of Plausible Alternatives, testing reasoning in multiple languages. | Estonian, Haitian, Indonesian, Italian, Quechua, Swahili, Tamil, Thai, Turkish, Vietnamese, Chinese                           |
+| [xnli](xnli/README.md)                                                   | Cross-Lingual Natural Language Inference to test understanding across different languages. | Arabic, Bulgarian, German, Greek, English, Spanish, French, Hindi, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese |
+| [xnli_eu](xnli_eu/README.md)                                             | Cross-lingual Natural Language Inference tasks in Basque. | Basque                                                                                                                        |
+| [xquad](xquad/README.md)                                                 | Cross-lingual Question Answering Dataset in multiple languages. | Arabic, German, Greek, English, Spanish, Hindi, Romanian, Russian, Thai, Turkish, Vietnamese, Chinese                         |
+| [xstorycloze](xstorycloze/README.md)                                     | Cross-lingual narrative understanding tasks to predict story endings in multiple languages. | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese                             |
+| [xwinograd](xwinograd/README.md)                                         | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages. | English, French, Japanese, Portuguese, Russian, Chinese                                                                       |