Merge branch 'main' into humaneval

# Conflicts: # lm_eval/api/task.py

Merge branch 'main' into humaneval
# Conflicts: # lm_eval/api/task.py
173b2bc3 · Baber · 74344829 · bb098f13 · 173b2bc3 · 173b2bc3
Commit 173b2bc3 authored Jan 10, 2025 by Baber
20 changed files
--- a/lm_eval/models/ibm_watsonx_ai.py
+++ b/lm_eval/models/ibm_watsonx_ai.py
+import copy
+import json
+import os
+from functools import lru_cache
+from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Type, cast
+from tqdm import tqdm
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from lm_eval.models.api_models import JsonChatStr
+from lm_eval.utils import eval_logger, simple_parse_args_string
+class LogLikelihoodResult(NamedTuple):
+    log_likelihood: float
+    is_greedy: bool
+def _verify_credentials(creds: Any) -> None:
+    """
+    Verifies that all required keys are present in the credentials dictionary.
+    Args:
+        creds (Any): A dictionary containing the credentials.
+    Raises:
+        ValueError: If any of the necessary credentials are missing, with guidance on which environment variables need to be set.
+    """
+    required_keys = ["apikey", "url", "project_id"]
+    env_var_mapping = {
+        "apikey": "WATSONX_API_KEY",
+        "url": "WATSONX_URL",
+        "project_id": "WATSONX_PROJECT_ID",
+    }
+    missing_keys = [key for key in required_keys if key not in creds or not creds[key]]
+    if missing_keys:
+        missing_env_vars = [env_var_mapping[key] for key in missing_keys]
+        raise ValueError(
+            f"Missing required credentials: {', '.join(missing_keys)}. Please set the following environment variables: {', '.join(missing_env_vars)}"
+        )
+@lru_cache(maxsize=None)
+def get_watsonx_credentials() -> Dict[str, str]:
+    """
+    Retrieves Watsonx API credentials from environmental variables.
+    Returns:
+        Dict[str, str]: A dictionary containing the credentials necessary for authentication, including
+                        keys such as `apikey`, `url`, and `project_id`.
+    Raises:
+        AssertionError: If the credentials format is invalid or any of the necessary credentials are missing.
+    """
+    credentials = {
+        "apikey": os.getenv("WATSONX_API_KEY", None),
+        "url": os.getenv("WATSONX_URL", None),
+        "project_id": os.getenv("WATSONX_PROJECT_ID", None),
+    }
+    _verify_credentials(credentials)
+    return credentials
+@register_model("watsonx_llm")
+class WatsonxLLM(LM):
+    """
+    Implementation of LM model interface for evaluating Watsonx model with the lm_eval framework.
+    See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/model_guide.md for reference.
+    """
+    @classmethod
+    def create_from_arg_string(
+        cls: Type["WatsonxLLM"],
+        arg_string: str,
+        additional_config: Optional[Dict] = None,
+    ) -> "WatsonxLLM":
+        """
+        Allow the user to specify model parameters (TextGenerationParameters) in CLI arguments.
+        """
+        try:
+            from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
+        except ImportError:
+            raise ImportError(
+                "Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
+            )
+        args = simple_parse_args_string(arg_string)
+        args.update(additional_config)
+        model_id = args.pop("model_id", None)
+        if model_id is None:
+            raise ValueError("'model_id' is required, please pass it in 'model_args'")
+        if not args.get("do_sample", None):
+            args["temperature"] = None
+            args["top_p"] = None
+            args["top_k"] = None
+            args["seed"] = None
+        generate_params = {
+            GenParams.DECODING_METHOD: (
+                "greedy" if not args.get("do_sample", None) else "sample"
+            ),
+            GenParams.LENGTH_PENALTY: args.get("length_penalty", None),
+            GenParams.TEMPERATURE: args.get("temperature", None),
+            GenParams.TOP_P: args.get("top_p", None),
+            GenParams.TOP_K: args.get("top_k", None),
+            GenParams.RANDOM_SEED: args.get("seed", None),
+            GenParams.REPETITION_PENALTY: args.get("repetition_penalty", None),
+            GenParams.MIN_NEW_TOKENS: args.get("min_new_tokens", None),
+            GenParams.MAX_NEW_TOKENS: args.get("max_new_tokens", 256),
+            GenParams.STOP_SEQUENCES: args.get("stop_sequences", None),
+            GenParams.TIME_LIMIT: args.get("time_limit", None),
+            GenParams.TRUNCATE_INPUT_TOKENS: args.get("truncate_input_tokens", None),
+            GenParams.RETURN_OPTIONS: {
+                "generated_tokens": True,
+                "input_tokens": True,
+                "token_logprobs": True,
+                "token_ranks": True,
+            },
+        }
+        generate_params = {k: v for k, v in generate_params.items() if v is not None}
+        return cls(
+            watsonx_credentials=get_watsonx_credentials(),
+            model_id=model_id,
+            generate_params=generate_params,
+        )
+    def __init__(
+        self,
+        watsonx_credentials: Dict,
+        model_id,
+        generate_params: Optional[Dict[Any, Any]] = None,
+    ) -> None:
+        try:
+            from ibm_watsonx_ai import APIClient
+            from ibm_watsonx_ai.foundation_models import ModelInference
+        except ImportError:
+            raise ImportError(
+                "Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
+            )
+        super().__init__()
+        client = APIClient(watsonx_credentials)
+        project_id = watsonx_credentials.get("project_id", None)
+        deployment_id = watsonx_credentials.get("deployment_id", None)
+        client.set.default_project(project_id)
+        self.generate_params = generate_params
+        self.model = ModelInference(
+            model_id=model_id,
+            deployment_id=deployment_id,
+            api_client=client,
+            project_id=project_id,
+        )
+        self._model_id = model_id
+    @staticmethod
+    def _has_stop_token(response_tokens: List[str], context_tokens: List[str]) -> bool:
+        """
+        Determines whether a stop token has been generated in the `response_tokens` compared to the `context_tokens`.
+        If the tokens do not match as expected, the function raises a RuntimeError, indicating a possible
+        misalignment between the tokens generated by the tokenizer and the model.
+        Args:
+            response_tokens (List[str]): The List of tokens generated as a response by the model.
+            context_tokens (List[str]): The List of tokens representing the input context.
+        Returns:
+            bool: True if the `response_tokens` likely contain a stop token that terminates the sequence,
+                  otherwise raises an exception.
+        Raises:
+            RuntimeError: If there is an unexpected mismatch between the `response_tokens` and the `context_tokens`.
+        """
+        context_length = len(context_tokens)
+        if response_tokens[: context_length - 1] == context_tokens[:-1]:
+            return (
+                response_tokens[-1] != context_tokens[-1]
+            )  # only last token differs, probably stop sequence (</s>)
+        raise RuntimeError(
+            f"There is an unexpected difference between tokenizer and model tokens:\n"
+            f"context_tokens={context_tokens}\n"
+            f"response_tokens={response_tokens[:context_length]}"
+        )
+    def _check_model_logprobs_support(self):
+        """
+        Verifies if the model supports returning log probabilities for input tokens.
+        This function sends a prompt to the model and checks whether the model's response
+        includes log probabilities for the input tokens. If log probabilities are not present,
+        it raises a `RuntimeError`, indicating that the model is not supported.
+        Raises:
+            RuntimeError: If the model does not return log probabilities for input tokens.
+        """
+        tokens = self.model.generate_text(
+            prompt=["The best ice cream flavor is:"],
+            params=self.generate_params,
+            raw_response=True,
+        )[0]["results"][0]
+        if all(token.get("logprob", None) is None for token in tokens["input_tokens"]):
+            raise RuntimeError(
+                f"Model {self._model_id} is not supported: does not return logprobs for input tokens"
+            )
+    def _get_log_likelihood(
+        self,
+        input_tokens: List[Dict[str, float]],
+        context_tokens: List[Dict[str, float]],
+    ) -> LogLikelihoodResult:
+        """
+        Calculates the log likelihood of the generated tokens compared to the context tokens.
+        Args:
+            input_tokens (List[Dict[str, float]]): A List of token dictionaries, each containing
+                token information like `text` and `logprob`.
+            context_tokens (List[Dict[str, float]]): A List of token dictionaries representing
+                the input context.
+        Returns:
+            LogLikelihoodResult: An object containing the calculated log likelihood and a boolean
+            flag indicating if the tokens were generated greedily.
+        """
+        response_tokens = [token["text"] for token in input_tokens]
+        context_length = len(context_tokens)
+        if self._has_stop_token(response_tokens, context_tokens):
+            context_length -= 1
+        return LogLikelihoodResult(
+            log_likelihood=sum(
+                token.get("logprob", 0) for token in input_tokens[context_length:]
+            ),
+            is_greedy=all(
+                token["rank"] == 1 for token in input_tokens[context_length:]
+            ),
+        )
+    def generate_until(self, requests: List[Instance]) -> List[str]:
+        """
+        Generates text responses for a List of requests, with progress tracking and caching.
+        Args:
+            requests (List[Instance]): A List of instances, each containing a text input to be processed.
+        Returns:
+            List[str]: A List of generated responses.
+        """
+        requests = [request.args for request in requests]
+        results = []
+        for request in tqdm(
+            requests,
+            desc="Running generate_until function ...",
+        ):
+            context, continuation = request
+            try:
+                if isinstance(context, JsonChatStr):
+                    context = json.loads(context.prompt)
+                    response = self.model.chat(context, self.generate_params)
+                    response = response["choices"][0]["message"]["content"]
+                else:
+                    response = self.model.generate_text(context, self.generate_params)
+            except Exception as exp:
+                eval_logger.error("Error while generating text.")
+                raise exp
+            results.append(response)
+            self.cache_hook.add_partial(
+                "generate_until", (context, continuation), response
+            )
+        return results
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        """
+        Args:
+            requests: Each request contains Instance.args : Tuple[str, str] containing:
+                1. an input string to the LM and
+                2. a target string on which the loglikelihood of the LM producing this target,
+                   conditioned on the input, will be returned.
+        Returns:
+            Tuple (loglikelihood, is_greedy) for each request according to the input order:
+                loglikelihood: probability of generating the target string conditioned on the input
+                is_greedy: True if and only if the target string would be generated by greedy sampling from the LM
+        """
+        try:
+            from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
+        except ImportError:
+            raise ImportError(
+                "Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
+            )
+        self._check_model_logprobs_support()
+        generate_params = copy.copy(self.generate_params)
+        generate_params[GenParams.MAX_NEW_TOKENS] = 1
+        requests = [request.args for request in requests]
+        results: List[LogLikelihoodResult] = []
+        # Note: We're not using batching due to (current) indeterminism of loglikelihood values when sending batch of requests
+        for request in tqdm(
+            requests,
+            desc="Running loglikelihood function ...",
+        ):
+            context, continuation = request
+            try:
+                tokenized_context = self.model.tokenize(
+                    prompt=context, return_tokens=True
+                )["result"]["tokens"]
+            except Exception as exp:
+                eval_logger.error("Error while model tokenize.")
+                raise exp
+            input_prompt = context + continuation
+            try:
+                response = self.model.generate_text(
+                    prompt=input_prompt, params=generate_params, raw_response=True
+                )
+            except Exception as exp:
+                eval_logger.error("Error while model generate text.")
+                raise exp
+            log_likelihood_response = self._get_log_likelihood(
+                response["results"][0]["input_tokens"], tokenized_context
+            )
+            results.append(log_likelihood_response)
+            self.cache_hook.add_partial(
+                "loglikelihood",
+                (context, continuation),
+                (
+                    log_likelihood_response.log_likelihood,
+                    log_likelihood_response.is_greedy,
+                ),
+            )
+        return cast(List[Tuple[float, bool]], results)
+    def loglikelihood_rolling(self, requests) -> List[Tuple[float, bool]]:
+        """
+        Used to evaluate perplexity on a data distribution.
+        Args:
+            requests: Each request contains Instance.args : Tuple[str] containing an input string to the model whose
+                entire loglikelihood, conditioned on purely the EOT token, will be calculated.
+        Returns:
+            Tuple (loglikelihood,) for each request according to the input order:
+                loglikelihood: solely the probability of producing each piece of text given no starting input.
+        """
+        try:
+            from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
+        except ImportError:
+            raise ImportError(
+                "Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
+            )
+        self._check_model_logprobs_support()
+        generate_params = copy.deepcopy(self.generate_params)
+        generate_params[GenParams.MAX_NEW_TOKENS] = 1
+        requests = [request.args for request in requests]
+        results: List[LogLikelihoodResult] = []
+        # Note: We're not using batching due to (current) indeterminism of loglikelihood values when sending batch of requests
+        for request in tqdm(
+            requests,
+            desc="Running loglikelihood_rolling function ...",
+        ):
+            context, continuation = request
+            try:
+                response = self.model.generate_text(
+                    prompt=context, params=generate_params, raw_response=True
+                )
+            except Exception as exp:
+                eval_logger.error("Error while model generate text.")
+                raise exp
+            log_likelihood_response = self._get_log_likelihood(
+                response["results"][0]["input_tokens"], []
+            )
+            results.append(log_likelihood_response)
+            self.cache_hook.add_partial(
+                "loglikelihood_rolling",
+                (context, continuation),
+                log_likelihood_response.log_likelihood,
+            )
+        return cast(List[Tuple[float, bool]], results)
+    @property
+    def tokenizer_name(self) -> str:
+        return ""
+    def apply_chat_template(
+        self, chat_history: List[Dict[str, str]]
+    ) -> List[Dict[str, str]]:
+        # A hack similar from api_model to allow encoding for cache
+        return JsonChatStr(json.dumps(chat_history))
--- a/lm_eval/models/mamba_lm.py
+++ b/lm_eval/models/mamba_lm.py
@@ -12,6 +12,8 @@ class MambaLMWrapper(HFLM):
    def __init__(
        self,
        pretrained="state-spaces/mamba-130m",
+        # To use the HF compatible variant
+        is_hf: bool = False,
        **kwargs,
    ) -> None:
        """
@@ -52,7 +54,7 @@ class MambaLMWrapper(HFLM):
        if "backend" in kwargs:
            # mamba currently only supports causal models
            assert kwargs["backend"] == "causal"
+        self.is_hf = is_hf or (True if pretrained.endswith("hf") else False)
        super().__init__(
            pretrained=pretrained,
            # set appropriate defaults for tokenizer, max length, etc
@@ -67,15 +69,18 @@ class MambaLMWrapper(HFLM):
        pretrained: str,
        **kwargs,
    ) -> None:
-        try:
+        if self.is_hf:
-            from mamba_ssm.utils.hf import load_config_hf  # noqa: F811
+            super()._get_config(pretrained, **kwargs)
-        except ModuleNotFoundError:
+        else:
-            raise Exception(
+            try:
-                "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
+                from mamba_ssm.utils.hf import load_config_hf  # noqa: F811
-please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
+            except ModuleNotFoundError as exception:
-            )
+                raise type(exception)(
+                    "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
-        self._config = load_config_hf(pretrained)
+    please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
+                )
+            self._config = load_config_hf(pretrained)
    def _create_model(
        self,
@@ -86,24 +91,32 @@ please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba
        # Mamba does not support arbitrary HF from_pretrained() args
        **kwargs,
    ) -> None:
-        try:
+        if self.is_hf:
-            from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel  # noqa: F811
+            super()._create_model(pretrained, dtype=dtype, **kwargs)
-        except ModuleNotFoundError:
+        else:
-            raise Exception(
+            try:
-                "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
+                from mamba_ssm.models.mixer_seq_simple import (
-please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
+                    MambaLMHeadModel,  # noqa: F811
+                )
+            except ModuleNotFoundError as exception:
+                raise type(exception)(
+                    "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
+    please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
+                )
+            self._model = MambaLMHeadModel.from_pretrained(
+                pretrained,
+                device=self._device,
+                dtype=torch.float16
+                if dtype == "auto"
+                else lm_eval.models.utils.get_dtype(dtype),
            )
-        self._model = MambaLMHeadModel.from_pretrained(
-            pretrained,
-            device=self._device,
-            dtype=torch.float16
-            if dtype == "auto"
-            else lm_eval.models.utils.get_dtype(dtype),
-        )
    def _model_generate(self, context, max_length, stop, **generation_kwargs):
-        for key in ("do_sample", "attention_mask"):
+        remove_arg = (
+            ["attention_mask"] if self.is_hf else ["do_sample", "attention_mask"]
+        )
+        for key in remove_arg:
            if key in generation_kwargs:
                generation_kwargs.pop(key)
@@ -116,11 +129,37 @@ please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba
        #     self.tokenizer, stop, 1, context.shape[0]
        # )
-        return self.model.generate(
+        if not self.is_hf:
-            input_ids=context,
+            return self.model.generate(
-            max_length=max_length,
+                input_ids=context,
-            # stopping_criteria=stopping_criteria,
+                max_length=max_length,
-            # pad_token_id=self.tokenizer.pad_token_id,
+                # stopping_criteria=stopping_criteria,
-            # use_cache=True,
+                # pad_token_id=self.tokenizer.pad_token_id,
-            **generation_kwargs,
+                # use_cache=True,
-        )
+                **generation_kwargs,
+            )
+        else:
+            stopping_criteria = lm_eval.models.utils.stop_sequences_criteria(
+                self.tokenizer,
+                stop,
+                context.shape[1],
+                context.shape[0],
+            )
+            generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
+            do_sample = generation_kwargs.get("do_sample", None)
+            # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
+            if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
+                generation_kwargs["do_sample"] = do_sample = False
+            if do_sample is False and generation_kwargs.get("temperature") == 0.0:
+                generation_kwargs.pop("temperature")
+            return self.model.generate(
+                input_ids=context,
+                max_length=max_length,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=self.tokenizer.pad_token_id,
+                use_cache=True,
+                **generation_kwargs,
+            )
--- a/lm_eval/models/nemo_lm.py
+++ b/lm_eval/models/nemo_lm.py
@@ -39,8 +39,8 @@ def _patch_pretrained_cfg(
 ):
    try:
        import omegaconf
-    except ModuleNotFoundError:
+    except ModuleNotFoundError as exception:
-        raise Exception(
+        raise type(exception)(
            "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
            "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
            "or installing nemo following https://github.com/NVIDIA/NeMo.",
@@ -79,8 +79,8 @@ def load_model(
            MegatronGPTModel,
        )
        from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
-    except ModuleNotFoundError:
+    except ModuleNotFoundError as exception:
-        raise Exception(
+        raise type(exception)(
            "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
            "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
            "or installing nemo following https://github.com/NVIDIA/NeMo.",
@@ -140,8 +140,8 @@ def load_model(
 def setup_distributed_environment(trainer):
    try:
        from nemo.utils.app_state import AppState
-    except ModuleNotFoundError:
+    except ModuleNotFoundError as exception:
-        raise Exception(
+        raise type(exception)(
            "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
            "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
            "or installing nemo following https://github.com/NVIDIA/NeMo.",
@@ -187,15 +187,15 @@ class NeMoLM(LM):
        **kwargs,
    ):
        try:
+            from lightning.pytorch.trainer.trainer import Trainer
            from nemo.collections.nlp.modules.common.text_generation_utils import (
                generate,
            )
            from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
-            from pytorch_lightning.trainer.trainer import Trainer
            self.generate = generate
-        except ModuleNotFoundError:
+        except ModuleNotFoundError as exception:
-            raise Exception(
+            raise type(exception)(
                "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
                "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
                "or installing nemo following https://github.com/NVIDIA/NeMo.",
@@ -386,6 +386,9 @@ class NeMoLM(LM):
            string_nll = sum(string_nll)
            loglikelihoods.append(string_nll)
+            # cache this loglikelihood_rolling request
+            self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
        return loglikelihoods
    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
@@ -468,6 +471,9 @@ class NeMoLM(LM):
                answer = (logprob, is_greedy)
                if cache_key is not None:
+                    # special case: loglikelihood_rolling produces a number of loglikelihood requests
+                    # all with cache key None. instead do add_partial on the per-example level
+                    # in the loglikelihood_rolling() function for those.
                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
                res.append(answer)

--- a/lm_eval/models/neuralmagic.py
+++ b/lm_eval/models/neuralmagic.py
@@ -38,8 +38,8 @@ class SparseMLLM(HFLM):
    ) -> None:
        try:
            from sparseml.transformers import SparseAutoModelForCausalLM
-        except ModuleNotFoundError:
+        except ModuleNotFoundError as exception:
-            raise Exception(
+            raise type(exception)(
                "Package `sparseml` is not installed. "
                "Please install it via `pip install sparseml[transformers]`"
            )
@@ -88,8 +88,8 @@ class SparseMLLM(HFLM):
    def _get_config(self, pretrained: str, **kwargs) -> None:
        try:
            from sparseml.transformers import SparseAutoConfig
-        except ModuleNotFoundError:
+        except ModuleNotFoundError as exception:
-            raise Exception(
+            raise type(exception)(
                "Package `sparseml` is not installed. "
                "Please install it via `pip install sparseml[transformers]`"
            )
@@ -112,8 +112,8 @@ class SparseMLLM(HFLM):
    ) -> None:
        try:
            from sparseml.transformers import SparseAutoTokenizer
-        except ModuleNotFoundError:
+        except ModuleNotFoundError as exception:
-            raise Exception(
+            raise type(exception)(
                "Package `sparseml` is not installed. "
                "Please install it via `pip install sparseml[transformers]`"
            )
@@ -171,8 +171,8 @@ class DeepSparseLM(LM):
        try:
            import deepsparse
-        except ModuleNotFoundError:
+        except ModuleNotFoundError as exception:
-            raise Exception(
+            raise type(exception)(
                "Package `deepsparse` is not installed. "
                "Please install it via `pip install deepsparse[transformers]`"
            )
@@ -321,6 +321,9 @@ class DeepSparseLM(LM):
                res.append(answer)
                if cache_key is not None:
+                    # special case: loglikelihood_rolling produces a number of loglikelihood requests
+                    # all with cache key None. instead do add_partial on the per-example level
+                    # in the loglikelihood_rolling() function for those.
                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
        return re_ord.get_original(res)

--- a/lm_eval/models/neuron_optimum.py
+++ b/lm_eval/models/neuron_optimum.py
 import copy
-import json
 import logging
-import subprocess
 from collections import defaultdict
 from typing import List, Optional, Union
@@ -33,54 +31,6 @@ except ImportError:
 logger = logging.getLogger(__name__)
-def get_nc_count() -> Union[int, None]:
-    """Returns the number of neuron cores on the current instance."""
-    try:
-        cmd = "neuron-ls --json-output"
-        result = subprocess.run(cmd, shell=True, capture_output=True)
-        print(f"inferring nc_count from `neuron-ls` {result.stdout}")
-        json_output = json.loads(result.stdout)
-        count = sum([x["nc_count"] for x in json_output])
-        print(f"nc_count={count}")
-        return count
-    except Exception:
-        return None
-def wrap_constant_batch_size(func):
-    def _decorator(self, input_ids):
-        """input_ids a 2D array with batch_size on dim=0
-        makes sure the func runs with self.batch_size
-        """
-        # access a from TestSample
-        batch_size = input_ids.shape[0]
-        if batch_size < self.batch_size:
-            # handle the event of input_ids.shape[0] != batch_size
-            # Neuron cores expect constant batch_size
-            input_ids = torch.concat(
-                (
-                    input_ids,
-                    # add missing_batch_size dummy
-                    torch.zeros(
-                        [self.batch_size - batch_size, *input_ids.size()[1:]],
-                        dtype=input_ids.dtype,
-                        device=input_ids.device,
-                    ),
-                ),
-                dim=0,
-            )
-        elif batch_size > self.batch_size:
-            raise ValueError(
-                f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
-            )
-        # return the forward pass that requires constant batch size
-        return func(self, input_ids)[:batch_size]
-    return _decorator
 class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
    """NeuronModelForCausalLM with `stopping_criteria` in `generate`"""
@@ -146,7 +96,7 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
            raise ValueError(
                f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
            )
-        elif batch_size < self.batch_size:
+        elif batch_size < self.batch_size and not self.continuous_batching:
            logger.warning(
                "Inputs will be padded to match the model static batch size. This will increase latency."
            )
@@ -158,8 +108,6 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
            if attention_mask is not None:
                padding = torch.zeros(padding_shape, dtype=torch.int64)
                padded_attention_mask = torch.cat([attention_mask, padding])
-        # Drop the current generation context and clear the Key/Value cache
-        self.reset_generation()
        output_ids = self.generate_tokens(
            padded_input_ids,
@@ -179,8 +127,6 @@ class NEURON_HF(TemplateLM):
    Tested with neuron 2.17.0
    """
-    _DEFAULT_MAX_LENGTH = 2048
    def __init__(
        self,
        pretrained: Optional[str] = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
@@ -198,12 +144,12 @@ class NEURON_HF(TemplateLM):
        add_bos_token: Optional[bool] = False,
    ) -> None:
        if not NEURON_AVAILABLE:
-            raise Exception(
+            raise ImportError(
                "Tried to load neuron model, but neuron is not installed ",
                "please install neuron via pip install transformers-neuron ",
                "also make sure you are running on an AWS inf2 instance",
            )
-        if version.parse(optimum_neuron_version) != version.parse("0.0.17"):
+        if version.parse(optimum_neuron_version) != version.parse("0.0.24"):
            logger.warning(
                '`optimum-neuron` model requires `pip install "optimum[neuronx]>=0.0.17" '
                "preferably using the Hugging Face Neuron Deep Learning AMI (Ubuntu 22.04) "
@@ -217,34 +163,16 @@ class NEURON_HF(TemplateLM):
        self.batch_size_per_gpu = int(batch_size)
        batch_size = int(batch_size)
-        if tp_degree is None:
-            # execute `neuron-ls --json-output | jq '.[0].nc_count'``
-            # to get the number of neuron cores on your instance
-            tp_degree = get_nc_count()
-        assert isinstance(tp_degree, int), (
-            f"model_args must include tp_degree. tp_degree must be set to an integer,"
-            f" but is tp_degree=`{tp_degree}` with type=`{type(tp_degree)}`."
-            "Set it to number of neuron cores on your instance."
-            " For inf2.xlarge and inf2.8xlarge, set it to `2`."
-            " For inf2.24xlarge, set it to `12`."
-            " For inf2.48xlarge, set it to `24`."
-        )
-        # TODO: update this to be less of a hack once subfolder is fixed in HF
-        revision = revision + ("/" + subfolder if subfolder is not None else "")
        self._config = transformers.AutoConfig.from_pretrained(
            pretrained,
            revision=revision,
            trust_remote_code=trust_remote_code,
        )
-        torch_dtype = lm_eval.models.utils.get_dtype(dtype)
-        assert torch_dtype in [
+        revision = str(revision)  # cast to string if not already one
-            torch.float16,
+        # TODO: update this to be less of a hack once subfolder is fixed in HF
-            torch.bfloat16,
+        revision = revision + ("/" + subfolder if subfolder is not None else "")
-        ], "Only float16 and bfloat16 are supported"
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
            pretrained if tokenizer is None else tokenizer,
@@ -253,36 +181,58 @@ class NEURON_HF(TemplateLM):
            use_fast=use_fast_tokenizer,
        )
-        # Neuron specific code
+        neuron_config = getattr(self._config, "neuron", None)
-        if torch_dtype == torch.float16:
+        if neuron_config is None:
-            self.amp_dtype = "f16"
+            # Check export parameters
-        elif torch_dtype == torch.bfloat16:
+            if tp_degree is not None:
-            self.amp_dtype = "bf16"
+                assert isinstance(tp_degree, int), (
-        elif torch_dtype == torch.float32:
+                    f"tp_degree must be set to an integer,"
-            self.amp_dtype = "f32"
+                    f" but is tp_degree=`{tp_degree}` with type=`{type(tp_degree)}`."
-        else:
+                    "Set it to a number lower than the number of neuron cores on your instance."
-            raise NotImplementedError("Only float16 and bfloat16 are implemented.")
+                    " For inf2.xlarge and inf2.8xlarge, set it to `2`."
+                    " For inf2.24xlarge, set it <= `12`."
-        compiler_args = {"num_cores": tp_degree, "auto_cast_type": self.amp_dtype}
+                    " For inf2.48xlarge, set it <= `24`."
-        input_shapes = {
+                )
-            "batch_size": batch_size,
+            torch_dtype = lm_eval.models.utils.get_dtype(dtype)
-            "sequence_length": self._DEFAULT_MAX_LENGTH,
-        }
+            if torch_dtype == torch.float16:
+                self.amp_dtype = "f16"
+            elif torch_dtype == torch.bfloat16:
+                self.amp_dtype = "bf16"
+            elif torch_dtype == torch.float32:
+                self.amp_dtype = "f32"
+            else:
+                raise NotImplementedError(
+                    "Only float16/bfloat16/float32 are supported."
+                )
-        print(
+            print(f"{'='*20} \n exporting model to neuron")
-            f"{'='*20} \n loading model to neuron with"
+            self.model = CustomNeuronModelForCausalLM.from_pretrained(
-            f" {compiler_args}, {input_shapes}..."
+                pretrained,
-        )
+                revision=revision,
-        self.model = CustomNeuronModelForCausalLM.from_pretrained(
+                trust_remote_code=trust_remote_code,
-            pretrained,
+                low_cpu_mem_usage=low_cpu_mem_usage,
-            revision=revision,
+                export=True,
-            trust_remote_code=trust_remote_code,
+                batch_size=batch_size,
-            low_cpu_mem_usage=low_cpu_mem_usage,
+                num_cores=tp_degree,
-            export=True,
+                auto_cast_type=self.amp_dtype,
-            **compiler_args,
+                sequence_length=max_length,
-            **input_shapes,
+            )
-        )
+            neuron_config = self.model.config.neuron
-        print(f"SUCCESS: neuron model compiled. \n {'='*20}")
+            print(
+                f"SUCCESS: neuron model exported with config {neuron_config}. \n {'='*20}"
+            )
+        else:
+            print(
+                f"{'='*20} \n loading neuron model with config" f" {neuron_config}..."
+            )
+            self.model = CustomNeuronModelForCausalLM.from_pretrained(
+                pretrained,
+                revision=revision,
+                trust_remote_code=trust_remote_code,
+                low_cpu_mem_usage=low_cpu_mem_usage,
+            )
+            print(f"SUCCESS: neuron model loaded. \n {'='*20}")
        self.truncation = truncation
@@ -290,8 +240,6 @@ class NEURON_HF(TemplateLM):
        self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
        self.add_bos_token = add_bos_token
-        self._max_length = max_length
        self.batch_schedule = 1
        self.batch_sizes = {}
@@ -312,17 +260,7 @@ class NEURON_HF(TemplateLM):
    @property
    def max_length(self):
-        if self._max_length:  # if max length manually set, return it
+        return self.model.max_length
-            return self._max_length
-        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
-        for attr in seqlen_config_attrs:
-            if hasattr(self.model.config, attr):
-                return getattr(self.model.config, attr)
-        if hasattr(self.tokenizer, "model_max_length"):
-            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
-                return self._DEFAULT_MAX_LENGTH
-            return self.tokenizer.model_max_length
-        return self._DEFAULT_MAX_LENGTH
    @property
    def max_gen_toks(self) -> int:
@@ -390,34 +328,6 @@ class NEURON_HF(TemplateLM):
    def tok_decode(self, tokens):
        return self.tokenizer.decode(tokens)
-    @wrap_constant_batch_size
-    def _model_call(self, input_ids: torch.Tensor):
-        """
-        get logits for the entire sequence
-        :param input_ids: torch.Tensor
-            A torch tensor of shape [batch, sequence_cont]
-            the size of sequence may vary from call to call
-        :return
-            A torch tensor of shape [batch, sequence, vocab] with the
-            logits returned from the model's decoder-lm head
-        """
-        _, sequence_length = input_ids.shape
-        with torch.inference_mode():
-            cache_ids = torch.arange(0, sequence_length, dtype=torch.int32).split(1)
-            input_ids_split = input_ids.split(1, dim=1)
-            return torch.concat(
-                [
-                    self.model.forward(
-                        input_ids=input_id, cache_ids=cache_id, return_dict=False
-                    )[0]
-                    for input_id, cache_id in zip(input_ids_split, cache_ids)
-                ],
-                dim=1,
-            )
    def _model_generate(self, context, max_length, stop, **generation_kwargs):
        # we require users to pass do_sample=True explicitly
        # for non-greedy gen. This should be reevaluated when considering beam search.
@@ -501,7 +411,8 @@ class NEURON_HF(TemplateLM):
            string_nll = sum(string_nll)
            loglikelihoods.append(string_nll)
+            # cache this loglikelihood_rolling request
+            self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
        return loglikelihoods
    def _loglikelihood_tokens(
@@ -578,15 +489,41 @@ class NEURON_HF(TemplateLM):
                cont_toks_list.append(continuation_enc)
                inplens.append(inplen)
-            # create encoder attn mask and batched conts, if seq2seq
+            # Add dummy inputs up to the model static batch size
-            call_kwargs = {}
+            if len(inps) < self.batch_size:
+                inps = inps + [
+                    torch.zeros_like(inps[0]),
+                ] * (self.batch_size - len(inps))
+            masks = [torch.ones_like(inp) for inp in inps]
            batched_inps = lm_eval.models.utils.pad_and_concat(
                padding_len_inp, inps, padding_side="right"
            )  # [batch, padding_len_inp]
-            multi_logits = F.log_softmax(
+            batched_masks = lm_eval.models.utils.pad_and_concat(
-                self._model_call(batched_inps, **call_kwargs), dim=-1
+                padding_len_inp, masks, padding_side="right"
-            )  # [batch, padding_length (inp or cont), vocab]
+            )
+            if self.model.model.neuron_config.output_all_logits:
+                inputs = self.model.prepare_inputs_for_prefill(
+                    batched_inps, batched_masks
+                )
+                multi_logits = F.log_softmax(
+                    self.model.forward(**inputs).logits, dim=-1
+                )  # [batch, padding_length (inp or cont), vocab]
+            else:
+                # The model will only return the logits for the last input token, so we need
+                # to iterate over inputs to accumulate logits.
+                # To speed things up we use the KV cache as we would do when generating.
+                inputs = self.model.prepare_inputs_for_prefill(
+                    batched_inps[:, :1], batched_masks[:, :1]
+                )
+                outputs = [self.model.forward(**inputs).logits]
+                for i in range(1, padding_len_inp):
+                    inputs = self.model.prepare_inputs_for_decode(
+                        batched_inps[:, : i + 1], batched_masks[:, : i + 1]
+                    )
+                    outputs.append(self.model.forward(**inputs).logits)
+                multi_logits = F.log_softmax(torch.concat(outputs, dim=1), dim=-1)
            for (cache_key, _, _), logits, inplen, cont_toks in zip(
                chunk, multi_logits, inplens, cont_toks_list
@@ -619,7 +556,11 @@ class NEURON_HF(TemplateLM):
                res.append(answer)
-                self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+                if cache_key is not None:
+                    # special case: loglikelihood_rolling produces a number of loglikelihood requests
+                    # all with cache key None. instead do add_partial on the per-example level
+                    # in the loglikelihood_rolling() function for those.
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
        return re_ord.get_original(res)

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
-import copy
 import os
-from collections import defaultdict
+from functools import cached_property
-from importlib.util import find_spec
+from operator import itemgetter
-from typing import List, Literal, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
-from tqdm import tqdm
-import lm_eval.models.utils
-from lm_eval import utils
-from lm_eval.api.model import LM, TemplateLM
 from lm_eval.api.registry import register_model
-from lm_eval.models.utils import retry_on_specific_exceptions
+from lm_eval.models.api_models import TemplateAPI
+from lm_eval.models.utils import handle_stop_sequences
 from lm_eval.utils import eval_logger
-def get_result(response) -> Tuple[float, bool]:
+@register_model("local-completions")
-    """Process results from OpenAI API response.
+class LocalCompletionsAPI(TemplateAPI):
+    def __init__(
-    :param response: dict
+        self,
-        OpenAI API Response
+        base_url=None,
-    :return:
+        tokenizer_backend="huggingface",
-        continuation_logprobs: np.array
+        **kwargs,
-            Log probabilities of continuation tokens
+    ):
-        is_greedy: bool
+        super().__init__(
-            whether argmax matches given continuation exactly
+            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
-    """
-    is_greedy = True
-    logprobs = response.logprobs.token_logprobs
-    continuation_logprobs = sum(logprobs)
-    for i in range(len(response.logprobs.token_logprobs)):
-        token = response.logprobs.token_logprobs[i]
-        top_tokens = response.logprobs.top_logprobs[i]
-        top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
-        if top_token != token:
-            is_greedy = False
-            break
-    return continuation_logprobs, is_greedy
-def oa_completion(client, chat: bool = False, **kwargs):
-    """Query OpenAI API for completion.
-    Retry with back-off until they respond
-    """
-    if not find_spec("openai") or not find_spec("tiktoken"):
-        raise Exception(
-            "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. "
-            "Please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`"
        )
-    else:
-        import openai
-    def _exception_callback(e: Exception, sleep_time: float) -> None:
-        import traceback
-        traceback.print_exc()
-    @retry_on_specific_exceptions(
-        on_exceptions=[openai.OpenAIError],
-        max_retries=None,  # retry forever, consider changing
-        on_exception_callback=_exception_callback,
-    )
-    def completion():
-        if chat:
-            return client.chat.completions.create(**kwargs)
-        else:
-            return client.completions.create(**kwargs)
-    return completion()
+    def _create_payload(
-@register_model("openai-completions", "local-completions")
-class OpenaiCompletionsLM(TemplateLM):
-    _DEFAULT_MAX_LENGTH = 2048
-    def __init__(
        self,
-        model: str,
+        messages: Union[List[List[int]], List[dict], List[str], str],
-        base_url: str = None,
+        generate=False,
-        tokenizer: Optional[str] = None,
+        gen_kwargs: Optional[dict] = None,
-        tokenizer_backend: Literal["tiktoken", "huggingface"] = "tiktoken",
-        truncate: bool = False,
-        max_gen_toks: int = 256,
-        batch_size: int = 1,
        seed: int = 1234,
-        max_length: Optional[int] = None,
+        eos=None,
-    ) -> None:
+        **kwargs,
-        """
+    ) -> dict:
+        if generate:
-        :param engine: str
+            gen_kwargs.pop("do_sample", False)
-            OpenAI API engine (e.g. gpt-3.5-turbo-instruct)
+            if "max_tokens" in gen_kwargs:
-        :param truncate: bool
+                max_tokens = gen_kwargs.pop("max_tokens")
-            Truncate input if too long (if False and input is too long, throw error)
+            else:
-        """
+                max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
-        super().__init__()
+            temperature = gen_kwargs.pop("temperature", 0)
-        self.seed = seed
+            stop = handle_stop_sequences(gen_kwargs.pop("until", None), eos)
-        try:
+            return {
-            import openai  # noqa: E401
+                "prompt": messages,
-            import tiktoken
+                "model": self.model,
-        except ModuleNotFoundError:
+                "max_tokens": max_tokens,
-            raise Exception(
+                "temperature": temperature,
-                "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
+                "stop": stop,
-    please install these via `pip install lm-eval[openai]` or `pip install -e .\"[openai]\"`",
+                "seed": seed,
-            )
+                **gen_kwargs,
-        self.model = model
+            }
-        self.base_url = base_url
-        self.tokenizer_backend = tokenizer_backend
-        self.truncate = truncate
-        self._batch_size = int(batch_size)
-        self._max_gen_toks = max_gen_toks
-        self._max_length = max_length
-        # if we have a local model, use HF tokenizer over tiktoken
-        if self.tokenizer_backend == "huggingface":
-            import transformers  # noqa: E401
-            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                tokenizer if tokenizer else self.model
-            )
-            self.vocab_size = self.tokenizer.vocab
-            self.end_of_text_token_id = self.tokenizer.eos_token
-        elif self.tokenizer_backend == "tiktoken":
-            if self.base_url:
-                eval_logger.warning(
-                    f"Passed `base_url={self.base_url}` but using Tiktoken tokenizer backend. "
-                    "Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken."
-                )
-            self.tokenizer = tiktoken.encoding_for_model(self.model)
-            self.vocab_size = self.tokenizer.n_vocab
-            self.end_of_text_token_id = self.tokenizer.eot_token
-        else:
-            raise ValueError(
-                f"Expected tokenizer_backend to be one of ['tiktoken', 'huggingface'] but got {self.tokenizer_backend}"
-            )
-        # Read from environment variable OPENAI_API_KEY
-        # Set to EMPTY for local
-        openai.api_key = os.environ["OPENAI_API_KEY"]
-        if self.base_url:
-            self.client = openai.OpenAI(base_url=self.base_url)
-        else:
-            self.client = openai.OpenAI()
-    @property
-    def eot_token_id(self):
-        return self.end_of_text_token_id
-    @property
-    def max_length(self) -> int:
-        if self._max_length:
-            return self._max_length
        else:
-            return self._DEFAULT_MAX_LENGTH
+            return {
+                "model": self.model,
-    @property
+                "prompt": messages,
-    def max_gen_toks(self) -> int:
+                "temperature": 0,
-        return self._max_gen_toks
+                "max_tokens": 1,
+                "logprobs": 1,
-    @property
+                "seed": seed,
-    def batch_size(self) -> int:
+                "echo": True,
-        return self._batch_size
+            }
-    @property
+    @staticmethod
-    def device(self):
+    def parse_logprobs(
-        # Isn't used because we override _loglikelihood_tokens
+        outputs: Union[Dict, List[Dict]],
-        raise NotImplementedError()
+        tokens: List[List[int]] = None,
+        ctxlens: List[int] = None,
-    def tok_encode(self, string: str, **kwargs) -> List[int]:
+        **kwargs,
-        return self.tokenizer.encode(string)
-    def tok_decode(self, tokens: List[int]) -> str:
-        return self.tokenizer.decode(tokens)
-    def _loglikelihood_tokens(
-        self, requests, disable_tqdm: bool = False
    ) -> List[Tuple[float, bool]]:
        res = []
+        if not isinstance(outputs, list):
-        def _collate(x):
+            outputs = [outputs]
-            # this doesn't efficiently handle last-token differences yet, but those are kinda annoying because
+        for out in outputs:
-            # it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations
+            for choice, ctxlen in zip(
-            # we care about, and so we need some kind of backup for when it isn't
+                sorted(out["choices"], key=itemgetter("index")), ctxlens
-            toks = x[1] + x[2]
-            return -len(toks), tuple(toks)
-        re_ord = utils.Reorderer(requests, _collate)
-        for chunk in tqdm(
-            list(lm_eval.models.utils.chunks(re_ord.get_reordered(), self.batch_size)),
-            disable=disable_tqdm,
-        ):
-            inps = []
-            ctxlens = []
-            for cache_key, context_enc, continuation_enc in chunk:
-                # max_length+1 because the API takes up to 2049 tokens, including the first context token
-                inp = (context_enc + continuation_enc)[-(self.max_length + 1) :]
-                # TODO: the logic is much simpler if we just look at the length of continuation tokens
-                ctxlen = len(context_enc) - max(
-                    0, len(context_enc) + len(continuation_enc) - (self.max_length + 1)
-                )
-                inps.append(inp)
-                ctxlens.append(ctxlen)
-            response = oa_completion(
-                client=self.client,
-                model=self.model,
-                prompt=inps,
-                max_tokens=0,
-                temperature=0.0,
-                logprobs=10,
-                seed=self.seed,
-            )
-            for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip(
-                response.choices, ctxlens, chunk
            ):
-                answer = get_result(resp)
+                assert ctxlen > 0, "Context length must be greater than 0"
+                logprobs = sum(choice["logprobs"]["token_logprobs"][ctxlen:-1])
-                res.append(answer)
+                tokens_logprobs = choice["logprobs"]["token_logprobs"][ctxlen:-1]
+                top_logprobs = choice["logprobs"]["top_logprobs"][ctxlen:-1]
-                # partial caching
+                is_greedy = True
-                if cache_key is not None:
+                for tok, top in zip(tokens_logprobs, top_logprobs):
-                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+                    if tok != max(top.values()):
-        return re_ord.get_original(res)
+                        is_greedy = False
+                        break
-    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
+                res.append((logprobs, is_greedy))
-        if not requests:
+        return res
-            return []
+    @staticmethod
+    def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]:
        res = []
-        requests = [req.args for req in requests]
+        if not isinstance(outputs, list):
+            outputs = [outputs]
-        def _collate(x):
+        for out in outputs:
-            toks = self.tok_encode(x[0])
+            tmp = [None] * len(out["choices"])
-            return len(toks), x[0]
+            for choices in out["choices"]:
+                tmp[choices["index"]] = choices["text"]
+            res = res + tmp
+        return res
-        re_ord = utils.Reorderer(requests, _collate)
+    @property
+    def api_key(self):
-        def sameuntil_chunks(xs, size):
+        return os.environ.get("OPENAI_API_KEY", "")
-            ret = []
-            lastuntil = xs[0][1]
-            for x in xs:
-                if len(ret) >= size or x[1] != lastuntil:
-                    yield ret, lastuntil
-                    ret = []
-                    lastuntil = x[1]
-                ret.append(x)
-            if ret:
-                yield ret, lastuntil
-        # todo: more intelligent batching for heterogeneous `until`
-        for chunk, request_args in tqdm(
-            list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size)),
-            disable=disable_tqdm,
-        ):
-            inps = []
-            self._max_gen_toks = request_args.get("max_gen_toks", self.max_gen_toks)
-            for context, _ in chunk:
-                context_enc = self.tok_encode(context)
-                inp = context_enc[-(self.max_length - self.max_gen_toks) :]
-                inps.append(inp)
-            until = request_args.get("until", ["<|endoftext|>"])
-            request_args["temperature"] = request_args.get("temperature", 0)
-            response = oa_completion(
+@register_model("local-chat-completions")
-                client=self.client,
+class LocalChatCompletion(LocalCompletionsAPI):
-                model=self.model,
+    def __init__(
-                prompt=inps,
+        self,
-                max_tokens=self.max_gen_toks,
+        base_url=None,
-                stop=until,
+        tokenizer_backend=None,
-                seed=self.seed,
+        tokenized_requests=False,
-                **{
+        **kwargs,
-                    k: v
+    ):
-                    for k, v in request_args.items()
+        eval_logger.warning(
-                    if k not in {"do_sample", "max_gen_toks", "until"}
+            "chat-completions endpoint requires the `--apply_chat_template` flag."
-                },
+        )
+        super().__init__(
+            base_url=base_url,
+            tokenizer_backend=tokenizer_backend,
+            tokenized_requests=tokenized_requests,
+            **kwargs,
+        )
+        if self._batch_size > 1:
+            eval_logger.warning(
+                "Chat completions does not support batching. Defaulting to batch size 1."
            )
-            for resp, (context, args_) in zip(response.choices, chunk):
+            self._batch_size = 1
-                s = getattr(resp, "text")
-                until_ = until
-                for term in until_:
-                    if len(term) > 0:
-                        s = s.split(term)[0]
-                # partial caching
-                self.cache_hook.add_partial(
-                    "generate_until", (context, {"until": until_}), s
-                )
-                res.append(s)
-        return re_ord.get_original(res)
-    def _model_call(self, inps):
-        # Isn't used because we override _loglikelihood_tokens
-        raise NotImplementedError()
-    def _model_generate(self, context, max_length, eos_token_id):
+    def _create_payload(
-        # Isn't used because we override generate_until
+        self,
-        raise NotImplementedError()
+        messages: List[Dict],
+        generate=False,
+        gen_kwargs: dict = None,
+        seed=1234,
+        eos=None,
+        **kwargs,
+    ) -> dict:
+        assert (
+            type(messages) is not str
+        ), "chat-completions require the --apply_chat_template flag."
+        gen_kwargs.pop("do_sample", False)
+        if "max_tokens" in gen_kwargs:
+            max_tokens = gen_kwargs.pop("max_tokens")
+        else:
+            max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
+        temperature = gen_kwargs.pop("temperature", 0)
+        stop = handle_stop_sequences(gen_kwargs.pop("until", None), eos)
+        if not isinstance(stop, (list, tuple)):
+            stop = [stop]
+        return {
+            "messages": messages,
+            "model": self.model,
+            "max_tokens": max_tokens,
+            "temperature": temperature,
+            "stop": stop[:4],
+            "seed": seed,
+            **gen_kwargs,
+        }
+    @staticmethod
+    def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]:
+        res = []
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+        for out in outputs:
+            tmp = [None] * len(out["choices"])
+            for choices in out["choices"]:
+                tmp[choices["index"]] = choices["message"]["content"]
+            res = res + tmp
+        return res
+    def tok_encode(
+        self,
+        string: Union[str, Any],
+        left_truncate_len=None,
+        add_special_tokens=None,
+        **kwargs,
+    ) -> Union[List[str], List[int], Any]:
+        return string
-    def loglikelihood_rolling(
+    def loglikelihood(self, requests, **kwargs):
-        self, requests, disable_tqdm: bool = False
+        raise NotImplementedError(
-    ) -> List[float]:
+            "Loglikelihood is not supported for chat completions. Consider using the completions API instead."
-        loglikelihoods = []
+        )
-        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
-            rolling_token_windows = list(
-                map(
-                    utils.make_disjoint_window,
-                    utils.get_rolling_token_windows(
-                        token_list=self.tok_encode(string),
-                        prefix_token=self.eot_token_id,
-                        max_seq_len=self.max_length,
-                        context_len=1,
-                    ),
-                )
-            )
-            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
+@register_model(
-            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+    "openai-completions",
+)
+class OpenAICompletionsAPI(LocalCompletionsAPI):
+    def __init__(
+        self,
+        base_url="https://api.openai.com/v1/completions",
+        tokenizer_backend="tiktoken",
+        **kwargs,
+    ):
+        super().__init__(
+            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
+        )
-            string_nll = self._loglikelihood_tokens(
+    @cached_property
-                rolling_token_windows,
+    def api_key(self):
-                disable_tqdm=True,
+        """Override this property to return the API key for the API request."""
+        key = os.environ.get("OPENAI_API_KEY", None)
+        if key is None:
+            raise ValueError(
+                "API key not found. Please set the `OPENAI_API_KEY` environment variable."
            )
+        return key
-            # discard is_greedy
+    def loglikelihood(self, requests, **kwargs):
-            string_nll = [x[0] for x in string_nll]
+        assert (
+            self.model
+            in [
+                "babbage-002",
+                "davinci-002",
+            ]
+        ), f"Prompt loglikelihoods are only supported by OpenAI's API for {['babbage-002', 'davinci-002']}."
+        return super().loglikelihood(requests, **kwargs)
-            string_nll = sum(string_nll)
+    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
-            loglikelihoods.append(string_nll)
+        return ""
-        return loglikelihoods
-@register_model("openai-chat-completions", "local-chat-completions")
+@register_model("openai-chat-completions")
-class OpenaiChatCompletionsLM(LM):
+class OpenAIChatCompletion(LocalChatCompletion):
    def __init__(
        self,
-        model: str = "gpt-3.5-turbo",  # GPT model or Local model using HuggingFace model paths
+        base_url="https://api.openai.com/v1/chat/completions",
-        base_url: str = None,
+        tokenizer_backend=None,
-        truncate: bool = False,
+        tokenized_requests=False,
        **kwargs,
-    ) -> None:
+    ):
-        """
+        if "o1" in kwargs.get("model", ""):
+            eval_logger.warning(
-        :param model: str
+                "o1 models do not support `stop` and only support temperature=1"
-            Implements an OpenAI-style chat completion API for
-            accessing both OpenAI OR locally-hosted models using
-            HuggingFace Tokenizer
-            OpenAI API model (e.g. gpt-3.5-turbo)
-            using the **gen_kwargs passed on init
-        :param truncate: bool
-            Truncate input if too long (if False and input is too long, throw error)
-        """
-        super().__init__()
-        try:
-            import openai  # noqa: E401
-        except ModuleNotFoundError:
-            raise Exception(
-                "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
-    please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
            )
-        self.model = model
+        super().__init__(
-        self.base_url = base_url
+            base_url=base_url,
-        self.truncate = truncate
+            tokenizer_backend=tokenizer_backend,
+            tokenized_requests=tokenized_requests,
-        # Read from environment variable OPENAI_API_KEY
+            **kwargs,
-        # Set to EMPTY for local
+        )
-        if self.base_url:
-            self.client = openai.OpenAI(base_url=self.base_url)
-        else:
-            self.client = openai.OpenAI()  # openai.AsyncOpenAI()
-    @property
-    def max_length(self) -> int:
-        # Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
-        return 2048
-    @property
-    def max_gen_toks(self) -> int:
-        return 256
-    @property
-    def batch_size(self):
-        # Isn't used because we override _loglikelihood_tokens
-        raise NotImplementedError()
-    @property
-    def device(self):
-        # Isn't used because we override _loglikelihood_tokens
-        raise NotImplementedError()
-    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
-        res = defaultdict(list)
-        re_ords = {}
-        # we group requests by their generation_kwargs,
+    @cached_property
-        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+    def api_key(self):
-        # in the same batch.
+        """Override this property to return the API key for the API request."""
-        grouper = lm_eval.models.utils.Grouper(requests, lambda x: str(x.args[1]))
+        key = os.environ.get("OPENAI_API_KEY", None)
-        for key, reqs in grouper.get_grouped().items():
+        if key is None:
-            # within each set of reqs for given kwargs, we reorder by token length, descending.
+            raise ValueError(
-            re_ords[key] = utils.Reorderer(
+                "API key not found. Please set the `OPENAI_API_KEY` environment variable."
-                [req.args for req in reqs], lambda x: (-len(x[0]), x[0])
            )
+        return key
-        pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
+    def loglikelihood(self, requests, **kwargs):
-        for key, re_ord in re_ords.items():
+        raise NotImplementedError(
-            # n needs to be 1 because messages in
+            "Loglikelihood (and therefore `multiple_choice`-type tasks) is not supported for chat completions as OpenAI does not provide prompt logprobs. See https://github.com/EleutherAI/lm-evaluation-harness/issues/942#issuecomment-1777836312 or https://github.com/EleutherAI/lm-evaluation-harness/issues/1196 for more background on this limitation."
-            # chat completion are not batch but
+        )
-            # is regarded as a single conversation.
-            chunks = lm_eval.models.utils.chunks(re_ord.get_reordered(), n=1)
-            for chunk in chunks:
-                contexts, all_gen_kwargs = zip(*chunk)
-                inps = [{"role": "user", "content": context} for context in contexts]
-                gen_kwargs = all_gen_kwargs[0]
-                until = None
-                if isinstance(kwargs := copy.deepcopy(gen_kwargs), dict):
-                    if "do_sample" in kwargs.keys():
-                        kwargs.pop("do_sample")
-                    if "until" in kwargs.keys():
-                        until = kwargs.pop("until")
-                        if isinstance(until, str):
-                            until = [until]
-                        elif not isinstance(until, list):
-                            raise ValueError(
-                                f"Expected repr(kwargs['until']) to be of type Union[str, list] but got {until}"
-                            )
-                        kwargs["stop"] = until
-                    kwargs["max_tokens"] = kwargs.pop("max_gen_toks", self.max_gen_toks)
-                else:
-                    raise ValueError(
-                        f"Expected repr(kwargs) to be of type repr(dict) but got {kwargs}"
-                    )
-                response = oa_completion(
-                    client=self.client,
-                    chat=True,
-                    messages=inps,
-                    model=self.model,
-                    **kwargs,
-                )
-                for resp, (context, args_) in zip(response.choices, chunk):
-                    s = resp.message.content
-                    if until is not None:
-                        for term in until:
-                            if len(term) > 0:
-                                s = s.split(term)[0]
-                    res[key].append(s)
-                    self.cache_hook.add_partial(
-                        "generate_until", (context, {"until": until}), s
-                    )
-                    pbar.update(1)
-            # reorder this group of results back to original unsorted form
-            res[key] = re_ord.get_original(res[key])
-        pbar.close()
-        return grouper.get_original(res)
-    def loglikelihood(self, requests, disable_tqdm: bool = False):
-        raise NotImplementedError("No support for logits.")
-    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
+    def _create_payload(
-        raise NotImplementedError("No support for logits.")
+        self,
+        messages: List[Dict],
+        generate=False,
+        gen_kwargs: dict = None,
+        seed=1234,
+        eos="<|endoftext|>",
+        **kwargs,
+    ) -> dict:
+        assert (
+            type(messages) is not str
+        ), "chat-completions require the --apply_chat_template flag."
+        gen_kwargs.pop("do_sample", False)
+        if "max_tokens" in gen_kwargs:
+            max_tokens = gen_kwargs.pop("max_tokens")
+        else:
+            max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
+        temperature = gen_kwargs.pop("temperature", 0)
+        stop = handle_stop_sequences(gen_kwargs.pop("until", ["<|endoftext|>"]), eos)
+        if not isinstance(stop, (list, tuple)):
+            stop = [stop]
+        output = {
+            "messages": messages,
+            "model": self.model,
+            "max_completion_tokens": max_tokens,
+            "temperature": temperature,
+            "stop": stop[:4],
+            "seed": seed,
+            **gen_kwargs,
+        }
+        if "o1" in self.model:
+            output.pop("stop")
+            output["temperature"] = 1
+        return output
--- a/lm_eval/models/optimum_ipex.py
+++ b/lm_eval/models/optimum_ipex.py
+from importlib.util import find_spec
+from lm_eval import utils
+from lm_eval.api.registry import register_model
+from lm_eval.models.huggingface import HFLM
+from lm_eval.models.utils import get_dtype
+eval_logger = utils.eval_logger
+@register_model("ipex")
+class IPEXLM(HFLM):
+    """
+    using the HuggingFace transformers + optimum-intel ipex backend, can run on intel cpu and intel gpu
+    """
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        if "backend" in kwargs:
+            # currently only supports causal models
+            assert (
+                kwargs["backend"] == "causal"
+            ), "Currently, only IPEXModelForCausalLM is supported."
+        super().__init__(
+            backend=kwargs.pop("backend", "causal"),
+            **kwargs,
+        )
+    def _create_model(
+        self,
+        pretrained: str,
+        revision="main",
+        dtype="auto",
+        trust_remote_code=False,
+        # arguments used for splitting a model across GPUs naively.
+        # only used if `parallelize=True`.
+        # (accelerate naive PP (device_map) options)
+        parallelize=False,
+        gpus=None,
+        max_memory_per_gpu=None,
+        max_cpu_memory=None,
+        offload_folder="./offload",
+        # PEFT, delta weights and quantization options
+        peft=None,
+        delta=None,
+        autogptq=False,
+        gptqmodel=False,
+        **kwargs,
+    ) -> None:
+        if not find_spec("optimum"):
+            raise ModuleNotFoundError(
+                "package `optimum` is not installed. Please install it via `pip install optimum[ipex]`"
+            )
+        else:
+            from optimum.intel import IPEXModelForCausalLM
+        model_kwargs = kwargs if kwargs else {}
+        model_kwargs.update(
+            self._get_accelerate_args(
+                parallelize=parallelize,
+                device_map=kwargs.get("device_map", None),
+                max_memory_per_gpu=max_memory_per_gpu,
+                max_cpu_memory=max_cpu_memory,
+                offload_folder=offload_folder,
+                gpus=gpus,
+            )
+        )
+        self._model = IPEXModelForCausalLM.from_pretrained(
+            pretrained,
+            revision=revision,
+            torch_dtype=get_dtype(dtype),
+            trust_remote_code=trust_remote_code,
+            **model_kwargs,
+        )
--- a/lm_eval/models/optimum_lm.py
+++ b/lm_eval/models/optimum_lm.py
@@ -50,7 +50,7 @@ class OptimumLM(HFLM):
        **kwargs,
    ) -> None:
        if not find_spec("optimum"):
-            raise Exception(
+            raise ModuleNotFoundError(
                "package `optimum` is not installed. Please install it via `pip install optimum[openvino]`"
            )
        else:
@@ -71,6 +71,11 @@ class OptimumLM(HFLM):
        else:
            model_kwargs["ov_config"] = {}
        model_kwargs["ov_config"].setdefault("CACHE_DIR", "")
+        if "pipeline_parallel" in model_kwargs:
+            if model_kwargs["pipeline_parallel"]:
+                model_kwargs["ov_config"]["MODEL_DISTRIBUTION_POLICY"] = (
+                    "PIPELINE_PARALLEL"
+                )
        model_file = Path(pretrained) / "openvino_model.xml"
        if model_file.exists():
            export = False

--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -5,6 +5,7 @@ import itertools
 import time
 from functools import wraps
 from typing import (
+    TYPE_CHECKING,
    Any,
    Callable,
    Dict,
@@ -24,6 +25,11 @@ import transformers
 from lm_eval.utils import eval_logger
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizerBase
+    from transformers.configuration_utils import PretrainedConfig
 def chunks(iter, n: int = 0, fn=None):
    """
    Divides an iterable into chunks of specified size or based on a given function.
@@ -613,3 +619,111 @@ class Collator:
        if arr:
            yield arr
+def configure_pad_token(
+    tokenizer: "PreTrainedTokenizerBase",
+    model_config: Optional["PretrainedConfig"] = None,
+) -> "PreTrainedTokenizerBase":
+    """
+    This function checks if the (Hugging Face) tokenizer has a padding token and sets it if not present.
+    Some tokenizers require special handling.
+    Args:
+        tokenizer: The tokenizer for which the padding token is to be handled.
+        model_config: The configuration of the model. Default is None.
+    Returns:
+        The tokenizer after the padding token has been handled.
+    Raises:
+        AssertionError: If the tokenizer is of type RWKVWorldTokenizer or Rwkv5Tokenizer and the padding token id is not 0.
+    """
+    if tokenizer.pad_token:
+        pass
+    elif tokenizer.unk_token:
+        tokenizer.pad_token_id = tokenizer.unk_token_id
+    elif tokenizer.eos_token:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    else:
+        # handle special cases
+        if model_config and getattr(model_config, "model_type", None) == "qwen":
+            # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
+            tokenizer.pad_token = "<|endoftext|>"
+        elif (
+            tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
+            or tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
+        ):
+            # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
+            # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
+            # ---
+            # Note that the world tokenizer class name, might change in the future for the final huggingface merge
+            # https://github.com/huggingface/transformers/pull/26963
+            assert tokenizer.pad_token_id == 0
+        else:
+            tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
+    return tokenizer
+def replace_placeholders(
+    string: str, default_placeholder: str, image_token: str, max_images: int
+):
+    """
+    A utility function used for local multimodal models. It locates all `placeholder` string
+    occurrences in the given input `string_` and replaces the first `max_count` instances with
+    `replacement`, and all subsequent occurrences with the empty string.
+    This is used to replace <image> placeholder tags by model-specific image tokens like <|image_pad|>
+    and to allow for only the first `max_count` images to be passed to a model if desired.
+    :param string: The original string containing placeholders.
+    :param default_placeholder: The placeholder text to be replaced.
+    :param image_token: The token to replace the placeholder with.
+    :param max_images: The maximum number of replacements to make.
+    :return: The string with placeholders replaced.
+    """
+    count = 0
+    result = []
+    parts = string.split(default_placeholder)
+    for part in parts[:-1]:  # Iterate through all but the last part
+        result.append(part)
+        if count < max_images:
+            result.append(image_token)
+            count += 1
+        elif default_placeholder != image_token:
+            result.append(default_placeholder)
+    # Add the last part of the string
+    result.append(parts[-1])
+    return "".join(result)
+def flatten_image_list(images: List[List]):
+    """
+    Takes in a list of lists of images, and returns a single list of all images in order.
+    Used for some multimodal models like Llava-1.5 which expects this flattened-list format for its image processor.
+    :param images: A list of lists of PIL images.
+    :return: a list of PIL images, via concatenating all the sub-lists in order.
+    """
+    return [image for image_list in images for image in image_list]
+def handle_stop_sequences(
+    until: Union[str, List[str], None], eos: Optional[str]
+) -> List[str]:
+    """Ensures that the `until` parameter is a list of stop sequences and includes the EOS token."""
+    if isinstance(until, str):
+        until = [until]
+    elif until is None:
+        until = []
+    elif not isinstance(until, list):
+        raise ValueError(
+            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
+        )
+    if eos is not None and eos not in until:
+        until.append(eos)
+    return until
--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
 import copy
 from importlib.metadata import version
 from importlib.util import find_spec
-from typing import List, Literal, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
 from more_itertools import distribute
 from packaging.version import parse as parse_version
@@ -10,7 +10,12 @@ from tqdm import tqdm
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import TemplateLM
 from lm_eval.api.registry import register_model
-from lm_eval.models.utils import Collator, undistribute
+from lm_eval.models.utils import (
+    Collator,
+    configure_pad_token,
+    handle_stop_sequences,
+    undistribute,
+)
 from lm_eval.utils import (
    eval_logger,
    get_rolling_token_windows,
@@ -26,6 +31,8 @@ try:
 except ModuleNotFoundError:
    pass
+if TYPE_CHECKING:
+    pass
 eval_logger = eval_logger
@@ -63,7 +70,7 @@ class VLLM(TemplateLM):
        super().__init__()
        if not find_spec("vllm"):
-            raise Exception(
+            raise ModuleNotFoundError(
                "attempted to use 'vllm' LM type, but package `vllm` is not installed. "
                "Please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`"
            )
@@ -95,7 +102,7 @@ class VLLM(TemplateLM):
        self.batch_size = (
            "auto"
            if isinstance(batch_size, str) and "auto" in batch_size
-            else batch_size
+            else int(batch_size)
        )
        if self.data_parallel_size <= 1:
            self.model = LLM(**self.model_args)
@@ -116,13 +123,14 @@ class VLLM(TemplateLM):
            tokenizer if tokenizer else pretrained,
            tokenizer_mode=tokenizer_mode,
            trust_remote_code=trust_remote_code,
-            tokenizer_revision=tokenizer_revision,
+            revision=tokenizer_revision,
        )
+        self.tokenizer = configure_pad_token(self.tokenizer)
        self.add_bos_token = add_bos_token
        if "gemma" in pretrained.lower():
            self.add_bos_token = True
            eval_logger.info(
-                "Found 'gemma' in model name, a BOS token will be used as Gemma underperforms without it."
+                "Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it."
            )
        self.custom_prefix_token_id = prefix_token_id
@@ -176,23 +184,40 @@ class VLLM(TemplateLM):
    def max_gen_toks(self):
        return self._max_gen_toks
+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        """
+        Method to apply a chat template to a list of chat history between user and model.
+        """
+        return self.tokenizer.apply_chat_template(
+            chat_history, tokenize=False, add_generation_prompt=True
+        )
+    @property
+    def tokenizer_name(self) -> str:
+        return self.tokenizer.name_or_path.replace("/", "__")
    def tok_encode(
        self,
-        string: str,
+        string: Union[str, List[str]],
-        left_truncate_len=None,
+        left_truncate_len: int = None,
-        add_special_tokens=None,
+        add_special_tokens: bool = False,
-        truncation=False,
+        truncation: bool = False,
-    ):
+    ) -> Union[List[int], List[List[int]]]:
-        """ """
        if not add_special_tokens:
            add_special_tokens = False or self.add_bos_token
-        encoding = self.tokenizer.encode(
+        encoding: Union[List[List[int]], List[int]] = self.tokenizer(
-            string, add_special_tokens=add_special_tokens, truncation=truncation
+            string,
-        )
+            add_special_tokens=add_special_tokens,
+            truncation=truncation,
+            return_attention_mask=False,
+        ).input_ids
        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
        if left_truncate_len:
-            encoding = encoding[-left_truncate_len:]
+            if not isinstance(string, str):
+                encoding = [enc[-left_truncate_len:] for enc in encoding]
+            else:
+                encoding = encoding[-left_truncate_len:]
        return encoding
@@ -209,7 +234,7 @@ class VLLM(TemplateLM):
            sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs)
        else:
            sampling_params = SamplingParams(
-                temperature=0, prompt_logprobs=1, max_tokens=1
+                temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
            )
        if self.data_parallel_size > 1:
            # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
@@ -219,17 +244,25 @@ class VLLM(TemplateLM):
            # but then tensor_parallel breaks
            @ray.remote
            def run_inference_one_model(
-                model_args: dict, sampling_params, requests: List[List[int]]
+                model_args: dict,
+                sampling_params,
+                requests: List[List[int]],
+                lora_request: LoRARequest,
            ):
                llm = LLM(**model_args)
                return llm.generate(
-                    prompt_token_ids=requests, sampling_params=sampling_params
+                    prompt_token_ids=requests,
+                    sampling_params=sampling_params,
+                    lora_request=lora_request,
                )
            # dispatch requests to all self.data_parallel_size workers, in interleaved fashion
            # interleaved important to balance context lengths across workers
            requests = [list(x) for x in distribute(self.data_parallel_size, requests)]
-            inputs = ((self.model_args, sampling_params, req) for req in requests)
+            inputs = (
+                (self.model_args, sampling_params, req, self.lora_request)
+                for req in requests
+            )
            object_refs = [run_inference_one_model.remote(*x) for x in inputs]
            results = ray.get(object_refs)
            # Invoke ray.shutdown() to prevent hang-ups if subsequent calls required.
@@ -237,50 +270,81 @@ class VLLM(TemplateLM):
            # flatten results
            return undistribute(results)
-        if self.lora_request is not None:
+        outputs = self.model.generate(
-            outputs = self.model.generate(
+            prompt_token_ids=requests,
-                prompt_token_ids=requests,
+            sampling_params=sampling_params,
-                sampling_params=sampling_params,
+            use_tqdm=True if self.batch_size == "auto" else False,
-                use_tqdm=True if self.batch_size == "auto" else False,
+            lora_request=self.lora_request,
-                lora_request=self.lora_request,
+        )
-            )
-        else:
-            outputs = self.model.generate(
-                prompt_token_ids=requests,
-                sampling_params=sampling_params,
-                use_tqdm=True if self.batch_size == "auto" else False,
-            )
        return outputs
    def loglikelihood_rolling(
        self, requests: List[Instance], disable_tqdm: bool = False
    ) -> List[float]:
-        loglikelihoods = []
+        adaptive_batch_size = None
+        if self.batch_size == "auto":
-        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
+            adaptive_batch_size = len(requests)
-            rolling_token_windows = list(
+        # First, collect all windows from all requests
+        all_windows = []  # List of (request_idx, window) tuples
+        request_window_counts = []  # Track number of windows per request
+        for req_idx, (string,) in enumerate(
+            tqdm(
+                [req.args for req in requests],
+                disable=(disable_tqdm or (self.rank != 0)),
+            )
+        ):
+            rolling_token_windows: List[Tuple[List[int], List[int]]] = list(
                map(
                    make_disjoint_window,
                    get_rolling_token_windows(
                        token_list=self.tok_encode(string),
-                        prefix_token=self.eot_token_id,
+                        prefix_token=self.prefix_token_id,
+                        # max_seq_len - (1 for context)
                        max_seq_len=self.max_length - 1,
                        context_len=1,
                    ),
                )
            )
-            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
+            windows = [(None,) + x for x in rolling_token_windows]
-            string_nll = self._loglikelihood_tokens(
+            # Store windows with their request index
-                rolling_token_windows,
+            all_windows.extend((req_idx, window) for window in windows)
+            request_window_counts.append(len(windows))
+        all_nlls = []
+        batch_size = adaptive_batch_size or int(self.batch_size)
+        for i in range(0, len(all_windows), batch_size):
+            batch = all_windows[i : i + batch_size]
+            # Extract just the windows for processing, keeping track of request indices
+            batch_indices, batch_windows = zip(*batch)
+            batch_nlls = self._loglikelihood_tokens(
+                requests=batch_windows,
+                disable_tqdm=False,
            )
+            # Store results with their request indices
+            all_nlls.extend(zip(batch_indices, batch_nlls))
-            # discard is_greedy
+        # Reconstruct per-request loglikelihoods
-            string_nll = [x[0] for x in string_nll]
+        loglikelihoods = []
+        current_idx = 0
+        for window_count in request_window_counts:
+            # Get all nlls for this request
+            request_nlls = all_nlls[current_idx : current_idx + window_count]
+            # Sum up the nlls for this request (discarding is_greedy)
+            request_total = sum(nll[0] for _, nll in request_nlls)
+            loglikelihoods.append(request_total)
+            current_idx += window_count
+            string = requests[len(loglikelihoods) - 1].args[0]
+            self.cache_hook.add_partial(
+                "loglikelihood_rolling", (string,), request_total
+            )
-            string_nll = sum(string_nll)
-            loglikelihoods.append(string_nll)
        return loglikelihoods
    def generate_until(
@@ -290,7 +354,9 @@ class VLLM(TemplateLM):
        # batch tokenize contexts
        context, all_gen_kwargs = zip(*(req.args for req in requests))
-        context_encoding = self.tokenizer(context, add_special_tokens=False).input_ids
+        context_encoding: List[List[int]] = self.tok_encode(
+            context, add_special_tokens=self.add_bos_token
+        )
        requests = [
            ((a, b), c) for a, b, c in zip(context, context_encoding, all_gen_kwargs)
        ]
@@ -318,6 +384,7 @@ class VLLM(TemplateLM):
            desc="Running generate_until requests",
        )
        # for each different set of kwargs, we execute all requests, by batch.
+        eos = self.tokenizer.decode(self.eot_token_id)
        for chunk in chunks:
            context_and_encoding, all_gen_kwargs = zip(*chunk)
            context, context_encoding = zip(*context_and_encoding)
@@ -325,27 +392,14 @@ class VLLM(TemplateLM):
            # this is safe to assume because the `grouper` object ensures it.
            gen_kwargs = all_gen_kwargs[0]
            # unpack our keyword arguments.
-            until = None
            if isinstance(gen_kwargs, dict):
                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
-                if "until" in kwargs.keys():
+                # add EOS token to stop sequences
-                    until = kwargs.pop("until")
+                until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
-                    if isinstance(until, str):
-                        until = [until]
-                    elif not isinstance(until, list):
-                        raise ValueError(
-                            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
-                        )
            else:
                raise ValueError(
-                    f"Expected `kwargs` to be of type `dict` but got {gen_kwargs}"
+                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
                )
-            # add EOS token to stop sequences
-            eos = self.tokenizer.decode(self.eot_token_id)
-            if not until:
-                until = [eos]
-            else:
-                until.append(eos)
            if "max_gen_toks" in kwargs.keys():
                max_gen_toks = kwargs.pop("max_gen_toks")
            else:
@@ -425,8 +479,10 @@ class VLLM(TemplateLM):
                res.append(answer)
-                # partial caching
                if cache_key is not None:
+                    # special case: loglikelihood_rolling produces a number of loglikelihood requests
+                    # all with cache key None. instead do add_partial on the per-example level
+                    # in the loglikelihood_rolling() function for those.
                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
                pbar.update(1)
        pbar.close()

--- a/lm_eval/models/vllm_vlms.py
+++ b/lm_eval/models/vllm_vlms.py
+import copy
+from typing import Dict, List, Optional
+import transformers
+from more_itertools import distribute
+from tqdm import tqdm
+from lm_eval.api.instance import Instance
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import (
+    Collator,
+    handle_stop_sequences,
+    replace_placeholders,
+    undistribute,
+)
+from lm_eval.models.vllm_causallms import VLLM
+from lm_eval.utils import eval_logger
+try:
+    import ray
+    from vllm import LLM, SamplingParams
+    from vllm.lora.request import LoRARequest  # noqa: F401
+    from vllm.transformers_utils.tokenizer import get_tokenizer  # noqa: F401
+except ModuleNotFoundError:
+    pass
+DEFAULT_IMAGE_PLACEHOLDER = "<image>"
+@register_model("vllm-vlm")
+class VLLM_VLM(VLLM):
+    MULTIMODAL = True
+    def __init__(
+        self,
+        pretrained: str,
+        trust_remote_code: Optional[bool] = False,
+        revision: Optional[str] = None,
+        interleave: bool = True,
+        # TODO<baber>: handle max_images and limit_mm_per_prompt better
+        max_images: int = 999,
+        **kwargs,
+    ):
+        if max_images != 999:
+            kwargs["limit_mm_per_prompt"] = {"image": max_images}
+            eval_logger.info(f"Setting limit_mm_per_prompt[image] to {max_images}")
+        super().__init__(
+            pretrained=pretrained,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            **kwargs,
+        )
+        self.interleave = interleave
+        self.max_images = max_images
+        self.processor = transformers.AutoProcessor.from_pretrained(
+            pretrained,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+        )
+        self.chat_applied: bool = False
+    def tok_batch_multimodal_encode(
+        self,
+        strings: List[str],  # note that input signature of this fn is different
+        images,  # TODO: typehint on this
+        left_truncate_len: int = None,
+        truncation: bool = False,
+    ):
+        images = [img[: self.max_images] for img in images]
+        # TODO<baber>: is the default placeholder always <image>?
+        if self.chat_applied is False:
+            strings = [
+                replace_placeholders(
+                    string,
+                    DEFAULT_IMAGE_PLACEHOLDER,
+                    DEFAULT_IMAGE_PLACEHOLDER,
+                    self.max_images,
+                )
+                for string in strings
+            ]
+        outputs = []
+        for x, i in zip(strings, images):
+            inputs = {
+                "prompt": x,
+                "multi_modal_data": {"image": i},
+            }
+            outputs.append(inputs)
+        return outputs
+    def _model_generate(
+        self,
+        requests: List[List[dict]] = None,
+        generate: bool = False,
+        max_tokens: int = None,
+        stop: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        if generate:
+            kwargs = self.modify_gen_kwargs(kwargs)
+            sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs)
+        else:
+            sampling_params = SamplingParams(
+                temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
+            )
+        if self.data_parallel_size > 1:
+            # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
+            # also seems to only work with decorator and not with ray.remote() fn
+            # see https://github.com/vllm-project/vllm/issues/973
+            # note: this has changed on 0.3.3, and it only works now if num_gpus are set.
+            # but then tensor_parallel breaks
+            @ray.remote
+            def run_inference_one_model(
+                model_args: dict, sampling_params, requests: List[List[dict]]
+            ):
+                llm = LLM(**model_args)
+                return llm.generate(requests, sampling_params=sampling_params)
+            # dispatch requests to all self.data_parallel_size workers, in interleaved fashion
+            # interleaved important to balance context lengths across workers
+            requests = [list(x) for x in distribute(self.data_parallel_size, requests)]
+            inputs = ((self.model_args, sampling_params, req) for req in requests)
+            object_refs = [run_inference_one_model.remote(*x) for x in inputs]
+            results = ray.get(object_refs)
+            # Invoke ray.shutdown() to prevent hang-ups if subsequent calls required.
+            ray.shutdown()
+            # flatten results
+            return undistribute(results)
+        if self.lora_request is not None:
+            outputs = self.model.generate(
+                requests,
+                sampling_params=sampling_params,
+                use_tqdm=True if self.batch_size == "auto" else False,
+                lora_request=self.lora_request,
+            )
+        else:
+            outputs = self.model.generate(
+                requests,
+                sampling_params=sampling_params,
+                use_tqdm=True if self.batch_size == "auto" else False,
+            )
+        return outputs
+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        self.chat_applied = True
+        if not self.interleave:
+            for content in chat_history:
+                c = []
+                text = content["content"]
+                # Count and remove image placeholders
+                image_count = min(
+                    self.max_images, text.count(DEFAULT_IMAGE_PLACEHOLDER)
+                )
+                text = text.replace(DEFAULT_IMAGE_PLACEHOLDER, "")
+                # Add image entries
+                for _ in range(image_count):
+                    c.append({"type": "image", "image": None})
+                # Add single text entry at the end
+                c.append({"type": "text", "text": text})
+                content["content"] = c
+        else:
+            for content in chat_history:
+                c = []
+                text = content["content"]
+                expected_image_count = min(
+                    self.max_images, text.count(DEFAULT_IMAGE_PLACEHOLDER)
+                )
+                actual_image_count = 0
+                text_parts = text.split(DEFAULT_IMAGE_PLACEHOLDER)
+                for i, part in enumerate(text_parts):
+                    # TODO: concatenate text parts (esp. if skipping images)?
+                    if part:  # Add non-empty text parts
+                        c.append({"type": "text", "text": part})
+                    if (
+                        (i < len(text_parts) - 1) and i < self.max_images
+                    ):  # Add image placeholder after each split except the last
+                        c.append({"type": "image"})
+                        actual_image_count += 1
+                content["content"] = c
+                if actual_image_count != expected_image_count:
+                    raise ValueError(
+                        f"Mismatch in image placeholder count. Expected: {expected_image_count}, Actual: {actual_image_count}"
+                    )
+        return self.processor.apply_chat_template(
+            chat_history, add_generation_prompt=True
+        )
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
+        # TODO: support text-only reqs
+        res = []
+        def _collate(x):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = self.tok_encode(x[0])
+            return -len(toks), x[0]
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running generate_until requests with text+image input",
+        )
+        # TODO: port auto-batch sizing into this.
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        re_ords = Collator(
+            [reg.args for reg in requests],
+            _collate,
+            group_by="gen_kwargs",
+            group_fn=lambda x: x[1],
+        )
+        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
+        eos = self.tokenizer.decode(self.eot_token_id)
+        for chunk in chunks:
+            contexts, all_gen_kwargs, aux_arguments = zip(*chunk)
+            visuals = [arg["visual"] for arg in aux_arguments]
+            if not isinstance(contexts, list):
+                contexts = list(
+                    contexts
+                )  # for Qwen2-VL, processor is unhappy accepting a tuple of strings instead of a list.
+                # TODO: could we upstream this workaround to HF?
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            gen_kwargs = all_gen_kwargs[0]
+            # unpack our keyword arguments.
+            if isinstance(gen_kwargs, dict):
+                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                # add EOS token to stop sequences
+                until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
+            else:
+                raise ValueError(
+                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
+                )
+            if "max_gen_toks" in kwargs.keys():
+                max_gen_toks = kwargs.pop("max_gen_toks")
+            else:
+                max_gen_toks = self.max_gen_toks
+            max_ctx_len = self.max_length - max_gen_toks
+            inputs = self.tok_batch_multimodal_encode(
+                contexts,
+                visuals,
+                left_truncate_len=max_ctx_len,
+            )
+            cont = self._model_generate(inputs, stop=until, generate=True, **kwargs)
+            for output, context in zip(cont, contexts):
+                generated_text = output.outputs[0].text
+                res.append(generated_text)
+                self.cache_hook.add_partial(
+                    "generate_until", (context, gen_kwargs), generated_text
+                )
+                pbar.update(1)
+        # reorder this group of results back to original unsorted form
+        res = re_ords.get_original(res)
+        pbar.close()
+        return res
--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
@@ -29,8 +29,8 @@ def get_prompt(prompt_id: str, dataset_name: str = None, subset_name: str = None
    if category_name == "promptsource":
        try:
            from promptsource.templates import DatasetTemplates
-        except ModuleNotFoundError:
+        except ModuleNotFoundError as exception:
-            raise Exception(
+            raise type(exception)(
                "Tried to load a Promptsource template, but promptsource is not installed ",
                "please install promptsource via pip install lm-eval[promptsource] or pip install -e .[promptsource]",
            )
@@ -118,7 +118,7 @@ class PromptString:
        # TODO need a way to process doc_to_choice
        if "doc_to_choice" in self.prompt_string:
-            raise Exception("Not yet implemented to accept doc_to_choice")
+            raise NotImplementedError("Not yet implemented to accept doc_to_choice")
        text_string = utils.apply_template(doc_to_text, doc)
        target_string = utils.apply_template(doc_to_target, doc)

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -11,19 +11,27 @@
 | [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic |
 | [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese |
 | [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English |
+| [arabic_leaderboard_complete](arabic_leaderboard_complete/README.md) | A full version of the tasks in the Open Arabic LLM Leaderboard, focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT) |
+| [arabic_leaderboard_light](arabic_leaderboard_light/README.md) | A light version of the tasks in the Open Arabic LLM Leaderboard (i.e., 10% samples of the test set in the original benchmarks), focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT) |
+| [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic |
+| [AraDICE](aradice/README.md) | A collection of multiple tasks carefully designed to evaluate dialectal and cultural capabilities in large language models (LLMs). | Arabic |
 | [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions.  | English |
 | [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English |
 | [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English |
 | [babi](babi/README.md) | Tasks designed as question and answering challenges based on simulated stories. | English |
+| [basque_bench](basque_bench/README.md) | Collection of tasks in Basque encompassing various evaluation areas. | Basque |
 | [basqueglue](basqueglue/README.md) | Tasks designed to evaluate language understanding in Basque language. | Basque |
 | [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German |
 | [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) |
 | benchmarks | General benchmarking tasks that test a wide range of language understanding capabilities. | |
+| [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) |
 | [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple |
 | [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English |
+| [catalan_bench](catalan_bench/README.md) | Collection of tasks in Catalan encompassing various evaluation areas. | Catalan |
 | [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese |
 | [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese |
 | code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby |
+| [commonsense_qa](commonsense_qa/README.md) | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English |
 | [copal_id](copal_id/README.md) | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian |
 | [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English |
 | [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French |
@@ -37,6 +45,8 @@
 | [fda](fda/README.md) | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English |
 | [fld](fld/README.md) | Tasks involving free-form and directed dialogue understanding. | English |
 | [french_bench](french_bench/README.md) | Set of tasks designed to assess language model performance in French. | French|
+| [galician_bench](galician_bench/README.md) | Collection of tasks in Galician encompassing various evaluation areas. | Galician |
+| [global_mmlu](global_mmlu/README.md) | Collection of culturally sensitive and culturally agnostic MMLU tasks in 15 languages with human translations or post-edits. | Multiple (15 languages) |
 | [glue](glue/README.md) | General Language Understanding Evaluation benchmark to test broad language abilities. | English |
 | [gpqa](gpqa/README.md) | Tasks designed for general public question answering and knowledge verification. | English |
 | [gsm8k](gsm8k/README.md) | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English |
@@ -47,6 +57,9 @@
 | [hendrycks_math](hendrycks_math/README.md) | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English |
 | [humaneval](humaneval/README.md) | Code generation task that measure functional correctness for synthesizing programs from docstrings. | Python |
 | [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English |
+| [inverse_scaling](inverse_scaling/README.md) | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English |
+| [japanese_leaderboard](japanese_leaderboard/README.md) | Japanese language understanding tasks to benchmark model performance on various linguistic aspects. | Japanese |
+| [kbl](kbl/README.md) | Korean Benchmark for Legal Language Understanding. | Korean |
 | [kmmlu](kmmlu/README.md) | Knowledge-based multi-subject multiple choice questions for academic evaluation. | Korean |
 | [kobest](kobest/README.md) | A collection of tasks designed to evaluate understanding in Korean language. | Korean |
 | [kormedmcqa](kormedmcqa/README.md) | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean |
@@ -54,28 +67,36 @@
 | [lambada_cloze](lambada_cloze/README.md) | Cloze-style LAMBADA dataset. | English |
 | [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian |
 | [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese |
+| [leaderboard](leaderboard/README.md) | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English |
+| [lingoly](lingoly/README.md) | Challenging logical reasoning benchmark in low-resource languages with controls for memorization | English, Multilingual |
 | [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese |
 | [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese |
 | [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English |
 | [mc_taco](mc_taco/README.md) | Question-answer pairs that require temporal commonsense comprehension. | English |
+| [med_concepts_qa](med_concepts_qa/README.md) | Benchmark for evaluating LLMs on their abilities to interpret medical codes and distinguish between medical concept. | English |
+| [metabench](metabench/README.md) | Distilled versions of six popular benchmarks which are highly predictive of overall benchmark performance and of a single general ability latent trait. | English |
 | medmcqa | Medical multiple choice questions assessing detailed medical knowledge. | English |
 | medqa | Multiple choice question answering based on the United States Medical License Exams. | |
 | [mgsm](mgsm/README.md) | Benchmark of multilingual grade-school math problems. | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu |
 | [minerva_math](minerva_math/README.md) | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills. | English |
-| mmlu | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English |
+| [mmlu](mmlu/README.md) | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English |
+| [mmlu_pro](mmlu_pro/README.md) | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options. | English |
+| [mmlusr](mmlusr/README.md) | Variation of MMLU designed to be more rigorous. | English |
 | model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. | |
 | [mutual](mutual/README.md) | A retrieval-based dataset for multi-turn dialogue reasoning. | English |
 | [nq_open](nq_open/README.md) | Open domain question answering tasks based on the Natural Questions dataset. | English |
 | [okapi/arc_multilingual](okapi/arc_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.** |
-| [okapi/hellaswag_multilingual](okapi/hellaswag_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (30 languages) |
+| [okapi/hellaswag_multilingual](okapi/hellaswag_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (30 languages) **Machine Translated.** |
-| okapi/mmlu_multilingual | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) |
+| okapi/mmlu_multilingual | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) **Machine Translated.** |
-| [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) |
+| [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.** |
 | [openbookqa](openbookqa/README.md) | Open-book question answering tasks that require external knowledge and reasoning. | English |
+| [paloma](paloma/README.md) | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit. | English |
 | [paws-x](paws-x/README.md) | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean |
 | [pile](pile/README.md) | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English |
 | [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English |
 | [piqa](piqa/README.md) | Physical Interaction Question Answering tasks to test physical commonsense reasoning. | English |
 | [polemo2](polemo2/README.md) | Sentiment analysis and emotion detection tasks based on Polish language data. | Polish |
+| [portuguese_bench](portuguese_bench/README.md) | Collection of tasks in European Portuguese encompassing various evaluation areas. | Portuguese |
 | [prost](prost/README.md) | Tasks requiring understanding of professional standards and ethics in various domains. | English |
 | [pubmedqa](pubmedqa/README.md) | Question answering tasks based on PubMed research articles for biomedical understanding. | English |
 | [qa4mre](qa4mre/README.md) | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning. | English |
@@ -83,8 +104,10 @@
 | [race](race/README.md) | Reading comprehension assessment tasks based on English exams in China. | English |
 | realtoxicityprompts | Tasks to evaluate language models for generating text with potential toxicity. | |
 | [sciq](sciq/README.md) | Science Question Answering tasks to assess understanding of scientific concepts. | English |
+| [score](score/README.md) | Systematic consistency and robustness evaluation for LLMs on 3 datasets(MMLU-Pro, Agi Eval and MATH) | English |
 | [scrolls](scrolls/README.md) | Tasks that involve long-form reading comprehension across various domains. | English |
 | [siqa](siqa/README.md) | Social Interaction Question Answering to evaluate common sense and social reasoning.  | English |
+| [spanish_bench](spanish_bench/README.md) | Collection of tasks in Spanish encompassing various evaluation areas. | Spanish |
 | [squad_completion](squad_completion/README.md) | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs. | English |
 | [squadv2](squadv2/README.md) | Stanford Question Answering Dataset version 2, a reading comprehension benchmark. | English |
 | [storycloze](storycloze/README.md) | Tasks to predict story endings, focusing on narrative logic and coherence. | English |
@@ -97,6 +120,7 @@
 | [translation](translation/README.md) | Tasks focused on evaluating the language translation capabilities of models. | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese |
 | [triviaqa](triviaqa/README.md) | A large-scale dataset for trivia question answering to test general knowledge. | English |
 | [truthfulqa](truthfulqa/README.md) | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English |
+| [turkishmmlu](turkishmmlu/README.md) | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams. | Turkish |
 | [unitxt](unitxt/README.md) | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI. | English |
 | [unscramble](unscramble/README.md) | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding. | English |
 | [webqs](webqs/README.md) | Web-based question answering tasks designed to evaluate internet search and retrieval. | English |
@@ -106,7 +130,8 @@
 | [wmt2016](wmt2016/README.md) | Tasks from the WMT 2016 shared task, focusing on translation between multiple languages. | English, Czech, German, Finnish, Russian, Romanian, Turkish |
 | [wsc273](wsc273/README.md) | The Winograd Schema Challenge, a test of commonsense reasoning and coreference resolution. | English |
 | [xcopa](xcopa/README.md) | Cross-lingual Choice of Plausible Alternatives, testing reasoning in multiple languages. | Estonian, Haitian, Indonesian, Italian, Quechua, Swahili, Tamil, Thai, Turkish, Vietnamese, Chinese |
-| [xnli](xnli/README.md) | Cross-Lingual Natural Language Inference to test understanding across different languages. | Arabic, Bulgarian, German, Greekm English, Spanish, French, Hindi, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese |
+| [xnli](xnli/README.md) | Cross-Lingual Natural Language Inference to test understanding across different languages. | Arabic, Bulgarian, German, Greek, English, Spanish, French, Hindi, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese |
 | [xnli_eu](xnli_eu/README.md) | Cross-lingual Natural Language Inference tasks in Basque. | Basque |
+| [xquad](xquad/README.md) | Cross-lingual Question Answering Dataset in multiple languages. | Arabic, German, Greek, English, Spanish, Hindi, Romanian, Russian, Thai, Turkish, Vietnamese, Chinese |
 | [xstorycloze](xstorycloze/README.md) | Cross-lingual narrative understanding tasks to predict story endings in multiple languages. | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese |
 | [xwinograd](xwinograd/README.md) | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages. | English, French, Japanese, Portuguese, Russian, Chinese |
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
 import collections
+import inspect
 import logging
 import os
 from functools import partial
 from typing import Dict, List, Mapping, Optional, Union
 from lm_eval import utils
+from lm_eval.api.group import ConfigurableGroup, GroupConfig
 from lm_eval.api.task import ConfigurableTask, Task
+from lm_eval.evaluator_utils import get_subtask_list
+GROUP_ONLY_KEYS = list(GroupConfig().to_dict().keys())
 class TaskManager:
@@ -30,6 +36,20 @@ class TaskManager:
        )
        self._all_tasks = sorted(list(self._task_index.keys()))
+        self._all_groups = sorted(
+            [x for x in self._all_tasks if self._task_index[x]["type"] == "group"]
+        )
+        self._all_subtasks = sorted(
+            [
+                x
+                for x in self._all_tasks
+                if self._task_index[x]["type"] in ["task", "python_task"]
+            ]
+        )
+        self._all_tags = sorted(
+            [x for x in self._all_tasks if self._task_index[x]["type"] == "tag"]
+        )
        self.task_group_map = collections.defaultdict(list)
    def initialize_tasks(
@@ -67,10 +87,88 @@ class TaskManager:
    def all_tasks(self):
        return self._all_tasks
+    @property
+    def all_groups(self):
+        return self._all_groups
+    @property
+    def all_subtasks(self):
+        return self._all_subtasks
+    @property
+    def all_tags(self):
+        return self._all_tags
    @property
    def task_index(self):
        return self._task_index
+    def list_all_tasks(
+        self, list_groups=True, list_tags=True, list_subtasks=True
+    ) -> str:
+        from pytablewriter import MarkdownTableWriter
+        def sanitize_path(path):
+            # don't print full path if we are within the lm_eval/tasks dir !
+            # if we aren't though, provide the full path.
+            if "lm_eval/tasks/" in path:
+                return "lm_eval/tasks/" + path.split("lm_eval/tasks/")[-1]
+            else:
+                return path
+        group_table = MarkdownTableWriter()
+        group_table.headers = ["Group", "Config Location"]
+        gt_values = []
+        for g in self.all_groups:
+            path = self.task_index[g]["yaml_path"]
+            if path == -1:
+                path = "---"
+            else:
+                path = sanitize_path(path)
+            gt_values.append([g, path])
+        group_table.value_matrix = gt_values
+        tag_table = MarkdownTableWriter()
+        tag_table.headers = ["Tag"]
+        tag_table.value_matrix = [[t] for t in self.all_tags]
+        subtask_table = MarkdownTableWriter()
+        subtask_table.headers = ["Task", "Config Location", "Output Type"]
+        st_values = []
+        for t in self.all_subtasks:
+            path = self.task_index[t]["yaml_path"]
+            output_type = ""
+            # read the yaml file to determine the output type
+            if path != -1:
+                config = utils.load_yaml_config(path, mode="simple")
+                if "output_type" in config:
+                    output_type = config["output_type"]
+                elif (
+                    "include" in config
+                ):  # if no output type, check if there is an include with an output type
+                    include_path = path.split("/")[:-1] + config["include"]
+                    include_config = utils.load_yaml_config(include_path, mode="simple")
+                    if "output_type" in include_config:
+                        output_type = include_config["output_type"]
+            if path == -1:
+                path = "---"
+            else:
+                path = sanitize_path(path)
+            st_values.append([t, path, output_type])
+        subtask_table.value_matrix = st_values
+        result = "\n"
+        if list_groups:
+            result += group_table.dumps() + "\n\n"
+        if list_tags:
+            result += tag_table.dumps() + "\n\n"
+        if list_subtasks:
+            result += subtask_table.dumps() + "\n\n"
+        return result
    def match_tasks(self, task_list):
        return utils.pattern_match(task_list, self.all_tasks)
@@ -80,7 +178,12 @@ class TaskManager:
        return False
    def _name_is_task(self, name) -> bool:
-        if self._name_is_registered(name) and ("task" in self.task_index[name]["type"]):
+        if self._name_is_registered(name) and (self.task_index[name]["type"] == "task"):
+            return True
+        return False
+    def _name_is_tag(self, name) -> bool:
+        if self._name_is_registered(name) and (self.task_index[name]["type"] == "tag"):
            return True
        return False
@@ -141,89 +244,126 @@ class TaskManager:
                config["group_alias"] = None
        return config
+    def _class_has_config_in_constructor(self, cls):
+        constructor = getattr(cls, "__init__", None)
+        return (
+            "config" in inspect.signature(constructor).parameters
+            if constructor
+            else False
+        )
    def _load_individual_task_or_group(
        self,
        name_or_config: Optional[Union[str, dict]] = None,
        parent_name: Optional[str] = None,
        update_config: Optional[dict] = None,
-        yaml_path: Optional[str] = None,
    ) -> Mapping:
-        def load_task(config, task, group=None, yaml_path=None):
+        def _load_task(config, task):
            if "include" in config:
-                if yaml_path is None:
-                    raise ValueError
                config = {
                    **utils.load_yaml_config(
-                        yaml_path,
+                        yaml_path=None,
                        yaml_config={"include": config.pop("include")},
                        mode="full",
                    ),
                    **config,
                }
            if self._config_is_python_task(config):
-                task_object = config["class"]()
+                if self._class_has_config_in_constructor(config["class"]):
+                    task_object = config["class"](config=config)
+                else:
+                    task_object = config["class"]()
+                if isinstance(task_object, ConfigurableTask):
+                    # very scuffed: set task name here. TODO: fixme?
+                    task_object.config.task = task
            else:
-                config = self._process_alias(config, group=group)
                task_object = ConfigurableTask(config=config)
-            if group is not None:
-                task_object = (group, task_object)
            return {task: task_object}
+        def _get_group_and_subtask_from_config(config):
+            group_name = ConfigurableGroup(config=config)
+            subtask_list = []
+            for task in group_name.config["task"]:
+                if isinstance(task, str) and self._name_is_tag(task):
+                    subtask_list.extend(self._get_tasklist(task))
+                else:
+                    subtask_list.append(task)
+            return group_name, subtask_list
+        def _process_group_config(config, update_config=None):
+            if update_config is not None:
+                config = {**config, **update_config}
+            _update_config = {
+                k: v for k, v in config.items() if k not in GROUP_ONLY_KEYS
+            }
+            if not bool(_update_config):
+                _update_config = None
+            group_config = {k: v for k, v in config.items() if k in GROUP_ONLY_KEYS}
+            return group_config, _update_config
        if isinstance(name_or_config, str):
            if update_config is not None:
                # Process name_or_config as a dict instead
                name_or_config = {"task": name_or_config, **update_config}
-            elif self._name_is_task(name_or_config):
+            elif self._name_is_task(name_or_config) or self._name_is_python_task(
+                name_or_config
+            ):
                task_config = self._get_config(name_or_config)
-                return load_task(task_config, task=name_or_config, group=parent_name)
+                return _load_task(task_config, task=name_or_config)
            else:
-                group_name = name_or_config
                subtask_list = self._get_tasklist(name_or_config)
                if subtask_list == -1:
                    group_config = self._get_config(name_or_config)
-                    subtask_list = group_config["task"]
+                    group_config, update_config = _process_group_config(group_config)
+                    group_name, subtask_list = _get_group_and_subtask_from_config(
-                # This checks if we're at the root.
+                        group_config
-                if parent_name is None:
+                    )
-                    group_config = self._get_config(name_or_config)
+                else:
-                    if set(group_config.keys()) > {"task", "group"}:
+                    if self._name_is_tag(name_or_config):
-                        update_config = {
+                        fn = partial(
-                            k: v
+                            self._load_individual_task_or_group,
-                            for k, v in group_config.items()
+                            update_config=name_or_config
-                            if k not in ["task", "group"]
+                            if isinstance(name_or_config, dict)
-                        }
+                            else None,
-                    yaml_path = self._get_yaml_path(group_name)
+                        )
+                        return dict(
-                    if (update_config is not None) and ("group_alias" in update_config):
+                            collections.ChainMap(*map(fn, reversed(subtask_list)))
-                        group_name = update_config["group_alias"]
+                        )
-                        update_config.pop("group_alias")
+                    else:
+                        group_name = ConfigurableGroup(
+                            config={"group": name_or_config, "task": subtask_list}
+                        )
        if isinstance(name_or_config, dict):
-            if update_config is not None:
-                name_or_config = {
-                    **name_or_config,
-                    **update_config,
-                }
            if self._config_is_task(name_or_config):
-                name = name_or_config["task"]
+                name = name_or_config.pop("task")
+                if update_config is not None:
+                    name_or_config = {**name_or_config, **update_config}
                # If the name is registered as a group
-                # if self._name_is_task(name) is False:
                if self._name_is_group(name):
-                    group_name = name
+                    group_config = self._get_config(name)
-                    update_config = {
-                        k: v for k, v in name_or_config.items() if k != "task"
+                    group_config, update_config = _process_group_config(
-                    }
+                        group_config, name_or_config
+                    )
+                    group_name, subtask_list = _get_group_and_subtask_from_config(
+                        group_config
+                    )
+                elif self._name_is_tag(name):
                    subtask_list = self._get_tasklist(name)
-                    if subtask_list == -1:
+                    fn = partial(
-                        subtask_list = self._get_config(name)["task"]
+                        self._load_individual_task_or_group,
+                        update_config=name_or_config,
+                    )
+                    return dict(collections.ChainMap(*map(fn, reversed(subtask_list))))
                else:
                    if self._name_is_registered(name):
                        base_task_config = self._get_config(name)
                        # Check if this is a duplicate.
                        if parent_name is not None:
-                            name_or_config["group"] = parent_name
                            num_duplicate = len(
                                list(
                                    filter(
@@ -242,34 +382,21 @@ class TaskManager:
                        }
                    else:
                        task_config = name_or_config
-                    return load_task(
+                    return _load_task(task_config, task=name)
-                        task_config, task=name, group=parent_name, yaml_path=yaml_path
-                    )
            else:
-                group_name = name_or_config["group"]
+                group_config, update_config = _process_group_config(name_or_config)
-                subtask_list = name_or_config["task"]
+                group_name, subtask_list = _get_group_and_subtask_from_config(
-                if set(name_or_config.keys()) > {"task", "group"}:
+                    group_config
-                    update_config = {
+                )
-                        k: v
-                        for k, v in name_or_config.items()
-                        if k not in ["task", "group"]
-                    }
-        all_subtasks = {}
-        if parent_name is not None:
-            all_subtasks = {group_name: (parent_name, None)}
        fn = partial(
            self._load_individual_task_or_group,
            parent_name=group_name,
            update_config=update_config,
-            yaml_path=yaml_path,
        )
-        all_subtasks = {
+        return {
-            **all_subtasks,
+            group_name: dict(collections.ChainMap(*map(fn, reversed(subtask_list))))
-            **dict(collections.ChainMap(*map(fn, subtask_list))),
        }
-        return all_subtasks
    def load_task_or_group(self, task_list: Optional[Union[str, list]] = None) -> dict:
        """Loads a dictionary of task objects from a list
@@ -293,10 +420,11 @@ class TaskManager:
    def _get_task_and_group(self, task_dir: str):
        """Creates a dictionary of tasks index with the following metadata,
-        - `type`, that can be either `task`, `python_task`, or `group`.
+        - `type`, that can be either `task`, `python_task`, `group` or `tags`.
            `task` refer to regular task configs, `python_task` are special
            yaml files that only consists of `task` and `class` parameters.
-            `group` are group configs.
+            `group` are group configs. `tags` are labels that can be assigned
+            to tasks to assist in sorting and calling tasks of certain themes.
        - `yaml_path`, path to the yaml file. If the entry is a `group` that
            was configured through a task config, the yaml_path will be -1
            and all subtasks will be listed in `task` (see below)
@@ -312,6 +440,32 @@ class TaskManager:
        :return
            Dictionary of task names as key and task metadata
        """
+        def _populate_tags_and_groups(config, task, tasks_and_groups, print_info):
+            # TODO: remove group in next release
+            if "tag" in config:
+                attr_list = config["tag"]
+                if isinstance(attr_list, str):
+                    attr_list = [attr_list]
+                for tag in attr_list:
+                    if tag not in tasks_and_groups:
+                        tasks_and_groups[tag] = {
+                            "type": "tag",
+                            "task": [task],
+                            "yaml_path": -1,
+                        }
+                    elif tasks_and_groups[tag]["type"] != "tag":
+                        self.logger.info(
+                            f"The tag '{tag}' is already registered as a group, this tag will not be registered. "
+                            "This may affect tasks you want to call."
+                        )
+                        break
+                    else:
+                        tasks_and_groups[tag]["task"].append(task)
+        # TODO: remove group in next release
+        print_info = True
        ignore_dirs = [
            "__pycache__",
            ".ipynb_checkpoints",
@@ -325,10 +479,14 @@ class TaskManager:
                    config = utils.load_yaml_config(yaml_path, mode="simple")
                    if self._config_is_python_task(config):
                        # This is a python class config
-                        tasks_and_groups[config["task"]] = {
+                        task = config["task"]
+                        tasks_and_groups[task] = {
                            "type": "python_task",
                            "yaml_path": yaml_path,
                        }
+                        _populate_tags_and_groups(
+                            config, task, tasks_and_groups, print_info
+                        )
                    elif self._config_is_group(config):
                        # This is a group config
                        tasks_and_groups[config["group"]] = {
@@ -357,21 +515,9 @@ class TaskManager:
                            "type": "task",
                            "yaml_path": yaml_path,
                        }
+                        _populate_tags_and_groups(
-                        if "group" in config:
+                            config, task, tasks_and_groups, print_info
-                            groups = config["group"]
+                        )
-                            if isinstance(config["group"], str):
-                                groups = [groups]
-                            for group in groups:
-                                if group not in tasks_and_groups:
-                                    tasks_and_groups[group] = {
-                                        "type": "group",
-                                        "task": [task],
-                                        "yaml_path": -1,
-                                    }
-                                else:
-                                    tasks_and_groups[group]["task"].append(task)
                    else:
                        self.logger.debug(f"File {f} in {root} could not be loaded")
@@ -400,6 +546,33 @@ def get_task_name_from_object(task_object):
    )
+def _check_duplicates(task_dict: dict) -> List[str]:
+    """helper function solely used in validating get_task_dict output.
+    Takes the output of lm_eval.evaluator_utils.get_subtask_list and
+    returns a list of all leaf subtasks contained within, and errors if any such leaf subtasks are
+    "oversubscribed" to several disjoint groups.
+    """
+    subtask_names = []
+    for key, value in task_dict.items():
+        subtask_names.extend(value)
+    duplicate_tasks = {
+        task_name for task_name in subtask_names if subtask_names.count(task_name) > 1
+    }
+    # locate the potentially problematic groups that seem to 'compete' for constituent subtasks
+    competing_groups = [
+        group
+        for group in task_dict.keys()
+        if len(set(task_dict[group]).intersection(duplicate_tasks)) > 0
+    ]
+    if len(duplicate_tasks) > 0:
+        raise ValueError(
+            f"Found 1 or more tasks while trying to call get_task_dict() that were members of more than 1 called group: {list(duplicate_tasks)}. Offending groups: {competing_groups}. Please call groups which overlap their constituent tasks in separate evaluation runs."
+        )
 def get_task_dict(
    task_name_list: Union[str, List[Union[str, Dict, Task]]],
    task_manager: Optional[TaskManager] = None,
@@ -417,6 +590,7 @@ def get_task_dict(
    :return
        Dictionary of task objects
    """
    task_name_from_string_dict = {}
    task_name_from_config_dict = {}
    task_name_from_object_dict = {}
@@ -463,8 +637,16 @@ def get_task_dict(
    ):
        raise ValueError
-    return {
+    final_task_dict = {
        **task_name_from_string_dict,
        **task_name_from_config_dict,
        **task_name_from_object_dict,
    }
+    # behavior can get odd if one tries to invoke several groups that "compete" for the same task.
+    # (notably, because one could request several num_fewshot values at once in GroupConfig overrides for the subtask
+    # and we'd be unsure which to use and report.)
+    # we explicitly check and error in this case.
+    _check_duplicates(get_subtask_list(final_task_dict))
+    return final_task_dict
--- a/lm_eval/tasks/aclue/README.md
+++ b/lm_eval/tasks/aclue/README.md
@@ -14,7 +14,7 @@ Homepage: https://github.com/isen-zhang/ACLUE
 ```bibtex
 @inproceedings{zhang-li-2023-large,
-    title = "Can Large Langauge Model Comprehend {A}ncient {C}hinese? A Preliminary Test on {ACLUE}",
+    title = "Can Large Language Model Comprehend {A}ncient {C}hinese? A Preliminary Test on {ACLUE}",
    author = "Zhang, Yixuan  and Li, Haonan",
    booktitle = "Proceedings of the Ancient Language Processing Workshop",
    month = sep,
@@ -26,7 +26,7 @@ Homepage: https://github.com/isen-zhang/ACLUE
 }
 ```
-### Groups and Tasks
+### Groups, Tags, and Tasks
 #### Groups

--- a/lm_eval/tasks/aclue/_aclue.yaml
+++ b/lm_eval/tasks/aclue/_aclue.yaml
+group: aclue
+task:
+  - aclue_ancient_chinese_culture
+  - aclue_ancient_literature
+  - aclue_ancient_medical
+  - aclue_ancient_phonetics
+  - aclue_basic_ancient_chinese
+  - aclue_couplet_prediction
+  - aclue_homographic_character_resolution
+  - aclue_named_entity_recognition
+  - aclue_poetry_appreciate
+  - aclue_poetry_context_prediction
+  - aclue_poetry_quality_assessment
+  - aclue_poetry_sentiment_analysis
+  - aclue_polysemy_resolution
+  - aclue_reading_comprehension
+  - aclue_sentence_segmentation
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aclue/_default_template_yaml
+++ b/lm_eval/tasks/aclue/_default_template_yaml
-group: aclue
 dataset_path: tyouisen/aclue
 test_split: test
 fewshot_split: dev
@@ -16,4 +15,4 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 metadata:
-  version: 0.0
+  version: 1.0
--- a/lm_eval/tasks/aexams/README.md
+++ b/lm_eval/tasks/aexams/README.md
@@ -24,11 +24,11 @@ Homepage for Arabic EXAMS: [EXAMS Arabic Homepage](https://github.com/FreedomInt
 ### Citation
-### Groups and Tasks
+### Groups, Tags, and Tasks
 #### Groups
- `EXAMS Arabic`: include IslamicStudies, Biology, Science, Physics, Social.
+- `aexams`: Arabic EXAMS dataset, including IslamicStudies, Biology, Science, Physics, Social subjects.
 #### Tasks

--- a/lm_eval/tasks/aexams/_aexams.yaml
+++ b/lm_eval/tasks/aexams/_aexams.yaml
+group: aexams
+task:
+  - aexams_Biology
+  - aexams_IslamicStudies
+  - aexams_Physics
+  - aexams_Science
+  - aexams_Social
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aexams/_default_template_yaml
+++ b/lm_eval/tasks/aexams/_default_template_yaml
-group: aexams
 dataset_path: Hennara/aexams
 test_split: test
 fewshot_split: dev
@@ -16,4 +15,4 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 metadata:
-  version: 0.0
+  version: 1.0