Merge branch 'main' into humaneval

# Conflicts: # lm_eval/api/task.py

Merge branch 'main' into humaneval
# Conflicts: # lm_eval/api/task.py
173b2bc3 · Baber · 74344829 · bb098f13 · 173b2bc3 · 173b2bc3
Commit 173b2bc3 authored Jan 10, 2025 by Baber
20 changed files
--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -3,7 +3,7 @@ import hashlib
 import json
 import logging
 import os
-from typing import Dict, List, Optional, Tuple, Type, TypeVar
+from typing import Dict, List, Optional, Tuple, Type, TypeVar, Union
 import transformers
 from sqlitedict import SqliteDict
@@ -55,7 +55,7 @@ class LM(abc.ABC):
        pass
    @abc.abstractmethod
-    def loglikelihood_rolling(self, requests) -> List[Tuple[float]]:
+    def loglikelihood_rolling(self, requests) -> List[float]:
        """Compute full log-likelihood of a string, with no truncation, for perplexity computation
        - We will use the full max context length of the model.
        - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
@@ -101,14 +101,13 @@ class LM(abc.ABC):
        """Generate greedily until a stopping sequence
        :param requests: list[Instance]
-            A list of Instance objects with property `args` which returns a tuple (context, until).
+            A list of Instance objects with property `args` which returns a tuple (context, gen_kwargs).
            context: str
                Context string
-            until: [str]
+            gen_kwargs: dict
-                The string sequences to generate until. These string sequences
+                A dictionary of keyword arguments to pass to the generation function e.g. top_k, until, etc.
-                may each span across multiple tokens, or may be part of one token.
        :return: list[str]
-            A list of strings continuation
+            A list of model generated continuations.
            continuation: str
                The generated continuation.
        """
@@ -193,15 +192,13 @@ class LM(abc.ABC):
            "To use this model with chat templates, please implement the 'tokenizer_name' property."
        )
-    @property
+    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
-    def chat_template(self) -> str:
+        """Returns the chat template structure for user/assistant messages if a template is provided.
-        """Must be defined for LM subclasses that implement Chat Templating.
+        This method is intended to be overridden in a subclass to define a specific chat template format.
-        Should return the structure of the chat template applied to user/assistant messages.
+        For models that do not support chat templates, this method returns None by default.
-        This is used only to save in the experiment results for reproducibility.
        """
-        raise NotImplementedError(
-            "To use this model with chat templates, please implement the 'chat_template' property."
+        return ""
-        )
    def set_cache_hook(self, cache_hook) -> None:
        self.cache_hook = cache_hook
@@ -246,9 +243,10 @@ class CachingLM:
        # add hook to lm
        lm.set_cache_hook(self.get_cache_hook())
-    def __getattr__(self, attr):
+    def __getattr__(self, attr: str):
        lm_attr = getattr(self.lm, attr)
-        if not callable(lm_attr):
+        if attr not in ["loglikelihood", "loglikelihood_rolling", "generate_until"]:
+            eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM")
            return lm_attr
        def fn(requests):
@@ -283,8 +281,11 @@ class CachingLM:
            eval_logger.info(
                f"Cached requests: {len(requests) - len(remaining_reqs)}, Requests remaining: {len(remaining_reqs)}"
            )
-            # actually run the LM on the requests that do not have cached results
+            if remaining_reqs:
-            rem_res = getattr(self.lm, attr)(remaining_reqs)
+                # actually run the LM on the requests that do not have cached results
+                rem_res = getattr(self.lm, attr)(remaining_reqs)
+            else:
+                rem_res = []
            # stick the new ones back into the list and also cache any of the new ones
            resptr = 0
@@ -313,6 +314,8 @@ class TemplateLM(LM):
    and boilerplate often included in other LM subclasses.
    """
+    tokenizer = None
    @property
    @abc.abstractmethod
    def eot_token_id(self):
@@ -324,14 +327,19 @@ class TemplateLM(LM):
        return self.eot_token_id
    @abc.abstractmethod
-    def tok_encode(self, string: str, **kwargs):
+    def tok_encode(self, string: str, **kwargs) -> List[int]:
+        """
+        Tokenize a string using the model's tokenizer and return a list of token IDs.
+        """
        pass
    @abc.abstractmethod
-    def _loglikelihood_tokens(self, requests, **kwargs):
+    def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
        pass
-    def _encode_pair(self, context, continuation):
+    def _encode_pair(
+        self, context: str, continuation: str
+    ) -> Tuple[List[int], List[int]]:
        n_spaces = len(context) - len(context.rstrip())
        if n_spaces > 0:
            continuation = context[-n_spaces:] + continuation
@@ -372,9 +380,110 @@ class TemplateLM(LM):
    @abc.abstractmethod
    def loglikelihood_rolling(
        self, requests, disable_tqdm: bool = False
-    ) -> List[Tuple[float, bool]]:
+    ) -> List[float]:
        pass
    @abc.abstractmethod
    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
        pass
+    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
+        """
+        Set and get the appropriate chat template for the model.
+        This method sets the tokenizer's chat_template and returns the template string for reproducibility.
+        The template selection logic is adapted from the Transformers library's `apply_chat_template`
+        method in the Tokenizer class. The original implementation can be found at:
+        https://github.com/huggingface/transformers/blob/fc35907f95459d7a6c5281dfadd680b6f7b620e3/src/transformers/tokenization_utils_base.py#L1687
+        This method ensures that the right template is chosen based on the following:
+        0. If the model has no 'tokenizer' attribute: assumes that there is only a single possible chat template, handled on the model provider side internally. Returns the empty string.
+        1. If the model's tokenizer has multiple templates:
+            a. Use the specified template if it exists in the dictionary.
+            b. Use the default template from the list if no specific template is provided.
+            c. Raise an error if no default template exists and no specific template is provided.
+        2. If the model's tokenizer has a single template or no template:
+            a. Use the tokenizer's chat template if available.
+            b. Fall back to the default chat template if no tokenizer chat template exists.
+        Args:
+            chat_template (Union[bool, str]): Specifies the chat template to use.
+                - If False or None, no template is applied.
+                - If True, the default or only available template is used.
+                - If a string, the template with the matching name is used.
+        Returns:
+            Optional[str]: The selected chat template, or None if no template is applied.
+        """
+        if self.tokenizer is None:
+            return ""
+        if chat_template is False or chat_template is None:
+            eval_logger.warning(
+                "model.chat_template was called with the chat_template set to False or None. "
+                "Therefore no chat template will be applied. Make sure this is an intended behavior."
+            )
+            return None
+        # Convert boolean chat_template to None to ensure compatibility with the adapted logic
+        if isinstance(chat_template, bool):
+            chat_template = None
+        using_default_template = False
+        # First, handle the cases when the model has a dict of multiple templates
+        try:
+            template = (
+                self.tokenizer.chat_template or self.tokenizer.default_chat_template
+            )
+        except AttributeError:
+            return None
+        if isinstance(template, dict):
+            using_default_dict = self.tokenizer.chat_template is None
+            if chat_template is not None:
+                if chat_template in template:
+                    selected_template = template[chat_template]
+                    if using_default_dict:
+                        using_default_template = True
+                else:
+                    raise ValueError(
+                        f"The specified chat template '{chat_template}' is not available. "
+                        f"Available template names are {sorted(template.keys())}."
+                    )
+            else:
+                # If user didn't pass a chat template, use the default template from the dict
+                if "default" in template:
+                    selected_template = template["default"]
+                    using_default_template = True
+                else:
+                    raise ValueError(
+                        "This model has multiple chat templates with no default specified! Please either pass a chat "
+                        "template or the name of the template you wish to use to the `chat_template` argument. Available "
+                        f"template names are {sorted(template.keys())}."
+                    )
+        # Cases when the model has a single template or no template
+        else:
+            # priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template
+            if isinstance(chat_template, str):
+                eval_logger.warning(
+                    "Chat template name provided, but the tokenizer's chat template is not a dictionary. "
+                    "Using the tokenizer's chat template or the default template instead."
+                )
+            if self.tokenizer.chat_template is not None:
+                selected_template = self.tokenizer.chat_template
+            else:
+                selected_template = self.tokenizer.default_chat_template
+                using_default_template = True
+        if using_default_template:
+            eval_logger.warning(
+                "No chat template is set for this tokenizer, falling back to a default class-level template. This is "
+                "very error-prone, because models are often trained with templates different from the class default! "
+                "Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which "
+                "point any code depending on them will stop working. We recommend setting a valid chat template before "
+                "then to ensure that this model continues working without issues."
+            )
+        return selected_template
--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
 import logging
-from typing import Callable, Dict
+from typing import Callable, Dict, Union
 import evaluate as hf_evaluate
@@ -185,8 +185,12 @@ def register_filter(name):
    return decorate
-def get_filter(filter_name: str) -> type:
+def get_filter(filter_name: Union[str, Callable]) -> Callable:
    try:
        return FILTER_REGISTRY[filter_name]
-    except KeyError:
+    except KeyError as e:
-        eval_logger.warning(f"filter `{filter_name}` is not registered!")
+        if callable(filter_name):
+            return filter_name
+        else:
+            eval_logger.warning(f"filter `{filter_name}` is not registered!")
+            raise e
--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
+from functools import partial
 import datasets
@@ -15,9 +17,38 @@ class ContextSampler:
        self.target_delimiter = self.config.target_delimiter
        self.fewshot_delimiter = self.config.fewshot_delimiter
-        self.doc_to_text = self.task.doc_to_text
+        if (
-        self.doc_to_target = self.task.doc_to_target
+            self.config.fewshot_config is not None
-        self.doc_to_choice = self.task.doc_to_choice
+            and self.config.fewshot_config.get("doc_to_text", None) is not None
+        ):
+            self.doc_to_text = partial(
+                self.task.doc_to_text,
+                doc_to_text=self.config.fewshot_config.get("doc_to_text", None),
+            )
+        else:
+            self.doc_to_text = self.task.doc_to_text
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_target", None) is not None
+        ):
+            self.doc_to_target = partial(
+                self.task.doc_to_target,
+                doc_to_target=self.config.fewshot_config.get("doc_to_target", None),
+            )
+        else:
+            self.doc_to_target = self.task.doc_to_target
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_choice", None) is not None
+        ):
+            self.doc_to_choice = partial(
+                self.task.doc_to_choice,
+                doc_to_choice=self.config.fewshot_config.get("doc_to_choice", None),
+            )
+        else:
+            self.doc_to_choice = self.task.doc_to_choice
        self.docs = docs  # HF dataset split, provided by task._fewshot_docs()
        if fewshot_indices:  # subset few-shot docs from
@@ -51,15 +82,17 @@ class ContextSampler:
                if self.config.doc_to_choice is None or isinstance(doc_content, str)
                else self.doc_to_choice(doc)[doc_content]
            )
-            labeled_examples += self.target_delimiter
-            labeled_examples += (
+            if doc_target != "":
-                str(doc_target[0])
+                labeled_examples += self.target_delimiter
-                if isinstance(doc_target, list)
+                labeled_examples += (
-                else doc_target
+                    str(doc_target[0])
-                if self.config.doc_to_choice is None or isinstance(doc_target, str)
+                    if isinstance(doc_target, list)
-                else str(self.doc_to_choice(doc)[doc_target])
+                    else doc_target
-            )
+                    if self.config.doc_to_choice is None or isinstance(doc_target, str)
-            labeled_examples += self.fewshot_delimiter
+                    else str(self.doc_to_choice(doc)[doc_target])
+                )
+                labeled_examples += self.fewshot_delimiter
        return labeled_examples

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -56,8 +56,7 @@ class TaskConfig(dict):
    # task naming/registry
    task: Optional[str] = None
    task_alias: Optional[str] = None
-    group: Optional[Union[str, list]] = None
+    tag: Optional[Union[str, list]] = None
-    group_alias: Optional[Union[str, list]] = None
    # HF dataset options.
    # which dataset to use,
    # and what splits for what purpose
@@ -68,13 +67,14 @@ class TaskConfig(dict):
    validation_split: Optional[str] = None
    test_split: Optional[str] = None
    fewshot_split: Optional[str] = (
-        None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
+        None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaluating (?)
    )
    # formatting / prompting options.
    # see docs/advanced_task_guide.md for more info
    process_docs: Optional[Callable] = None
    doc_to_text: Optional[Union[Callable, str]] = None
    doc_to_target: Optional[Union[Callable, str]] = None
+    doc_to_image: Union[Callable, str] = None
    doc_to_choice: Optional[Union[Callable, str, dict, list]] = None
    process_results: Optional[Union[Callable, str]] = None
    use_prompt: Optional[str] = None
@@ -365,18 +365,23 @@ class Task(abc.ABC):
    def doc_to_target(self, doc):
        pass
+    # not an abstractmethod because not every language-only task has to implement this
+    def doc_to_image(self, doc):
+        raise NotImplementedError
    def build_all_requests(
        self,
        *,
-        limit=None,
+        limit: Union[int, None] = None,
-        rank=None,
+        rank: int = 0,
-        world_size=None,
+        world_size: int = 1,
-        cache_requests=False,
+        cache_requests: bool = False,
-        rewrite_requests_cache=False,
+        rewrite_requests_cache: bool = False,
-        system_instruction=None,
+        system_instruction: Optional[str] = None,
-        apply_chat_template=False,
+        apply_chat_template: bool = False,
-        fewshot_as_multiturn=False,
+        fewshot_as_multiturn: bool = False,
-        lm=None,
+        chat_template: Optional[Callable] = None,
+        tokenizer_name: str = "",
    ) -> None:
        """Build a set of Instances for a task, and store them in task.instances"""
@@ -391,9 +396,9 @@ class Task(abc.ABC):
            if system_instruction is not None
            else ""
        )
-        cache_key += f"-tokenizer{lm.tokenizer_name}" if apply_chat_template else ""
+        cache_key += f"-tokenizer{tokenizer_name}"
-        cached_instances = load_from_cache(file_name=cache_key)
+        cached_instances = load_from_cache(file_name=cache_key, cache=cache_requests)
        if cache_requests and cached_instances and not rewrite_requests_cache:
            cached_instances = cached_instances[:limit]
@@ -436,7 +441,7 @@ class Task(abc.ABC):
                system_instruction,
                apply_chat_template,
                fewshot_as_multiturn,
-                lm,
+                chat_template,
            )
            # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
@@ -444,6 +449,7 @@ class Task(abc.ABC):
                doc=doc,
                ctx=fewshot_ctx,
                metadata=(self.config["task"], doc_id, self.config.repeats),
+                apply_chat_template=apply_chat_template,
            )
            if not isinstance(inst, list):
@@ -722,6 +728,10 @@ class ConfigurableTask(Task):
                )
            self.OUTPUT_TYPE = self.config.output_type
+        if self.config.doc_to_image is not None:
+            # mark the task as requiring multimodality.
+            self.MULTIMODAL = True
        if self.config.dataset_path is not None:
            self.DATASET_PATH = self.config.dataset_path
@@ -979,7 +989,7 @@ class ConfigurableTask(Task):
        else:
            if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0):
                eval_logger.warning(
-                    f"Task '{self.config.task}': "
+                    f"[Task: {self.config.task}] "
                    "num_fewshot > 0 but fewshot_split is None. "
                    "using preconfigured rule."
                )
@@ -1014,7 +1024,7 @@ class ConfigurableTask(Task):
        system_instruction: Optional[str] = None,
        apply_chat_template: bool = False,
        fewshot_as_multiturn: bool = False,
-        lm=None,
+        chat_template: Optional[Callable] = None,
    ) -> str:
        """Returns a fewshot context string that is made up of a prepended description
        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
@@ -1029,8 +1039,8 @@ class ConfigurableTask(Task):
            Whether to apply the chat template to the fewshot context.
        :param fewshot_as_multiturn: bool
            Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
-        :param lm:
+        :param chat_template:
-            Language model with definition of the tokenizer/function to use for applying the chat template.
+            callable (from lm.apply_chat_template) that takes in a list[Dict] chat transcript and renders it into a string.
        :returns: str
            The fewshot context.
        """
@@ -1077,7 +1087,7 @@ class ConfigurableTask(Task):
        example = self.doc_to_text(doc)
        if apply_chat_template:
            if self.multiple_input:
-                return lm.apply_chat_template(labeled_examples)
+                return chat_template(labeled_examples)
            if isinstance(example, str):
                self.append_target_question(
                    labeled_examples, example, fewshot_as_multiturn
@@ -1089,7 +1099,7 @@ class ConfigurableTask(Task):
                for ex in example:
                    chat = deepcopy(labeled_examples)
                    self.append_target_question(chat, ex, fewshot_as_multiturn)
-                    labeled_examples_list.append(lm.apply_chat_template(chat))
+                    labeled_examples_list.append(chat_template(chat))
                return labeled_examples_list
            # if example is an integer, append the choice or convert to string
            elif isinstance(example, int):
@@ -1103,7 +1113,7 @@ class ConfigurableTask(Task):
                        labeled_examples, str(example), fewshot_as_multiturn
                    )
                # return lm.apply_chat_template(labeled_examples)
-            return lm.apply_chat_template(labeled_examples)
+            return chat_template(labeled_examples)
        else:
            if self.multiple_input:
                return labeled_examples
@@ -1158,9 +1168,11 @@ class ConfigurableTask(Task):
        """
        return doc
-    def doc_to_text(self, doc):
+    def doc_to_text(self, doc, doc_to_text=None):
        if self.prompt is not None:
            doc_to_text = self.prompt
+        elif doc_to_text is not None:
+            doc_to_text = doc_to_text
        else:
            doc_to_text = self.config.doc_to_text
@@ -1192,9 +1204,11 @@ class ConfigurableTask(Task):
            print(type(doc_to_text))
            raise TypeError
-    def doc_to_target(self, doc: Mapping) -> Union[int, str, list]:
+    def doc_to_target(self, doc: Mapping, doc_to_target=None) -> Union[int, str, list]:
        if self.prompt is not None:
            doc_to_target = self.prompt
+        elif doc_to_target is not None:
+            doc_to_target = doc_to_target
        else:
            doc_to_target = self.config.doc_to_target
@@ -1236,9 +1250,11 @@ class ConfigurableTask(Task):
        else:
            raise TypeError
-    def doc_to_choice(self, doc: Any) -> List[str]:
+    def doc_to_choice(self, doc: Any, doc_to_choice=None) -> List[str]:
        if self.prompt is not None:
            doc_to_choice = self.prompt
+        elif doc_to_choice is not None:
+            doc_to_choice = doc_to_choice
        elif self.config.doc_to_choice is None:
            eval_logger.error("doc_to_choice was called but not set in config")
        else:
@@ -1260,9 +1276,36 @@ class ConfigurableTask(Task):
        else:
            raise TypeError
+    def doc_to_image(self, doc: Any, doc_to_image=None) -> Union[int, str, list]:
+        if doc_to_image is not None:
+            doc_to_image = doc_to_image
+        elif self.config.doc_to_image is not None:
+            doc_to_image = self.config.doc_to_image
+        else:
+            return None
+        if isinstance(doc_to_image, list):
+            image_feature = [
+                self.doc_to_image(doc, feature) for feature in doc_to_image
+            ]
+            return [feature for feature in image_feature if feature is not None]
+        elif isinstance(doc_to_image, str):
+            if doc_to_image in self.features:
+                return doc[doc_to_image]
+            else:
+                return ast.literal_eval(utils.apply_template(doc_to_image, doc))
+        elif callable(doc_to_image):
+            return doc_to_image(doc)
+        else:
+            return None
    def construct_requests(
        self, doc: dict, ctx: str, **kwargs
    ) -> Union[List[Instance], Instance]:
+        apply_chat_template = kwargs.pop("apply_chat_template", False)
+        aux_arguments = None
        if self.OUTPUT_TYPE == "loglikelihood":
            arguments = (ctx, self.doc_to_target(doc))
        elif self.OUTPUT_TYPE == "loglikelihood_rolling":
@@ -1270,6 +1313,8 @@ class ConfigurableTask(Task):
        elif self.OUTPUT_TYPE == "multiple_choice":
            choices = self.doc_to_choice(doc)
            target_delimiter = self.config.target_delimiter
+            if apply_chat_template:
+                target_delimiter = ""
            if self.multiple_input:
                # If there are multiple inputs, choices are placed in the ctx
                cont = self.doc_to_target(doc)
@@ -1280,6 +1325,37 @@ class ConfigurableTask(Task):
                # Otherwise they are placed in the continuation
                arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]
+            # TODO: we should raise a warning telling users this will at most ~2x runtime.
+            if "acc_mutual_info" in self._metric_fn_list.keys():
+                # if we are calculating multiple choice accuracy
+                # using mutual information instead of raw loglikelihood as metric, need unconditional lls.
+                # here mutual info refers to calculating
+                # log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
+                # in other words normalizing by subtracting the unconditional logprob of each choice.
+                aux_arguments = [("", f"{choice}") for choice in choices]
+                arguments.extend(aux_arguments)
+        elif self.OUTPUT_TYPE == "generate_until":
+            arguments = (ctx, deepcopy(self.config.generation_kwargs))
+        multimodal_arg = {}
+        if (
+            self.config.doc_to_image
+        ):  # TODO: ensure that non-multimodal tasks aren't getting visual args
+            multimodal_arg = {
+                **multimodal_arg,
+                **{"visual": self.doc_to_image(doc)},
+            }
+        if bool(multimodal_arg):
+            if isinstance(arguments, list):
+                arguments = [arg + (multimodal_arg,) for arg in arguments]
+            else:
+                arguments = arguments + (multimodal_arg,)
+        if self.OUTPUT_TYPE == "multiple_choice":
            request_list = [
                Instance(
                    request_type="loglikelihood",
@@ -1290,33 +1366,15 @@ class ConfigurableTask(Task):
                )
                for i, arg in enumerate(arguments)
            ]
-            # TODO: we should raise a warning telling users this will at most ~2x runtime.
-            if "acc_mutual_info" in self._metric_fn_list.keys():
-                # if we are calculating multiple choice accuracy
-                # using mutual information instead of raw loglikelihood as metric, need unconditional lls.
-                # here mutual info refers to calculating
-                # log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
-                # in other words normalizing by subtracting the unconditional logprob of each choice.
-                request_list.extend(
-                    [
-                        Instance(
-                            request_type="loglikelihood",
-                            doc=doc,
-                            arguments=("", "{}".format(choice)),
-                            idx=i,
-                            **kwargs,
-                        )
-                        for i, choice in enumerate(choices)
-                    ]
-                )
            return request_list
-        elif self.OUTPUT_TYPE == "generate_until":
-            arguments = (ctx, deepcopy(self.config.generation_kwargs))
        return Instance(
-            request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs
+            request_type=self.OUTPUT_TYPE,
+            doc=doc,
+            arguments=arguments,
+            idx=0,
+            **kwargs,
        )
    def process_results(self, doc, results):
@@ -1445,7 +1503,10 @@ class ConfigurableTask(Task):
            # we expect multiple_targets to be a list.
            elif self.multiple_target:
                gold = list(gold)
-            elif type(gold) != type(result) and not isinstance(result, List):
+            elif (
+                type(gold) is not type(result)
+                and "bypass" not in self._metric_fn_list.keys()
+            ):
                # cast gold to the same type as result
                gold = type(result)(gold)
@@ -1519,10 +1580,13 @@ class ConfigurableTask(Task):
    def get_config(self, key: str) -> Any:
        return getattr(self._config, key, None)
+    @property
+    def task_name(self) -> Any:
+        return getattr(self.config, "task", None)
    def __repr__(self):
        return (
            f"ConfigurableTask(task_name={getattr(self.config, 'task', None)},"
-            f"group_name={getattr(self.config, 'group', None)},"
            f"output_type={self.OUTPUT_TYPE},"
            f"num_fewshot={getattr(self.config, 'num_fewshot', None)},"
            f"num_samples={len(self.eval_docs)})"

--- a/lm_eval/caching/__init__.py
+++ b/lm_eval/caching/__init__.py
--- a/lm_eval/caching/cache.py
+++ b/lm_eval/caching/cache.py
@@ -21,7 +21,9 @@ HASH_PREFIX = hashlib.sha256(HASH_INPUT.encode("utf-8")).hexdigest()
 FILE_SUFFIX = f".{HASH_PREFIX}.pickle"
-def load_from_cache(file_name):
+def load_from_cache(file_name: str, cache: bool = False):
+    if not cache:
+        return
    try:
        path = f"{PATH}/{file_name}{FILE_SUFFIX}"

--- a/lm_eval/decontamination/archiver.py
+++ b/lm_eval/decontamination/archiver.py
@@ -110,12 +110,15 @@ class TextReader:
    def read_tqdm(self, update_frequency: int = 10000):
        current_file_position = 0
        line_counter = 0
-        with open(self.file_path, "r", encoding="utf-8") as fh, tqdm.tqdm(
+        with (
-            total=os.path.getsize(self.file_path),
+            open(self.file_path, "r", encoding="utf-8") as fh,
-            dynamic_ncols=True,
+            tqdm.tqdm(
-            unit="byte",
+                total=os.path.getsize(self.file_path),
-            unit_scale=1,
+                dynamic_ncols=True,
-        ) as progress:
+                unit="byte",
+                unit_scale=1,
+            ) as progress,
+        ):
            with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
                for line in iter(mmap_obj.readline, b""):
                    line = line.decode("utf-8")

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
--- a/lm_eval/evaluator_utils.py
+++ b/lm_eval/evaluator_utils.py
@@ -2,9 +2,15 @@ import collections
 import math
 import pathlib
 import sys
-from typing import Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
-from lm_eval.api import metrics
+from lm_eval.api.group import ConfigurableGroup
+from lm_eval.api.metrics import (
+    aggregate_subtask_metrics,
+    pooled_sample_stderr,
+    stderr_for_metric,
+)
+from lm_eval.api.task import Task
 from lm_eval.utils import eval_logger, positional_deprecated
@@ -98,7 +104,7 @@ class TaskOutput:
            self.agg_metrics[metric_key] = agg_fn(items)
            self.sample_len = len(items)  # TODO: same sample size for each metric?
            if isinstance(bootstrap_iters, int):
-                stderr_fn = metrics.stderr_for_metric(
+                stderr_fn = stderr_for_metric(
                    metric=agg_fn,
                    bootstrap_iters=min(bootstrap_iters, 100)
                    if metric in ["bleu", "chrf", "ter"]
@@ -116,23 +122,71 @@ class TaskOutput:
        return (
            f"TaskOutput(task_name={self.task_name}, "
            f"group_name={self.group_name}, "
-            f"version={self.version},"
+            f"version={self.version}, "
-            f"n_shot={self.n_shot}"
+            f"n_shot={self.n_shot}, "
-            f"task_alias={self.task_alias}, group_alias={self.group_alias})"
+            f"task_alias={self.task_alias}, "
+            f"group_alias={self.group_alias})"
        )
-def get_task_list(task_dict: dict) -> Tuple[Dict[str, list], List[TaskOutput]]:
+def get_task_list(task_dict: dict) -> List[TaskOutput]:
-    task_hierarchy = collections.defaultdict(list)
+    outputs = []
-    outputs = list(TaskOutput.from_taskdict(x, y) for x, y in task_dict.items())
+    for task_name, task_obj in task_dict.items():
-    for task_output in outputs:
+        if isinstance(task_obj, dict):
-        if group_name := task_output.group_name:
+            _outputs = get_task_list(task_obj)
-            task_hierarchy[group_name].append(task_output.task_name)
+            outputs.extend(_outputs)
        else:
-            task_hierarchy[task_output.task_name] = []
+            task_output = TaskOutput.from_taskdict(task_name, task_obj)
-    # returns task_hierarchy tracking which groups contain which subtasks,
+            outputs.append(task_output)
-    # and a list of TaskOutput classes for each non-group subtask
-    return task_hierarchy, [x for x in outputs if x.task]
+    return outputs
+def get_subtask_list(task_dict, task_root=None, depth=0):
+    subtask_list = {}
+    for group_obj, task_obj in task_dict.items():
+        if isinstance(group_obj, ConfigurableGroup):
+            # group_name = group_obj.group_name
+            group_name = group_obj.group_name
+        else:
+            group_name = group_obj
+        if isinstance(task_obj, dict):
+            _subtask_list = get_subtask_list(
+                task_obj, task_root=group_name, depth=depth + 1
+            )
+            if task_root:
+                subtask_list.setdefault((task_root, depth), []).extend(
+                    [
+                        _task
+                        for (_task, _depth) in _subtask_list.keys()
+                        if (_depth - 1) == depth
+                    ]
+                )
+            subtask_list = {**subtask_list, **_subtask_list}
+        else:
+            if isinstance(task_obj, ConfigurableGroup):
+                # group_or_task_name = task_obj.group_name
+                group_or_task_name = task_obj.group_name
+            elif isinstance(task_obj, Task):
+                # group_or_task_name = task_obj.task_name
+                group_or_task_name = task_obj.task_name
+            if task_root is None:
+                subtask_list.setdefault((group_or_task_name, depth), [])
+            else:
+                subtask_list.setdefault((task_root, depth), []).append(
+                    group_or_task_name
+                )
+    if depth == 0:
+        _subtask_list = {}
+        for group_key, task_list in subtask_list.items():
+            group_name, depth = group_key
+            _subtask_list[group_name] = task_list
+        subtask_list = _subtask_list
+    return subtask_list
 def print_writeout(task) -> None:
@@ -155,70 +209,95 @@ def get_sample_size(task, limit: Optional[int]) -> Union[int, None]:
 def prepare_print_tasks(
-    task_hierarchy: dict, results: dict, tab=0
+    task_dict: dict,
+    results: dict,
+    task_depth=0,
+    group_depth=0,
 ) -> Tuple[dict, dict]:
    """
-    @param task_hierarchy: Dictionary representing the group hierarchy of tasks. Each key is a group name and its
+    @param task_dict: Dictionary representing the group hierarchy of tasks. Each key is a group name and its
    value is a list of task names.
    @param results: Dictionary containing the results of each task. Each key is a
    group name and its value is a dictionary of task results.
-    @param tab: The indentation level for printing the task
+    @param task_depth: The indentation level for printing the task
+    hierarchy. Default is 0.
+    @param group_depth: The indentation level for printing the group
    hierarchy. Default is 0.
    @return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains
    aggregated results for each task, and groups_agg contains aggregated results for each group.
    Prepares the task hierarchy and aggregates the results for each task and group recursively for printing.
    """
-    results_agg = collections.defaultdict(dict)
-    groups_agg = collections.defaultdict(dict)
-    (group_name, task_list), *_ = task_hierarchy.items()
-    task_list = sorted(task_list)
-    results_agg[group_name] = results[group_name].copy()
-    # results_agg[group_name]["tab"] = tab
-    if "samples" in results_agg[group_name]:
-        results_agg[group_name].pop("samples")
-    tab_string = " " * tab + "- " if tab > 0 else ""
-    if "alias" in results_agg[group_name]:
+    def _sort_task_dict(task_dict):
-        results_agg[group_name]["alias"] = tab_string + results_agg[group_name]["alias"]
+        """
-    else:
+        Helper utility. Sorts the task dict at the current level of the hierarchy based on alphabetized task name.
-        results_agg[group_name]["alias"] = tab_string + group_name
+        Required so that we end up sorting within each sub-header correctly.
+        """
-    if len(task_list) > 0:
-        groups_agg[group_name] = results[group_name].copy()
+        return dict(
-        # groups_agg[group_name]["tab"] = tab
+            sorted(
-        if "samples" in groups_agg[group_name]:
+                task_dict.items(),
-            groups_agg[group_name].pop("samples")
+                key=lambda item: item[0].group_name
+                if isinstance(item[0], ConfigurableGroup)
-        if "alias" in groups_agg[group_name]:
+                else item[0],
-            groups_agg[group_name]["alias"] = (
-                tab_string + groups_agg[group_name]["alias"]
            )
-        else:
+        )
-            groups_agg[group_name]["alias"] = tab_string + group_name
-        for task_name in task_list:
+    task_agg = collections.defaultdict(dict)
-            if task_name in task_hierarchy:
+    group_agg = collections.defaultdict(dict)
-                _task_hierarchy = {
+    task_dict = _sort_task_dict(task_dict)
-                    **{task_name: task_hierarchy[task_name]},
+    for task_or_group_name, task_or_group_obj in task_dict.items():
-                    **task_hierarchy,
+        tab_string = " " * task_depth + "- " if task_depth > 0 else ""
-                }
+        if isinstance(task_or_group_name, ConfigurableGroup):
+            # string_name = task_or_group_name.group_name
+            name = task_or_group_name.group_name
+            from_configurable_group = True
+            task_or_group_obj = _sort_task_dict(task_or_group_obj)
+        elif isinstance(task_or_group_name, str):
+            name = task_or_group_name
+            if isinstance(task_or_group_obj, Task):
+                # string_name = task_or_group_obj.task_name
+                name = task_or_group_obj.task_name
+            from_configurable_group = False
+        task_agg[name] = results[name].copy()
+        if from_configurable_group:
+            if task_or_group_name.group_alias is not None:
+                alias = task_or_group_name.group_alias
            else:
-                _task_hierarchy = {
+                alias = task_or_group_name.group
-                    **{task_name: []},
+        else:
-                    **task_hierarchy,
+            if "alias" in task_agg[name]:
-                }
+                alias = task_agg[name]["alias"]
+            else:
-            _results_agg, _groups_agg = prepare_print_tasks(
+                alias = name
-                _task_hierarchy, results, tab + 1
+        task_agg[name]["alias"] = tab_string + alias
+        if "samples" in task_agg[name]:
+            task_agg[name].pop("samples")
+        if from_configurable_group and (" " not in results[name]):
+            group_tab_string = " " * group_depth + "- " if group_depth > 0 else ""
+            group_agg[name] = results[name].copy()
+            group_agg[name]["alias"] = group_tab_string + alias
+            if "samples" in group_agg[name]:
+                group_agg[name].pop("samples")
+        if isinstance(task_or_group_obj, dict):
+            task_depth += 1
+            group_depth += 1
+            _task_agg, _group_agg = prepare_print_tasks(
+                task_or_group_obj, results, task_depth, group_depth
            )
-            results_agg = {**results_agg, **_results_agg}
+            task_agg = {
-            groups_agg = {**groups_agg, **_groups_agg}
+                **task_agg,
+                **_task_agg,
-    return results_agg, groups_agg
+            }
+            group_agg = {**group_agg, **_group_agg}
+            task_depth -= 1
+            group_depth -= 1
+    return task_agg, group_agg
 def consolidate_results(
@@ -261,6 +340,8 @@ def consolidate_results(
    for task_output in eval_tasks:
        if "task_alias" in (task_config := task_output.task_config):
            results[task_output.task_name]["alias"] = task_config["task_alias"]
+        else:
+            results[task_output.task_name]["alias"] = task_output.task_name
        if group_alias := task_output.group_alias:
            if group_alias not in results and (group_name := task_output.group_name):
                results[group_name]["alias"] = group_alias
@@ -275,12 +356,153 @@ def consolidate_results(
                metric_key
            ]
            results[task_output.task_name]["samples"] = task_output.sample_len
-            results[task_output.task_name][
+            results[task_output.task_name][f"{metric}_stderr,{filter_key}"] = (
-                f"{metric}_stderr,{filter_key}"
+                task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
-            ] = task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
+            )
    return results, samples, configs, versions, num_fewshot, higher_is_better
+def consolidate_group_results(
+    results,
+    versions,
+    task_dict,
+    task_root=None,
+    show_group_table=False,
+    task_aggregation_list=None,
+) -> Tuple[dict, dict, bool, Union[None,]]:
+    """
+    (Recursively) calculates groups' aggregated metrics and updates the results and versions dictionaries with this info.
+    @return: a tuple [results, versions, show_group_table, task_aggregation_list] with formats described below:
+    - results: A defaultdict with task names (and, after this function is called, group names of
+    groups that perform aggregation) as keys, and dictionaries with "alias" and metric,filter_name pairs as keys.
+    - versions: A defaultdict with task names (and, after this function is called, group names of
+    groups that perform aggregation) as keys, and float values representing the task or group's version if a version is specified. (defaulting to None).
+    - show_group_table: a boolean which is true if there exists a group that requires printing of its aggregated scores in a group table.
+    - task_aggregation_list: a defaultdict listing the subtasks to average over to produce a given group's end metric.
+    The method then returns the updated results, versions, show_group_table, and task_aggregation_list as a tuple.
+    In the top-level invocation of this function, task_aggregation_list is ignored.
+    """
+    if task_root is None:
+        task_root = {}
+    if task_aggregation_list is None:
+        task_aggregation_list = {}
+    for group_or_task, group_or_task_info in task_dict.items():
+        # Convert to string
+        if isinstance(group_or_task, ConfigurableGroup):
+            group_config = group_or_task.config
+            group_or_task = group_or_task.group_name
+        else:
+            group_config = None
+        if isinstance(group_or_task_info, Task):
+            if task_root:
+                task_aggregation_list.setdefault(task_root, []).append(
+                    group_or_task_info.task_name
+                )
+        else:
+            (
+                results,
+                versions,
+                show_group_table,
+                _task_aggregation_list,
+            ) = consolidate_group_results(
+                results,
+                versions,
+                group_or_task_info,
+                group_or_task,
+                show_group_table,
+                task_aggregation_list,
+            )
+            if task_root:
+                task_aggregation_list.setdefault(task_root, []).extend(
+                    task_aggregation_list.get(group_or_task, [])
+                )
+            if (group_config is None) or (
+                group_config["aggregate_metric_list"] is None
+            ):
+                results[group_or_task][" "] = " "
+                continue
+            if "aggregate_metric_list" in group_config:
+                agg_metric_list = group_config["aggregate_metric_list"]
+            show_group_table = show_group_table | bool(
+                group_config["aggregate_metric_list"]
+            )
+            task_list = _task_aggregation_list[group_or_task]
+            metric_list = list(
+                {
+                    key
+                    for task in task_list
+                    for key in results[task].keys()
+                    if "_stderr" not in key and key not in ["task", "alias", "samples"]
+                }
+            )
+            for metric in metric_list:
+                stderr = "_stderr,".join(metric.split(","))
+                # gather metrics, sizes, and stderrs from subtasks
+                metrics = [
+                    results[task][metric]
+                    for task in task_list
+                    if metric in results[task]
+                ]  # TODO: copy?
+                stderrs = [
+                    results[task][stderr]
+                    for task in task_list
+                    if stderr in results[task]
+                ]
+                sizes = [
+                    results[task]["samples"]
+                    for task in task_list
+                    if metric in results[task]
+                ]
+                for metric_config in agg_metric_list:
+                    for filter_name in metric_config["filter_list"]:
+                        if metric != ",".join([metric_config["metric"], filter_name]):
+                            continue
+                        # compute group's pooled metric and stderr
+                        if metric_config["aggregation"] == "mean":
+                            aggregate_fn = aggregate_subtask_metrics
+                        elif callable(metric_config["aggregation"]):
+                            aggregate_fn = metric_config["aggregation"]
+                        else:
+                            raise ValueError(
+                                f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{metric_config['aggregation']}' for group '{group_or_task}'"
+                            )
+                        results[group_or_task][metric] = aggregate_fn(
+                            metrics,
+                            sizes,
+                            metric_config["weight_by_size"],
+                        )
+                        # TODO: calculate groups' metrics using arbitrary agg fns
+                        if "N/A" in stderrs:
+                            results[group_or_task][stderr] = "N/A"
+                        else:
+                            # NOTE: this assumes we are using the mean to aggregate. There are warnings about this elsewhere
+                            results[group_or_task][stderr] = pooled_sample_stderr(
+                                stderrs, sizes
+                            )
+                results[group_or_task]["samples"] = sum(sizes)
+                group_metadata = group_config.get("metadata", None)
+                if group_metadata is not None:
+                    versions[group_or_task] = group_metadata.get("version", None)
+    # print(results)
+    return results, versions, show_group_table, task_aggregation_list
 @positional_deprecated
 def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
    """

--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
--- a/lm_eval/loggers/evaluation_tracker.py
+++ b/lm_eval/loggers/evaluation_tracker.py
--- a/lm_eval/loggers/utils.py
+++ b/lm_eval/loggers/utils.py
--- a/lm_eval/loggers/wandb_logger.py
+++ b/lm_eval/loggers/wandb_logger.py
--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
 from . import (
    anthropic_llms,
+    api_models,
    dummy,
    gguf,
+    hf_vlms,
    huggingface,
+    ibm_watsonx_ai,
    mamba_lm,
    nemo_lm,
    neuralmagic,
    neuron_optimum,
    openai_completions,
+    optimum_ipex,
    optimum_lm,
    textsynth,
    vllm_causallms,
+    vllm_vlms,
 )

--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
--- a/lm_eval/models/api_models.py
+++ b/lm_eval/models/api_models.py
--- a/lm_eval/models/dummy.py
+++ b/lm_eval/models/dummy.py
@@ -26,9 +26,9 @@ class DummyLM(LM):
    def generate_until(self, requests, disable_tqdm: bool = False):
        res = []
-        for ctx, _ in tqdm(requests, disable=disable_tqdm):
+        for request in tqdm(requests, disable=disable_tqdm):
            res.append("lol")
-            assert ctx.strip() != ""
+            assert request.arguments[0].strip() != ""
        return res

--- a/lm_eval/models/gguf.py
+++ b/lm_eval/models/gguf.py
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py