Merge branch 'main' into autobatchtest

# Conflicts: # lm_eval/models/huggingface.py

Merge branch 'main' into autobatchtest
# Conflicts: # lm_eval/models/huggingface.py
948f120f · Baber · a5b1c7a8 · bd80a6c0 · 948f120f · 948f120f
Commit 948f120f authored Nov 09, 2024 by Baber
20 changed files
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -56,8 +56,7 @@ class TaskConfig(dict):
    # task naming/registry
    task: Optional[str] = None
    task_alias: Optional[str] = None
-    group: Optional[Union[str, list]] = None
+    tag: Optional[Union[str, list]] = None
-    group_alias: Optional[Union[str, list]] = None
    # HF dataset options.
    # which dataset to use,
    # and what splits for what purpose
@@ -68,13 +67,14 @@ class TaskConfig(dict):
    validation_split: Optional[str] = None
    test_split: Optional[str] = None
    fewshot_split: Optional[str] = (
-        None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
+        None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaluating (?)
    )
    # formatting / prompting options.
    # see docs/advanced_task_guide.md for more info
    process_docs: Optional[Callable] = None
    doc_to_text: Optional[Union[Callable, str]] = None
    doc_to_target: Optional[Union[Callable, str]] = None
+    doc_to_image: Union[Callable, str] = None
    doc_to_choice: Optional[Union[Callable, str, dict, list]] = None
    process_results: Optional[Union[Callable, str]] = None
    use_prompt: Optional[str] = None
@@ -365,6 +365,10 @@ class Task(abc.ABC):
    def doc_to_target(self, doc):
        pass
+    # not an abstractmethod because not every language-only task has to implement this
+    def doc_to_image(self, doc):
+        raise NotImplementedError
    def build_all_requests(
        self,
        *,
@@ -723,6 +727,10 @@ class ConfigurableTask(Task):
                )
            self.OUTPUT_TYPE = self.config.output_type
+        if self.config.doc_to_image is not None:
+            # mark the task as requiring multimodality.
+            self.MULTIMODAL = True
        if self.config.dataset_path is not None:
            self.DATASET_PATH = self.config.dataset_path
@@ -980,7 +988,7 @@ class ConfigurableTask(Task):
        else:
            if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0):
                eval_logger.warning(
-                    f"Task '{self.config.task}': "
+                    f"[Task: {self.config.task}] "
                    "num_fewshot > 0 but fewshot_split is None. "
                    "using preconfigured rule."
                )
@@ -1030,8 +1038,8 @@ class ConfigurableTask(Task):
            Whether to apply the chat template to the fewshot context.
        :param fewshot_as_multiturn: bool
            Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
-        :param chat_template: Callable
+        :param chat_template:
-            Chat template to be applied to the fewshot context.
+            callable (from lm.apply_chat_template) that takes in a list[Dict] chat transcript and renders it into a string.
        :returns: str
            The fewshot context.
        """
@@ -1159,9 +1167,11 @@ class ConfigurableTask(Task):
        """
        return doc
-    def doc_to_text(self, doc):
+    def doc_to_text(self, doc, doc_to_text=None):
        if self.prompt is not None:
            doc_to_text = self.prompt
+        elif doc_to_text is not None:
+            doc_to_text = doc_to_text
        else:
            doc_to_text = self.config.doc_to_text
@@ -1193,9 +1203,11 @@ class ConfigurableTask(Task):
            print(type(doc_to_text))
            raise TypeError
-    def doc_to_target(self, doc: Mapping) -> Union[int, str, list]:
+    def doc_to_target(self, doc: Mapping, doc_to_target=None) -> Union[int, str, list]:
        if self.prompt is not None:
            doc_to_target = self.prompt
+        elif doc_to_target is not None:
+            doc_to_target = doc_to_target
        else:
            doc_to_target = self.config.doc_to_target
@@ -1237,9 +1249,11 @@ class ConfigurableTask(Task):
        else:
            raise TypeError
-    def doc_to_choice(self, doc: Any) -> List[str]:
+    def doc_to_choice(self, doc: Any, doc_to_choice=None) -> List[str]:
        if self.prompt is not None:
            doc_to_choice = self.prompt
+        elif doc_to_choice is not None:
+            doc_to_choice = doc_to_choice
        elif self.config.doc_to_choice is None:
            eval_logger.error("doc_to_choice was called but not set in config")
        else:
@@ -1261,9 +1275,34 @@ class ConfigurableTask(Task):
        else:
            raise TypeError
+    def doc_to_image(self, doc: Any, doc_to_image=None) -> Union[int, str, list]:
+        if doc_to_image is not None:
+            doc_to_image = doc_to_image
+        elif self.config.doc_to_image is not None:
+            doc_to_image = self.config.doc_to_image
+        else:
+            return None
+        if isinstance(doc_to_image, list):
+            image_feature = [
+                self.doc_to_image(doc, feature) for feature in doc_to_image
+            ]
+            return [feature for feature in image_feature if feature is not None]
+        elif isinstance(doc_to_image, str):
+            if doc_to_image in self.features:
+                return doc[doc_to_image]
+            else:
+                return ast.literal_eval(utils.apply_template(doc_to_image, doc))
+        elif callable(doc_to_image):
+            return doc_to_image(doc)
+        else:
+            return None
    def construct_requests(
        self, doc: dict, ctx: str, **kwargs
    ) -> Union[List[Instance], Instance]:
+        aux_arguments = None
        if self.OUTPUT_TYPE == "loglikelihood":
            arguments = (ctx, self.doc_to_target(doc))
        elif self.OUTPUT_TYPE == "loglikelihood_rolling":
@@ -1281,6 +1320,37 @@ class ConfigurableTask(Task):
                # Otherwise they are placed in the continuation
                arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]
+            # TODO: we should raise a warning telling users this will at most ~2x runtime.
+            if "acc_mutual_info" in self._metric_fn_list.keys():
+                # if we are calculating multiple choice accuracy
+                # using mutual information instead of raw loglikelihood as metric, need unconditional lls.
+                # here mutual info refers to calculating
+                # log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
+                # in other words normalizing by subtracting the unconditional logprob of each choice.
+                aux_arguments = [("", f"{choice}") for choice in choices]
+                arguments.extend(aux_arguments)
+        elif self.OUTPUT_TYPE == "generate_until":
+            arguments = (ctx, deepcopy(self.config.generation_kwargs))
+        multimodal_arg = {}
+        if (
+            self.config.doc_to_image
+        ):  # TODO: ensure that non-multimodal tasks aren't getting visual args
+            multimodal_arg = {
+                **multimodal_arg,
+                **{"visual": self.doc_to_image(doc)},
+            }
+        if bool(multimodal_arg):
+            if isinstance(arguments, list):
+                arguments = [arg + (multimodal_arg,) for arg in arguments]
+            else:
+                arguments = arguments + (multimodal_arg,)
+        if self.OUTPUT_TYPE == "multiple_choice":
            request_list = [
                Instance(
                    request_type="loglikelihood",
@@ -1291,33 +1361,15 @@ class ConfigurableTask(Task):
                )
                for i, arg in enumerate(arguments)
            ]
-            # TODO: we should raise a warning telling users this will at most ~2x runtime.
-            if "acc_mutual_info" in self._metric_fn_list.keys():
-                # if we are calculating multiple choice accuracy
-                # using mutual information instead of raw loglikelihood as metric, need unconditional lls.
-                # here mutual info refers to calculating
-                # log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
-                # in other words normalizing by subtracting the unconditional logprob of each choice.
-                request_list.extend(
-                    [
-                        Instance(
-                            request_type="loglikelihood",
-                            doc=doc,
-                            arguments=("", "{}".format(choice)),
-                            idx=i,
-                            **kwargs,
-                        )
-                        for i, choice in enumerate(choices)
-                    ]
-                )
            return request_list
-        elif self.OUTPUT_TYPE == "generate_until":
-            arguments = (ctx, deepcopy(self.config.generation_kwargs))
        return Instance(
-            request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs
+            request_type=self.OUTPUT_TYPE,
+            doc=doc,
+            arguments=arguments,
+            idx=0,
+            **kwargs,
        )
    def process_results(self, doc, results):
@@ -1446,7 +1498,7 @@ class ConfigurableTask(Task):
            # we expect multiple_targets to be a list.
            elif self.multiple_target:
                gold = list(gold)
-            elif type(gold) != type(result):
+            elif type(gold) is not type(result):
                # cast gold to the same type as result
                gold = type(result)(gold)
@@ -1520,10 +1572,13 @@ class ConfigurableTask(Task):
    def get_config(self, key: str) -> Any:
        return getattr(self._config, key, None)
+    @property
+    def task_name(self) -> Any:
+        return getattr(self.config, "task", None)
    def __repr__(self):
        return (
            f"ConfigurableTask(task_name={getattr(self.config, 'task', None)},"
-            f"group_name={getattr(self.config, 'group', None)},"
            f"output_type={self.OUTPUT_TYPE},"
            f"num_fewshot={getattr(self.config, 'num_fewshot', None)},"
            f"num_samples={len(self.eval_docs)})"

--- a/lm_eval/caching/__init__.py
+++ b/lm_eval/caching/__init__.py
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -11,11 +11,14 @@ import torch
 import lm_eval.api.metrics
 import lm_eval.api.registry
+import lm_eval.api.task
 import lm_eval.models
 from lm_eval.caching.cache import delete_cache
 from lm_eval.evaluator_utils import (
+    consolidate_group_results,
    consolidate_results,
    get_sample_size,
+    get_subtask_list,
    get_task_list,
    prepare_print_tasks,
    print_writeout,
@@ -23,7 +26,10 @@ from lm_eval.evaluator_utils import (
 )
 from lm_eval.loggers import EvaluationTracker
 from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash
-from lm_eval.tasks import TaskManager, get_task_dict
+from lm_eval.tasks import (
+    TaskManager,
+    get_task_dict,
+)
 from lm_eval.utils import (
    eval_logger,
    handle_non_serializable,
@@ -35,7 +41,7 @@ from lm_eval.utils import (
 if TYPE_CHECKING:
    from lm_eval.api.model import LM
-    from lm_eval.tasks import Task
+    from lm_eval.api.task import Task
 @positional_deprecated
@@ -44,7 +50,7 @@ def simple_evaluate(
    model_args: Optional[Union[str, dict]] = None,
    tasks: Optional[List[Union[str, dict, object]]] = None,
    num_fewshot: Optional[int] = None,
-    batch_size: Optional[int] = None,
+    batch_size: Optional[Union[int, str]] = None,
    max_batch_size: Optional[int] = None,
    device: Optional[str] = None,
    use_cache: Optional[str] = None,
@@ -58,7 +64,7 @@ def simple_evaluate(
    log_samples: bool = True,
    evaluation_tracker: Optional[EvaluationTracker] = None,
    system_instruction: Optional[str] = None,
-    apply_chat_template: bool = False,
+    apply_chat_template: Union[bool, str] = False,
    fewshot_as_multiturn: bool = False,
    gen_kwargs: Optional[str] = None,
    task_manager: Optional[TaskManager] = None,
@@ -106,8 +112,11 @@ def simple_evaluate(
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
    :param system_instruction: str
        System instruction to be applied to the prompt
-    :param apply_chat_template: bool
+    :param apply_chat_template: Union[bool, str]
-        If True, apply chat template to the prompt
+        Specifies whether to apply a chat template to the prompt.
+        - If set to True, the default chat template is applied.
+        - If set to a string, applies the specified chat template by name.
+        Defaults to False (no chat template applied).
    :param fewshot_as_multiturn: bool
        Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
    :param gen_kwargs: str
@@ -148,6 +157,9 @@ def simple_evaluate(
        seed_message.append(f"Setting torch manual seed to {torch_random_seed}")
        torch.manual_seed(torch_random_seed)
+    if fewshot_random_seed is not None:
+        seed_message.append(f"Setting fewshot manual seed to {fewshot_random_seed}")
    if seed_message:
        eval_logger.info(" | ".join(seed_message))
@@ -199,7 +211,9 @@ def simple_evaluate(
            )
    else:
        if not isinstance(model, lm_eval.api.model.LM):
-            raise TypeError
+            raise TypeError(
+                f"The value of `model` passed to simple_evaluate() was of type {type(model)}, but is required to be a subclass of lm_eval.api.model.LM . This may be because you are passing an initialized Hugging Face PreTrainedModel without having wrapped it in `lm_eval.models.huggingface.HFLM(pretrained=my_model)` first."
+            )
        eval_logger.info("Using pre-initialized model")
        lm = model
@@ -219,48 +233,58 @@ def simple_evaluate(
        task_manager = TaskManager(verbosity)
    task_dict = get_task_dict(tasks, task_manager)
-    for task_name in task_dict.keys():
-        task_obj = task_dict[task_name]
-        if isinstance(task_obj, tuple):
-            _, task_obj = task_obj
-            if task_obj is None:
-                continue
-        if task_obj.get_config("output_type") == "generate_until":
-            if gen_kwargs is not None:
-                task_obj.set_config(
-                    key="generation_kwargs", value=gen_kwargs, update=True
-                )
-        if predict_only:
+    # helper function to recursively apply config overrides to leaf subtasks, skipping their constituent groups.
-            log_samples = True
+    # (setting of num_fewshot ; bypassing metric calculation ; setting fewshot seed)
-            eval_logger.info(
+    def _adjust_config(task_dict):
-                f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
+        adjusted_task_dict = {}
-            )
+        for task_name, task_obj in task_dict.items():
-            # we have to change the class properties post-hoc. This is pretty hacky.
+            if isinstance(task_obj, dict):
-            task_obj.override_metric(metric_name="bypass")
+                adjusted_task_dict = {
+                    **adjusted_task_dict,
-        # override tasks' fewshot values to the provided num_fewshot arg value
+                    **{task_name: _adjust_config(task_obj)},
-        # except if tasks have it set to 0 manually in their configs--then we should never overwrite that
+                }
-        if num_fewshot is not None:
-            if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
-                eval_logger.info(
-                    f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
-                )
            else:
-                eval_logger.warning(
+                if task_obj.get_config("output_type") == "generate_until":
-                    f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
+                    if gen_kwargs is not None:
-                )
+                        task_obj.set_config(
-                task_obj.set_config(key="num_fewshot", value=num_fewshot)
+                            key="generation_kwargs", value=gen_kwargs, update=True
-        else:
+                        )
-            # if num_fewshot not provided, and the task does not define a default one, default to 0
-            if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None:
+                if predict_only:
-                task_obj.set_config(key="num_fewshot", value=0)
+                    eval_logger.info(
-        # fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file)
+                        f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
-        task_obj.set_fewshot_seed(seed=fewshot_random_seed)
+                    )
-        eval_logger.info(
+                    # we have to change the class properties post-hoc. This is pretty hacky.
-            f"Setting fewshot random generator seed to {fewshot_random_seed}"
+                    task_obj.override_metric(metric_name="bypass")
-        )
+                # override tasks' fewshot values to the provided num_fewshot arg value
+                # except if tasks have it set to 0 manually in their configs--then we should never overwrite that
+                if num_fewshot is not None:
+                    if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
+                        eval_logger.info(
+                            f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
+                        )
+                    else:
+                        eval_logger.warning(
+                            f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
+                        )
+                        task_obj.set_config(key="num_fewshot", value=num_fewshot)
+                else:
+                    # if num_fewshot not provided, and the task does not define a default one, default to 0
+                    if (
+                        default_num_fewshot := task_obj.get_config("num_fewshot")
+                    ) is None:
+                        task_obj.set_config(key="num_fewshot", value=0)
+                # fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file)
+                task_obj.set_fewshot_seed(seed=fewshot_random_seed)
+                adjusted_task_dict[task_name] = task_obj
+        return adjusted_task_dict
+    task_dict = _adjust_config(task_dict)
    if check_integrity:
        run_task_tests(task_list=tasks)
@@ -270,7 +294,7 @@ def simple_evaluate(
            model_source=model,
            model_args=model_args,
            system_instruction=system_instruction,
-            chat_template=lm.chat_template if apply_chat_template else None,
+            chat_template=lm.chat_template(apply_chat_template),
            fewshot_as_multiturn=fewshot_as_multiturn,
        )
@@ -282,7 +306,7 @@ def simple_evaluate(
        rewrite_requests_cache=rewrite_requests_cache,
        bootstrap_iters=bootstrap_iters,
        write_out=write_out,
-        log_samples=log_samples,
+        log_samples=True if predict_only else log_samples,
        system_instruction=system_instruction,
        apply_chat_template=apply_chat_template,
        fewshot_as_multiturn=fewshot_as_multiturn,
@@ -343,7 +367,7 @@ def evaluate(
    write_out: bool = False,
    log_samples: bool = True,
    system_instruction: Optional[str] = None,
-    apply_chat_template: bool = False,
+    apply_chat_template: Union[bool, str] = False,
    fewshot_as_multiturn: bool = False,
    verbosity: str = "INFO",
 ):
@@ -363,8 +387,11 @@ def evaluate(
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
    :param system_instruction: str
        System instruction to be applied to the prompt
-    :param apply_chat_template: bool
+    :param apply_chat_template: Union[bool, str]
-        If True, apply chat template to the prompt
+        Specifies whether to apply a chat template to the prompt.
+        - If set to True, the default chat template is applied.
+        - If set to a string, applies the specified chat template by name.
+        Defaults to False (no chat template applied).
    :param fewshot_as_multiturn: bool
        Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
    :return
@@ -380,16 +407,40 @@ def evaluate(
    padding_requests = defaultdict(int)
    # get lists of group hierarchy and each type of request
-    task_hierarchy, eval_tasks = get_task_list(task_dict)
+    eval_tasks = get_task_list(task_dict)
    if not log_samples:
        if not all(
            "bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys()
            for task_output in eval_tasks
        ):
            raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
+    # validation check: are we running multimodal task <-> non-multimodal model class, or vice-versa.
+    incompatible_tasks = []
    for task_output in eval_tasks:
        task: Task = task_output.task
-        limit = get_sample_size(task, limit)
+        if getattr(lm, "MULTIMODAL", False) != getattr(task, "MULTIMODAL", False):
+            incompatible_tasks.append(task_output.task_name)
+    if len(incompatible_tasks) > 0:
+        if not getattr(lm, "MULTIMODAL", False):
+            raise ValueError(
+                f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the ['hf-multimodal', 'vllm-vlm'] model type."
+            )
+        else:
+            raise ValueError(
+                f"Attempted to run tasks: {incompatible_tasks} which are text-only, but used a model type which only currently supports multimodal tasks."
+            )
+    # end multimodality validation check
+    # Cache the limit arg.
+    limit_arg = limit
+    limits = []
+    for task_output in eval_tasks:
+        task: Task = task_output.task
+        limit = get_sample_size(task, limit_arg)
+        limits.append(limit)
        task.build_all_requests(
            limit=limit,
            rank=lm.rank,
@@ -397,7 +448,7 @@ def evaluate(
            cache_requests=cache_requests,
            rewrite_requests_cache=rewrite_requests_cache,
            system_instruction=system_instruction,
-            apply_chat_template=apply_chat_template,
+            apply_chat_template=bool(apply_chat_template),
            fewshot_as_multiturn=fewshot_as_multiturn,
            chat_template=getattr(lm, "apply_chat_template")
            if apply_chat_template
@@ -459,7 +510,7 @@ def evaluate(
    WORLD_SIZE = lm.world_size
    ### Postprocess outputs ###
    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
-    for task_output in eval_tasks:
+    for task_output, limit in zip(eval_tasks, limits):
        task = task_output.task
        task.apply_filters()
@@ -557,106 +608,45 @@ def evaluate(
        ### Calculate group metrics ###
        if bool(results):
-            for group, task_list in reversed(task_hierarchy.items()):
+            results, versions, show_group_table, *_ = consolidate_group_results(
-                if len(task_list) == 0:
+                results, versions, task_dict
-                    # task_hierarchy entries are either
+            )
-                    # `group_name: [subtask1, subtask2, ...]`
-                    # or `task_name: []`.
+        results_agg, group_agg = prepare_print_tasks(task_dict, results)
-                    # we only want to operate on groups here.
+        subtask_list = get_subtask_list(task_dict)
-                    continue
+        # collect all higher_is_better values for metrics
-                # collect all higher_is_better values for metrics
+        # in the group's subtasks.
-                # in the group's subtasks.
+        # TODO: clean this up ; unify with the below metric_list loop?
-                # TODO: clean this up ; unify with the below metric_list loop?
+        _higher_is_better = {}
-                _higher_is_better = {}
+        for group, task_list in subtask_list.items():
+            if (
+                len(task_list) != 0
+            ):  # subtask list will list "task_name": [] for solo tasks
                for task in task_list:
                    for m, h in higher_is_better[task].items():
                        if m not in _higher_is_better.keys():
                            _higher_is_better[m] = h
-                    if (
-                        m in _higher_is_better
-                        and _higher_is_better[m] is not None
-                        and _higher_is_better[m] != h
-                    ):
-                        eval_logger.warning(
-                            f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
-                        )
-                        _higher_is_better[m] = None
-                higher_is_better[group] = _higher_is_better
-                # collect all metric keys used by a subtask in the group.
-                metric_list = list(
-                    {
-                        key
-                        for task in task_list
-                        for key in results[task].keys()
-                        if "_stderr" not in key and key not in ["alias", "samples"]
-                    }
-                )
-                for metric in metric_list:
-                    stderr = "_stderr,".join(metric.split(","))
-                    # gather metrics, sizes, and stderrs from subtasks
-                    metrics = [
-                        results[task][metric]
-                        for task in task_list
-                        if metric in results[task]
-                    ]  # TODO: copy?
-                    stderrs = [
-                        results[task][stderr]
-                        for task in task_list
-                        if stderr in results[task]
-                    ]
-                    sizes = [
-                        results[task]["samples"]
-                        for task in task_list
-                        if metric in results[task]
-                    ]
-                    # compute group's pooled metric and stderr
-                    results[group][metric] = (
-                        lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
-                    )
-                    # TODO: calculate grouped metric using aggregation fn
-                    if "N/A" in stderrs:
-                        results[group][stderr] = "N/A"
-                    else:
-                        results[group][stderr] = (
-                            lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
-                        )
-                        # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
-                        # To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
-                        # results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
-                    results[group]["samples"] = sum(sizes)
-        results_agg = defaultdict(dict)
-        groups_agg = defaultdict(dict)
-        all_tasks_list = list(task_hierarchy.keys())
-        while True:
-            add_tasks_list = list(k for k in results_agg.keys())
-            left_tasks_list = sorted(list(set(all_tasks_list) - set(add_tasks_list)))
-            if len(left_tasks_list) == 0:
-                break
-            _task_hierarchy = {
-                k: v for k, v in task_hierarchy.items() if k in left_tasks_list
-            }
-            _results_agg, _groups_agg = prepare_print_tasks(_task_hierarchy, results)
-            results_agg = {**results_agg, **_results_agg}
+                        if (
-            groups_agg = {**groups_agg, **_groups_agg}
+                            m in _higher_is_better
+                            and _higher_is_better[m] is not None
-        for group_name, task_list in task_hierarchy.items():
+                            and _higher_is_better[m] != h
-            if task_list:
+                        ):
-                num_fewshot[group_name] = num_fewshot[
+                            eval_logger.warning(
-                    task_list[0]
+                                f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
-                ]  # TODO: validate this
+                            )
+                            _higher_is_better[m] = None
+                higher_is_better[group] = _higher_is_better
        results_dict = {
            "results": dict(results_agg.items()),
-            **({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
+            **(
-            "group_subtasks": dict(reversed(task_hierarchy.items())),
+                {"groups": dict(group_agg.items())}
+                if (bool(group_agg) & show_group_table)
+                else {}
+            ),
+            "group_subtasks": dict(reversed(subtask_list.items())),
            "configs": dict(sorted(configs.items())),
            "versions": dict(sorted(versions.items())),
            "n-shot": dict(sorted(num_fewshot.items())),
@@ -669,7 +659,7 @@ def evaluate(
                        len(task_output.task.eval_docs),
                    ),
                }
-                for task_output in eval_tasks
+                for task_output, limit in zip(eval_tasks, limits)
            },
        }
        if log_samples:

--- a/lm_eval/evaluator_utils.py
+++ b/lm_eval/evaluator_utils.py
--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -62,11 +62,8 @@ class WhitespaceFilter(Filter):
        def filter_set(inst):
            filtered_resp = []
            for resp in inst:
-                if resp.startswith(" "):
+                resp = resp.lstrip()
-                    resp = resp[1:]
                filtered_resp.append(resp)
            return filtered_resp
        filtered_resps = [filter_set(resp) for resp in resps]

--- a/lm_eval/loggers/evaluation_tracker.py
+++ b/lm_eval/loggers/evaluation_tracker.py
 import json
+import os
 import re
 import time
 from collections import defaultdict
@@ -14,6 +15,7 @@ from huggingface_hub import (
    HfApi,
    hf_hub_url,
 )
+from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
 from lm_eval.utils import (
    eval_logger,
@@ -112,12 +114,15 @@ class EvaluationTracker:
        output_path: str = None,
        hub_results_org: str = "",
        hub_repo_name: str = "",
+        details_repo_name: str = "",
+        results_repo_name: str = "",
        push_results_to_hub: bool = False,
        push_samples_to_hub: bool = False,
        public_repo: bool = False,
        token: str = "",
        leaderboard_url: str = "",
        point_of_contact: str = "",
+        gated: bool = False,
    ) -> None:
        """
        Creates all the necessary loggers for evaluation tracking.
@@ -126,12 +131,15 @@ class EvaluationTracker:
            output_path (str): Path to save the results. If not provided, the results won't be saved.
            hub_results_org (str): The Hugging Face organization to push the results to. If not provided, the results will be pushed to the owner of the Hugging Face token.
            hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`.
+            details_repo_name (str): The name of the Hugging Face repository to push the details to. If not provided, the results will be pushed to `lm-eval-results`.
+            result_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will not be pushed and will be found in the details_hub_repo.
            push_results_to_hub (bool): Whether to push the results to the Hugging Face hub.
            push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub.
            public_repo (bool): Whether to push the results to a public or private repository.
            token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`.
            leaderboard_url (str): URL to the leaderboard on the Hugging Face hub on the dataset card.
            point_of_contact (str): Contact information on the Hugging Face hub dataset card.
+            gated (bool): Whether to gate the repository.
        """
        self.general_config_tracker = GeneralConfigTracker()
@@ -142,6 +150,7 @@ class EvaluationTracker:
        self.leaderboard_url = leaderboard_url
        self.point_of_contact = point_of_contact
        self.api = HfApi(token=token) if token else None
+        self.gated_repo = gated
        if not self.api and (push_results_to_hub or push_samples_to_hub):
            raise ValueError(
@@ -159,9 +168,24 @@ class EvaluationTracker:
                f"hub_results_org was not specified. Results will be pushed to '{hub_results_org}'."
            )
-        hub_repo_name = hub_repo_name if hub_repo_name else "lm-eval-results"
+        if hub_repo_name == "":
-        self.hub_results_repo = f"{hub_results_org}/{hub_repo_name}"
+            details_repo_name = (
-        self.hub_results_repo_private = f"{hub_results_org}/{hub_repo_name}-private"
+                details_repo_name if details_repo_name != "" else "lm-eval-results"
+            )
+            results_repo_name = (
+                results_repo_name if results_repo_name != "" else details_repo_name
+            )
+        else:
+            details_repo_name = hub_repo_name
+            results_repo_name = hub_repo_name
+            eval_logger.warning(
+                "hub_repo_name was specified. Both details and results will be pushed to the same repository. Using hub_repo_name is no longer recommended, details_repo_name and results_repo_name should be used instead."
+            )
+        self.details_repo = f"{hub_results_org}/{details_repo_name}"
+        self.details_repo_private = f"{hub_results_org}/{details_repo_name}-private"
+        self.results_repo = f"{hub_results_org}/{results_repo_name}"
+        self.results_repo_private = f"{hub_results_org}/{results_repo_name}-private"
    def save_results_aggregated(
        self,
@@ -211,9 +235,9 @@ class EvaluationTracker:
                if self.api and self.push_results_to_hub:
                    repo_id = (
-                        self.hub_results_repo
+                        self.results_repo
                        if self.public_repo
-                        else self.hub_results_repo_private
+                        else self.results_repo_private
                    )
                    self.api.create_repo(
                        repo_id=repo_id,
@@ -221,10 +245,15 @@ class EvaluationTracker:
                        private=not self.public_repo,
                        exist_ok=True,
                    )
-                    self.api.upload_folder(
+                    self.api.upload_file(
                        repo_id=repo_id,
-                        folder_path=str(path),
+                        path_or_fileobj=str(
-                        path_in_repo=self.general_config_tracker.model_name_sanitized,
+                            path.joinpath(f"results_{self.date_id}.json")
+                        ),
+                        path_in_repo=os.path.join(
+                            self.general_config_tracker.model_name,
+                            f"results_{self.date_id}.json",
+                        ),
                        repo_type="dataset",
                        commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
                    )
@@ -278,6 +307,7 @@ class EvaluationTracker:
                    sample["resps"] = sanitize_list(sample["resps"])
                    sample["filtered_resps"] = sanitize_list(sample["filtered_resps"])
                    sample["arguments"] = arguments
+                    sample["target"] = str(sample["target"])
                    sample_dump = (
                        json.dumps(
@@ -288,14 +318,14 @@ class EvaluationTracker:
                        + "\n"
                    )
-                    with open(file_results_samples, "a") as f:
+                    with open(file_results_samples, "a", encoding="utf-8") as f:
                        f.write(sample_dump)
                if self.api and self.push_samples_to_hub:
                    repo_id = (
-                        self.hub_results_repo
+                        self.details_repo
                        if self.public_repo
-                        else self.hub_results_repo_private
+                        else self.details_repo_private
                    )
                    self.api.create_repo(
                        repo_id=repo_id,
@@ -303,6 +333,18 @@ class EvaluationTracker:
                        private=not self.public_repo,
                        exist_ok=True,
                    )
+                    try:
+                        if self.gated_repo:
+                            headers = build_hf_headers()
+                            r = get_session().put(
+                                url=f"https://huggingface.co/api/datasets/{repo_id}/settings",
+                                headers=headers,
+                                json={"gated": "auto"},
+                            )
+                            hf_raise_for_status(r)
+                    except Exception as e:
+                        eval_logger.warning("Could not gate the repository")
+                        eval_logger.info(repr(e))
                    self.api.upload_folder(
                        repo_id=repo_id,
                        folder_path=str(path),
@@ -327,9 +369,7 @@ class EvaluationTracker:
        """
        eval_logger.info("Recreating metadata card")
-        repo_id = (
+        repo_id = self.details_repo if self.public_repo else self.details_repo_private
-            self.hub_results_repo if self.public_repo else self.hub_results_repo_private
-        )
        files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
        results_files = get_results_filenames(files_in_repo)
@@ -360,7 +400,10 @@ class EvaluationTracker:
                results_datetime,
            )
            latest_task_results_datetime[samples_key] = latest_datetime
-            latest_task_results_datetime[results_key] = latest_datetime
+            latest_task_results_datetime[results_key] = max(
+                latest_task_results_datetime[results_key],
+                latest_datetime,
+            )
        # Create metadata card
        card_metadata = MetadataConfigs()
@@ -377,14 +420,15 @@ class EvaluationTracker:
            sanitized_last_eval_date_results = re.sub(
                r"[^\w\.]", "_", latest_task_results_datetime[config_name]
            )
-            # Ensure that all results files are listed in the metadata card
-            current_results = card_metadata.get(config_name, {"data_files": []})
-            current_results["data_files"].append(
-                {"split": eval_date_sanitized, "path": [str(results_filename)]}
-            )
-            card_metadata[config_name] = current_results
-            # If the results file is the newest, update the "latest" field in the metadata card
            if eval_date_sanitized == sanitized_last_eval_date_results:
+                # Ensure that all results files are listed in the metadata card
+                current_results = card_metadata.get(config_name, {"data_files": []})
+                current_results["data_files"].append(
+                    {"split": eval_date_sanitized, "path": [str(results_filename)]}
+                )
+                card_metadata[config_name] = current_results
+                # If the results file is the newest, update the "latest" field in the metadata card
                card_metadata[config_name]["data_files"].append(
                    {"split": "latest", "path": [str(results_filename)]}
                )
@@ -403,65 +447,20 @@ class EvaluationTracker:
            sanitized_last_eval_date_results = re.sub(
                r"[^\w\.]", "_", latest_task_results_datetime[config_name]
            )
-            # Ensure that all sample results files are listed in the metadata card
-            current_details_for_task = card_metadata.get(
-                config_name, {"data_files": []}
-            )
-            current_details_for_task["data_files"].append(
-                {"split": eval_date_sanitized, "path": [str(results_filename)]}
-            )
-            card_metadata[config_name] = current_details_for_task
-            # If the samples results file is the newest, update the "latest" field in the metadata card
            if eval_date_sanitized == sanitized_last_eval_date_results:
+                # Ensure that all sample results files are listed in the metadata card
+                current_details_for_task = card_metadata.get(
+                    config_name, {"data_files": []}
+                )
+                current_details_for_task["data_files"].append(
+                    {"split": eval_date_sanitized, "path": [str(results_filename)]}
+                )
+                card_metadata[config_name] = current_details_for_task
+                # If the samples results file is the newest, update the "latest" field in the metadata card
                card_metadata[config_name]["data_files"].append(
                    {"split": "latest", "path": [str(results_filename)]}
                )
-            # Special case for MMLU with a single split covering it all
-            # We add another config with all MMLU splits results together for easy inspection
-            SPECIAL_TASKS = ["mmlu", "gpqa", "minerva_math"]
-            for special_task in SPECIAL_TASKS:
-                if special_task in config_name:
-                    special_task = f"{model_name}__{special_task}"
-                    former_entry = card_metadata.get(special_task, {"data_files": []})
-                    former_split = [
-                        (i, entry)
-                        for i, entry in enumerate(former_entry["data_files"])
-                        if entry.get("split", None) == eval_date_sanitized
-                    ]
-                    if len(former_split) == 0:
-                        former_entry["data_files"].append(
-                            {
-                                "split": eval_date_sanitized,
-                                "path": [str(results_filename)],
-                            }
-                        )
-                    else:
-                        split_index, _ = former_split[0]
-                        former_entry["data_files"][split_index]["path"].append(
-                            str(results_filename)
-                        )
-                    if eval_date_sanitized == sanitized_last_eval_date_results:
-                        latest_split = [
-                            (i, entry)
-                            for i, entry in enumerate(former_entry["data_files"])
-                            if entry.get("split", None) == "latest"
-                        ]
-                        if len(latest_split) == 0:
-                            former_entry["data_files"].append(
-                                {"split": "latest", "path": [str(results_filename)]}
-                            )
-                        else:
-                            latest_index, _ = latest_split[0]
-                            former_entry["data_files"][latest_index]["path"].append(
-                                str(results_filename)
-                            )
-                    card_metadata[special_task] = former_entry
        # Get latest results and extract info to update metadata card examples
        latest_datetime = max(latest_task_results_datetime.values())
        latest_model_name = max(

--- a/lm_eval/loggers/utils.py
+++ b/lm_eval/loggers/utils.py
@@ -114,15 +114,29 @@ def add_env_info(storage: Dict[str, Any]):
 def add_tokenizer_info(storage: Dict[str, Any], lm):
    if getattr(lm, "tokenizer", False):
-        tokenizer_info = {
+        try:
-            "tokenizer_pad_token": [lm.tokenizer.pad_token, lm.tokenizer.pad_token_id],
+            tokenizer_info = {
-            "tokenizer_eos_token": [lm.tokenizer.eos_token, lm.tokenizer.eos_token_id],
+                "tokenizer_pad_token": [
-            "tokenizer_bos_token": [lm.tokenizer.bos_token, lm.tokenizer.bos_token_id],
+                    lm.tokenizer.pad_token,
-            "eot_token_id": getattr(lm, "eot_token_id", None),
+                    str(lm.tokenizer.pad_token_id),
-            "max_length": getattr(lm, "max_length", None),
+                ],
-        }
+                "tokenizer_eos_token": [
-        storage.update(tokenizer_info)
+                    lm.tokenizer.eos_token,
-    # seems gguf and textsynth do not have tokenizer
+                    str(lm.tokenizer.eos_token_id),
+                ],
+                "tokenizer_bos_token": [
+                    lm.tokenizer.bos_token,
+                    str(lm.tokenizer.bos_token_id),
+                ],
+                "eot_token_id": getattr(lm, "eot_token_id", None),
+                "max_length": getattr(lm, "max_length", None),
+            }
+            storage.update(tokenizer_info)
+        except Exception as err:
+            logger.debug(
+                f"Logging detailed tokenizer info failed with {err}, skipping..."
+            )
+        # seems gguf and textsynth do not have tokenizer
    else:
        logger.debug(
            "LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results."

--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
 from . import (
    anthropic_llms,
+    api_models,
    dummy,
    gguf,
+    hf_vlms,
    huggingface,
+    ibm_watsonx_ai,
    mamba_lm,
    nemo_lm,
    neuralmagic,
@@ -11,6 +14,7 @@ from . import (
    optimum_lm,
    textsynth,
    vllm_causallms,
+    vllm_vlms,
 )

--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
--- a/lm_eval/models/api_models.py
+++ b/lm_eval/models/api_models.py
--- a/lm_eval/models/dummy.py
+++ b/lm_eval/models/dummy.py
@@ -26,9 +26,9 @@ class DummyLM(LM):
    def generate_until(self, requests, disable_tqdm: bool = False):
        res = []
-        for ctx, _ in tqdm(requests, disable=disable_tqdm):
+        for request in tqdm(requests, disable=disable_tqdm):
            res.append("lol")
-            assert ctx.strip() != ""
+            assert request.arguments[0].strip() != ""
        return res

--- a/lm_eval/models/gguf.py
+++ b/lm_eval/models/gguf.py
@@ -68,7 +68,9 @@ class GGUFLM(LM):
                logger.error(f"RequestException: {e}")
                time.sleep(delay)  # wait before retrying
        else:
-            raise Exception(f"Failed to get a valid response after {retries} retries.")
+            raise RuntimeError(
+                f"Failed to get a valid response after {retries} retries."
+            )
    def loglikelihood(self, requests, disable_tqdm: bool = False):
        if not requests:

--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
--- a/lm_eval/models/ibm_watsonx_ai.py
+++ b/lm_eval/models/ibm_watsonx_ai.py
--- a/lm_eval/models/mamba_lm.py
+++ b/lm_eval/models/mamba_lm.py
@@ -69,8 +69,8 @@ class MambaLMWrapper(HFLM):
    ) -> None:
        try:
            from mamba_ssm.utils.hf import load_config_hf  # noqa: F811
-        except ModuleNotFoundError:
+        except ModuleNotFoundError as exception:
-            raise Exception(
+            raise type(exception)(
                "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
 please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
            )
@@ -88,8 +88,8 @@ please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba
    ) -> None:
        try:
            from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel  # noqa: F811
-        except ModuleNotFoundError:
+        except ModuleNotFoundError as exception:
-            raise Exception(
+            raise type(exception)(
                "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
 please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
            )

--- a/lm_eval/models/nemo_lm.py
+++ b/lm_eval/models/nemo_lm.py
--- a/lm_eval/models/neuralmagic.py
+++ b/lm_eval/models/neuralmagic.py
--- a/lm_eval/models/neuron_optimum.py
+++ b/lm_eval/models/neuron_optimum.py
--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py