Refactor `evaluater.evaluate` (#1441)

* change `all_gather` to `gather` * add TaskOutput utility class * Add FilterResults class and refactor task handling. * Rename `key` to `filter_key` for clarity * Add `print_writeout` function in utils.py * Add function to calculate limit size. * Add doc_iterator method to Task class * Refactor `doc_iterator` and cleanup in Task class * remove superfluous bits * change `all_gather` to `gather` * bugfix * bugfix * fix `gather` * Refactor `gather` loop * Refactor aggregate metrics calculation * Refactor and simplify aggregate metrics calculation Removed unused code * Simplify metrics calculation and remove unused code. * simplify the metrics calculation in `utils.py` and `evaluator.py`. * Fix group metric * change evaluate to hf_evaluate * change evaluate to hf_evaluate * add docs * add docs * nits * make isslice keyword only * nit * add todo * nit * nit * nit: swap order samples_metrics tuple * move instance sorting outside loop * nit * nit * Add __repr__ for ConfigurableTask * nit * nit * Revert "nit" This reverts commit dab8d9977a643752a17f840fd8cf7e4b107df28f. * fix some logging * nit * fix `predict_only` bug. thanks to `@LSinev`! * change `print_tasks` to `prepare_print_tasks` * nits * move eval utils * move eval utils * nit * add comment * added tqdm descriptions * Update lm_eval/evaluator_utils.py Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> * fix mgsm bug * nit * fix `build_all_requests` * pre-commit * add ceil to limit --------- Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

Refactor `evaluater.evaluate` (#1441)
* change `all_gather` to `gather` * add TaskOutput utility class * Add FilterResults class and refactor task handling. * Rename `key` to `filter_key` for clarity * Add `print_writeout` function in utils.py * Add function to calculate limit size. * Add doc_iterator method to Task class * Refactor `doc_iterator` and cleanup in Task class * remove superfluous bits * change `all_gather` to `gather` * bugfix * bugfix * fix `gather` * Refactor `gather` loop * Refactor aggregate metrics calculation * Refactor and simplify aggregate metrics calculation Removed unused code * Simplify metrics calculation and remove unused code. * simplify the metrics calculation in `utils.py` and `evaluator.py`. * Fix group metric * change evaluate to hf_evaluate * change evaluate to hf_evaluate * add docs * add docs * nits * make isslice keyword only * nit * add todo * nit * nit * nit: swap order samples_metrics tuple * move instance sorting outside loop * nit * nit * Add __repr__ for ConfigurableTask * nit * nit * Revert "nit" This reverts commit dab8d9977a643752a17f840fd8cf7e4b107df28f. * fix some logging * nit * fix `predict_only` bug. thanks to `@LSinev`! * change `print_tasks` to `prepare_print_tasks` * nits * move eval utils * move eval utils * nit * add comment * added tqdm descriptions * Update lm_eval/evaluator_utils.py Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> * fix mgsm bug * nit * fix `build_all_requests` * pre-commit * add ceil to limit --------- Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
5ccd65d4 · Baber Abbasi · GitHub · 96d185fa · 5ccd65d4 · 5ccd65d4
Unverified Commit 5ccd65d4 authored Feb 27, 2024 by Baber Abbasi Committed by GitHub Feb 27, 2024
7 changed files
--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -225,7 +225,7 @@ class CachingLM:
            eval_logger.info(
                f"Loading '{attr}' responses from cache '{self.cache_db}' where possible..."
            )
-            for req in tqdm(requests):
+            for req in tqdm(requests, desc="Checking cached requests"):
                hsh = hash_args(attr, req.args)
                if attr == "generate_until" and req.args[1].get("do_sample", False):
                    # when we are doing non-greedy generation, don't use the cache
@@ -246,7 +246,9 @@ class CachingLM:
                else:
                    res.append(None)
                    remaining_reqs.append(req)
-
+            eval_logger.info(
+                f"Cached requests: {len(requests) - len(remaining_reqs)}, Requests remaining: {len(remaining_reqs)}"
+            )
            # actually run the LM on the requests that do not have cached results
            rem_res = getattr(self.lm, attr)(remaining_reqs)


--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -7,7 +7,7 @@ from collections.abc import Callable
 from copy import deepcopy
 from dataclasses import asdict, dataclass
 from inspect import getsource
-from typing import Any, List, Literal, Tuple, Union
+from typing import Any, Iterator, List, Literal, Tuple, Union

 import datasets
 import numpy as np
@@ -327,7 +327,7 @@ class Task(abc.ABC):
        return doc

    @property
-    def instances(self):
+    def instances(self) -> List[Instance]:
        """After calling `task.build_all_requests()`, tasks
        maintain a list of the dataset instances which will be evaluated.
        """
@@ -355,6 +355,7 @@ class Task(abc.ABC):

    def build_all_requests(
        self,
+        *,
        limit=None,
        rank=None,
        world_size=None,
@@ -382,13 +383,6 @@ class Task(abc.ABC):
            self._instances = flattened_instances
            return

-        if self.has_test_docs():
-            docs = self.test_docs()
-        elif self.has_validation_docs():
-            docs = self.validation_docs()
-        else:
-            assert False, f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
-
        eval_logger.info(f"Building contexts for {self.config.task} on rank {rank}...")

        instances = []
@@ -402,12 +396,7 @@ class Task(abc.ABC):
            limit = None

        doc_id_docs = list(
-            utils.create_iterator(
-                enumerate(docs),
-                rank,
-                world_size,
-                limit,
-            )
+            self.doc_iterator(rank=rank, limit=limit, world_size=world_size)
        )

        num_docs = len(doc_id_docs)
@@ -632,6 +621,27 @@ class Task(abc.ABC):
        setattr(self._config, "metric_list", [{"metric": metric_name}])
        setattr(self._config, "process_results", None)

+    @property
+    def eval_docs(self) -> Union[datasets.Dataset, List[dict]]:
+        if self.has_test_docs():
+            return self.test_docs()
+        elif self.has_validation_docs():
+            return self.validation_docs()
+        else:
+            assert False, f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
+
+    def doc_iterator(
+        self, *, rank: int = 0, limit: Union[int, None] = None, world_size: int = 1
+    ) -> Iterator[Tuple[int, Any]]:
+        limit = int(limit) if limit else None
+        doc_iterator = utils.create_iterator(
+            enumerate(self.eval_docs),
+            rank=int(rank),
+            limit=limit,
+            world_size=int(world_size),
+        )
+        return doc_iterator
+

 class ConfigurableTask(Task):
    VERSION = "Yaml"
@@ -781,12 +791,7 @@ class ConfigurableTask(Task):
                else "default"
            )(list(self.fewshot_docs()), self, rnd=random.Random(1234))

-        if self.has_test_docs():
-            self.task_docs = self.test_docs()
-        elif self.has_validation_docs():
-            self.task_docs = self.validation_docs()
-        else:
-            assert False, f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
+        self.task_docs = self.eval_docs

        # Test One Doc
        self.features = list(self.task_docs.features.keys())
@@ -1336,6 +1341,15 @@ class ConfigurableTask(Task):
    def get_config(self, key: str) -> Any:
        return getattr(self._config, key, None)

+    def __repr__(self):
+        return (
+            f"ConfigurableTask(task_name={getattr(self.config, 'task', None)},"
+            f"group_name={getattr(self.config, 'group', None)},"
+            f"output_type={self.OUTPUT_TYPE},"
+            f"num_fewshot={getattr(self.config, 'num_fewshot', None)},"
+            f"num_samples={len(self.eval_docs)})"
+        )
+

 class MultipleChoiceTask(Task):
    OUTPUT_TYPE: str = "loglikelihood"

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
 import collections
 import itertools
 import logging
-import math
 import random
 from typing import TYPE_CHECKING, Optional, Union

@@ -11,12 +10,19 @@ import torch
 import lm_eval.api.metrics
 import lm_eval.api.registry
 import lm_eval.models
+from lm_eval.evaluator_utils import (
+    consolidate_results,
+    get_sample_size,
+    get_task_list,
+    prepare_print_tasks,
+    print_writeout,
+    run_task_tests,
+)
 from lm_eval.logging_utils import add_env_info, get_git_commit_hash
 from lm_eval.tasks import TaskManager, get_task_dict
 from lm_eval.utils import (
    eval_logger,
    positional_deprecated,
-    run_task_tests,
    simple_parse_args_string,
 )

@@ -111,19 +117,23 @@ def simple_evaluate(
        eval_logger.info("Deleting requests cache...")
        delete_cache()

+    seed_message = []
    if random_seed is not None:
        # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1412
-        eval_logger.info(f"Setting random seed to {random_seed}")
+        seed_message.append(f"Setting random seed to {random_seed}")
        random.seed(random_seed)

    if numpy_random_seed is not None:
-        eval_logger.info(f"Setting numpy seed to {numpy_random_seed}")
+        seed_message.append(f"Setting numpy seed to {numpy_random_seed}")
        np.random.seed(numpy_random_seed)

    if torch_random_seed is not None:
-        eval_logger.info(f"Setting torch manual seed to {torch_random_seed}")
+        seed_message.append(f"Setting torch manual seed to {torch_random_seed}")
        torch.manual_seed(torch_random_seed)

+    if seed_message:
+        eval_logger.info(" | ".join(seed_message))
+
    if tasks is None:
        tasks = []
    assert (
@@ -166,7 +176,7 @@ def simple_evaluate(
        lm = model

    if use_cache is not None:
-        print(f"Using cache at {use_cache + '_rank' + str(lm.rank) + '.db'}")
+        eval_logger.info(f"Using cache at {use_cache + '_rank' + str(lm.rank) + '.db'}")
        lm = lm_eval.api.model.CachingLM(
            lm,
            use_cache
@@ -198,13 +208,13 @@ def simple_evaluate(
                    key="generation_kwargs", value=gen_kwargs, update=True
                )

-            if predict_only:
-                log_samples = True
-                eval_logger.info(
-                    f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
-                )
-                # we have to change the class properties post-hoc. This is pretty hacky.
-                task_obj.override_metric(metric_name="bypass")
+        if predict_only:
+            log_samples = True
+            eval_logger.info(
+                f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
+            )
+            # we have to change the class properties post-hoc. This is pretty hacky.
+            task_obj.override_metric(metric_name="bypass")

        if num_fewshot is not None:
            if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
@@ -299,82 +309,22 @@ def evaluate(
    eval_logger.setLevel(getattr(logging, f"{verbosity}"))
    # decontaminate = decontamination_ngrams_path is not None

-    for task_name, task in task_dict.items():
-        if isinstance(task, tuple):
-            _, task = task
-        if not log_samples:
-            assert (
-                "bypass" not in getattr(task, "_metric_fn_list", {}).keys()
-            ), f"log_samples must be True for 'bypass' only tasks: {task_name}"
-
-    # stores the final result for each task, for each metric/filter pair.
-    results = collections.defaultdict(dict)
-    # Tracks each task's version.
-    versions = collections.defaultdict(dict)
-    # Tracks the YAML configs of all chosen tasks.
-    configs = collections.defaultdict(dict)
-    # logs info about each document evaluated.
-    samples = collections.defaultdict(list)
    # tracks all Instances/requests a model must generate output on.
    requests = collections.defaultdict(list)
-    # Aggregated task scores presented with groups
-    results_agg = collections.defaultdict(dict)
-    # Aggregated groups scores only
-    groups_agg = collections.defaultdict(dict)
    # stores the amount to pad out reqs per req. type so that
    # number of fwd passes per distributed rank is equal
    padding_requests = collections.defaultdict(int)
-    # store the hierarchy to do proper ordering
-    task_hierarchy = collections.defaultdict(list)
-    # store num-fewshot value per task
-    num_fewshot = collections.defaultdict(int)
-
-    # get lists of each type of request
-    for task_name, task in task_dict.items():
-        task: Task
-
-        if isinstance(task, tuple):
-            group_name, task = task
-            task_hierarchy[group_name].append(task_name)
-            versions[group_name] = "N/A"
-
-        else:
-            group_name = None
-            task_hierarchy[task_name] = []
-
-        if task is None:
-            continue
-
-        versions[task_name] = task.VERSION
-        configs[task_name] = dict(task.dump_config())
-
-        # Number of few-shots for printing.
-        if (n_shot := configs[task_name].get("num_fewshot")) == 0:
-            n_shot = configs[task_name].get("metadata", {}).get("num_fewshot", 0)
-        num_fewshot[task_name] = n_shot
-
-        if "task_alias" in configs[task_name]:
-            results[task_name]["alias"] = configs[task_name]["task_alias"]
-
-        if (
-            ("group_alias" in configs[task_name])
-            and (group_name not in results)
-            and (group_name is not None)
-        ):
-            results[group_name]["alias"] = configs[task_name]["group_alias"]
-
-        if limit is not None:
-            if task.has_test_docs():
-                task_docs = task.test_docs()
-            elif task.has_validation_docs():
-                task_docs = task.validation_docs()
-            else:
-                raise RuntimeError("Task has neither test_docs nor validation_docs")
-
-            num_docs = len(task_docs) * limit
-            # ceil to prevent limit being equal to 0
-            limit = int(math.ceil(num_docs)) if limit < 1.0 else int(limit)

+    # get lists of group hierarchy and each type of request
+    task_hierarchy, eval_tasks = get_task_list(task_dict)
+    if not log_samples:
+        assert all(
+            "bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys()
+            for task_output in eval_tasks
+        ), "log_samples must be True for 'bypass' only tasks"
+    for task_output in eval_tasks:
+        task: Task = task_output.task
+        limit = get_sample_size(task, limit)
        task.build_all_requests(
            limit=limit,
            rank=lm.rank,
@@ -382,21 +332,12 @@ def evaluate(
            cache_requests=cache_requests,
            rewrite_requests_cache=rewrite_requests_cache,
        )
-
        eval_logger.debug(
-            f"Task: {task_name}; number of requests on this rank: {len(task.instances)}"
+            f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}"
        )

        if write_out:
-            for inst in task.instances:
-                # print the prompt for the first few documents
-                if inst.doc_id < 1:
-                    eval_logger.info(
-                        f"Task: {task_name}; document {inst.doc_id}; context prompt (starting on next line):\
-\n{inst.args[0]}\n(end of prompt on previous line)\ntarget string or answer choice index (starting on next line):\n{task.doc_to_target(inst.doc)}\n(end of target on previous line)"
-                    )
-                    eval_logger.info(f"Request: {str(inst)}")
-
+            print_writeout(task)
        # aggregate Instances by LM method requested to get output.
        for instance in task.instances:
            reqtype = instance.request_type
@@ -408,7 +349,7 @@ def evaluate(
                lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist()
            )

-            # compute number of pseudobatches to pad with (FSDP/DDP require even batches among ranks)
+            # compute number of pseudo-batches to pad with (FSDP/DDP require even batches among ranks)
            numpad = max(gathered_item) - gathered_item[lm.rank]
            padding_requests[task.OUTPUT_TYPE] += numpad

@@ -435,42 +376,33 @@ def evaluate(
        if lm.world_size > 1:
            lm.accelerator.wait_for_everyone()

+    RANK = lm.rank
+    WORLD_SIZE = lm.world_size
    ### Postprocess outputs ###
    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
-    for task_name, task in task_dict.items():
-        if isinstance(task, tuple):
-            group, task = task
-            if task is None:
-                continue
+    for task_output in eval_tasks:
+        task = task_output.task
        task.apply_filters()

-    ### Collect values of metrics on all datapoints ###
-    vals = collections.defaultdict(list)
-
-    # unpack results and sort back in order and return control to Task
-    for task_name, task in task_dict.items():
-        if isinstance(task, tuple):
-            group, task = task
-            if task is None:
-                continue
+        ### Collect values of metrics on all datapoints ###
+        # # unpack results and sort back in order and return control to Task
        # TODO: make it possible to use a different metric per filter
+        # Pre-process task.instances to group by doc_id
+        instances_by_doc_id = collections.defaultdict(list)
+        for instance in task.instances:
+            instances_by_doc_id[instance.doc_id].append(instance)
+        # Sort instances within each group
+        for instances in instances_by_doc_id.values():
+            instances.sort(key=lambda x: x.idx)
        # iterate over different filters used
-        for key in task.instances[0].filtered_resps.keys():
-            doc_iterator = (
-                itertools.islice(
-                    enumerate(task.test_docs()), lm.rank, limit, lm.world_size
-                )
-                if task.has_test_docs()
-                else itertools.islice(
-                    enumerate(task.validation_docs()), lm.rank, limit, lm.world_size
-                )
+        for filter_key in task.instances[0].filtered_resps.keys():
+            doc_iterator = task.doc_iterator(
+                rank=RANK, limit=limit, world_size=WORLD_SIZE
            )
            for doc_id, doc in doc_iterator:
-                # subset instances to only this document id ; sort by idx
-                requests = list(filter(lambda x: x.doc_id == doc_id, task.instances))
-                requests.sort(key=lambda x: x.idx)
+                requests = instances_by_doc_id[doc_id]
                metrics = task.process_results(
-                    doc, [req.filtered_resps[key] for req in requests]
+                    doc, [req.filtered_resps[filter_key] for req in requests]
                )
                if log_samples:
                    target = task.doc_to_target(doc)
@@ -480,93 +412,56 @@ def evaluate(
                        "target": target,
                        "arguments": [req.args for req in requests],
                        "resps": [req.resps for req in requests],
-                        "filtered_resps": [req.filtered_resps[key] for req in requests],
+                        "filtered_resps": [
+                            req.filtered_resps[filter_key] for req in requests
+                        ],
                    }
                    example.update(metrics)
-                    samples[task_name].append(example)
+                    task_output.logged_samples.append(example)
                for metric, value in metrics.items():
-                    vals[(task_name, key, metric)].append(value)
+                    task_output.sample_metrics[(metric, filter_key)].append(value)

-    if lm.world_size > 1:
-        # if multigpu, then gather data across all ranks
+    if WORLD_SIZE > 1:
+        # if multigpu, then gather data across all ranks to rank 0
        # first gather logged samples across all ranks
-        for task_name, task_samples in list(samples.items()):
-            full_samples = [None] * lm.world_size
-            torch.distributed.all_gather_object(full_samples, task_samples)
-
-            samples[task_name] = list(itertools.chain.from_iterable(full_samples))
-
-        # then collect metrics across all ranks
-        vals_torch = collections.defaultdict(list)
-        for (task_name, key, metric), items in vals.items():
-            numitem = 0
-            if isinstance(items[0], tuple):
-                numitem = len(items[0])
-
-            if isinstance(items[0], (str, list, tuple)):
-                # handle the string case
-                gathered_items = [None] * lm.accelerator.num_processes
-                torch.distributed.all_gather_object(gathered_items, items)
-
-                gathered_item = list(itertools.chain.from_iterable(gathered_items))
-            else:
-                # distributed gather requires all ranks to have same dimensions
-                # so we pad out with float32 min value
-                pad_value = torch.finfo(torch.float32).min
-                metrics_tensor = torch.tensor(items, device=lm.device)
-
-                original_dtype = metrics_tensor.dtype  # store original dtype
-                torch_device_tensor = lm.accelerator.pad_across_processes(
-                    metrics_tensor.to(torch.float32), pad_index=pad_value
+        for task_output in eval_tasks:
+            if log_samples:
+                # for task_name, task_samples in list(samples.items()):
+                full_samples = [None] * WORLD_SIZE if RANK == 0 else None
+                torch.distributed.gather_object(
+                    obj=task_output.logged_samples,
+                    object_gather_list=full_samples,
+                    dst=0,
                )
-                gathered_item = lm.accelerator.gather(torch_device_tensor)

-                if numitem > 0:
-                    gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
-                else:
-                    gathered_filtered = gathered_item[gathered_item != pad_value]
+                if RANK == 0:
+                    task_output.logged_samples = list(
+                        itertools.chain.from_iterable(full_samples)
+                    )

-                gathered_item = (
-                    gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
+            # then collect metrics across all ranks
+            for metrics in task_output.sample_metrics:
+                metric_list = [None] * WORLD_SIZE if RANK == 0 else None
+                torch.distributed.gather_object(
+                    obj=task_output.sample_metrics[metrics],
+                    object_gather_list=metric_list,
+                    dst=0,
                )
-                # reconvert if we were passed a tuple of values
-                if numitem > 0:
-                    gathered_item = [tuple(g) for g in gathered_item]
-
-            if lm.rank == 0:
-                vals_torch[(task_name, key, metric)] = gathered_item
-
-        vals = vals_torch
+                if RANK == 0:
+                    task_output.sample_metrics[metrics] = list(
+                        itertools.chain.from_iterable(metric_list)
+                    )

-    if lm.rank == 0:
+    if RANK == 0:
        ### Aggregate results over all datapoints ###
        # aggregate results ; run bootstrap CIs
-        for (task_name, key, metric), items in vals.items():
-            task = task_dict[task_name]
-            group_name, task = task if isinstance(task, tuple) else (None, task)
-
-            metric_key = f"{metric},{key}"
-            agg_fn = task.aggregation()[metric]
-
-            results[task_name][metric_key] = agg_fn(items)
-            results[task_name]["samples"] = len(items)
-
-            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
-            # so we run them less iterations. still looking for a cleaner way to do this
-            if bootstrap_iters > 0:
-                stderr_fn = lm_eval.api.metrics.stderr_for_metric(
-                    metric=agg_fn,
-                    bootstrap_iters=(
-                        min(bootstrap_iters, 100)
-                        if metric in ["bleu", "chrf", "ter"]
-                        else bootstrap_iters
-                    ),
-                )
-
-                results[task_name][f"{metric}_stderr,{key}"] = (
-                    stderr_fn(items) if (stderr_fn and len(items) > 1) else "N/A"
-                )
+        for task_output in eval_tasks:
+            task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters)
+        results, samples, configs, versions, num_fewshot = consolidate_results(
+            eval_tasks
+        )

+        ### Calculate group metrics ###
        if bool(results):
            for group, task_list in reversed(task_hierarchy.items()):
                if len(task_list) == 0:
@@ -575,19 +470,33 @@ def evaluate(
                    # or `task_name: []`.
                    # we only want to operate on groups here.
                    continue
-                for metric in [
-                    key
-                    for key in results[task_list[0]].keys()
-                    if "_stderr" not in key and key not in ["alias", "samples"]
-                ]:  # TODO: what if tasks don't all share the same metrics
+                metric_list = list(
+                    {
+                        key
+                        for task in task_list
+                        for key in results[task].keys()
+                        if "_stderr" not in key and key not in ["alias", "samples"]
+                    }
+                )
+                for metric in metric_list:
                    stderr = "_stderr,".join(metric.split(","))

                    # gather metrics, sizes, and stderrs from subtasks
                    metrics = [
-                        results[task][metric] for task in task_list
+                        results[task][metric]
+                        for task in task_list
+                        if metric in results[task]
                    ]  # TODO: copy?
-                    stderrs = [results[task][stderr] for task in task_list]
-                    sizes = [results[task]["samples"] for task in task_list]
+                    stderrs = [
+                        results[task][stderr]
+                        for task in task_list
+                        if stderr in results[task]
+                    ]
+                    sizes = [
+                        results[task]["samples"]
+                        for task in task_list
+                        if metric in results[task]
+                    ]

                    # compute group's pooled metric and stderr
                    results[group][
@@ -606,60 +515,6 @@ def evaluate(

                    results[group]["samples"] = sum(sizes)

-        def print_tasks(task_hierarchy, results, tab=0):
-            results_agg = collections.defaultdict(dict)
-            groups_agg = collections.defaultdict(dict)
-
-            (group_name, task_list), *_ = task_hierarchy.items()
-            task_list = sorted(task_list)
-
-            results_agg[group_name] = results[group_name].copy()
-            # results_agg[group_name]["tab"] = tab
-            if "samples" in results_agg[group_name]:
-                results_agg[group_name].pop("samples")
-
-            tab_string = " " * tab + "- " if tab > 0 else ""
-
-            if "alias" in results_agg[group_name]:
-                results_agg[group_name]["alias"] = (
-                    tab_string + results_agg[group_name]["alias"]
-                )
-            else:
-                results_agg[group_name]["alias"] = tab_string + group_name
-
-            if len(task_list) > 0:
-                groups_agg[group_name] = results[group_name].copy()
-                # groups_agg[group_name]["tab"] = tab
-                if "samples" in groups_agg[group_name]:
-                    groups_agg[group_name].pop("samples")
-
-                if "alias" in groups_agg[group_name]:
-                    groups_agg[group_name]["alias"] = (
-                        tab_string + groups_agg[group_name]["alias"]
-                    )
-                else:
-                    groups_agg[group_name]["alias"] = tab_string + group_name
-
-                for task_name in task_list:
-                    if task_name in task_hierarchy:
-                        _task_hierarchy = {
-                            **{task_name: task_hierarchy[task_name]},
-                            **task_hierarchy,
-                        }
-                    else:
-                        _task_hierarchy = {
-                            **{task_name: []},
-                            **task_hierarchy,
-                        }
-
-                    _results_agg, _groups_agg = print_tasks(
-                        _task_hierarchy, results, tab + 1
-                    )
-                    results_agg = {**results_agg, **_results_agg}
-                    groups_agg = {**groups_agg, **_groups_agg}
-
-            return results_agg, groups_agg
-
        results_agg = collections.defaultdict(dict)
        groups_agg = collections.defaultdict(dict)
        all_tasks_list = list(task_hierarchy.keys())
@@ -673,7 +528,7 @@ def evaluate(
            _task_hierarchy = {
                k: v for k, v in task_hierarchy.items() if k in left_tasks_list
            }
-            _results_agg, _groups_agg = print_tasks(_task_hierarchy, results)
+            _results_agg, _groups_agg = prepare_print_tasks(_task_hierarchy, results)

            results_agg = {**results_agg, **_results_agg}
            groups_agg = {**groups_agg, **_groups_agg}

--- a/lm_eval/evaluator_utils.py
+++ b/lm_eval/evaluator_utils.py
+import collections
+import math
+import pathlib
+import sys
+from typing import Dict, List, Optional, Tuple, Union
+
+from lm_eval.api import metrics
+from lm_eval.utils import eval_logger, positional_deprecated
+
+
+class TaskOutput:
+    """
+    Wrapper class for Task outputs.It contains various attributes and methods to manage and calculate metrics for the task.
+
+        Attributes:
+            task (object): The task object.
+            task_name (str): The name of the task.
+            task_config (dict): The configuration of the task.
+            version (str): The version of the task.
+            group_name (str): The name of the task group.
+            n_shot (int): The number of shots for the task.
+            task_alias (str): The alias of the task.
+            group_alias (str): The alias of the task group.
+            is_group (bool): Indicates if the task is a group.
+            logged_samples (list): The list of logged samples.
+            sample_len (int): The length of the samples.
+            sample_metrics (defaultdict): The dictionary of samples' metrics.
+            agg_metrics (defaultdict): The dictionary of aggregate metrics.
+
+        Methods:
+            from_taskdict(cls, task_name: str, task):
+                Creates a TaskOutput instance from a task dictionary.
+
+            calculate_aggregate_metric(bootstrap_iters=100000) -> None:
+                Calculates the aggregate metrics for the task.
+    """
+
+    def __init__(
+        self,
+        task=None,
+        task_name=None,
+        task_config=None,
+        version=None,
+        group_name=None,
+        n_shot=None,
+        task_alias=None,
+        group_alias=None,
+        is_group=None,
+    ):
+        self.task = task
+        self.task_config = task_config
+        self.task_name = task_name
+        self.group_name = group_name
+        self.version = version
+        self.n_shot = n_shot
+        self.task_alias = task_alias
+        self.group_alias = group_alias
+        self.is_group = is_group
+        self.logged_samples = []
+        self.sample_len = None
+        self.sample_metrics = collections.defaultdict(list)
+        self.agg_metrics = collections.defaultdict(list)
+
+    @classmethod
+    def from_taskdict(cls, task_name: str, task):
+        if isinstance(task, tuple):
+            group_name, task = task
+        else:
+            group_name = None
+        if not task:
+            # these gets filtered out in get_task_list
+            # once they are added to group hierarchy
+            is_group = True
+            return cls(
+                task=task, task_name=task_name, is_group=is_group, group_name=group_name
+            )
+        version = task.VERSION
+        task_config = dict(task.dump_config())
+        if (n_shot := task_config.get("num_fewshot")) == 0:
+            n_shot = task_config.get("metadata", {}).get("num_fewshot", 0)
+        task_alias = task_config.get("alias")
+        group_alias = task_config.get("group_alias")
+        return cls(
+            task=task,
+            task_name=task_name,
+            task_config=task_config,
+            group_name=group_name,
+            version=version,
+            n_shot=n_shot,
+            task_alias=task_alias,
+            group_alias=group_alias,
+        )
+
+    def calculate_aggregate_metric(self, bootstrap_iters=100000) -> None:
+        for (metric, filter_key), items in self.sample_metrics.items():
+            agg_fn = self.task.aggregation()[metric]
+            metric_key = f"{metric},{filter_key}"
+            self.agg_metrics[metric_key] = agg_fn(items)
+            self.sample_len = len(items)  # TODO: same sample size for each metric?
+            if bootstrap_iters:
+                stderr_fn = metrics.stderr_for_metric(
+                    metric=agg_fn,
+                    bootstrap_iters=min(bootstrap_iters, 100)
+                    if metric in ["bleu", "chrf", "ter"]
+                    else bootstrap_iters,
+                )
+                self.agg_metrics[f"{metric}_stderr,{filter_key}"] = (
+                    stderr_fn(items) if (stderr_fn and len(items) > 1) else "N/A"
+                )
+
+    def __repr__(self):
+        return (
+            f"TaskOutput(task_name={self.task_name}, "
+            f"group_name={self.group_name}, "
+            f"version={self.version},"
+            f"n_shot={self.n_shot}"
+            f"task_alias={self.task_alias}, group_alias={self.group_alias})"
+        )
+
+
+def get_task_list(task_dict: dict) -> Tuple[Dict[str, list], List[TaskOutput]]:
+    task_hierarchy = collections.defaultdict(list)
+    outputs = list(TaskOutput.from_taskdict(x, y) for x, y in task_dict.items())
+    for task_output in outputs:
+        if group_name := task_output.group_name:
+            task_hierarchy[group_name].append(task_output.task_name)
+        else:
+            task_hierarchy[task_output.task_name] = []
+    # returns task_hierarchy tracking which groups contain which subtasks,
+    # and a list of TaskOutput classes for each non-group subtask
+    return task_hierarchy, [x for x in outputs if x.task]
+
+
+def print_writeout(task) -> None:
+    for inst in task.instances:
+        # print the prompt for the first few documents
+        if inst.doc_id < 1:
+            eval_logger.info(
+                f"Task: {task}; document {inst.doc_id}; context prompt (starting on next line):\
+    \n{inst.args[0]}\n(end of prompt on previous line)\ntarget string or answer choice index (starting on next line):\n{task.doc_to_target(inst.doc)}\n(end of target on previous line)"
+            )
+            eval_logger.info(f"Request: {str(inst)}")
+
+
+def get_sample_size(task, limit: Optional[int]) -> Union[int, None]:
+    if limit is not None:
+        limit = (
+            int(math.ceil(len(task.eval_docs) * limit)) if limit < 1.0 else int(limit)
+        )
+    return limit
+
+
+def prepare_print_tasks(
+    task_hierarchy: dict, results: dict, tab=0
+) -> Tuple[dict, dict]:
+    """
+    @param task_hierarchy: Dictionary representing the group hierarchy of tasks. Each key is a group name and its
+    value is a list of task names.
+    @param results: Dictionary containing the results of each task. Each key is a
+    group name and its value is a dictionary of task results.
+    @param tab: The indentation level for printing the task
+    hierarchy. Default is 0.
+    @return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains
+    aggregated results for each task, and groups_agg contains aggregated results for each group.
+
+    Prepares the task hierarchy and aggregates the results for each task and group recursively for printing.
+    """
+    results_agg = collections.defaultdict(dict)
+    groups_agg = collections.defaultdict(dict)
+
+    (group_name, task_list), *_ = task_hierarchy.items()
+    task_list = sorted(task_list)
+
+    results_agg[group_name] = results[group_name].copy()
+    # results_agg[group_name]["tab"] = tab
+    if "samples" in results_agg[group_name]:
+        results_agg[group_name].pop("samples")
+
+    tab_string = " " * tab + "- " if tab > 0 else ""
+
+    if "alias" in results_agg[group_name]:
+        results_agg[group_name]["alias"] = tab_string + results_agg[group_name]["alias"]
+    else:
+        results_agg[group_name]["alias"] = tab_string + group_name
+
+    if len(task_list) > 0:
+        groups_agg[group_name] = results[group_name].copy()
+        # groups_agg[group_name]["tab"] = tab
+        if "samples" in groups_agg[group_name]:
+            groups_agg[group_name].pop("samples")
+
+        if "alias" in groups_agg[group_name]:
+            groups_agg[group_name]["alias"] = (
+                tab_string + groups_agg[group_name]["alias"]
+            )
+        else:
+            groups_agg[group_name]["alias"] = tab_string + group_name
+
+        for task_name in task_list:
+            if task_name in task_hierarchy:
+                _task_hierarchy = {
+                    **{task_name: task_hierarchy[task_name]},
+                    **task_hierarchy,
+                }
+            else:
+                _task_hierarchy = {
+                    **{task_name: []},
+                    **task_hierarchy,
+                }
+
+            _results_agg, _groups_agg = prepare_print_tasks(
+                _task_hierarchy, results, tab + 1
+            )
+            results_agg = {**results_agg, **_results_agg}
+            groups_agg = {**groups_agg, **_groups_agg}
+
+    return results_agg, groups_agg
+
+
+def consolidate_results(
+    eval_tasks: List[TaskOutput],
+) -> Tuple[dict, dict, dict, dict, dict]:
+    """
+    @param eval_tasks: list(TaskOutput).
+    @return: A tuple containing the consolidated results, samples, configs, versions, and num_fewshot.
+
+    Consolidates the results of multiple evaluation tasks into a single structure.
+
+    The method iterates over each evaluation instance and extracts relevant information to create the consolidated
+    results structure. The consolidated results structure has the following properties:
+
+    - results: A defaultdict with task names as keys and dictionaries as values. Each dictionary contains
+    metric/filter pairs as keys and corresponding metric values as values. The "alias" key is used to store task
+    aliases specified in the task configuration.
+    - samples: A defaultdict with task names as keys and lists of log samples as values.
+    - configs: A defaultdict with task names as keys and task configurations as values.
+    - versions: A defaultdict with task names as keys and task versions as values.
+    - num_fewshot: A defaultdict with task names as keys and number of few-shot samples as values.
+
+    The method then returns the consolidated results, samples, configs, versions, and num_fewshot as a tuple.
+    """
+    # stores the final result for each task, for each metric/filter pair.
+    results = collections.defaultdict(dict)
+    # logs info about each document evaluated.
+    samples = collections.defaultdict(list)
+    # store num-fewshot value per task
+    num_fewshot = collections.defaultdict(int)
+    # Tracks the YAML configs of all chosen task
+    configs = collections.defaultdict(dict)
+    # Tracks each task's version.
+    versions = collections.defaultdict(dict)
+    for task_output in eval_tasks:
+        if "task_alias" in (task_config := task_output.task_config):
+            results[task_output.task_name]["alias"] = task_config["task_alias"]
+        if group_alias := task_output.group_alias:
+            if group_alias not in results and (group_name := task_output.group_name):
+                results[group_name]["alias"] = group_alias
+        num_fewshot[task_output.task_name] = task_output.n_shot
+        configs[task_output.task_name] = task_output.task_config
+        versions[task_output.task_name] = task_output.version
+        samples[task_output.task_name] = task_output.logged_samples
+        for (metric, filter_key), items in task_output.sample_metrics.items():
+            metric_key = f"{metric},{filter_key}"
+            results[task_output.task_name][metric_key] = task_output.agg_metrics[
+                metric_key
+            ]
+            results[task_output.task_name]["samples"] = task_output.sample_len
+            results[task_output.task_name][
+                f"{metric}_stderr,{filter_key}"
+            ] = task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
+    return results, samples, configs, versions, num_fewshot
+
+
+@positional_deprecated
+def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
+    """
+    Search upward in the directory tree to a maximum of three layers
+    to find and return the package root (containing the 'tests' folder)
+    """
+    cur_path = start_path.resolve()
+    max_layers = 3
+    for _ in range(max_layers):
+        if (cur_path / "tests" / "test_version_stable.py").exists():
+            return cur_path
+        else:
+            cur_path = cur_path.parent.resolve()
+    raise FileNotFoundError(
+        f"Unable to find package root within {max_layers} upwards" + f"of {start_path}"
+    )
+
+
+@positional_deprecated
+def run_task_tests(task_list: List[str]):
+    """
+    Find the package root and run the tests for the given tasks
+    """
+    import pytest
+
+    package_root = find_test_root(start_path=pathlib.Path(__file__))
+    task_string = " or ".join(task_list)
+    args = [
+        f"{package_root}/tests/test_version_stable.py",
+        f"--rootdir={package_root}",
+        "-k",
+        f"{task_string}",
+    ]
+    sys.path.append(str(package_root))
+    pytest_return_val = pytest.main(args)
+    if pytest_return_val:
+        raise ValueError(
+            f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
+        )
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -921,7 +921,11 @@ class HFLM(TemplateLM):
        )

        chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn)
-        pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running loglikelihood requests",
+        )
        for chunk in chunks:
            inps = []
            cont_toks_list = []
@@ -1089,7 +1093,11 @@ class HFLM(TemplateLM):
            toks = self.tok_encode(req[0])
            return -len(toks), req[0]

-        pbar = tqdm(total=len(requests), disable=(self.rank != 0))
+        pbar = tqdm(
+            total=len(requests),
+            disable=(self.rank != 0),
+            desc="Running generate_until requests",
+        )
        adaptive_batch_size = None
        if self.batch_size == "auto":
            # using rolling window with maximum context

--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -254,7 +254,11 @@ class VLLM(TemplateLM):
            n=int(self.batch_size) if self.batch_size != "auto" else 0, batch_fn=None
        )

-        pbar = tqdm(total=len(requests), disable=(self.rank != 0))
+        pbar = tqdm(
+            total=len(requests),
+            disable=(self.rank != 0),
+            desc="Running generate_until requests",
+        )
        # for each different set of kwargs, we execute all requests, by batch.
        for chunk in chunks:
            context_and_encoding, all_gen_kwargs = zip(*chunk)
@@ -329,7 +333,11 @@ class VLLM(TemplateLM):
            n=int(self.batch_size) if self.batch_size != "auto" else 0, batch_fn=None
        )

-        pbar = tqdm(total=len(requests), disable=disable_tqdm)
+        pbar = tqdm(
+            total=len(requests),
+            disable=disable_tqdm,
+            desc="Running loglikelihood requests",
+        )
        for chunk in chunks:
            inputs = []
            ctxlens = []

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -6,9 +6,7 @@ import inspect
 import logging
 import os
 import re
-import sys
 from itertools import islice
-from pathlib import Path
 from typing import Any, Callable, List

 import numpy as np
@@ -244,7 +242,7 @@ def make_table(result_dict, column: str = "results"):
    values = []

    for k, dic in result_dict[column].items():
-        version = result_dict["versions"][k]
+        version = result_dict["versions"].get(k, "N/A")
        n = str(result_dict["n-shot"][k])

        if "alias" in dic:
@@ -292,47 +290,6 @@ def positional_deprecated(fn):
    return _wrapper


-@positional_deprecated
-def find_test_root(start_path: Path) -> Path:
-    """
-    Search upward in the directory tree to a maximum of three layers
-    to find and return the package root (containing the 'tests' folder)
-    """
-    cur_path = start_path.resolve()
-    max_layers = 3
-    for _ in range(max_layers):
-        if (cur_path / "tests" / "test_version_stable.py").exists():
-            return cur_path
-        else:
-            cur_path = cur_path.parent.resolve()
-    raise FileNotFoundError(
-        f"Unable to find package root within {max_layers} upwards" + f"of {start_path}"
-    )
-
-
-@positional_deprecated
-def run_task_tests(task_list: List[str]):
-    """
-    Find the package root and run the tests for the given tasks
-    """
-    import pytest
-
-    package_root = find_test_root(start_path=Path(__file__))
-    task_string = " or ".join(task_list)
-    args = [
-        f"{package_root}/tests/test_version_stable.py",
-        f"--rootdir={package_root}",
-        "-k",
-        f"{task_string}",
-    ]
-    sys.path.append(str(package_root))
-    pytest_return_val = pytest.main(args)
-    if pytest_return_val:
-        raise ValueError(
-            f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
-        )
-
-
 def ignore_constructor(loader, node):
    return node

@@ -414,16 +371,10 @@ def apply_template(template: str, doc: dict) -> str:
    return rtemplate.render(**doc)


-def create_iterator(raw_iterator, rank, world_size, limit=None):
+def create_iterator(raw_iterator, *, rank=0, world_size=1, limit=None):
    """
    Method for creating a (potentially) sliced and limited
    iterator from a raw document iterator. Used for splitting data
    among ranks in multigpu setting or only pulling a sample of documents
    """
    return islice(raw_iterator, rank, limit, world_size)
-
-
-# Multi-token stopping criteria
-
-
-# from more_itertools