Merge branch 'main' into weight_by_size

9822b06e · Lintang Sutawika · GitHub · 51f27158 · b177c82c · 9822b06e
Unverified Commit 9822b06e authored Mar 01, 2024 by Lintang Sutawika Committed by GitHub Mar 01, 2024
16 changed files
--- a/lm_eval/tasks/xwinograd/utils.py
+++ b/lm_eval/tasks/xwinograd/utils.py
@@ -51,7 +51,9 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
    for lang in LANGUAGES:
        file_name = f"xwinograd_{lang}.yaml"
        try:
-            with open(f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf-8") as f:
+            with open(
+                f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf-8"
+            ) as f:
                f.write("# Generated by utils.py\n")
                yaml.dump(
                    {

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
 import collections
 import fnmatch
 import functools
-import gc
 import importlib.util
 import inspect
 import logging
 import os
-import pathlib
 import re
-import subprocess
-import sys
-import time
-from functools import wraps
 from itertools import islice
-from typing import (
-    Any,
-    Callable,
-    Iterable,
-    Iterator,
-    List,
-    Literal,
-    Optional,
-    Tuple,
-    Type,
-    Union,
-)
+from typing import Any, Callable, List

-import torch
-import transformers
+import numpy as np
 import yaml
 from jinja2 import BaseLoader, Environment, StrictUndefined

@@ -99,44 +81,6 @@ def join_iters(iters):
        yield from iter


-def chunks(iter, n: int = 0, fn=None):
-    """
-    Divides an iterable into chunks of specified size or based on a given function.
-    Useful for batching
-
-    Parameters:
-    - iter: The input iterable to be divided into chunks.
-    - n: An integer representing the size of each chunk. Default is 0.
-    - fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
-
-    Returns:
-    An iterator that yields chunks of the input iterable.
-
-    Example usage:
-    ```
-    data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-    for chunk in chunks(data, 3):
-        print(chunk)
-    ```
-    Output:
-    ```
-    [1, 2, 3]
-    [4, 5, 6]
-    [7, 8, 9]
-    [10]
-    ```
-    """
-    arr = []
-    for i, x in enumerate(iter):
-        arr.append(x)
-        if len(arr) == (fn(i, iter) if fn else n):
-            yield arr
-            arr = []
-
-    if arr:
-        yield arr
-
-
 def group(arr, fn):
    res = collections.defaultdict(list)

@@ -146,25 +90,6 @@ def group(arr, fn):
    return list(res.values())


-class MultiChoice:
-    def __init__(self, choices) -> None:
-        self.choices = choices
-
-    # Simple wildcard support (linux filename patterns)
-    def __contains__(self, values) -> bool:
-        for value in values.split(","):
-            if len(fnmatch.filter(self.choices, value)) == 0:
-                eval_logger.info("Available tasks to choose:")
-                for choice in self.choices:
-                    eval_logger.info(f"  - {choice}")
-                raise ValueError("'{}' is not in task list".format(value))
-        return True
-
-    def __iter__(self) -> Iterator:
-        for choice in self.choices:
-            yield choice
-
-
 # Returns a list containing all values of the source_list that
 # match at least one of the patterns
 def pattern_match(patterns, source_list):
@@ -178,6 +103,12 @@ def pattern_match(patterns, source_list):
    return sorted(list(task_names))


+def softmax(x):
+    """Compute softmax values for each sets of scores in x."""
+    e_x = np.exp(x - np.max(x))
+    return e_x / e_x.sum()
+
+
 def general_detokenize(string):
    string = string.replace(" n't", "n't")
    string = string.replace(" )", ")")
@@ -283,64 +214,6 @@ class Reorderer:
        return res


-class Grouper:
-    """
-    takes an array `arr` and function `fn` and returns a dictionary
-    with keys fn(ob) for each ob in `arr` and with values `self.arr[key]` a list of all
-    objects in `arr` satisfying `key == fn(ob)`.
-    """
-
-    def __init__(self, arr, fn) -> None:
-        # self.orig_arr = arr
-        self.size = len(arr)
-        arr = list(enumerate(arr))
-
-        def group_return_dict(arr, fn):
-            res = collections.defaultdict(list)
-
-            for ob in arr:
-                res[fn(ob)].append(ob)
-            return res
-
-        arr = group_return_dict(arr, lambda x: fn(x[1]))
-
-        # self.arr has format Dict[Tuple[int, <entry from orig. arr>]]
-        self.arr = arr
-        self._grouped = None
-
-    def get_grouped(self):
-        # return the contents but not indices for our grouped dict.
-        if self._grouped:
-            return self._grouped
-        grouped = {}
-        for key in self.arr.keys():
-            # drop the index from each element of self.arr
-            grouped[key] = [y[1] for y in self.arr[key]]
-        self._grouped = grouped
-        return grouped
-
-    def get_original(self, grouped_dict):
-        # take in a grouped dictionary with e.g. results for each key listed
-        # in the same order as the instances in `self.arr`, and
-        # return the results in the same (single list) order as `self.orig_arr`.
-        res = [None] * self.size
-        cov = [False] * self.size
-        # orig = [None] * self.size
-
-        assert grouped_dict.keys() == self.arr.keys()
-
-        for key in grouped_dict.keys():
-            for (ind, _), v in zip(self.arr[key], grouped_dict[key]):
-                res[ind] = v
-                cov[ind] = True
-                # orig[ind] = _
-
-        assert all(cov)
-        # assert orig == self.orig_arr
-
-        return res
-
-
 def make_table(result_dict, column: str = "results"):
    """Generate table of results."""
    from pytablewriter import LatexTableWriter, MarkdownTableWriter
@@ -369,7 +242,7 @@ def make_table(result_dict, column: str = "results"):
    values = []

    for k, dic in result_dict[column].items():
-        version = result_dict["versions"][k]
+        version = result_dict["versions"].get(k, "N/A")
        n = str(result_dict["n-shot"][k])

        if "alias" in dic:
@@ -417,59 +290,8 @@ def positional_deprecated(fn):
    return _wrapper


-@positional_deprecated
-def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
-    """
-    Search upward in the directory tree to a maximum of three layers
-    to find and return the package root (containing the 'tests' folder)
-    """
-    cur_path = start_path.resolve()
-    max_layers = 3
-    for _ in range(max_layers):
-        if (cur_path / "tests" / "test_version_stable.py").exists():
-            return cur_path
-        else:
-            cur_path = cur_path.parent.resolve()
-    raise FileNotFoundError(
-        f"Unable to find package root within {max_layers} upwards" + f"of {start_path}"
-    )
-
-
-@positional_deprecated
-def run_task_tests(task_list: List[str]):
-    """
-    Find the package root and run the tests for the given tasks
-    """
-    import pytest
-
-    package_root = find_test_root(start_path=pathlib.Path(__file__))
-    task_string = " or ".join(task_list)
-    args = [
-        f"{package_root}/tests/test_version_stable.py",
-        f"--rootdir={package_root}",
-        "-k",
-        f"{task_string}",
-    ]
-    sys.path.append(str(package_root))
-    pytest_return_val = pytest.main(args)
-    if pytest_return_val:
-        raise ValueError(
-            f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
-        )
-
-
-def get_git_commit_hash():
-    """
-    Gets the git commit hash of your current repo (if it exists).
-    Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
-    """
-    try:
-        git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
-        git_hash = git_hash.decode()
-    except subprocess.CalledProcessError or FileNotFoundError:
-        # FileNotFoundError occurs when git not installed on system
-        git_hash = None
-    return git_hash
+def ignore_constructor(loader, node):
+    return node


 def import_function(loader, node):
@@ -489,11 +311,14 @@ def import_function(loader, node):
    return function


-# Add the import_function constructor to the YAML loader
-yaml.add_constructor("!function", import_function)
-
+def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None, mode="full"):
+    if mode == "simple":
+        constructor_fn = ignore_constructor
+    elif mode == "full":
+        constructor_fn = import_function

-def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
+    # Add the import_function constructor to the YAML loader
+    yaml.add_constructor("!function", constructor_fn)
    if yaml_config is None:
        with open(yaml_path, "rb") as file:
            yaml_config = yaml.full_load(file)
@@ -521,7 +346,7 @@ def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
                path = os.path.join(yaml_dir, path)

            try:
-                included_yaml_config = load_yaml_config(path)
+                included_yaml_config = load_yaml_config(yaml_path=path, mode=mode)
                final_yaml_config.update(included_yaml_config)
            except Exception as ex:
                # If failed to load, ignore
@@ -546,389 +371,10 @@ def apply_template(template: str, doc: dict) -> str:
    return rtemplate.render(**doc)


-def create_iterator(raw_iterator, rank, world_size, limit=None):
+def create_iterator(raw_iterator, *, rank=0, world_size=1, limit=None):
    """
    Method for creating a (potentially) sliced and limited
    iterator from a raw document iterator. Used for splitting data
    among ranks in multigpu setting or only pulling a sample of documents
    """
    return islice(raw_iterator, rank, limit, world_size)
-
-
-def pad_and_concat(
-    max_length: int,
-    tensors: List[torch.Tensor],
-    padding_side: Literal["right", "left"] = "right",
-):
-    """
-    Method for padding a list of tensors given the maximum tensor
-    length in the batch. Used for batching inputs and continuations in
-    seq2seq models.
-    """
-    assert (
-        padding_side == "left" or padding_side == "right"
-    ), f"Unrecognized padding type: '{padding_side}' not 'left' or 'right'"
-
-    for i, tensor in enumerate(tensors):
-        if len(tensor.shape) == 2:
-            tensor = tensor.squeeze(0)  # squeeze, in case passed [1, seq] size
-        tensor_len = tensor.shape[0]
-        if tensor_len < max_length:
-            if padding_side == "right":
-                # right-pad
-                tensors[i] = torch.cat(
-                    [
-                        tensor,  # [seq]
-                        torch.zeros(
-                            max_length - tensor_len,
-                            dtype=torch.long,
-                            device=tensor.device,
-                        ),  # [padding_length - seq]
-                    ],
-                    dim=0,
-                ).unsqueeze(0)
-            else:
-                # left-pad
-                tensors[i] = torch.cat(
-                    [
-                        torch.zeros(
-                            max_length - tensor_len,
-                            dtype=torch.long,
-                            device=tensor.device,
-                        ),  # [padding_length - seq]
-                        tensor,  # [seq]
-                    ],
-                    dim=0,
-                ).unsqueeze(0)
-        else:
-            tensors[i] = tensor.unsqueeze(0)
-
-    return torch.cat(tensors, dim=0)
-
-
-def clear_torch_cache() -> None:
-    gc.collect()
-    torch.cuda.empty_cache()
-
-
-def get_dtype(dtype: Union[str, torch.dtype]) -> torch.dtype:
-    """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
-    if isinstance(dtype, str) and dtype != "auto":
-        # Convert `str` args torch dtype: `float16` -> `torch.float16`
-        _torch_dtype = getattr(torch, dtype)
-    else:
-        _torch_dtype = dtype
-    return _torch_dtype
-
-
-# Multi-token stopping criteria
-class MultiTokenEOSCriteria(transformers.StoppingCriteria):
-    """Criteria to stop on the specified multi-token sequence."""
-
-    def __init__(
-        self,
-        sequence: str,
-        tokenizer: transformers.PreTrainedTokenizer,
-        initial_decoder_input_length: int,
-        batch_size: int,
-    ) -> None:
-        self.initial_decoder_input_length = initial_decoder_input_length
-        self.done_tracker = [False] * batch_size
-        self.sequence = sequence
-        self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
-        # print(sequence, self.sequence_ids)
-        # we look back for 2 more tokens than it takes to encode our stop sequence
-        # because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
-        # and we don't want to mistakenly not stop a generation because our
-        # (string) stop sequence was output in a different tokenization
-
-        # NOTE: there is a minor danger that this will end up looking back 2 tokens into the past, into the inputs to the model,
-        # and stopping generation immediately as a result. With only 2 extra tokens of lookback, this risk is minimized
-        # Additionally, in lookback_ids_batch we should prevent ever looking back into the inputs as described.
-        self.sequence_id_len = len(self.sequence_ids) + 2
-        self.tokenizer = tokenizer
-
-    def __call__(self, input_ids, scores, **kwargs) -> bool:
-        # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
-        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :]
-
-        lookback_ids_batch = lookback_ids_batch[:, -self.sequence_id_len :]
-
-        lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
-
-        for i, done in enumerate(self.done_tracker):
-            if not done:
-                self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
-        return False not in self.done_tracker
-
-
-def stop_sequences_criteria(
-    tokenizer: transformers.PreTrainedTokenizer,
-    stop_sequences: List[str],
-    initial_decoder_input_length: int,
-    batch_size: int,
-) -> transformers.StoppingCriteriaList:
-    return transformers.StoppingCriteriaList(
-        [
-            *[
-                MultiTokenEOSCriteria(
-                    sequence, tokenizer, initial_decoder_input_length, batch_size
-                )
-                for sequence in stop_sequences
-            ],
-        ]
-    )
-
-
-# from more_itertools
-def divide(iterable, n) -> List[Iterator]:
-    """Divide the elements from *iterable* into *n* parts, maintaining
-    order.
-
-        >>> group_1, group_2 = divide([1, 2, 3, 4, 5, 6], 2)
-        >>> list(group_1)
-        [1, 2, 3]
-        >>> list(group_2)
-        [4, 5, 6]
-
-    If the length of *iterable* is not evenly divisible by *n*, then the
-    length of the returned iterables will not be identical:
-
-        >>> children = divide([1, 2, 3, 4, 5, 6, 7], 3)
-        >>> [list(c) for c in children]
-        [[1, 2, 3], [4, 5], [6, 7]]
-
-    If the length of the iterable is smaller than n, then the last returned
-    iterables will be empty:
-
-        >>> children = divide([1, 2, 3], 5)
-        >>> [list(c) for c in children]
-        [[1], [2], [3], [], []]
-
-    This function will exhaust the iterable before returning and may require
-    significant storage. If order is not important, see :func:`distribute`,
-    which does not first pull the iterable into memory.
-
-    """
-    if n < 1:
-        raise ValueError("n must be at least 1")
-
-    try:
-        iterable[:0]
-    except TypeError:
-        seq = tuple(iterable)
-    else:
-        seq = iterable
-
-    q, r = divmod(len(seq), n)
-
-    ret = []
-    stop = 0
-    for i in range(1, n + 1):
-        start = stop
-        stop += q + 1 if i <= r else q
-        ret.append(iter(seq[start:stop]))
-
-    return ret
-
-
-def retry_on_specific_exceptions(
-    on_exceptions: List[Type[Exception]],
-    max_retries: Optional[int] = None,
-    backoff_time: float = 3.0,
-    backoff_multiplier: float = 1.5,
-    on_exception_callback: Optional[Callable[[Exception, float], Any]] = None,
-):
-    """Retry on an LLM Provider's rate limit error with exponential backoff
-    For example, to use for OpenAI, do the following:
-    ```
-    from openai import RateLimitError
-
-    # Recommend specifying max_retries to avoid infinite loops!
-    @retry_on_specific_exceptions([RateLimitError], max_retries=3)
-    def completion(...):
-        # Wrap OpenAI completion function here
-        ...
-    ```
-    """
-
-    def decorator(func: Callable):
-        @wraps(func)
-        def wrapper(*args, **kwargs):
-            sleep_time = backoff_time
-            attempt = 0
-            while max_retries is None or attempt < max_retries:
-                try:
-                    return func(*args, **kwargs)
-                except tuple(on_exceptions) as e:
-                    if on_exception_callback is not None:
-                        on_exception_callback(e, sleep_time)
-                    time.sleep(sleep_time)
-                    sleep_time *= backoff_multiplier
-                    attempt += 1
-
-        return wrapper
-
-    return decorator
-
-
-class Collator:
-    """
-    A class for reordering and batching elements of an array.
-
-    This class allows for sorting an array based on a provided sorting function, grouping elements based on a grouping function, and generating batches from the sorted and grouped data.
-    """
-
-    def __init__(
-        self,
-        arr: List,
-        sort_fn: Callable,
-        group_fn: Callable = lambda x: x[1],
-        grouping: bool = False,
-    ) -> None:
-        self.grouping = grouping
-        self.fn = sort_fn
-        self.group_fn = lambda x: group_fn(x[1])  # first index are enumerated indices
-        self.reorder_indices: List = []
-        self.size = len(arr)
-        self.arr_with_indices: Iterable[Any] = tuple(enumerate(arr))  # [indices, (arr)]
-        if self.grouping is True:
-            self.group_by_index()
-
-    def group_by_index(self) -> None:
-        self.arr_with_indices = self.group(
-            self.arr_with_indices, fn=self.group_fn, values=False
-        )
-
-    def get_batched(self, n: int = 1, batch_fn: Optional[Callable] = None) -> Iterator:
-        """
-        Generates and yields batches from the reordered array.
-
-        Parameters:
-        - n (int): The size of each batch. Defaults to 1.
-        - batch_fn (Optional[Callable[[int, Iterable], int]]): A function to determine the size of each batch. Defaults to None.
-
-        Yields:
-        Iterator: An iterator over batches of reordered elements.
-        """
-        if self.grouping:
-            for (
-                key,
-                values,
-            ) in self.arr_with_indices.items():  # type: ignore
-                values = self._reorder(values)
-                batch = self.get_chunks(values, n=n, fn=batch_fn)
-                yield from batch
-        else:
-            values = self._reorder(self.arr_with_indices)  # type: ignore
-            batch = self.get_chunks(values, n=n, fn=batch_fn)
-            yield from batch
-
-    def _reorder(self, arr: Union[List, Tuple[Tuple[int, Any], ...]]) -> List:
-        """
-        Reorders the elements in the array based on the sorting function.
-
-        Parameters:
-        - arr (Union[List, Tuple[Tuple[int, Any], ...]]): The array or iterable to be reordered.
-
-        Yields:
-        List: Yields reordered elements one by one.
-        """
-        arr = sorted(arr, key=lambda x: self.fn(x[1]))
-        self.reorder_indices.extend([x[0] for x in arr])
-        yield from [x[1] for x in arr]
-
-    def get_original(self, newarr: List) -> List:
-        """
-        Restores the original order of elements from the reordered list.
-
-        Parameters:
-        - newarr (List): The reordered array.
-
-        Returns:
-        List: The array with elements restored to their original order.
-        """
-        res = [None] * self.size
-        cov = [False] * self.size
-
-        for ind, v in zip(self.reorder_indices, newarr):
-            res[ind] = v
-            cov[ind] = True
-
-        assert all(cov)
-
-        return res
-
-    def __len__(self):
-        return self.size
-
-    @staticmethod
-    def group(arr: Iterable, fn: Callable, values: bool = False) -> Iterable:
-        """
-        Groups elements of an iterable based on a provided function.
-
-        Parameters:
-        - arr (Iterable): The iterable to be grouped.
-        - fn (Callable): The function to determine the grouping.
-        - values (bool): If True, returns the values of the group. Defaults to False.
-
-        Returns:
-        Iterable: An iterable of grouped elements.
-        """
-        res = collections.defaultdict(list)
-        for ob in arr:
-            try:
-                hashable_dict = tuple(
-                    (
-                        key,
-                        tuple(value)
-                        if isinstance(value, collections.abc.Iterable)
-                        else value,
-                    )
-                    for key, value in sorted(fn(ob).items())
-                )
-                res[hashable_dict].append(ob)
-            except TypeError:
-                res[fn(ob)].append(ob)
-        if not values:
-            return res
-        return res.values()
-
-    @staticmethod
-    def get_chunks(_iter, n: int = 0, fn=None):
-        """
-        Divides an iterable into chunks of specified size or based on a given function.
-        Useful for batching
-
-        Parameters:
-        - iter: The input iterable to be divided into chunks.
-        - n: An integer representing the size of each chunk. Default is 0.
-        - fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
-
-        Returns:
-        An iterator that yields chunks of the input iterable.
-
-        Example usage:
-        ```
-        data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        for chunk in chunks(data, 3):
-            print(chunk)
-        ```
-        Output:
-        ```
-        [1, 2, 3]
-        [4, 5, 6]
-        [7, 8, 9]
-        [10]
-        ```
-        """
-        arr = []
-        _iter = tuple(_iter)
-        for i, x in enumerate(_iter):
-            arr.append(x)
-            if len(arr) == (fn(i, _iter) if fn else n):
-                yield arr
-                arr = []
-
-        if arr:
-            yield arr
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,8 @@ dependencies = [
    "tqdm-multiprocess",
    "transformers>=4.1",
    "zstandard",
+    "dill",
+    "word2number",
 ]

 [tool.setuptools.packages.find]
@@ -57,7 +59,9 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
 anthropic = ["anthropic"]
 dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy"]
 gptq = ["auto-gptq[triton]>=0.6.0"]
+hf_transfer = ["hf_transfer"]
 ifeval = ["langdetect", "immutabledict"]
+neuronx = ["optimum[neuronx]"]
 mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
 math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"]
 multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
@@ -68,12 +72,13 @@ sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
 testing = ["pytest", "pytest-cov", "pytest-xdist"]
 vllm = ["vllm<=0.2.5"]
 zeno = ["pandas", "zeno-client"]
+wandb = ["wandb>=0.16.3", "pandas", "numpy"]
 all = [
    "lm_eval[anthropic]",
    "lm_eval[dev]",
    "lm_eval[gptq]",
+    "lm_eval[hf_transfer]",
    "lm_eval[ifeval]",
-    "lm_eval[linting]",
    "lm_eval[mamba]",
    "lm_eval[math]",
    "lm_eval[multilingual]",
@@ -83,11 +88,9 @@ all = [
    "lm_eval[testing]",
    "lm_eval[vllm]",
    "lm_eval[zeno]",
+    "lm_eval[wandb]",
 ]

-[tool.ruff]
-extend-exclude = ["lm_eval/evaluator.py", "lm_eval/tasks/*.py"]
-
 [tool.ruff.lint]
 extend-select = ["I"]

@@ -96,5 +99,4 @@ lines-after-imports = 2
 known-first-party = ["lm_eval"]

 [tool.ruff.extend-per-file-ignores]
-"__init__.py" = ["F401","F402","F403","I"]
-"lm_eval/tasks/*"= ["E721"]
+"__init__.py" = ["F401","F402","F403"]
--- a/scripts/clean_training_data/README.md
+++ b/scripts/clean_training_data/README.md
@@ -30,4 +30,7 @@ pip install pybind11
 c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix)
 ```

-If your your compiler isn't linked to python, you may need to add to the above `-undefined dynamic_lookup`
+MacOS users: If your compiler isn't linked to Python, you may need to add to the above `-undefined dynamic_lookup`. \
+Linux users: If your compiler isn't linked to Python, you may need to follow these steps:
+1. Rename the compiled code file to `janitor_util.so`.
+2. Before running `import Janitor` in your code, add `sys.path.append("your/relative/path/to/janitor_util.so")` so that Python knows the location of `janitor_util.so`.
--- a/scripts/model_comparator.py
+++ b/scripts/model_comparator.py
@@ -8,6 +8,7 @@ import scipy.stats
 import torch

 import lm_eval.evaluator
+import lm_eval.models.utils
 from lm_eval import tasks, utils


@@ -113,7 +114,7 @@ if __name__ == "__main__":
        batch_size=args.batch,
    )
    memory_stats()
-    utils.clear_torch_cache()
+    lm_eval.models.utils.clear_torch_cache()
    eval_logger.info("Memory stats cleared")
    memory_stats()
    results_hf = lm_eval.evaluator.simple_evaluate(

--- a/scripts/requests_caching.py
+++ b/scripts/requests_caching.py
+"""
+Usage:
+   python requests_caching.py --tasks=comma,separated,list,of,tasks --cache_requests=<true|refresh|delete]>
+"""
+
+import argparse
+import os
+from typing import List
+
+import torch
+from transformers import (
+    pipeline as trans_pipeline,
+)
+
+from lm_eval import simple_evaluate
+from lm_eval.evaluator import request_caching_arg_to_dict
+from lm_eval.utils import eval_logger
+
+
+MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
+
+# Used to specify alternate cache path, useful if run in a docker container
+# NOTE raw datasets will break if you try to transfer the cache from your host to a docker image
+LM_HARNESS_CACHE_PATH = os.getenv("LM_HARNESS_CACHE_PATH")
+
+
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+MODEL = "EleutherAI/pythia-70m"
+
+TASK = "text-generation"
+
+
+def run_model_for_task_caching(tasks: List[str], cache_requests: str):
+    eval_logger.info(f"Loading HF model: {MODEL}")
+
+    trans_pipe = trans_pipeline(
+        task=TASK, model=MODEL, device=DEVICE, trust_remote_code=True
+    )
+
+    model = trans_pipe.model
+    tokenizer = trans_pipe.tokenizer
+
+    eval_logger.info(
+        f"Running simple_evaluate to cache request objects for tasks: {tasks}"
+    )
+
+    cache_args = request_caching_arg_to_dict(cache_requests=cache_requests)
+
+    eval_logger.info(
+        f"The following operations will be performed on the cache: {cache_requests}"
+    )
+
+    eval_data = simple_evaluate(
+        model="hf-auto",
+        model_args={
+            "pretrained": model,
+            "tokenizer": tokenizer,
+        },
+        limit=1,
+        device=DEVICE,
+        tasks=tasks,
+        write_out=True,
+        **cache_args,
+    )
+
+    return eval_data
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--tasks",
+        "-t",
+        default=None,
+        metavar="task1,task2",
+    )
+    parser.add_argument(
+        "--cache_requests",
+        type=str,
+        default=None,
+        choices=["true", "refresh", "delete"],
+        help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
+    )
+
+    args = parser.parse_args()
+
+    tasks = args.tasks.split(",")
+
+    eval_data = run_model_for_task_caching(
+        tasks=tasks, model=MODEL, device=DEVICE, cache_requests=args.cache_requests
+    )
--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -5,7 +5,7 @@ import random
 import numpy as np

 from lm_eval import tasks
-from lm_eval.tasks import include_path, initialize_tasks
+from lm_eval.tasks import TaskManager
 from lm_eval.utils import eval_logger, join_iters


@@ -39,22 +39,21 @@ def main():
    args = parse_args()
    np.random.seed(args.seed)

-    initialize_tasks(args.verbosity)
-
    if args.include_path is not None:
        eval_logger.info(f"Including path: {args.include_path}")
-        include_path(args.include_path)
+
+    task_manager = TaskManager(args.verbosity, include_path=args.include_path)

    if args.tasks == "all_tasks":
-        task_names = tasks.ALL_TASKS
+        task_names = task_manager.all_tasks
    else:
        task_names = args.tasks.split(",")
-    task_dict = tasks.get_task_dict(task_names)
+    task_dict = tasks.get_task_dict(task_names, task_manager)

    os.makedirs(args.output_base_path, exist_ok=True)
    for task_name, task in task_dict.items():
        if isinstance(task, tuple):
-            group_name, task = task
+            _, task = task
        rnd = random.Random()
        rnd.seed(args.seed)


--- a/tests/models/test_huggingface.py
+++ b/tests/models/test_huggingface.py
@@ -11,20 +11,21 @@ from lm_eval.api.instance import Instance
 from lm_eval.models.huggingface import HFLM


-tasks.initialize_tasks()
+task_manager = tasks.TaskManager()


 class Test_HFLM:
    torch.use_deterministic_algorithms(True)
+    task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
    version_minor = sys.version_info.minor
-    multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")()  # type: ignore
+    multiple_choice_task = task_list["arc_easy"]  # type: ignore
    multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
    MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
-    generate_until_task = tasks.TASK_REGISTRY.get("gsm8k")()  # type: ignore
-    generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
+    generate_until_task = task_list["gsm8k"]  # type: ignore
    generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
+    generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
    generate_until: list[Instance] = generate_until_task.instances
-    rolling_task = tasks.TASK_REGISTRY.get("wikitext")()  # type: ignore
+    rolling_task = task_list["wikitext"]  # type: ignore
    rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
    ROLLING: list[Instance] = rolling_task.instances

@@ -73,7 +74,7 @@ class Test_HFLM:
    generate_until_RES = [
        " The average of $2.50 each is $",
        " A robe takes 2 bolts of blue fiber and half",
-        " $50,000 in repairs.",
+        " $50,000 in repairs.\n\nQuestion",
        " He runs 1 sprint 3 times a week.",
        " They feed each of her chickens three cups of mixed",
        " The price of the glasses is $5, but",

--- a/tests/models/test_neuron_optimum.py
+++ b/tests/models/test_neuron_optimum.py
+import pytest
+import torch
+
+from lm_eval.models.neuron_optimum import wrap_constant_batch_size
+
+
+def test_wrap_constant_batch_size():
+    class Tester:
+        def __init__(self, batch_size):
+            self.batch_size = batch_size
+
+        @wrap_constant_batch_size
+        def test_constant_batch_size(self, inputs):
+            assert len(inputs) == self.batch_size
+            return inputs
+
+    batch_size_test = 8
+    for i in range(1, batch_size_test + 1):
+        tensor = torch.ones([i, 2, 2])
+        out = Tester(batch_size=batch_size_test).test_constant_batch_size(tensor)
+        torch.testing.assert_allclose(out, tensor)
+
+    with pytest.raises(ValueError):
+        Tester(batch_size=batch_size_test).test_constant_batch_size(
+            torch.ones([batch_size_test + 1, 2, 2])
+        )
--- a/tests/models/test_openvino.py
+++ b/tests/models/test_openvino.py
@@ -6,12 +6,9 @@ from optimum.intel import OVModelForCausalLM
 from transformers import AutoTokenizer

 import lm_eval.evaluator as evaluator
-import lm_eval.tasks as tasks
 from lm_eval.api.registry import get_model


-tasks.initialize_tasks()
-
 SUPPORTED_ARCHITECTURES_TASKS = {
    "facebook/opt-125m": "lambada_openai",
    "hf-internal-testing/tiny-random-gpt2": "wikitext",

--- a/tests/models/test_vllm.py
+++ b/tests/models/test_vllm.py
@@ -7,6 +7,9 @@ import lm_eval.tasks as tasks
 from lm_eval.api.instance import Instance


+task_manager = tasks.TaskManager()
+
+
 @pytest.mark.skip(reason="requires CUDA")
 class TEST_VLLM:
    vllm = pytest.importorskip("vllm")
@@ -17,15 +20,15 @@ class TEST_VLLM:
    except ModuleNotFoundError:
        pass
    torch.use_deterministic_algorithms(True)
-    tasks.initialize_tasks()
-    multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")()  # type: ignore
+    task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
+    multiple_choice_task = task_list["arc_easy"]  # type: ignore
    multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
    MULTIPLE_CH: List[Instance] = multiple_choice_task.instances
-    generate_until_task = tasks.TASK_REGISTRY.get("gsm8k")()  # type: ignore
+    generate_until_task = task_list["gsm8k"]  # type: ignore
    generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
    generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
    generate_until: List[Instance] = generate_until_task.instances
-    rolling_task = tasks.TASK_REGISTRY.get("wikitext")()  # type: ignore
+    rolling_task = task_list["wikitext"]  # type: ignore
    rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
    ROLLING: List[Instance] = rolling_task.instances


--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -6,11 +6,9 @@ import pytest
 # import lm_eval.models as models
 import lm_eval.api as api
 import lm_eval.evaluator as evaluator
-import lm_eval.tasks as tasks
+from lm_eval import tasks


-tasks.initialize_tasks()
-
 # TODO: more fine grained unit tests rather than this big honking integration
 # test once we break evaluator into smaller, more manageable pieces

@@ -46,7 +44,8 @@ def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str
            "device": None,
        },
    )
-    task_dict = tasks.get_task_dict(task_name, num_fewshot=0)
+    task_manager = tasks.TaskManager()
+    task_dict = tasks.get_task_dict(task_name, task_manager)

    e2 = evaluator.evaluate(
        lm=lm,

--- a/tests/test_requests_caching.py
+++ b/tests/test_requests_caching.py
+# import lm_eval.base as base
+import importlib
+import os
+import sys
+from datetime import datetime
+from typing import List, Tuple
+
+import pytest
+import torch
+
+# import lm_eval.models as models
+from lm_eval.caching.cache import PATH
+
+
+MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
+
+# NOTE the script this loads uses simple evaluate
+# TODO potentially test both the helper script and the normal script
+sys.path.append(f"{MODULE_DIR}/../scripts")
+model_loader = importlib.import_module("requests_caching")
+run_model_for_task_caching = model_loader.run_model_for_task_caching
+
+
+DEFAULT_TASKS = ["lambada_openai", "hellaswag"]
+
+
+@pytest.fixture(autouse=True)
+def setup_and_teardown():
+    # Setup
+    torch.use_deterministic_algorithms(False)
+    clear_cache()
+    # Yields control back to the test function
+    yield
+    # Cleanup here
+
+
+def clear_cache():
+    if os.path.exists(PATH):
+        cache_files = os.listdir(PATH)
+        for file in cache_files:
+            file_path = f"{PATH}/{file}"
+            os.unlink(file_path)
+
+
+# leaving tasks here to allow for the option to select specific task files
+def get_cache_files(tasks: List[str] = None) -> Tuple[List[str], List[str]]:
+    cache_files = os.listdir(PATH)
+
+    file_task_names = []
+
+    for file in cache_files:
+        file_without_prefix = file.split("-")[1]
+        file_without_prefix_and_suffix = file_without_prefix.split(".")[0]
+        file_task_names.append(file_without_prefix_and_suffix)
+
+    return cache_files, file_task_names
+
+
+def assert_created(tasks: List[str], file_task_names: List[str]):
+    tasks.sort()
+    file_task_names.sort()
+
+    assert tasks == file_task_names
+
+
+@pytest.mark.parametrize("tasks", [DEFAULT_TASKS])
+def test_requests_caching_true(tasks: List[str]):
+    run_model_for_task_caching(tasks=tasks, cache_requests="true")
+
+    cache_files, file_task_names = get_cache_files()
+
+    assert_created(tasks=tasks, file_task_names=file_task_names)
+
+
+@pytest.mark.parametrize("tasks", [DEFAULT_TASKS])
+def test_requests_caching_refresh(tasks: List[str]):
+    run_model_for_task_caching(tasks=tasks, cache_requests="true")
+
+    timestamp_before_test = datetime.now().timestamp()
+
+    run_model_for_task_caching(tasks=tasks, cache_requests="refresh")
+
+    cache_files, file_task_names = get_cache_files()
+
+    for file in cache_files:
+        modification_time = os.path.getmtime(f"{PATH}/{file}")
+        assert modification_time > timestamp_before_test
+
+    tasks.sort()
+    file_task_names.sort()
+
+    assert tasks == file_task_names
+
+
+@pytest.mark.parametrize("tasks", [DEFAULT_TASKS])
+def test_requests_caching_delete(tasks: List[str]):
+    # populate the data first, rerun this test within this test for additional confidence
+    test_requests_caching_true(tasks=tasks)
+
+    run_model_for_task_caching(tasks=tasks, cache_requests="delete")
+
+    cache_files, file_task_names = get_cache_files()
+
+    assert len(cache_files) == 0
+
+
+# useful for locally running tests through the debugger
+if __name__ == "__main__":
+
+    def run_tests():
+        tests = [
+            test_requests_caching_true,
+            test_requests_caching_refresh,
+            test_requests_caching_delete,
+        ]
+
+        for test_func in tests:
+            clear_cache()
+            test_func(tasks=DEFAULT_TASKS)
+
+        print("Tests pass")
+
+    run_tests()
--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
@@ -8,7 +8,7 @@ from lm_eval.api.task import ConfigurableTask
 from .utils import new_tasks


-tasks.initialize_tasks()
+task_manager = tasks.TaskManager()
 # Default Task
 TASKS = ["arc_easy"]

@@ -19,9 +19,9 @@ def task_class():
    task_classes = new_tasks()
    # Check if task_classes is empty
    if task_classes:
-        return [tasks.TASK_REGISTRY.get(x)() for x in task_classes]
+        return list(task_manager.load_task_or_group(task_classes).values())
    else:
-        return [tasks.TASK_REGISTRY.get(x)() for x in TASKS]
+        return list(task_manager.load_task_or_group(TASKS).values())


 @pytest.fixture()

--- a/tests/test_utils.py
+++ b/tests/test_utils.py
+import itertools
+
+import numpy as np
 import pytest
+import torch

-from lm_eval.utils import Collator, get_rolling_token_windows, make_disjoint_window
+from lm_eval.api.metrics import (
+    aggregate_subtask_metrics,
+    mean,
+    pooled_sample_stderr,
+    stderr_for_metric,
+)
+from lm_eval.models.utils import Collator
+from lm_eval.utils import (
+    get_rolling_token_windows,
+    make_disjoint_window,
+)


 # noinspection DuplicatedCode
@@ -245,12 +259,20 @@ class TestCollator:
        ]
        return samples

+    def make_loglikelihood_sample_group(self, end=11):
+        a = [(("x", "x"), [1, 2, 3, 4, 5, 6, 7, 8], [x]) for x in range(9)]
+        b = [
+            (("x", "x"), [1, 2, 3, 4, 5, 6, 7, 8], [x, y, z])
+            for x, y, z in zip(range(9), range(9, 18), range(18, 27))
+        ]
+        return a + b
+
    @pytest.mark.parametrize("batch_size, end", [(17, 30), (8, 61), (12, 48), (0, 9)])
    def test_generations(self, batch_size, end):
        _collate_gen = lambda x: (-len(x[0]), x[0])  # noqa: E731

        generation_samples = self.make_generate_sample(int(end))
-        gens = Collator(generation_samples, _collate_gen, grouping=True)
+        gens = Collator(generation_samples, _collate_gen, group_by="gen_kwargs")
        chunks = gens.get_batched(n=int(batch_size), batch_fn=None)
        output = []
        for chunks in chunks:
@@ -279,7 +301,10 @@ class TestCollator:
    def test_loglikelihood(self, batch_size, end):
        _collate_log = lambda x: (-len(x[1]), tuple(x[1]))  # noqa: E731
        loglikelihood_samples = self.make_loglikelihood_sample(int(end))
-        loglikelihoods = Collator(loglikelihood_samples, _collate_log, grouping=False)
+        loglikelihoods = Collator(
+            loglikelihood_samples,
+            _collate_log,
+        )
        chunks = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
        output = []
        for chunks in chunks:
@@ -295,3 +320,81 @@ class TestCollator:
        # check indices
        reordered_output = loglikelihoods.get_original(output)
        assert reordered_output == [x[1] for x in loglikelihood_samples]
+
+    @pytest.mark.parametrize("batch_size", [17, 8, 12, 0])
+    def test_context_grouping(self, batch_size):
+        def _collate(x):
+            toks = x[1] + x[2]
+            return -len(toks), tuple(toks)
+
+        _collate_log = _collate  # noqa: E731
+        loglikelihood_samples = self.make_loglikelihood_sample_group()
+        loglikelihoods = Collator(
+            loglikelihood_samples,
+            _collate_log,
+            group_fn=lambda a: a[-2] + a[-1][:-1],
+            group_by="contexts",
+        )
+        chunks = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
+        output = []
+        outputs_ = []
+        for chunks in chunks:
+            # check batching
+            if batch_size != 0:
+                assert len(chunks) <= batch_size
+            # check reorder
+            assert all(
+                len(chunks[i][1]) <= len(chunks[i - 1][1])
+                for i in range(1, len(chunks))
+            )
+            for x in chunks:
+                for request_str, cont_toks, logits in loglikelihoods.get_cache(
+                    req_str="".join(x[0]),
+                    cxt_toks=x[1],
+                    cont_toks=x[2],
+                    logits=torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])
+                    .unsqueeze(0)
+                    .unsqueeze(0),
+                ):
+                    output.append(x[1])
+                    outputs_.append(cont_toks)
+        assert len(output) == len(outputs_)
+        # check indices
+        reordered_output = loglikelihoods.get_original(output)
+        assert reordered_output == [x[1] for x in loglikelihood_samples]
+
+
+def test_aggregate_mean():
+    # test weight_by_size is respected
+    assert (
+        aggregate_subtask_metrics([0.3, 0.2, 0.4], [20, 40, 100], weight_by_size=False)
+        == 0.3
+    )
+    assert (
+        aggregate_subtask_metrics([0.3, 0.2, 0.4], [20, 40, 100], weight_by_size=True)
+        == 0.3375
+    )
+
+
+@pytest.mark.parametrize(
+    "samples",
+    [
+        [40 * [1.0] + 60 * [0.0], 30 * [1.0] + 30 * [0.0], 20 * [1.0] + 60 * [0.0]],
+        [35 * [1.0] + 65 * [0.0], 20 * [1.0] + 20 * [0.0]],
+    ],
+)
+def test_aggregate_stderrs(samples):
+    # check that aggregating subtasks' bootstrap stderrs with our formula
+    # (using weight_by_size) is ~equiv.
+    # to just getting bootstrap stderr of the whole set of samples
+    mean_stderr = stderr_for_metric(metric=mean, bootstrap_iters=100000)
+
+    stderrs = [mean_stderr(subtask) for subtask in samples]
+
+    sizes = [len(subtask) for subtask in samples]
+
+    assert np.allclose(
+        pooled_sample_stderr(stderrs, sizes),
+        mean_stderr(list(itertools.chain.from_iterable(samples))),
+        atol=1.0e-3,
+    )
--- a/tests/utils.py
+++ b/tests/utils.py
 import os
-from pathlib import Path
 from typing import List, Union

 from lm_eval.utils import load_yaml_config
@@ -20,17 +19,18 @@ def load_changed_files(file_path: str) -> List[str]:


 # checks the txt file for list of changed files.
-# if file ends with .yaml then check yaml for task name
-# if file ends with .py then parse the folder for all yaml files
-# skips benchmarks folder
+# if file ends with .yaml then check yaml and load the config.
+# if the config task is a string, it's a task config.
+# if the config task is a list, it's a group config.
 def parser(full_path: List[str]) -> List[str]:
    _output = set()
    for x in full_path:
-        if x.endswith(".yaml") and "benchmarks" not in x:
-            _output.add(load_yaml_config(x)["task"])
-        elif x.endswith(".py") and "benchmarks" not in x:
-            path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))]
-            _output |= {load_yaml_config(x)["task"] for x in path}
+        if os.path.exists(x) and x.endswith(".yaml"):
+            config = load_yaml_config(x, mode="simple")
+            if isinstance(config["task"], str):
+                _output.add(config["task"])
+            elif isinstance(config["task"], list):
+                _output.add(config["group"])
    return list(_output)