Refactor utilities into a separate model utils file. (#1429)

2d0a6460 · Baber Abbasi · GitHub · 620d6a15 · 2d0a6460 · 2d0a6460
Unverified Commit 2d0a6460 authored Feb 14, 2024 by Baber Abbasi Committed by GitHub Feb 14, 2024
11 changed files
--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -5,7 +5,7 @@ from tqdm import tqdm
 from lm_eval import utils
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
-from lm_eval.utils import retry_on_specific_exceptions
+from lm_eval.models.utils import retry_on_specific_exceptions


 eval_logger = utils.eval_logger

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -26,7 +26,13 @@ from lm_eval import utils
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
-from lm_eval.utils import Collator, stop_sequences_criteria
+from lm_eval.models.utils import (
+    Collator,
+    clear_torch_cache,
+    get_dtype,
+    pad_and_concat,
+    stop_sequences_criteria,
+)


 eval_logger = utils.eval_logger
@@ -503,13 +509,13 @@ class HFLM(LM):
            if transformers.__version__ >= "4.30.0":
                if model_kwargs.get("load_in_4bit", None):
                    if model_kwargs.get("bnb_4bit_compute_dtype", None):
-                        model_kwargs["bnb_4bit_compute_dtype"] = utils.get_dtype(
+                        model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(
                            model_kwargs["bnb_4bit_compute_dtype"]
                        )
            self._model = self.AUTO_MODEL_CLASS.from_pretrained(
                pretrained,
                revision=revision,
-                torch_dtype=utils.get_dtype(dtype),
+                torch_dtype=get_dtype(dtype),
                trust_remote_code=trust_remote_code,
                **model_kwargs,
            )
@@ -639,10 +645,10 @@ class HFLM(LM):
                self.accelerator.gather(max_rnk_bs).cpu().detach().numpy().tolist()
            )
            batch_size = min(gathered)
-            utils.clear_torch_cache()
+            clear_torch_cache()
            return batch_size

-        utils.clear_torch_cache()
+        clear_torch_cache()
        return batch_size

    def tok_encode(
@@ -997,18 +1003,18 @@ class HFLM(LM):
            # create encoder attn mask and batched conts, if seq2seq
            call_kwargs = {}
            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
-                batched_inps = utils.pad_and_concat(
+                batched_inps = pad_and_concat(
                    padding_len_inp, inps, padding_side="right"
                )  # [batch, padding_len_inp]
            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
                # TODO: left-pad encoder inps and mask?
-                batched_inps = utils.pad_and_concat(
+                batched_inps = pad_and_concat(
                    padding_len_inp, inps
                )  # [batch, padding_len_inp]
-                batched_conts = utils.pad_and_concat(
+                batched_conts = pad_and_concat(
                    padding_len_cont, conts
                )  # [batch, padding_len_cont]
-                batched_encoder_mask = utils.pad_and_concat(
+                batched_encoder_mask = pad_and_concat(
                    padding_len_inp, encoder_attns
                )  # [batch, padding_len_inp]
                call_kwargs = {

--- a/lm_eval/models/mamba_lm.py
+++ b/lm_eval/models/mamba_lm.py
@@ -2,7 +2,7 @@ from typing import Optional, Union

 import torch

-from lm_eval import utils
+import lm_eval.models.utils
 from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM

@@ -97,7 +97,9 @@ please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba
        self._model = MambaLMHeadModel.from_pretrained(
            pretrained,
            device=self._device,
-            dtype=torch.float16 if dtype == "auto" else utils.get_dtype(dtype),
+            dtype=torch.float16
+            if dtype == "auto"
+            else lm_eval.models.utils.get_dtype(dtype),
        )

    def _model_generate(self, context, max_length, stop, **generation_kwargs):

--- a/lm_eval/models/neuron_optimum.py
+++ b/lm_eval/models/neuron_optimum.py
@@ -13,10 +13,11 @@ from tqdm import tqdm
 from transformers import GenerationConfig
 from transformers.generation import StoppingCriteriaList

+import lm_eval.models.utils
 from lm_eval import utils
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
-from lm_eval.utils import stop_sequences_criteria
+from lm_eval.models.utils import stop_sequences_criteria


 try:
@@ -239,7 +240,7 @@ class NEURON_HF(LM):
            revision=revision,
            trust_remote_code=trust_remote_code,
        )
-        torch_dtype = utils.get_dtype(dtype)
+        torch_dtype = lm_eval.models.utils.get_dtype(dtype)

        assert torch_dtype in [
            torch.float16,
@@ -550,7 +551,7 @@ class NEURON_HF(LM):
        # automatic (variable) batch size detection for vectorization
        # pull longest context sample from request

-        chunks = utils.chunks(
+        chunks = lm_eval.models.utils.chunks(
            re_ord.get_reordered(),
            n=self.batch_size,
            fn=None,
@@ -603,7 +604,7 @@ class NEURON_HF(LM):

            # create encoder attn mask and batched conts, if seq2seq
            call_kwargs = {}
-            batched_inps = utils.pad_and_concat(
+            batched_inps = lm_eval.models.utils.pad_and_concat(
                padding_len_inp, inps, padding_side="right"
            )  # [batch, padding_len_inp]

@@ -663,7 +664,7 @@ class NEURON_HF(LM):
        # we group requests by their generation_kwargs,
        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
        # in the same batch.
-        grouper = utils.Grouper(requests, lambda x: str(x.args[1]))
+        grouper = lm_eval.models.utils.Grouper(requests, lambda x: str(x.args[1]))
        for key, reqs in grouper.get_grouped().items():
            # within each set of reqs for given kwargs, we reorder by token length, descending.
            re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
@@ -672,7 +673,9 @@ class NEURON_HF(LM):

        # for each different set of kwargs, we execute all requests, by batch.
        for key, re_ord in re_ords.items():
-            chunks = utils.chunks(re_ord.get_reordered(), n=self.batch_size)
+            chunks = lm_eval.models.utils.chunks(
+                re_ord.get_reordered(), n=self.batch_size
+            )
            for chunk in tqdm(chunks, disable=self.rank != 0):
                contexts, all_gen_kwargs = zip(*chunk)
                # we assume all gen kwargs in the batch are the same

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -6,10 +6,12 @@ from typing import List, Literal, Optional, Tuple

 from tqdm import tqdm

+import lm_eval.models.utils
 from lm_eval import utils
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
-from lm_eval.utils import eval_logger, retry_on_specific_exceptions
+from lm_eval.models.utils import retry_on_specific_exceptions
+from lm_eval.utils import eval_logger


 def get_result(response, ctxlen: int) -> Tuple[float, bool]:
@@ -219,7 +221,7 @@ class OpenaiCompletionsLM(LM):
        re_ord = utils.Reorderer(requests, _collate)

        for chunk in tqdm(
-            list(utils.chunks(re_ord.get_reordered(), self.batch_size)),
+            list(lm_eval.models.utils.chunks(re_ord.get_reordered(), self.batch_size)),
            disable=disable_tqdm,
        ):
            inps = []
@@ -429,7 +431,7 @@ class OpenaiChatCompletionsLM(LM):
        # we group requests by their generation_kwargs,
        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
        # in the same batch.
-        grouper = utils.Grouper(requests, lambda x: str(x.args[1]))
+        grouper = lm_eval.models.utils.Grouper(requests, lambda x: str(x.args[1]))
        for key, reqs in grouper.get_grouped().items():
            # within each set of reqs for given kwargs, we reorder by token length, descending.
            re_ords[key] = utils.Reorderer(
@@ -441,7 +443,7 @@ class OpenaiChatCompletionsLM(LM):
            # n needs to be 1 because messages in
            # chat completion are not batch but
            # is regarded as a single conversation.
-            chunks = utils.chunks(re_ord.get_reordered(), n=1)
+            chunks = lm_eval.models.utils.chunks(re_ord.get_reordered(), n=1)
            for chunk in chunks:
                contexts, all_gen_kwargs = zip(*chunk)
                inps = [{"role": "user", "content": context} for context in contexts]

--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
@@ -19,7 +19,7 @@ from tqdm import tqdm

 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
-from lm_eval.utils import retry_on_specific_exceptions
+from lm_eval.models.utils import retry_on_specific_exceptions


 logger = logging.getLogger(__name__)

--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
+import collections
+import fnmatch
+import gc
+import time
+from functools import wraps
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+)
+
+import torch
+import transformers
+
+from lm_eval.utils import eval_logger
+
+
+def chunks(iter, n: int = 0, fn=None):
+    """
+    Divides an iterable into chunks of specified size or based on a given function.
+    Useful for batching
+
+    Parameters:
+    - iter: The input iterable to be divided into chunks.
+    - n: An integer representing the size of each chunk. Default is 0.
+    - fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
+
+    Returns:
+    An iterator that yields chunks of the input iterable.
+
+    Example usage:
+    ```
+    data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    for chunk in chunks(data, 3):
+        print(chunk)
+    ```
+    Output:
+    ```
+    [1, 2, 3]
+    [4, 5, 6]
+    [7, 8, 9]
+    [10]
+    ```
+    """
+    arr = []
+    for i, x in enumerate(iter):
+        arr.append(x)
+        if len(arr) == (fn(i, iter) if fn else n):
+            yield arr
+            arr = []
+
+    if arr:
+        yield arr
+
+
+class MultiChoice:
+    def __init__(self, choices) -> None:
+        self.choices = choices
+
+    # Simple wildcard support (linux filename patterns)
+    def __contains__(self, values) -> bool:
+        for value in values.split(","):
+            if len(fnmatch.filter(self.choices, value)) == 0:
+                eval_logger.info("Available tasks to choose:")
+                for choice in self.choices:
+                    eval_logger.info(f"  - {choice}")
+                raise ValueError("'{}' is not in task list".format(value))
+        return True
+
+    def __iter__(self) -> Iterator:
+        for choice in self.choices:
+            yield choice
+
+
+class Grouper:
+    """
+    takes an array `arr` and function `fn` and returns a dictionary
+    with keys fn(ob) for each ob in `arr` and with values `self.arr[key]` a list of all
+    objects in `arr` satisfying `key == fn(ob)`.
+    """
+
+    def __init__(self, arr, fn) -> None:
+        # self.orig_arr = arr
+        self.size = len(arr)
+        arr = list(enumerate(arr))
+
+        def group_return_dict(arr, fn):
+            res = collections.defaultdict(list)
+
+            for ob in arr:
+                res[fn(ob)].append(ob)
+            return res
+
+        arr = group_return_dict(arr, lambda x: fn(x[1]))
+
+        # self.arr has format Dict[Tuple[int, <entry from orig. arr>]]
+        self.arr = arr
+        self._grouped = None
+
+    def get_grouped(self):
+        # return the contents but not indices for our grouped dict.
+        if self._grouped:
+            return self._grouped
+        grouped = {}
+        for key in self.arr.keys():
+            # drop the index from each element of self.arr
+            grouped[key] = [y[1] for y in self.arr[key]]
+        self._grouped = grouped
+        return grouped
+
+    def get_original(self, grouped_dict):
+        # take in a grouped dictionary with e.g. results for each key listed
+        # in the same order as the instances in `self.arr`, and
+        # return the results in the same (single list) order as `self.orig_arr`.
+        res = [None] * self.size
+        cov = [False] * self.size
+        # orig = [None] * self.size
+
+        assert grouped_dict.keys() == self.arr.keys()
+
+        for key in grouped_dict.keys():
+            for (ind, _), v in zip(self.arr[key], grouped_dict[key]):
+                res[ind] = v
+                cov[ind] = True
+                # orig[ind] = _
+
+        assert all(cov)
+        # assert orig == self.orig_arr
+
+        return res
+
+
+def pad_and_concat(
+    max_length: int,
+    tensors: List[torch.Tensor],
+    padding_side: Literal["right", "left"] = "right",
+):
+    """
+    Method for padding a list of tensors given the maximum tensor
+    length in the batch. Used for batching inputs and continuations in
+    seq2seq models.
+    """
+    assert (
+        padding_side == "left" or padding_side == "right"
+    ), f"Unrecognized padding type: '{padding_side}' not 'left' or 'right'"
+
+    for i, tensor in enumerate(tensors):
+        if len(tensor.shape) == 2:
+            tensor = tensor.squeeze(0)  # squeeze, in case passed [1, seq] size
+        tensor_len = tensor.shape[0]
+        if tensor_len < max_length:
+            if padding_side == "right":
+                # right-pad
+                tensors[i] = torch.cat(
+                    [
+                        tensor,  # [seq]
+                        torch.zeros(
+                            max_length - tensor_len,
+                            dtype=torch.long,
+                            device=tensor.device,
+                        ),  # [padding_length - seq]
+                    ],
+                    dim=0,
+                ).unsqueeze(0)
+            else:
+                # left-pad
+                tensors[i] = torch.cat(
+                    [
+                        torch.zeros(
+                            max_length - tensor_len,
+                            dtype=torch.long,
+                            device=tensor.device,
+                        ),  # [padding_length - seq]
+                        tensor,  # [seq]
+                    ],
+                    dim=0,
+                ).unsqueeze(0)
+        else:
+            tensors[i] = tensor.unsqueeze(0)
+
+    return torch.cat(tensors, dim=0)
+
+
+def clear_torch_cache() -> None:
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+def get_dtype(dtype: Union[str, torch.dtype]) -> torch.dtype:
+    """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
+    if isinstance(dtype, str) and dtype != "auto":
+        # Convert `str` args torch dtype: `float16` -> `torch.float16`
+        _torch_dtype = getattr(torch, dtype)
+    else:
+        _torch_dtype = dtype
+    return _torch_dtype
+
+
+class MultiTokenEOSCriteria(transformers.StoppingCriteria):
+    """Criteria to stop on the specified multi-token sequence."""
+
+    def __init__(
+        self,
+        sequence: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        initial_decoder_input_length: int,
+        batch_size: int,
+    ) -> None:
+        self.initial_decoder_input_length = initial_decoder_input_length
+        self.done_tracker = [False] * batch_size
+        self.sequence = sequence
+        self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
+        # print(sequence, self.sequence_ids)
+        # we look back for 2 more tokens than it takes to encode our stop sequence
+        # because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
+        # and we don't want to mistakenly not stop a generation because our
+        # (string) stop sequence was output in a different tokenization
+
+        # NOTE: there is a minor danger that this will end up looking back 2 tokens into the past, into the inputs to the model,
+        # and stopping generation immediately as a result. With only 2 extra tokens of lookback, this risk is minimized
+        # Additionally, in lookback_ids_batch we should prevent ever looking back into the inputs as described.
+        self.sequence_id_len = len(self.sequence_ids) + 2
+        self.tokenizer = tokenizer
+
+    def __call__(self, input_ids, scores, **kwargs) -> bool:
+        # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
+        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :]
+
+        lookback_ids_batch = lookback_ids_batch[:, -self.sequence_id_len :]
+
+        lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
+
+        for i, done in enumerate(self.done_tracker):
+            if not done:
+                self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
+        return False not in self.done_tracker
+
+
+def stop_sequences_criteria(
+    tokenizer: transformers.PreTrainedTokenizer,
+    stop_sequences: List[str],
+    initial_decoder_input_length: int,
+    batch_size: int,
+) -> transformers.StoppingCriteriaList:
+    return transformers.StoppingCriteriaList(
+        [
+            *[
+                MultiTokenEOSCriteria(
+                    sequence, tokenizer, initial_decoder_input_length, batch_size
+                )
+                for sequence in stop_sequences
+            ],
+        ]
+    )
+
+
+def divide(iterable, n) -> List[Iterator]:
+    """Divide the elements from *iterable* into *n* parts, maintaining
+    order.
+
+        >>> group_1, group_2 = divide([1, 2, 3, 4, 5, 6], 2)
+        >>> list(group_1)
+        [1, 2, 3]
+        >>> list(group_2)
+        [4, 5, 6]
+
+    If the length of *iterable* is not evenly divisible by *n*, then the
+    length of the returned iterables will not be identical:
+
+        >>> children = divide([1, 2, 3, 4, 5, 6, 7], 3)
+        >>> [list(c) for c in children]
+        [[1, 2, 3], [4, 5], [6, 7]]
+
+    If the length of the iterable is smaller than n, then the last returned
+    iterables will be empty:
+
+        >>> children = divide([1, 2, 3], 5)
+        >>> [list(c) for c in children]
+        [[1], [2], [3], [], []]
+
+    This function will exhaust the iterable before returning and may require
+    significant storage. If order is not important, see :func:`distribute`,
+    which does not first pull the iterable into memory.
+
+    """
+    if n < 1:
+        raise ValueError("n must be at least 1")
+
+    try:
+        iterable[:0]
+    except TypeError:
+        seq = tuple(iterable)
+    else:
+        seq = iterable
+
+    q, r = divmod(len(seq), n)
+
+    ret = []
+    stop = 0
+    for i in range(1, n + 1):
+        start = stop
+        stop += q + 1 if i <= r else q
+        ret.append(iter(seq[start:stop]))
+
+    return ret
+
+
+def retry_on_specific_exceptions(
+    on_exceptions: List[Type[Exception]],
+    max_retries: Optional[int] = None,
+    backoff_time: float = 3.0,
+    backoff_multiplier: float = 1.5,
+    on_exception_callback: Optional[Callable[[Exception, float], Any]] = None,
+):
+    """Retry on an LLM Provider's rate limit error with exponential backoff
+    For example, to use for OpenAI, do the following:
+    ```
+    from openai import RateLimitError
+
+    # Recommend specifying max_retries to avoid infinite loops!
+    @retry_on_specific_exceptions([RateLimitError], max_retries=3)
+    def completion(...):
+        # Wrap OpenAI completion function here
+        ...
+    ```
+    """
+
+    def decorator(func: Callable):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            sleep_time = backoff_time
+            attempt = 0
+            while max_retries is None or attempt < max_retries:
+                try:
+                    return func(*args, **kwargs)
+                except tuple(on_exceptions) as e:
+                    if on_exception_callback is not None:
+                        on_exception_callback(e, sleep_time)
+                    time.sleep(sleep_time)
+                    sleep_time *= backoff_multiplier
+                    attempt += 1
+
+        return wrapper
+
+    return decorator
+
+
+class Collator:
+    """
+    A class for reordering and batching elements of an array.
+
+    This class allows for sorting an array based on a provided sorting function, grouping elements based on a grouping function, and generating batches from the sorted and grouped data.
+    """
+
+    def __init__(
+        self,
+        arr: List,
+        sort_fn: Callable,
+        group_fn: Callable = lambda x: x[1],
+        grouping: bool = False,
+    ) -> None:
+        self.grouping = grouping
+        self.fn = sort_fn
+        self.group_fn = lambda x: group_fn(x[1])  # first index are enumerated indices
+        self.reorder_indices: List = []
+        self.size = len(arr)
+        self.arr_with_indices: Iterable[Any] = tuple(enumerate(arr))  # [indices, (arr)]
+        if self.grouping is True:
+            self.group_by_index()
+
+    def group_by_index(self) -> None:
+        self.arr_with_indices = self.group(
+            self.arr_with_indices, fn=self.group_fn, values=False
+        )
+
+    def get_batched(self, n: int = 1, batch_fn: Optional[Callable] = None) -> Iterator:
+        """
+        Generates and yields batches from the reordered array.
+
+        Parameters:
+        - n (int): The size of each batch. Defaults to 1.
+        - batch_fn (Optional[Callable[[int, Iterable], int]]): A function to determine the size of each batch. Defaults to None.
+
+        Yields:
+        Iterator: An iterator over batches of reordered elements.
+        """
+        if self.grouping:
+            for (
+                key,
+                values,
+            ) in self.arr_with_indices.items():  # type: ignore
+                values = self._reorder(values)
+                batch = self.get_chunks(values, n=n, fn=batch_fn)
+                yield from batch
+        else:
+            values = self._reorder(self.arr_with_indices)  # type: ignore
+            batch = self.get_chunks(values, n=n, fn=batch_fn)
+            yield from batch
+
+    def _reorder(self, arr: Union[List, Tuple[Tuple[int, Any], ...]]) -> List:
+        """
+        Reorders the elements in the array based on the sorting function.
+
+        Parameters:
+        - arr (Union[List, Tuple[Tuple[int, Any], ...]]): The array or iterable to be reordered.
+
+        Yields:
+        List: Yields reordered elements one by one.
+        """
+        arr = sorted(arr, key=lambda x: self.fn(x[1]))
+        self.reorder_indices.extend([x[0] for x in arr])
+        yield from [x[1] for x in arr]
+
+    def get_original(self, newarr: List) -> List:
+        """
+        Restores the original order of elements from the reordered list.
+
+        Parameters:
+        - newarr (List): The reordered array.
+
+        Returns:
+        List: The array with elements restored to their original order.
+        """
+        res = [None] * self.size
+        cov = [False] * self.size
+
+        for ind, v in zip(self.reorder_indices, newarr):
+            res[ind] = v
+            cov[ind] = True
+
+        assert all(cov)
+
+        return res
+
+    def __len__(self):
+        return self.size
+
+    @staticmethod
+    def group(arr: Iterable, fn: Callable, values: bool = False) -> Iterable:
+        """
+        Groups elements of an iterable based on a provided function.
+
+        Parameters:
+        - arr (Iterable): The iterable to be grouped.
+        - fn (Callable): The function to determine the grouping.
+        - values (bool): If True, returns the values of the group. Defaults to False.
+
+        Returns:
+        Iterable: An iterable of grouped elements.
+        """
+        res = collections.defaultdict(list)
+        for ob in arr:
+            try:
+                hashable_dict = tuple(
+                    (
+                        key,
+                        tuple(value)
+                        if isinstance(value, collections.abc.Iterable)
+                        else value,
+                    )
+                    for key, value in sorted(fn(ob).items())
+                )
+                res[hashable_dict].append(ob)
+            except TypeError:
+                res[fn(ob)].append(ob)
+        if not values:
+            return res
+        return res.values()
+
+    @staticmethod
+    def get_chunks(_iter, n: int = 0, fn=None):
+        """
+        Divides an iterable into chunks of specified size or based on a given function.
+        Useful for batching
+
+        Parameters:
+        - iter: The input iterable to be divided into chunks.
+        - n: An integer representing the size of each chunk. Default is 0.
+        - fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
+
+        Returns:
+        An iterator that yields chunks of the input iterable.
+
+        Example usage:
+        ```
+        data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        for chunk in chunks(data, 3):
+            print(chunk)
+        ```
+        Output:
+        ```
+        [1, 2, 3]
+        [4, 5, 6]
+        [7, 8, 9]
+        [10]
+        ```
+        """
+        arr = []
+        _iter = tuple(_iter)
+        for i, x in enumerate(_iter):
+            arr.append(x)
+            if len(arr) == (fn(i, _iter) if fn else n):
+                yield arr
+                arr = []
+
+        if arr:
+            yield arr
--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -7,9 +7,8 @@ from tqdm import tqdm
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
+from lm_eval.models.utils import Collator, divide
 from lm_eval.utils import (
-    Collator,
-    divide,
    eval_logger,
    get_rolling_token_windows,
    make_disjoint_window,

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
 import collections
 import fnmatch
 import functools
-import gc
 import importlib.util
 import inspect
 import logging
@@ -10,24 +9,13 @@ import pathlib
 import re
 import subprocess
 import sys
-import time
-from functools import wraps
 from itertools import islice
 from typing import (
    Any,
    Callable,
-    Iterable,
-    Iterator,
    List,
-    Literal,
-    Optional,
-    Tuple,
-    Type,
-    Union,
 )

-import torch
-import transformers
 import yaml
 from jinja2 import BaseLoader, Environment, StrictUndefined

@@ -99,44 +87,6 @@ def join_iters(iters):
        yield from iter


-def chunks(iter, n: int = 0, fn=None):
-    """
-    Divides an iterable into chunks of specified size or based on a given function.
-    Useful for batching
-
-    Parameters:
-    - iter: The input iterable to be divided into chunks.
-    - n: An integer representing the size of each chunk. Default is 0.
-    - fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
-
-    Returns:
-    An iterator that yields chunks of the input iterable.
-
-    Example usage:
-    ```
-    data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-    for chunk in chunks(data, 3):
-        print(chunk)
-    ```
-    Output:
-    ```
-    [1, 2, 3]
-    [4, 5, 6]
-    [7, 8, 9]
-    [10]
-    ```
-    """
-    arr = []
-    for i, x in enumerate(iter):
-        arr.append(x)
-        if len(arr) == (fn(i, iter) if fn else n):
-            yield arr
-            arr = []
-
-    if arr:
-        yield arr
-
-
 def group(arr, fn):
    res = collections.defaultdict(list)

@@ -146,25 +96,6 @@ def group(arr, fn):
    return list(res.values())


-class MultiChoice:
-    def __init__(self, choices) -> None:
-        self.choices = choices
-
-    # Simple wildcard support (linux filename patterns)
-    def __contains__(self, values) -> bool:
-        for value in values.split(","):
-            if len(fnmatch.filter(self.choices, value)) == 0:
-                eval_logger.info("Available tasks to choose:")
-                for choice in self.choices:
-                    eval_logger.info(f"  - {choice}")
-                raise ValueError("'{}' is not in task list".format(value))
-        return True
-
-    def __iter__(self) -> Iterator:
-        for choice in self.choices:
-            yield choice
-
-
 # Returns a list containing all values of the source_list that
 # match at least one of the patterns
 def pattern_match(patterns, source_list):
@@ -283,64 +214,6 @@ class Reorderer:
        return res


-class Grouper:
-    """
-    takes an array `arr` and function `fn` and returns a dictionary
-    with keys fn(ob) for each ob in `arr` and with values `self.arr[key]` a list of all
-    objects in `arr` satisfying `key == fn(ob)`.
-    """
-
-    def __init__(self, arr, fn) -> None:
-        # self.orig_arr = arr
-        self.size = len(arr)
-        arr = list(enumerate(arr))
-
-        def group_return_dict(arr, fn):
-            res = collections.defaultdict(list)
-
-            for ob in arr:
-                res[fn(ob)].append(ob)
-            return res
-
-        arr = group_return_dict(arr, lambda x: fn(x[1]))
-
-        # self.arr has format Dict[Tuple[int, <entry from orig. arr>]]
-        self.arr = arr
-        self._grouped = None
-
-    def get_grouped(self):
-        # return the contents but not indices for our grouped dict.
-        if self._grouped:
-            return self._grouped
-        grouped = {}
-        for key in self.arr.keys():
-            # drop the index from each element of self.arr
-            grouped[key] = [y[1] for y in self.arr[key]]
-        self._grouped = grouped
-        return grouped
-
-    def get_original(self, grouped_dict):
-        # take in a grouped dictionary with e.g. results for each key listed
-        # in the same order as the instances in `self.arr`, and
-        # return the results in the same (single list) order as `self.orig_arr`.
-        res = [None] * self.size
-        cov = [False] * self.size
-        # orig = [None] * self.size
-
-        assert grouped_dict.keys() == self.arr.keys()
-
-        for key in grouped_dict.keys():
-            for (ind, _), v in zip(self.arr[key], grouped_dict[key]):
-                res[ind] = v
-                cov[ind] = True
-                # orig[ind] = _
-
-        assert all(cov)
-        # assert orig == self.orig_arr
-
-        return res
-
-
 def make_table(result_dict, column: str = "results"):
    """Generate table of results."""
    from pytablewriter import LatexTableWriter, MarkdownTableWriter
@@ -562,380 +435,7 @@ def create_iterator(raw_iterator, rank, world_size, limit=None):
    return islice(raw_iterator, rank, limit, world_size)


-def pad_and_concat(
-    max_length: int,
-    tensors: List[torch.Tensor],
-    padding_side: Literal["right", "left"] = "right",
-):
-    """
-    Method for padding a list of tensors given the maximum tensor
-    length in the batch. Used for batching inputs and continuations in
-    seq2seq models.
-    """
-    assert (
-        padding_side == "left" or padding_side == "right"
-    ), f"Unrecognized padding type: '{padding_side}' not 'left' or 'right'"
-
-    for i, tensor in enumerate(tensors):
-        if len(tensor.shape) == 2:
-            tensor = tensor.squeeze(0)  # squeeze, in case passed [1, seq] size
-        tensor_len = tensor.shape[0]
-        if tensor_len < max_length:
-            if padding_side == "right":
-                # right-pad
-                tensors[i] = torch.cat(
-                    [
-                        tensor,  # [seq]
-                        torch.zeros(
-                            max_length - tensor_len,
-                            dtype=torch.long,
-                            device=tensor.device,
-                        ),  # [padding_length - seq]
-                    ],
-                    dim=0,
-                ).unsqueeze(0)
-            else:
-                # left-pad
-                tensors[i] = torch.cat(
-                    [
-                        torch.zeros(
-                            max_length - tensor_len,
-                            dtype=torch.long,
-                            device=tensor.device,
-                        ),  # [padding_length - seq]
-                        tensor,  # [seq]
-                    ],
-                    dim=0,
-                ).unsqueeze(0)
-        else:
-            tensors[i] = tensor.unsqueeze(0)
-
-    return torch.cat(tensors, dim=0)
-
-
-def clear_torch_cache() -> None:
-    gc.collect()
-    torch.cuda.empty_cache()
-
-
-def get_dtype(dtype: Union[str, torch.dtype]) -> torch.dtype:
-    """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
-    if isinstance(dtype, str) and dtype != "auto":
-        # Convert `str` args torch dtype: `float16` -> `torch.float16`
-        _torch_dtype = getattr(torch, dtype)
-    else:
-        _torch_dtype = dtype
-    return _torch_dtype
-
-
 # Multi-token stopping criteria
-class MultiTokenEOSCriteria(transformers.StoppingCriteria):
-    """Criteria to stop on the specified multi-token sequence."""
-
-    def __init__(
-        self,
-        sequence: str,
-        tokenizer: transformers.PreTrainedTokenizer,
-        initial_decoder_input_length: int,
-        batch_size: int,
-    ) -> None:
-        self.initial_decoder_input_length = initial_decoder_input_length
-        self.done_tracker = [False] * batch_size
-        self.sequence = sequence
-        self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
-        # print(sequence, self.sequence_ids)
-        # we look back for 2 more tokens than it takes to encode our stop sequence
-        # because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
-        # and we don't want to mistakenly not stop a generation because our
-        # (string) stop sequence was output in a different tokenization
-
-        # NOTE: there is a minor danger that this will end up looking back 2 tokens into the past, into the inputs to the model,
-        # and stopping generation immediately as a result. With only 2 extra tokens of lookback, this risk is minimized
-        # Additionally, in lookback_ids_batch we should prevent ever looking back into the inputs as described.
-        self.sequence_id_len = len(self.sequence_ids) + 2
-        self.tokenizer = tokenizer
-
-    def __call__(self, input_ids, scores, **kwargs) -> bool:
-        # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
-        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :]
-
-        lookback_ids_batch = lookback_ids_batch[:, -self.sequence_id_len :]
-
-        lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
-
-        for i, done in enumerate(self.done_tracker):
-            if not done:
-                self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
-        return False not in self.done_tracker
-
-
-def stop_sequences_criteria(
-    tokenizer: transformers.PreTrainedTokenizer,
-    stop_sequences: List[str],
-    initial_decoder_input_length: int,
-    batch_size: int,
-) -> transformers.StoppingCriteriaList:
-    return transformers.StoppingCriteriaList(
-        [
-            *[
-                MultiTokenEOSCriteria(
-                    sequence, tokenizer, initial_decoder_input_length, batch_size
-                )
-                for sequence in stop_sequences
-            ],
-        ]
-    )


 # from more_itertools
-def divide(iterable, n) -> List[Iterator]:
-    """Divide the elements from *iterable* into *n* parts, maintaining
-    order.
-
-        >>> group_1, group_2 = divide([1, 2, 3, 4, 5, 6], 2)
-        >>> list(group_1)
-        [1, 2, 3]
-        >>> list(group_2)
-        [4, 5, 6]
-
-    If the length of *iterable* is not evenly divisible by *n*, then the
-    length of the returned iterables will not be identical:
-
-        >>> children = divide([1, 2, 3, 4, 5, 6, 7], 3)
-        >>> [list(c) for c in children]
-        [[1, 2, 3], [4, 5], [6, 7]]
-
-    If the length of the iterable is smaller than n, then the last returned
-    iterables will be empty:
-
-        >>> children = divide([1, 2, 3], 5)
-        >>> [list(c) for c in children]
-        [[1], [2], [3], [], []]
-
-    This function will exhaust the iterable before returning and may require
-    significant storage. If order is not important, see :func:`distribute`,
-    which does not first pull the iterable into memory.
-
-    """
-    if n < 1:
-        raise ValueError("n must be at least 1")
-
-    try:
-        iterable[:0]
-    except TypeError:
-        seq = tuple(iterable)
-    else:
-        seq = iterable
-
-    q, r = divmod(len(seq), n)
-
-    ret = []
-    stop = 0
-    for i in range(1, n + 1):
-        start = stop
-        stop += q + 1 if i <= r else q
-        ret.append(iter(seq[start:stop]))
-
-    return ret
-
-
-def retry_on_specific_exceptions(
-    on_exceptions: List[Type[Exception]],
-    max_retries: Optional[int] = None,
-    backoff_time: float = 3.0,
-    backoff_multiplier: float = 1.5,
-    on_exception_callback: Optional[Callable[[Exception, float], Any]] = None,
-):
-    """Retry on an LLM Provider's rate limit error with exponential backoff
-    For example, to use for OpenAI, do the following:
-    ```
-    from openai import RateLimitError
-
-    # Recommend specifying max_retries to avoid infinite loops!
-    @retry_on_specific_exceptions([RateLimitError], max_retries=3)
-    def completion(...):
-        # Wrap OpenAI completion function here
-        ...
-    ```
-    """
-
-    def decorator(func: Callable):
-        @wraps(func)
-        def wrapper(*args, **kwargs):
-            sleep_time = backoff_time
-            attempt = 0
-            while max_retries is None or attempt < max_retries:
-                try:
-                    return func(*args, **kwargs)
-                except tuple(on_exceptions) as e:
-                    if on_exception_callback is not None:
-                        on_exception_callback(e, sleep_time)
-                    time.sleep(sleep_time)
-                    sleep_time *= backoff_multiplier
-                    attempt += 1
-
-        return wrapper
-
-    return decorator
-
-
-class Collator:
-    """
-    A class for reordering and batching elements of an array.
-
-    This class allows for sorting an array based on a provided sorting function, grouping elements based on a grouping function, and generating batches from the sorted and grouped data.
-    """
-
-    def __init__(
-        self,
-        arr: List,
-        sort_fn: Callable,
-        group_fn: Callable = lambda x: x[1],
-        grouping: bool = False,
-    ) -> None:
-        self.grouping = grouping
-        self.fn = sort_fn
-        self.group_fn = lambda x: group_fn(x[1])  # first index are enumerated indices
-        self.reorder_indices: List = []
-        self.size = len(arr)
-        self.arr_with_indices: Iterable[Any] = tuple(enumerate(arr))  # [indices, (arr)]
-        if self.grouping is True:
-            self.group_by_index()
-
-    def group_by_index(self) -> None:
-        self.arr_with_indices = self.group(
-            self.arr_with_indices, fn=self.group_fn, values=False
-        )
-
-    def get_batched(self, n: int = 1, batch_fn: Optional[Callable] = None) -> Iterator:
-        """
-        Generates and yields batches from the reordered array.
-
-        Parameters:
-        - n (int): The size of each batch. Defaults to 1.
-        - batch_fn (Optional[Callable[[int, Iterable], int]]): A function to determine the size of each batch. Defaults to None.
-
-        Yields:
-        Iterator: An iterator over batches of reordered elements.
-        """
-        if self.grouping:
-            for (
-                key,
-                values,
-            ) in self.arr_with_indices.items():  # type: ignore
-                values = self._reorder(values)
-                batch = self.get_chunks(values, n=n, fn=batch_fn)
-                yield from batch
-        else:
-            values = self._reorder(self.arr_with_indices)  # type: ignore
-            batch = self.get_chunks(values, n=n, fn=batch_fn)
-            yield from batch
-
-    def _reorder(self, arr: Union[List, Tuple[Tuple[int, Any], ...]]) -> List:
-        """
-        Reorders the elements in the array based on the sorting function.
-
-        Parameters:
-        - arr (Union[List, Tuple[Tuple[int, Any], ...]]): The array or iterable to be reordered.
-
-        Yields:
-        List: Yields reordered elements one by one.
-        """
-        arr = sorted(arr, key=lambda x: self.fn(x[1]))
-        self.reorder_indices.extend([x[0] for x in arr])
-        yield from [x[1] for x in arr]
-
-    def get_original(self, newarr: List) -> List:
-        """
-        Restores the original order of elements from the reordered list.
-
-        Parameters:
-        - newarr (List): The reordered array.
-
-        Returns:
-        List: The array with elements restored to their original order.
-        """
-        res = [None] * self.size
-        cov = [False] * self.size
-
-        for ind, v in zip(self.reorder_indices, newarr):
-            res[ind] = v
-            cov[ind] = True
-
-        assert all(cov)
-
-        return res
-
-    def __len__(self):
-        return self.size
-
-    @staticmethod
-    def group(arr: Iterable, fn: Callable, values: bool = False) -> Iterable:
-        """
-        Groups elements of an iterable based on a provided function.
-
-        Parameters:
-        - arr (Iterable): The iterable to be grouped.
-        - fn (Callable): The function to determine the grouping.
-        - values (bool): If True, returns the values of the group. Defaults to False.
-
-        Returns:
-        Iterable: An iterable of grouped elements.
-        """
-        res = collections.defaultdict(list)
-        for ob in arr:
-            try:
-                hashable_dict = tuple(
-                    (
-                        key,
-                        tuple(value)
-                        if isinstance(value, collections.abc.Iterable)
-                        else value,
-                    )
-                    for key, value in sorted(fn(ob).items())
-                )
-                res[hashable_dict].append(ob)
-            except TypeError:
-                res[fn(ob)].append(ob)
-        if not values:
-            return res
-        return res.values()
-
-    @staticmethod
-    def get_chunks(_iter, n: int = 0, fn=None):
-        """
-        Divides an iterable into chunks of specified size or based on a given function.
-        Useful for batching
-
-        Parameters:
-        - iter: The input iterable to be divided into chunks.
-        - n: An integer representing the size of each chunk. Default is 0.
-        - fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
-
-        Returns:
-        An iterator that yields chunks of the input iterable.
-
-        Example usage:
-        ```
-        data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        for chunk in chunks(data, 3):
-            print(chunk)
-        ```
-        Output:
-        ```
-        [1, 2, 3]
-        [4, 5, 6]
-        [7, 8, 9]
-        [10]
-        ```
-        """
-        arr = []
-        _iter = tuple(_iter)
-        for i, x in enumerate(_iter):
-            arr.append(x)
-            if len(arr) == (fn(i, _iter) if fn else n):
-                yield arr
-                arr = []
-
-        if arr:
-            yield arr
--- a/scripts/model_comparator.py
+++ b/scripts/model_comparator.py
@@ -8,6 +8,7 @@ import scipy.stats
 import torch

 import lm_eval.evaluator
+import lm_eval.models.utils
 from lm_eval import tasks, utils


@@ -113,7 +114,7 @@ if __name__ == "__main__":
        batch_size=args.batch,
    )
    memory_stats()
-    utils.clear_torch_cache()
+    lm_eval.models.utils.clear_torch_cache()
    eval_logger.info("Memory stats cleared")
    memory_stats()
    results_hf = lm_eval.evaluator.simple_evaluate(

--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -9,8 +9,8 @@ from lm_eval.api.metrics import (
    pooled_sample_stderr,
    stderr_for_metric,
 )
+from lm_eval.models.utils import Collator
 from lm_eval.utils import (
-    Collator,
    get_rolling_token_windows,
    make_disjoint_window,
 )