merged with latest update

ab96fc7e · lintangsutawika · bf2517cc · 8680e938 · ab96fc7e · ab96fc7e
Commit ab96fc7e authored Feb 20, 2024 by lintangsutawika
20 changed files
--- a/docs/interface.md
+++ b/docs/interface.md
@@ -46,6 +46,8 @@ This mode supports a number of command-line arguments, the details of which can
 * `--predict_only`: Generates the model outputs without computing metrics. Use with `--log_samples` to retrieve decoded results.
+* `--seed`: Set seed for python's random, numpy and torch.  Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three.  The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility).  E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`.  E.g, `--seed 42` sets all three seeds to 42.
 ## External Library Usage
 We also support using the library's external API for use within model training loops or other scripts.

--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -4,6 +4,7 @@ import logging
 import os
 import re
 import sys
+from functools import partial
 from pathlib import Path
 from typing import Union
@@ -23,6 +24,30 @@ def _handle_non_serializable(o):
        return str(o)
+def _int_or_none_list_arg_type(max_len: int, value: str, split_char: str = ","):
+    def parse_value(item):
+        item = item.strip().lower()
+        if item == "none":
+            return None
+        try:
+            return int(item)
+        except ValueError:
+            raise argparse.ArgumentTypeError(f"{item} is not an integer or None")
+    items = [parse_value(v) for v in value.split(split_char)]
+    num_items = len(items)
+    if num_items == 1:
+        # Makes downstream handling the same for single and multiple values
+        items = items * max_len
+    elif num_items != max_len:
+        raise argparse.ArgumentTypeError(
+            f"Argument requires {max_len} integers or None, separated by '{split_char}'"
+        )
+    return items
 def parse_eval_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument("--model", "-m", default="hf", help="Name of model e.g. `hf`")
@@ -149,6 +174,19 @@ def parse_eval_args() -> argparse.Namespace:
        default=False,
        help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
    )
+    parser.add_argument(
+        "--seed",
+        type=partial(_int_or_none_list_arg_type, 3),
+        default="0,1234,1234",  # for backward compatibility
+        help=(
+            "Set seed for python's random, numpy and torch.\n"
+            "Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, "
+            "or a single integer to set the same seed for all three.\n"
+            "The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility).\n"
+            "E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`.\n"
+            "E.g, `--seed 42` sets all three seeds to 42."
+        ),
+    )
    return parser.parse_args()
@@ -251,6 +289,9 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        gen_kwargs=args.gen_kwargs,
        task_manager=task_manager,
        predict_only=args.predict_only,
+        random_seed=args.seed[0],
+        numpy_random_seed=args.seed[1],
+        torch_random_seed=args.seed[2],
    )
    if results is not None:

--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -5,11 +5,11 @@ from collections.abc import Iterable
 from collections import defaultdict
 from typing import List
-import evaluate
 import numpy as np
 import sacrebleu
 import sklearn.metrics
+import evaluate
 from lm_eval.api.registry import register_aggregation, register_metric
@@ -470,13 +470,14 @@ def pooled_sample_stderr(stderrs: List[float], sizes: List[int]):
    assert len(stderrs) == len(sizes)
    # formula source: https://en.wikipedia.org/wiki/Pooled_variance
-    # this empirically matches running `stderr_for_metric` on all instances
+    # and: https://stats.stackexchange.com/a/4841331
+    # this empirically seems to match running `stderr_for_metric` on all instances
    # from the subtasks concatenated with each other.
    pooled_sample_var = (
-        sum([(size - 1) * stderr**2 for size, stderr in zip(sizes, stderrs)])
+        sum([(size - 1) * stderr**2 * size for size, stderr in zip(sizes, stderrs)])
    ) / (sum(sizes) - len(sizes))
-    return np.sqrt(pooled_sample_var)
+    return np.sqrt(pooled_sample_var / sum(sizes))
 def combined_sample_stderr(stderrs: List[float], sizes: List[int], metrics=None):
@@ -515,7 +516,7 @@ def aggregate_subtask_metrics(metrics, sizes, weight_by_size=True):
    # A helper function that is used to aggregate
    # subtask scores cross-task.
    # TODO: does not hold for non-mean aggregations
-    if weight_by_size:
+    if not weight_by_size:
        sizes = [1] * len(sizes)
    assert len(metrics) == len(sizes)

--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -2,7 +2,6 @@ import logging
 from typing import Callable, Dict
 import evaluate
 from lm_eval.api.model import LM

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -40,6 +40,9 @@ def simple_evaluate(
    task_manager: TaskManager = None,
    verbosity: str = "INFO",
    predict_only: bool = False,
+    random_seed: int = 0,
+    numpy_random_seed: int = 1234,
+    torch_random_seed: int = 1234,
 ):
    """Instantiate and evaluate a model on a list of tasks.
@@ -75,18 +78,31 @@ def simple_evaluate(
        Ignored for all tasks with loglikelihood output_type
    :param predict_only: bool
        If true only model outputs will be generated and returned. Metrics will not be evaluated
+    :param random_seed: int
+        Random seed for python's random module. If set to None, the seed will not be set.
+    :param numpy_random_seed: int
+        Random seed for numpy. If set to None, the seed will not be set.
+    :param torch_random_seed: int
+        Random seed for torch. If set to None, the seed will not be set.
    :return
        Dictionary of results
    """
-    random.seed(0)
-    np.random.seed(1234)
-    torch.manual_seed(
-        1234
-    )  # TODO: this may affect training runs that are run with evaluation mid-run.
    eval_logger.setLevel(getattr(logging, f"{verbosity}"))
+    if random_seed is not None:
+        # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1412
+        eval_logger.info(f"Setting random seed to {random_seed}")
+        random.seed(random_seed)
+    if numpy_random_seed is not None:
+        eval_logger.info(f"Setting numpy seed to {numpy_random_seed}")
+        np.random.seed(numpy_random_seed)
+    if torch_random_seed is not None:
+        eval_logger.info(f"Setting torch manual seed to {torch_random_seed}")
+        torch.manual_seed(torch_random_seed)
    if tasks is None:
        tasks = []
    assert (

--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -7,7 +7,10 @@ class RegexFilter(Filter):
    """ """
    def __init__(
-        self, regex_pattern: str = r"#### (\-?[0-9\.\,]+)", fallback: str = "[invalid]"
+        self,
+        regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
+        group_select=0,
+        fallback: str = "[invalid]",
    ) -> None:
        """
        pass a string `regex` to run `re.compile(r"regex")` on.
@@ -15,6 +18,7 @@ class RegexFilter(Filter):
        """
        self.regex_pattern = regex_pattern
        self.regex = re.compile(regex_pattern)
+        self.group_select = group_select
        self.fallback = fallback
    def apply(self, resps, docs):
@@ -25,9 +29,12 @@ class RegexFilter(Filter):
        def filter_set(inst):
            filtered = []
            for resp in inst:
-                match = self.regex.search(resp)
+                match = self.regex.findall(resp)
                if match:
-                    match = match.group(1).strip()
+                    match = match[self.group_select]
+                    if isinstance(match, tuple):
+                        match = [m for m in match if m][0]
+                    match = match.strip()
                else:
                    match = self.fallback
                filtered.append(match)

--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
@@ -11,12 +11,11 @@ from . import neuron_optimum
 # TODO: implement __all__
-import os
 try:
-    # enabling faster model download
+    # enable hf hub transfer if available
-    import hf_transfer
+    import hf_transfer  # type: ignore # noqa
+    import huggingface_hub.constants  # type: ignore
-    os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+    huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
 except ImportError:
    pass
--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -5,7 +5,7 @@ from tqdm import tqdm
 from lm_eval import utils
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
-from lm_eval.utils import retry_on_specific_exceptions
+from lm_eval.models.utils import retry_on_specific_exceptions
 eval_logger = utils.eval_logger

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -26,7 +26,13 @@ from lm_eval import utils
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
-from lm_eval.utils import Collator, stop_sequences_criteria
+from lm_eval.models.utils import (
+    Collator,
+    clear_torch_cache,
+    get_dtype,
+    pad_and_concat,
+    stop_sequences_criteria,
+)
 eval_logger = utils.eval_logger
@@ -503,13 +509,13 @@ class HFLM(LM):
            if transformers.__version__ >= "4.30.0":
                if model_kwargs.get("load_in_4bit", None):
                    if model_kwargs.get("bnb_4bit_compute_dtype", None):
-                        model_kwargs["bnb_4bit_compute_dtype"] = utils.get_dtype(
+                        model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(
                            model_kwargs["bnb_4bit_compute_dtype"]
                        )
            self._model = self.AUTO_MODEL_CLASS.from_pretrained(
                pretrained,
                revision=revision,
-                torch_dtype=utils.get_dtype(dtype),
+                torch_dtype=get_dtype(dtype),
                trust_remote_code=trust_remote_code,
                **model_kwargs,
            )
@@ -639,10 +645,10 @@ class HFLM(LM):
                self.accelerator.gather(max_rnk_bs).cpu().detach().numpy().tolist()
            )
            batch_size = min(gathered)
-            utils.clear_torch_cache()
+            clear_torch_cache()
            return batch_size
-        utils.clear_torch_cache()
+        clear_torch_cache()
        return batch_size
    def tok_encode(
@@ -997,18 +1003,18 @@ class HFLM(LM):
            # create encoder attn mask and batched conts, if seq2seq
            call_kwargs = {}
            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
-                batched_inps = utils.pad_and_concat(
+                batched_inps = pad_and_concat(
                    padding_len_inp, inps, padding_side="right"
                )  # [batch, padding_len_inp]
            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
                # TODO: left-pad encoder inps and mask?
-                batched_inps = utils.pad_and_concat(
+                batched_inps = pad_and_concat(
                    padding_len_inp, inps
                )  # [batch, padding_len_inp]
-                batched_conts = utils.pad_and_concat(
+                batched_conts = pad_and_concat(
                    padding_len_cont, conts
                )  # [batch, padding_len_cont]
-                batched_encoder_mask = utils.pad_and_concat(
+                batched_encoder_mask = pad_and_concat(
                    padding_len_inp, encoder_attns
                )  # [batch, padding_len_inp]
                call_kwargs = {

--- a/lm_eval/models/mamba_lm.py
+++ b/lm_eval/models/mamba_lm.py
@@ -2,7 +2,7 @@ from typing import Optional, Union
 import torch
-from lm_eval import utils
+import lm_eval.models.utils
 from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM
@@ -97,7 +97,9 @@ please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba
        self._model = MambaLMHeadModel.from_pretrained(
            pretrained,
            device=self._device,
-            dtype=torch.float16 if dtype == "auto" else utils.get_dtype(dtype),
+            dtype=torch.float16
+            if dtype == "auto"
+            else lm_eval.models.utils.get_dtype(dtype),
        )
    def _model_generate(self, context, max_length, stop, **generation_kwargs):

--- a/lm_eval/models/neuron_optimum.py
+++ b/lm_eval/models/neuron_optimum.py
@@ -13,10 +13,11 @@ from tqdm import tqdm
 from transformers import GenerationConfig
 from transformers.generation import StoppingCriteriaList
+import lm_eval.models.utils
 from lm_eval import utils
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
-from lm_eval.utils import stop_sequences_criteria
+from lm_eval.models.utils import stop_sequences_criteria
 try:
@@ -239,7 +240,7 @@ class NEURON_HF(LM):
            revision=revision,
            trust_remote_code=trust_remote_code,
        )
-        torch_dtype = utils.get_dtype(dtype)
+        torch_dtype = lm_eval.models.utils.get_dtype(dtype)
        assert torch_dtype in [
            torch.float16,
@@ -550,7 +551,7 @@ class NEURON_HF(LM):
        # automatic (variable) batch size detection for vectorization
        # pull longest context sample from request
-        chunks = utils.chunks(
+        chunks = lm_eval.models.utils.chunks(
            re_ord.get_reordered(),
            n=self.batch_size,
            fn=None,
@@ -603,7 +604,7 @@ class NEURON_HF(LM):
            # create encoder attn mask and batched conts, if seq2seq
            call_kwargs = {}
-            batched_inps = utils.pad_and_concat(
+            batched_inps = lm_eval.models.utils.pad_and_concat(
                padding_len_inp, inps, padding_side="right"
            )  # [batch, padding_len_inp]
@@ -663,7 +664,7 @@ class NEURON_HF(LM):
        # we group requests by their generation_kwargs,
        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
        # in the same batch.
-        grouper = utils.Grouper(requests, lambda x: str(x.args[1]))
+        grouper = lm_eval.models.utils.Grouper(requests, lambda x: str(x.args[1]))
        for key, reqs in grouper.get_grouped().items():
            # within each set of reqs for given kwargs, we reorder by token length, descending.
            re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
@@ -672,7 +673,9 @@ class NEURON_HF(LM):
        # for each different set of kwargs, we execute all requests, by batch.
        for key, re_ord in re_ords.items():
-            chunks = utils.chunks(re_ord.get_reordered(), n=self.batch_size)
+            chunks = lm_eval.models.utils.chunks(
+                re_ord.get_reordered(), n=self.batch_size
+            )
            for chunk in tqdm(chunks, disable=self.rank != 0):
                contexts, all_gen_kwargs = zip(*chunk)
                # we assume all gen kwargs in the batch are the same

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -6,10 +6,12 @@ from typing import List, Literal, Optional, Tuple
 from tqdm import tqdm
+import lm_eval.models.utils
 from lm_eval import utils
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
-from lm_eval.utils import eval_logger, retry_on_specific_exceptions
+from lm_eval.models.utils import retry_on_specific_exceptions
+from lm_eval.utils import eval_logger
 def get_result(response, ctxlen: int) -> Tuple[float, bool]:
@@ -219,7 +221,7 @@ class OpenaiCompletionsLM(LM):
        re_ord = utils.Reorderer(requests, _collate)
        for chunk in tqdm(
-            list(utils.chunks(re_ord.get_reordered(), self.batch_size)),
+            list(lm_eval.models.utils.chunks(re_ord.get_reordered(), self.batch_size)),
            disable=disable_tqdm,
        ):
            inps = []
@@ -429,7 +431,7 @@ class OpenaiChatCompletionsLM(LM):
        # we group requests by their generation_kwargs,
        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
        # in the same batch.
-        grouper = utils.Grouper(requests, lambda x: str(x.args[1]))
+        grouper = lm_eval.models.utils.Grouper(requests, lambda x: str(x.args[1]))
        for key, reqs in grouper.get_grouped().items():
            # within each set of reqs for given kwargs, we reorder by token length, descending.
            re_ords[key] = utils.Reorderer(
@@ -441,7 +443,7 @@ class OpenaiChatCompletionsLM(LM):
            # n needs to be 1 because messages in
            # chat completion are not batch but
            # is regarded as a single conversation.
-            chunks = utils.chunks(re_ord.get_reordered(), n=1)
+            chunks = lm_eval.models.utils.chunks(re_ord.get_reordered(), n=1)
            for chunk in chunks:
                contexts, all_gen_kwargs = zip(*chunk)
                inps = [{"role": "user", "content": context} for context in contexts]

--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
@@ -19,7 +19,7 @@ from tqdm import tqdm
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
-from lm_eval.utils import retry_on_specific_exceptions
+from lm_eval.models.utils import retry_on_specific_exceptions
 logger = logging.getLogger(__name__)

--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
+import collections
+import fnmatch
+import gc
+import time
+from functools import wraps
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+)
+import torch
+import transformers
+from lm_eval.utils import eval_logger
+def chunks(iter, n: int = 0, fn=None):
+    """
+    Divides an iterable into chunks of specified size or based on a given function.
+    Useful for batching
+    Parameters:
+    - iter: The input iterable to be divided into chunks.
+    - n: An integer representing the size of each chunk. Default is 0.
+    - fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
+    Returns:
+    An iterator that yields chunks of the input iterable.
+    Example usage:
+    ```
+    data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    for chunk in chunks(data, 3):
+        print(chunk)
+    ```
+    Output:
+    ```
+    [1, 2, 3]
+    [4, 5, 6]
+    [7, 8, 9]
+    [10]
+    ```
+    """
+    arr = []
+    for i, x in enumerate(iter):
+        arr.append(x)
+        if len(arr) == (fn(i, iter) if fn else n):
+            yield arr
+            arr = []
+    if arr:
+        yield arr
+class MultiChoice:
+    def __init__(self, choices) -> None:
+        self.choices = choices
+    # Simple wildcard support (linux filename patterns)
+    def __contains__(self, values) -> bool:
+        for value in values.split(","):
+            if len(fnmatch.filter(self.choices, value)) == 0:
+                eval_logger.info("Available tasks to choose:")
+                for choice in self.choices:
+                    eval_logger.info(f"  - {choice}")
+                raise ValueError("'{}' is not in task list".format(value))
+        return True
+    def __iter__(self) -> Iterator:
+        for choice in self.choices:
+            yield choice
+class Grouper:
+    """
+    takes an array `arr` and function `fn` and returns a dictionary
+    with keys fn(ob) for each ob in `arr` and with values `self.arr[key]` a list of all
+    objects in `arr` satisfying `key == fn(ob)`.
+    """
+    def __init__(self, arr, fn) -> None:
+        # self.orig_arr = arr
+        self.size = len(arr)
+        arr = list(enumerate(arr))
+        def group_return_dict(arr, fn):
+            res = collections.defaultdict(list)
+            for ob in arr:
+                res[fn(ob)].append(ob)
+            return res
+        arr = group_return_dict(arr, lambda x: fn(x[1]))
+        # self.arr has format Dict[Tuple[int, <entry from orig. arr>]]
+        self.arr = arr
+        self._grouped = None
+    def get_grouped(self):
+        # return the contents but not indices for our grouped dict.
+        if self._grouped:
+            return self._grouped
+        grouped = {}
+        for key in self.arr.keys():
+            # drop the index from each element of self.arr
+            grouped[key] = [y[1] for y in self.arr[key]]
+        self._grouped = grouped
+        return grouped
+    def get_original(self, grouped_dict):
+        # take in a grouped dictionary with e.g. results for each key listed
+        # in the same order as the instances in `self.arr`, and
+        # return the results in the same (single list) order as `self.orig_arr`.
+        res = [None] * self.size
+        cov = [False] * self.size
+        # orig = [None] * self.size
+        assert grouped_dict.keys() == self.arr.keys()
+        for key in grouped_dict.keys():
+            for (ind, _), v in zip(self.arr[key], grouped_dict[key]):
+                res[ind] = v
+                cov[ind] = True
+                # orig[ind] = _
+        assert all(cov)
+        # assert orig == self.orig_arr
+        return res
+def pad_and_concat(
+    max_length: int,
+    tensors: List[torch.Tensor],
+    padding_side: Literal["right", "left"] = "right",
+):
+    """
+    Method for padding a list of tensors given the maximum tensor
+    length in the batch. Used for batching inputs and continuations in
+    seq2seq models.
+    """
+    assert (
+        padding_side == "left" or padding_side == "right"
+    ), f"Unrecognized padding type: '{padding_side}' not 'left' or 'right'"
+    for i, tensor in enumerate(tensors):
+        if len(tensor.shape) == 2:
+            tensor = tensor.squeeze(0)  # squeeze, in case passed [1, seq] size
+        tensor_len = tensor.shape[0]
+        if tensor_len < max_length:
+            if padding_side == "right":
+                # right-pad
+                tensors[i] = torch.cat(
+                    [
+                        tensor,  # [seq]
+                        torch.zeros(
+                            max_length - tensor_len,
+                            dtype=torch.long,
+                            device=tensor.device,
+                        ),  # [padding_length - seq]
+                    ],
+                    dim=0,
+                ).unsqueeze(0)
+            else:
+                # left-pad
+                tensors[i] = torch.cat(
+                    [
+                        torch.zeros(
+                            max_length - tensor_len,
+                            dtype=torch.long,
+                            device=tensor.device,
+                        ),  # [padding_length - seq]
+                        tensor,  # [seq]
+                    ],
+                    dim=0,
+                ).unsqueeze(0)
+        else:
+            tensors[i] = tensor.unsqueeze(0)
+    return torch.cat(tensors, dim=0)
+def clear_torch_cache() -> None:
+    gc.collect()
+    torch.cuda.empty_cache()
+def get_dtype(dtype: Union[str, torch.dtype]) -> torch.dtype:
+    """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
+    if isinstance(dtype, str) and dtype != "auto":
+        # Convert `str` args torch dtype: `float16` -> `torch.float16`
+        _torch_dtype = getattr(torch, dtype)
+    else:
+        _torch_dtype = dtype
+    return _torch_dtype
+class MultiTokenEOSCriteria(transformers.StoppingCriteria):
+    """Criteria to stop on the specified multi-token sequence."""
+    def __init__(
+        self,
+        sequence: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        initial_decoder_input_length: int,
+        batch_size: int,
+    ) -> None:
+        self.initial_decoder_input_length = initial_decoder_input_length
+        self.done_tracker = [False] * batch_size
+        self.sequence = sequence
+        self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
+        # print(sequence, self.sequence_ids)
+        # we look back for 2 more tokens than it takes to encode our stop sequence
+        # because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
+        # and we don't want to mistakenly not stop a generation because our
+        # (string) stop sequence was output in a different tokenization
+        # NOTE: there is a minor danger that this will end up looking back 2 tokens into the past, into the inputs to the model,
+        # and stopping generation immediately as a result. With only 2 extra tokens of lookback, this risk is minimized
+        # Additionally, in lookback_ids_batch we should prevent ever looking back into the inputs as described.
+        self.sequence_id_len = len(self.sequence_ids) + 2
+        self.tokenizer = tokenizer
+    def __call__(self, input_ids, scores, **kwargs) -> bool:
+        # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
+        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :]
+        lookback_ids_batch = lookback_ids_batch[:, -self.sequence_id_len :]
+        lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
+        for i, done in enumerate(self.done_tracker):
+            if not done:
+                self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
+        return False not in self.done_tracker
+def stop_sequences_criteria(
+    tokenizer: transformers.PreTrainedTokenizer,
+    stop_sequences: List[str],
+    initial_decoder_input_length: int,
+    batch_size: int,
+) -> transformers.StoppingCriteriaList:
+    return transformers.StoppingCriteriaList(
+        [
+            *[
+                MultiTokenEOSCriteria(
+                    sequence, tokenizer, initial_decoder_input_length, batch_size
+                )
+                for sequence in stop_sequences
+            ],
+        ]
+    )
+def divide(iterable, n) -> List[Iterator]:
+    """Divide the elements from *iterable* into *n* parts, maintaining
+    order.
+        >>> group_1, group_2 = divide([1, 2, 3, 4, 5, 6], 2)
+        >>> list(group_1)
+        [1, 2, 3]
+        >>> list(group_2)
+        [4, 5, 6]
+    If the length of *iterable* is not evenly divisible by *n*, then the
+    length of the returned iterables will not be identical:
+        >>> children = divide([1, 2, 3, 4, 5, 6, 7], 3)
+        >>> [list(c) for c in children]
+        [[1, 2, 3], [4, 5], [6, 7]]
+    If the length of the iterable is smaller than n, then the last returned
+    iterables will be empty:
+        >>> children = divide([1, 2, 3], 5)
+        >>> [list(c) for c in children]
+        [[1], [2], [3], [], []]
+    This function will exhaust the iterable before returning and may require
+    significant storage. If order is not important, see :func:`distribute`,
+    which does not first pull the iterable into memory.
+    """
+    if n < 1:
+        raise ValueError("n must be at least 1")
+    try:
+        iterable[:0]
+    except TypeError:
+        seq = tuple(iterable)
+    else:
+        seq = iterable
+    q, r = divmod(len(seq), n)
+    ret = []
+    stop = 0
+    for i in range(1, n + 1):
+        start = stop
+        stop += q + 1 if i <= r else q
+        ret.append(iter(seq[start:stop]))
+    return ret
+def retry_on_specific_exceptions(
+    on_exceptions: List[Type[Exception]],
+    max_retries: Optional[int] = None,
+    backoff_time: float = 3.0,
+    backoff_multiplier: float = 1.5,
+    on_exception_callback: Optional[Callable[[Exception, float], Any]] = None,
+):
+    """Retry on an LLM Provider's rate limit error with exponential backoff
+    For example, to use for OpenAI, do the following:
+    ```
+    from openai import RateLimitError
+    # Recommend specifying max_retries to avoid infinite loops!
+    @retry_on_specific_exceptions([RateLimitError], max_retries=3)
+    def completion(...):
+        # Wrap OpenAI completion function here
+        ...
+    ```
+    """
+    def decorator(func: Callable):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            sleep_time = backoff_time
+            attempt = 0
+            while max_retries is None or attempt < max_retries:
+                try:
+                    return func(*args, **kwargs)
+                except tuple(on_exceptions) as e:
+                    if on_exception_callback is not None:
+                        on_exception_callback(e, sleep_time)
+                    time.sleep(sleep_time)
+                    sleep_time *= backoff_multiplier
+                    attempt += 1
+        return wrapper
+    return decorator
+class Collator:
+    """
+    A class for reordering and batching elements of an array.
+    This class allows for sorting an array based on a provided sorting function, grouping elements based on a grouping function, and generating batches from the sorted and grouped data.
+    """
+    def __init__(
+        self,
+        arr: List,
+        sort_fn: Callable,
+        group_fn: Callable = lambda x: x[1],
+        grouping: bool = False,
+    ) -> None:
+        self.grouping = grouping
+        self.fn = sort_fn
+        self.group_fn = lambda x: group_fn(x[1])  # first index are enumerated indices
+        self.reorder_indices: List = []
+        self.size = len(arr)
+        self.arr_with_indices: Iterable[Any] = tuple(enumerate(arr))  # [indices, (arr)]
+        if self.grouping is True:
+            self.group_by_index()
+    def group_by_index(self) -> None:
+        self.arr_with_indices = self.group(
+            self.arr_with_indices, fn=self.group_fn, values=False
+        )
+    def get_batched(self, n: int = 1, batch_fn: Optional[Callable] = None) -> Iterator:
+        """
+        Generates and yields batches from the reordered array.
+        Parameters:
+        - n (int): The size of each batch. Defaults to 1.
+        - batch_fn (Optional[Callable[[int, Iterable], int]]): A function to determine the size of each batch. Defaults to None.
+        Yields:
+        Iterator: An iterator over batches of reordered elements.
+        """
+        if self.grouping:
+            for (
+                key,
+                values,
+            ) in self.arr_with_indices.items():  # type: ignore
+                values = self._reorder(values)
+                batch = self.get_chunks(values, n=n, fn=batch_fn)
+                yield from batch
+        else:
+            values = self._reorder(self.arr_with_indices)  # type: ignore
+            batch = self.get_chunks(values, n=n, fn=batch_fn)
+            yield from batch
+    def _reorder(self, arr: Union[List, Tuple[Tuple[int, Any], ...]]) -> List:
+        """
+        Reorders the elements in the array based on the sorting function.
+        Parameters:
+        - arr (Union[List, Tuple[Tuple[int, Any], ...]]): The array or iterable to be reordered.
+        Yields:
+        List: Yields reordered elements one by one.
+        """
+        arr = sorted(arr, key=lambda x: self.fn(x[1]))
+        self.reorder_indices.extend([x[0] for x in arr])
+        yield from [x[1] for x in arr]
+    def get_original(self, newarr: List) -> List:
+        """
+        Restores the original order of elements from the reordered list.
+        Parameters:
+        - newarr (List): The reordered array.
+        Returns:
+        List: The array with elements restored to their original order.
+        """
+        res = [None] * self.size
+        cov = [False] * self.size
+        for ind, v in zip(self.reorder_indices, newarr):
+            res[ind] = v
+            cov[ind] = True
+        assert all(cov)
+        return res
+    def __len__(self):
+        return self.size
+    @staticmethod
+    def group(arr: Iterable, fn: Callable, values: bool = False) -> Iterable:
+        """
+        Groups elements of an iterable based on a provided function.
+        Parameters:
+        - arr (Iterable): The iterable to be grouped.
+        - fn (Callable): The function to determine the grouping.
+        - values (bool): If True, returns the values of the group. Defaults to False.
+        Returns:
+        Iterable: An iterable of grouped elements.
+        """
+        res = collections.defaultdict(list)
+        for ob in arr:
+            try:
+                hashable_dict = tuple(
+                    (
+                        key,
+                        tuple(value)
+                        if isinstance(value, collections.abc.Iterable)
+                        else value,
+                    )
+                    for key, value in sorted(fn(ob).items())
+                )
+                res[hashable_dict].append(ob)
+            except TypeError:
+                res[fn(ob)].append(ob)
+        if not values:
+            return res
+        return res.values()
+    @staticmethod
+    def get_chunks(_iter, n: int = 0, fn=None):
+        """
+        Divides an iterable into chunks of specified size or based on a given function.
+        Useful for batching
+        Parameters:
+        - iter: The input iterable to be divided into chunks.
+        - n: An integer representing the size of each chunk. Default is 0.
+        - fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
+        Returns:
+        An iterator that yields chunks of the input iterable.
+        Example usage:
+        ```
+        data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        for chunk in chunks(data, 3):
+            print(chunk)
+        ```
+        Output:
+        ```
+        [1, 2, 3]
+        [4, 5, 6]
+        [7, 8, 9]
+        [10]
+        ```
+        """
+        arr = []
+        _iter = tuple(_iter)
+        for i, x in enumerate(_iter):
+            arr.append(x)
+            if len(arr) == (fn(i, _iter) if fn else n):
+                yield arr
+                arr = []
+        if arr:
+            yield arr
--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -7,9 +7,8 @@ from tqdm import tqdm
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
+from lm_eval.models.utils import Collator, divide
 from lm_eval.utils import (
-    Collator,
-    divide,
    eval_logger,
    get_rolling_token_windows,
    make_disjoint_window,

--- a/lm_eval/tasks/arc/README.md
+++ b/lm_eval/tasks/arc/README.md
@@ -38,7 +38,7 @@ Homepage: https://allenai.org/data/arc
 #### Tasks
 * `arc_easy`
-* `arc_challange`
+* `arc_challenge`
 ### Checklist

--- a/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
@@ -7,22 +7,22 @@ metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
-    # ignore_case: true
+    ignore_case: true
    # ignore_punctuation: true
+    regexes_to_ignore:
+      - "\\.$"
+      - ","
+      - "\\\\"
+      - "\n"
+      - '"'
 generation_kwargs:
  until:
    - "</s>"
-    - "Q"
+    - "Q:"
-    - "\n\n"
+    - "<|im_end|>"
    - "<0x0A>"
  do_sample: false
  temperature: 0.0
-filter_list:
-  - name: "get-answer"
-    filter:
-      - function: "regex"
-        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
-      - function: "take_first"
 num_fewshot: 0
 metadata:
-  version: 1.0
+  version: 2.0
--- a/lm_eval/tasks/bbh/cot_zeroshot/boolean_expressions.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/boolean_expressions.yaml
 "dataset_name": "boolean_expressions"
 "description": "Evaluate the result of a random Boolean expression.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_boolean_expressions"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "\\b(True|False)\\b"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/causal_judgement.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/causal_judgement.yaml
 "dataset_name": "causal_judgement"
 "description": "Answer questions about causal attribution.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_causal_judgement"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "\\b(Yes|No|yes|no)\\b"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/date_understanding.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/date_understanding.yaml
 "dataset_name": "date_understanding"
 "description": "Infer the date from context.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_date_understanding"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"