Commit ab96fc7e authored by lintangsutawika's avatar lintangsutawika
Browse files

merged with latest update

parents bf2517cc 8680e938
...@@ -46,6 +46,8 @@ This mode supports a number of command-line arguments, the details of which can ...@@ -46,6 +46,8 @@ This mode supports a number of command-line arguments, the details of which can
* `--predict_only`: Generates the model outputs without computing metrics. Use with `--log_samples` to retrieve decoded results. * `--predict_only`: Generates the model outputs without computing metrics. Use with `--log_samples` to retrieve decoded results.
* `--seed`: Set seed for python's random, numpy and torch. Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three. The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility). E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`. E.g, `--seed 42` sets all three seeds to 42.
## External Library Usage ## External Library Usage
We also support using the library's external API for use within model training loops or other scripts. We also support using the library's external API for use within model training loops or other scripts.
......
...@@ -4,6 +4,7 @@ import logging ...@@ -4,6 +4,7 @@ import logging
import os import os
import re import re
import sys import sys
from functools import partial
from pathlib import Path from pathlib import Path
from typing import Union from typing import Union
...@@ -23,6 +24,30 @@ def _handle_non_serializable(o): ...@@ -23,6 +24,30 @@ def _handle_non_serializable(o):
return str(o) return str(o)
def _int_or_none_list_arg_type(max_len: int, value: str, split_char: str = ","):
def parse_value(item):
item = item.strip().lower()
if item == "none":
return None
try:
return int(item)
except ValueError:
raise argparse.ArgumentTypeError(f"{item} is not an integer or None")
items = [parse_value(v) for v in value.split(split_char)]
num_items = len(items)
if num_items == 1:
# Makes downstream handling the same for single and multiple values
items = items * max_len
elif num_items != max_len:
raise argparse.ArgumentTypeError(
f"Argument requires {max_len} integers or None, separated by '{split_char}'"
)
return items
def parse_eval_args() -> argparse.Namespace: def parse_eval_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("--model", "-m", default="hf", help="Name of model e.g. `hf`") parser.add_argument("--model", "-m", default="hf", help="Name of model e.g. `hf`")
...@@ -149,6 +174,19 @@ def parse_eval_args() -> argparse.Namespace: ...@@ -149,6 +174,19 @@ def parse_eval_args() -> argparse.Namespace:
default=False, default=False,
help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.", help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
) )
parser.add_argument(
"--seed",
type=partial(_int_or_none_list_arg_type, 3),
default="0,1234,1234", # for backward compatibility
help=(
"Set seed for python's random, numpy and torch.\n"
"Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, "
"or a single integer to set the same seed for all three.\n"
"The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility).\n"
"E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`.\n"
"E.g, `--seed 42` sets all three seeds to 42."
),
)
return parser.parse_args() return parser.parse_args()
...@@ -251,6 +289,9 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -251,6 +289,9 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
gen_kwargs=args.gen_kwargs, gen_kwargs=args.gen_kwargs,
task_manager=task_manager, task_manager=task_manager,
predict_only=args.predict_only, predict_only=args.predict_only,
random_seed=args.seed[0],
numpy_random_seed=args.seed[1],
torch_random_seed=args.seed[2],
) )
if results is not None: if results is not None:
......
...@@ -5,11 +5,11 @@ from collections.abc import Iterable ...@@ -5,11 +5,11 @@ from collections.abc import Iterable
from collections import defaultdict from collections import defaultdict
from typing import List from typing import List
import evaluate
import numpy as np import numpy as np
import sacrebleu import sacrebleu
import sklearn.metrics import sklearn.metrics
import evaluate
from lm_eval.api.registry import register_aggregation, register_metric from lm_eval.api.registry import register_aggregation, register_metric
...@@ -470,13 +470,14 @@ def pooled_sample_stderr(stderrs: List[float], sizes: List[int]): ...@@ -470,13 +470,14 @@ def pooled_sample_stderr(stderrs: List[float], sizes: List[int]):
assert len(stderrs) == len(sizes) assert len(stderrs) == len(sizes)
# formula source: https://en.wikipedia.org/wiki/Pooled_variance # formula source: https://en.wikipedia.org/wiki/Pooled_variance
# this empirically matches running `stderr_for_metric` on all instances # and: https://stats.stackexchange.com/a/4841331
# this empirically seems to match running `stderr_for_metric` on all instances
# from the subtasks concatenated with each other. # from the subtasks concatenated with each other.
pooled_sample_var = ( pooled_sample_var = (
sum([(size - 1) * stderr**2 for size, stderr in zip(sizes, stderrs)]) sum([(size - 1) * stderr**2 * size for size, stderr in zip(sizes, stderrs)])
) / (sum(sizes) - len(sizes)) ) / (sum(sizes) - len(sizes))
return np.sqrt(pooled_sample_var) return np.sqrt(pooled_sample_var / sum(sizes))
def combined_sample_stderr(stderrs: List[float], sizes: List[int], metrics=None): def combined_sample_stderr(stderrs: List[float], sizes: List[int], metrics=None):
...@@ -515,7 +516,7 @@ def aggregate_subtask_metrics(metrics, sizes, weight_by_size=True): ...@@ -515,7 +516,7 @@ def aggregate_subtask_metrics(metrics, sizes, weight_by_size=True):
# A helper function that is used to aggregate # A helper function that is used to aggregate
# subtask scores cross-task. # subtask scores cross-task.
# TODO: does not hold for non-mean aggregations # TODO: does not hold for non-mean aggregations
if weight_by_size: if not weight_by_size:
sizes = [1] * len(sizes) sizes = [1] * len(sizes)
assert len(metrics) == len(sizes) assert len(metrics) == len(sizes)
......
...@@ -2,7 +2,6 @@ import logging ...@@ -2,7 +2,6 @@ import logging
from typing import Callable, Dict from typing import Callable, Dict
import evaluate import evaluate
from lm_eval.api.model import LM from lm_eval.api.model import LM
......
...@@ -40,6 +40,9 @@ def simple_evaluate( ...@@ -40,6 +40,9 @@ def simple_evaluate(
task_manager: TaskManager = None, task_manager: TaskManager = None,
verbosity: str = "INFO", verbosity: str = "INFO",
predict_only: bool = False, predict_only: bool = False,
random_seed: int = 0,
numpy_random_seed: int = 1234,
torch_random_seed: int = 1234,
): ):
"""Instantiate and evaluate a model on a list of tasks. """Instantiate and evaluate a model on a list of tasks.
...@@ -75,18 +78,31 @@ def simple_evaluate( ...@@ -75,18 +78,31 @@ def simple_evaluate(
Ignored for all tasks with loglikelihood output_type Ignored for all tasks with loglikelihood output_type
:param predict_only: bool :param predict_only: bool
If true only model outputs will be generated and returned. Metrics will not be evaluated If true only model outputs will be generated and returned. Metrics will not be evaluated
:param random_seed: int
Random seed for python's random module. If set to None, the seed will not be set.
:param numpy_random_seed: int
Random seed for numpy. If set to None, the seed will not be set.
:param torch_random_seed: int
Random seed for torch. If set to None, the seed will not be set.
:return :return
Dictionary of results Dictionary of results
""" """
random.seed(0)
np.random.seed(1234)
torch.manual_seed(
1234
) # TODO: this may affect training runs that are run with evaluation mid-run.
eval_logger.setLevel(getattr(logging, f"{verbosity}")) eval_logger.setLevel(getattr(logging, f"{verbosity}"))
if random_seed is not None:
# See https://github.com/EleutherAI/lm-evaluation-harness/pull/1412
eval_logger.info(f"Setting random seed to {random_seed}")
random.seed(random_seed)
if numpy_random_seed is not None:
eval_logger.info(f"Setting numpy seed to {numpy_random_seed}")
np.random.seed(numpy_random_seed)
if torch_random_seed is not None:
eval_logger.info(f"Setting torch manual seed to {torch_random_seed}")
torch.manual_seed(torch_random_seed)
if tasks is None: if tasks is None:
tasks = [] tasks = []
assert ( assert (
......
...@@ -7,7 +7,10 @@ class RegexFilter(Filter): ...@@ -7,7 +7,10 @@ class RegexFilter(Filter):
""" """ """ """
def __init__( def __init__(
self, regex_pattern: str = r"#### (\-?[0-9\.\,]+)", fallback: str = "[invalid]" self,
regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
group_select=0,
fallback: str = "[invalid]",
) -> None: ) -> None:
""" """
pass a string `regex` to run `re.compile(r"regex")` on. pass a string `regex` to run `re.compile(r"regex")` on.
...@@ -15,6 +18,7 @@ class RegexFilter(Filter): ...@@ -15,6 +18,7 @@ class RegexFilter(Filter):
""" """
self.regex_pattern = regex_pattern self.regex_pattern = regex_pattern
self.regex = re.compile(regex_pattern) self.regex = re.compile(regex_pattern)
self.group_select = group_select
self.fallback = fallback self.fallback = fallback
def apply(self, resps, docs): def apply(self, resps, docs):
...@@ -25,9 +29,12 @@ class RegexFilter(Filter): ...@@ -25,9 +29,12 @@ class RegexFilter(Filter):
def filter_set(inst): def filter_set(inst):
filtered = [] filtered = []
for resp in inst: for resp in inst:
match = self.regex.search(resp) match = self.regex.findall(resp)
if match: if match:
match = match.group(1).strip() match = match[self.group_select]
if isinstance(match, tuple):
match = [m for m in match if m][0]
match = match.strip()
else: else:
match = self.fallback match = self.fallback
filtered.append(match) filtered.append(match)
......
...@@ -11,12 +11,11 @@ from . import neuron_optimum ...@@ -11,12 +11,11 @@ from . import neuron_optimum
# TODO: implement __all__ # TODO: implement __all__
import os
try: try:
# enabling faster model download # enable hf hub transfer if available
import hf_transfer import hf_transfer # type: ignore # noqa
import huggingface_hub.constants # type: ignore
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
except ImportError: except ImportError:
pass pass
...@@ -5,7 +5,7 @@ from tqdm import tqdm ...@@ -5,7 +5,7 @@ from tqdm import tqdm
from lm_eval import utils from lm_eval import utils
from lm_eval.api.model import LM from lm_eval.api.model import LM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.utils import retry_on_specific_exceptions from lm_eval.models.utils import retry_on_specific_exceptions
eval_logger = utils.eval_logger eval_logger = utils.eval_logger
......
...@@ -26,7 +26,13 @@ from lm_eval import utils ...@@ -26,7 +26,13 @@ from lm_eval import utils
from lm_eval.api.instance import Instance from lm_eval.api.instance import Instance
from lm_eval.api.model import LM from lm_eval.api.model import LM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.utils import Collator, stop_sequences_criteria from lm_eval.models.utils import (
Collator,
clear_torch_cache,
get_dtype,
pad_and_concat,
stop_sequences_criteria,
)
eval_logger = utils.eval_logger eval_logger = utils.eval_logger
...@@ -503,13 +509,13 @@ class HFLM(LM): ...@@ -503,13 +509,13 @@ class HFLM(LM):
if transformers.__version__ >= "4.30.0": if transformers.__version__ >= "4.30.0":
if model_kwargs.get("load_in_4bit", None): if model_kwargs.get("load_in_4bit", None):
if model_kwargs.get("bnb_4bit_compute_dtype", None): if model_kwargs.get("bnb_4bit_compute_dtype", None):
model_kwargs["bnb_4bit_compute_dtype"] = utils.get_dtype( model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(
model_kwargs["bnb_4bit_compute_dtype"] model_kwargs["bnb_4bit_compute_dtype"]
) )
self._model = self.AUTO_MODEL_CLASS.from_pretrained( self._model = self.AUTO_MODEL_CLASS.from_pretrained(
pretrained, pretrained,
revision=revision, revision=revision,
torch_dtype=utils.get_dtype(dtype), torch_dtype=get_dtype(dtype),
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
**model_kwargs, **model_kwargs,
) )
...@@ -639,10 +645,10 @@ class HFLM(LM): ...@@ -639,10 +645,10 @@ class HFLM(LM):
self.accelerator.gather(max_rnk_bs).cpu().detach().numpy().tolist() self.accelerator.gather(max_rnk_bs).cpu().detach().numpy().tolist()
) )
batch_size = min(gathered) batch_size = min(gathered)
utils.clear_torch_cache() clear_torch_cache()
return batch_size return batch_size
utils.clear_torch_cache() clear_torch_cache()
return batch_size return batch_size
def tok_encode( def tok_encode(
...@@ -997,18 +1003,18 @@ class HFLM(LM): ...@@ -997,18 +1003,18 @@ class HFLM(LM):
# create encoder attn mask and batched conts, if seq2seq # create encoder attn mask and batched conts, if seq2seq
call_kwargs = {} call_kwargs = {}
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
batched_inps = utils.pad_and_concat( batched_inps = pad_and_concat(
padding_len_inp, inps, padding_side="right" padding_len_inp, inps, padding_side="right"
) # [batch, padding_len_inp] ) # [batch, padding_len_inp]
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
# TODO: left-pad encoder inps and mask? # TODO: left-pad encoder inps and mask?
batched_inps = utils.pad_and_concat( batched_inps = pad_and_concat(
padding_len_inp, inps padding_len_inp, inps
) # [batch, padding_len_inp] ) # [batch, padding_len_inp]
batched_conts = utils.pad_and_concat( batched_conts = pad_and_concat(
padding_len_cont, conts padding_len_cont, conts
) # [batch, padding_len_cont] ) # [batch, padding_len_cont]
batched_encoder_mask = utils.pad_and_concat( batched_encoder_mask = pad_and_concat(
padding_len_inp, encoder_attns padding_len_inp, encoder_attns
) # [batch, padding_len_inp] ) # [batch, padding_len_inp]
call_kwargs = { call_kwargs = {
......
...@@ -2,7 +2,7 @@ from typing import Optional, Union ...@@ -2,7 +2,7 @@ from typing import Optional, Union
import torch import torch
from lm_eval import utils import lm_eval.models.utils
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM from lm_eval.models.huggingface import HFLM
...@@ -97,7 +97,9 @@ please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba ...@@ -97,7 +97,9 @@ please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba
self._model = MambaLMHeadModel.from_pretrained( self._model = MambaLMHeadModel.from_pretrained(
pretrained, pretrained,
device=self._device, device=self._device,
dtype=torch.float16 if dtype == "auto" else utils.get_dtype(dtype), dtype=torch.float16
if dtype == "auto"
else lm_eval.models.utils.get_dtype(dtype),
) )
def _model_generate(self, context, max_length, stop, **generation_kwargs): def _model_generate(self, context, max_length, stop, **generation_kwargs):
......
...@@ -13,10 +13,11 @@ from tqdm import tqdm ...@@ -13,10 +13,11 @@ from tqdm import tqdm
from transformers import GenerationConfig from transformers import GenerationConfig
from transformers.generation import StoppingCriteriaList from transformers.generation import StoppingCriteriaList
import lm_eval.models.utils
from lm_eval import utils from lm_eval import utils
from lm_eval.api.model import LM from lm_eval.api.model import LM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.utils import stop_sequences_criteria from lm_eval.models.utils import stop_sequences_criteria
try: try:
...@@ -239,7 +240,7 @@ class NEURON_HF(LM): ...@@ -239,7 +240,7 @@ class NEURON_HF(LM):
revision=revision, revision=revision,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
) )
torch_dtype = utils.get_dtype(dtype) torch_dtype = lm_eval.models.utils.get_dtype(dtype)
assert torch_dtype in [ assert torch_dtype in [
torch.float16, torch.float16,
...@@ -550,7 +551,7 @@ class NEURON_HF(LM): ...@@ -550,7 +551,7 @@ class NEURON_HF(LM):
# automatic (variable) batch size detection for vectorization # automatic (variable) batch size detection for vectorization
# pull longest context sample from request # pull longest context sample from request
chunks = utils.chunks( chunks = lm_eval.models.utils.chunks(
re_ord.get_reordered(), re_ord.get_reordered(),
n=self.batch_size, n=self.batch_size,
fn=None, fn=None,
...@@ -603,7 +604,7 @@ class NEURON_HF(LM): ...@@ -603,7 +604,7 @@ class NEURON_HF(LM):
# create encoder attn mask and batched conts, if seq2seq # create encoder attn mask and batched conts, if seq2seq
call_kwargs = {} call_kwargs = {}
batched_inps = utils.pad_and_concat( batched_inps = lm_eval.models.utils.pad_and_concat(
padding_len_inp, inps, padding_side="right" padding_len_inp, inps, padding_side="right"
) # [batch, padding_len_inp] ) # [batch, padding_len_inp]
...@@ -663,7 +664,7 @@ class NEURON_HF(LM): ...@@ -663,7 +664,7 @@ class NEURON_HF(LM):
# we group requests by their generation_kwargs, # we group requests by their generation_kwargs,
# so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
# in the same batch. # in the same batch.
grouper = utils.Grouper(requests, lambda x: str(x.args[1])) grouper = lm_eval.models.utils.Grouper(requests, lambda x: str(x.args[1]))
for key, reqs in grouper.get_grouped().items(): for key, reqs in grouper.get_grouped().items():
# within each set of reqs for given kwargs, we reorder by token length, descending. # within each set of reqs for given kwargs, we reorder by token length, descending.
re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate) re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
...@@ -672,7 +673,9 @@ class NEURON_HF(LM): ...@@ -672,7 +673,9 @@ class NEURON_HF(LM):
# for each different set of kwargs, we execute all requests, by batch. # for each different set of kwargs, we execute all requests, by batch.
for key, re_ord in re_ords.items(): for key, re_ord in re_ords.items():
chunks = utils.chunks(re_ord.get_reordered(), n=self.batch_size) chunks = lm_eval.models.utils.chunks(
re_ord.get_reordered(), n=self.batch_size
)
for chunk in tqdm(chunks, disable=self.rank != 0): for chunk in tqdm(chunks, disable=self.rank != 0):
contexts, all_gen_kwargs = zip(*chunk) contexts, all_gen_kwargs = zip(*chunk)
# we assume all gen kwargs in the batch are the same # we assume all gen kwargs in the batch are the same
......
...@@ -6,10 +6,12 @@ from typing import List, Literal, Optional, Tuple ...@@ -6,10 +6,12 @@ from typing import List, Literal, Optional, Tuple
from tqdm import tqdm from tqdm import tqdm
import lm_eval.models.utils
from lm_eval import utils from lm_eval import utils
from lm_eval.api.model import LM from lm_eval.api.model import LM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.utils import eval_logger, retry_on_specific_exceptions from lm_eval.models.utils import retry_on_specific_exceptions
from lm_eval.utils import eval_logger
def get_result(response, ctxlen: int) -> Tuple[float, bool]: def get_result(response, ctxlen: int) -> Tuple[float, bool]:
...@@ -219,7 +221,7 @@ class OpenaiCompletionsLM(LM): ...@@ -219,7 +221,7 @@ class OpenaiCompletionsLM(LM):
re_ord = utils.Reorderer(requests, _collate) re_ord = utils.Reorderer(requests, _collate)
for chunk in tqdm( for chunk in tqdm(
list(utils.chunks(re_ord.get_reordered(), self.batch_size)), list(lm_eval.models.utils.chunks(re_ord.get_reordered(), self.batch_size)),
disable=disable_tqdm, disable=disable_tqdm,
): ):
inps = [] inps = []
...@@ -429,7 +431,7 @@ class OpenaiChatCompletionsLM(LM): ...@@ -429,7 +431,7 @@ class OpenaiChatCompletionsLM(LM):
# we group requests by their generation_kwargs, # we group requests by their generation_kwargs,
# so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
# in the same batch. # in the same batch.
grouper = utils.Grouper(requests, lambda x: str(x.args[1])) grouper = lm_eval.models.utils.Grouper(requests, lambda x: str(x.args[1]))
for key, reqs in grouper.get_grouped().items(): for key, reqs in grouper.get_grouped().items():
# within each set of reqs for given kwargs, we reorder by token length, descending. # within each set of reqs for given kwargs, we reorder by token length, descending.
re_ords[key] = utils.Reorderer( re_ords[key] = utils.Reorderer(
...@@ -441,7 +443,7 @@ class OpenaiChatCompletionsLM(LM): ...@@ -441,7 +443,7 @@ class OpenaiChatCompletionsLM(LM):
# n needs to be 1 because messages in # n needs to be 1 because messages in
# chat completion are not batch but # chat completion are not batch but
# is regarded as a single conversation. # is regarded as a single conversation.
chunks = utils.chunks(re_ord.get_reordered(), n=1) chunks = lm_eval.models.utils.chunks(re_ord.get_reordered(), n=1)
for chunk in chunks: for chunk in chunks:
contexts, all_gen_kwargs = zip(*chunk) contexts, all_gen_kwargs = zip(*chunk)
inps = [{"role": "user", "content": context} for context in contexts] inps = [{"role": "user", "content": context} for context in contexts]
......
...@@ -19,7 +19,7 @@ from tqdm import tqdm ...@@ -19,7 +19,7 @@ from tqdm import tqdm
from lm_eval.api.model import LM from lm_eval.api.model import LM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.utils import retry_on_specific_exceptions from lm_eval.models.utils import retry_on_specific_exceptions
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
......
import collections
import fnmatch
import gc
import time
from functools import wraps
from typing import (
Any,
Callable,
Iterable,
Iterator,
List,
Literal,
Optional,
Tuple,
Type,
Union,
)
import torch
import transformers
from lm_eval.utils import eval_logger
def chunks(iter, n: int = 0, fn=None):
"""
Divides an iterable into chunks of specified size or based on a given function.
Useful for batching
Parameters:
- iter: The input iterable to be divided into chunks.
- n: An integer representing the size of each chunk. Default is 0.
- fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
Returns:
An iterator that yields chunks of the input iterable.
Example usage:
```
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for chunk in chunks(data, 3):
print(chunk)
```
Output:
```
[1, 2, 3]
[4, 5, 6]
[7, 8, 9]
[10]
```
"""
arr = []
for i, x in enumerate(iter):
arr.append(x)
if len(arr) == (fn(i, iter) if fn else n):
yield arr
arr = []
if arr:
yield arr
class MultiChoice:
def __init__(self, choices) -> None:
self.choices = choices
# Simple wildcard support (linux filename patterns)
def __contains__(self, values) -> bool:
for value in values.split(","):
if len(fnmatch.filter(self.choices, value)) == 0:
eval_logger.info("Available tasks to choose:")
for choice in self.choices:
eval_logger.info(f" - {choice}")
raise ValueError("'{}' is not in task list".format(value))
return True
def __iter__(self) -> Iterator:
for choice in self.choices:
yield choice
class Grouper:
"""
takes an array `arr` and function `fn` and returns a dictionary
with keys fn(ob) for each ob in `arr` and with values `self.arr[key]` a list of all
objects in `arr` satisfying `key == fn(ob)`.
"""
def __init__(self, arr, fn) -> None:
# self.orig_arr = arr
self.size = len(arr)
arr = list(enumerate(arr))
def group_return_dict(arr, fn):
res = collections.defaultdict(list)
for ob in arr:
res[fn(ob)].append(ob)
return res
arr = group_return_dict(arr, lambda x: fn(x[1]))
# self.arr has format Dict[Tuple[int, <entry from orig. arr>]]
self.arr = arr
self._grouped = None
def get_grouped(self):
# return the contents but not indices for our grouped dict.
if self._grouped:
return self._grouped
grouped = {}
for key in self.arr.keys():
# drop the index from each element of self.arr
grouped[key] = [y[1] for y in self.arr[key]]
self._grouped = grouped
return grouped
def get_original(self, grouped_dict):
# take in a grouped dictionary with e.g. results for each key listed
# in the same order as the instances in `self.arr`, and
# return the results in the same (single list) order as `self.orig_arr`.
res = [None] * self.size
cov = [False] * self.size
# orig = [None] * self.size
assert grouped_dict.keys() == self.arr.keys()
for key in grouped_dict.keys():
for (ind, _), v in zip(self.arr[key], grouped_dict[key]):
res[ind] = v
cov[ind] = True
# orig[ind] = _
assert all(cov)
# assert orig == self.orig_arr
return res
def pad_and_concat(
max_length: int,
tensors: List[torch.Tensor],
padding_side: Literal["right", "left"] = "right",
):
"""
Method for padding a list of tensors given the maximum tensor
length in the batch. Used for batching inputs and continuations in
seq2seq models.
"""
assert (
padding_side == "left" or padding_side == "right"
), f"Unrecognized padding type: '{padding_side}' not 'left' or 'right'"
for i, tensor in enumerate(tensors):
if len(tensor.shape) == 2:
tensor = tensor.squeeze(0) # squeeze, in case passed [1, seq] size
tensor_len = tensor.shape[0]
if tensor_len < max_length:
if padding_side == "right":
# right-pad
tensors[i] = torch.cat(
[
tensor, # [seq]
torch.zeros(
max_length - tensor_len,
dtype=torch.long,
device=tensor.device,
), # [padding_length - seq]
],
dim=0,
).unsqueeze(0)
else:
# left-pad
tensors[i] = torch.cat(
[
torch.zeros(
max_length - tensor_len,
dtype=torch.long,
device=tensor.device,
), # [padding_length - seq]
tensor, # [seq]
],
dim=0,
).unsqueeze(0)
else:
tensors[i] = tensor.unsqueeze(0)
return torch.cat(tensors, dim=0)
def clear_torch_cache() -> None:
gc.collect()
torch.cuda.empty_cache()
def get_dtype(dtype: Union[str, torch.dtype]) -> torch.dtype:
"""Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
if isinstance(dtype, str) and dtype != "auto":
# Convert `str` args torch dtype: `float16` -> `torch.float16`
_torch_dtype = getattr(torch, dtype)
else:
_torch_dtype = dtype
return _torch_dtype
class MultiTokenEOSCriteria(transformers.StoppingCriteria):
"""Criteria to stop on the specified multi-token sequence."""
def __init__(
self,
sequence: str,
tokenizer: transformers.PreTrainedTokenizer,
initial_decoder_input_length: int,
batch_size: int,
) -> None:
self.initial_decoder_input_length = initial_decoder_input_length
self.done_tracker = [False] * batch_size
self.sequence = sequence
self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
# print(sequence, self.sequence_ids)
# we look back for 2 more tokens than it takes to encode our stop sequence
# because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
# and we don't want to mistakenly not stop a generation because our
# (string) stop sequence was output in a different tokenization
# NOTE: there is a minor danger that this will end up looking back 2 tokens into the past, into the inputs to the model,
# and stopping generation immediately as a result. With only 2 extra tokens of lookback, this risk is minimized
# Additionally, in lookback_ids_batch we should prevent ever looking back into the inputs as described.
self.sequence_id_len = len(self.sequence_ids) + 2
self.tokenizer = tokenizer
def __call__(self, input_ids, scores, **kwargs) -> bool:
# For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :]
lookback_ids_batch = lookback_ids_batch[:, -self.sequence_id_len :]
lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
for i, done in enumerate(self.done_tracker):
if not done:
self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
return False not in self.done_tracker
def stop_sequences_criteria(
tokenizer: transformers.PreTrainedTokenizer,
stop_sequences: List[str],
initial_decoder_input_length: int,
batch_size: int,
) -> transformers.StoppingCriteriaList:
return transformers.StoppingCriteriaList(
[
*[
MultiTokenEOSCriteria(
sequence, tokenizer, initial_decoder_input_length, batch_size
)
for sequence in stop_sequences
],
]
)
def divide(iterable, n) -> List[Iterator]:
"""Divide the elements from *iterable* into *n* parts, maintaining
order.
>>> group_1, group_2 = divide([1, 2, 3, 4, 5, 6], 2)
>>> list(group_1)
[1, 2, 3]
>>> list(group_2)
[4, 5, 6]
If the length of *iterable* is not evenly divisible by *n*, then the
length of the returned iterables will not be identical:
>>> children = divide([1, 2, 3, 4, 5, 6, 7], 3)
>>> [list(c) for c in children]
[[1, 2, 3], [4, 5], [6, 7]]
If the length of the iterable is smaller than n, then the last returned
iterables will be empty:
>>> children = divide([1, 2, 3], 5)
>>> [list(c) for c in children]
[[1], [2], [3], [], []]
This function will exhaust the iterable before returning and may require
significant storage. If order is not important, see :func:`distribute`,
which does not first pull the iterable into memory.
"""
if n < 1:
raise ValueError("n must be at least 1")
try:
iterable[:0]
except TypeError:
seq = tuple(iterable)
else:
seq = iterable
q, r = divmod(len(seq), n)
ret = []
stop = 0
for i in range(1, n + 1):
start = stop
stop += q + 1 if i <= r else q
ret.append(iter(seq[start:stop]))
return ret
def retry_on_specific_exceptions(
on_exceptions: List[Type[Exception]],
max_retries: Optional[int] = None,
backoff_time: float = 3.0,
backoff_multiplier: float = 1.5,
on_exception_callback: Optional[Callable[[Exception, float], Any]] = None,
):
"""Retry on an LLM Provider's rate limit error with exponential backoff
For example, to use for OpenAI, do the following:
```
from openai import RateLimitError
# Recommend specifying max_retries to avoid infinite loops!
@retry_on_specific_exceptions([RateLimitError], max_retries=3)
def completion(...):
# Wrap OpenAI completion function here
...
```
"""
def decorator(func: Callable):
@wraps(func)
def wrapper(*args, **kwargs):
sleep_time = backoff_time
attempt = 0
while max_retries is None or attempt < max_retries:
try:
return func(*args, **kwargs)
except tuple(on_exceptions) as e:
if on_exception_callback is not None:
on_exception_callback(e, sleep_time)
time.sleep(sleep_time)
sleep_time *= backoff_multiplier
attempt += 1
return wrapper
return decorator
class Collator:
"""
A class for reordering and batching elements of an array.
This class allows for sorting an array based on a provided sorting function, grouping elements based on a grouping function, and generating batches from the sorted and grouped data.
"""
def __init__(
self,
arr: List,
sort_fn: Callable,
group_fn: Callable = lambda x: x[1],
grouping: bool = False,
) -> None:
self.grouping = grouping
self.fn = sort_fn
self.group_fn = lambda x: group_fn(x[1]) # first index are enumerated indices
self.reorder_indices: List = []
self.size = len(arr)
self.arr_with_indices: Iterable[Any] = tuple(enumerate(arr)) # [indices, (arr)]
if self.grouping is True:
self.group_by_index()
def group_by_index(self) -> None:
self.arr_with_indices = self.group(
self.arr_with_indices, fn=self.group_fn, values=False
)
def get_batched(self, n: int = 1, batch_fn: Optional[Callable] = None) -> Iterator:
"""
Generates and yields batches from the reordered array.
Parameters:
- n (int): The size of each batch. Defaults to 1.
- batch_fn (Optional[Callable[[int, Iterable], int]]): A function to determine the size of each batch. Defaults to None.
Yields:
Iterator: An iterator over batches of reordered elements.
"""
if self.grouping:
for (
key,
values,
) in self.arr_with_indices.items(): # type: ignore
values = self._reorder(values)
batch = self.get_chunks(values, n=n, fn=batch_fn)
yield from batch
else:
values = self._reorder(self.arr_with_indices) # type: ignore
batch = self.get_chunks(values, n=n, fn=batch_fn)
yield from batch
def _reorder(self, arr: Union[List, Tuple[Tuple[int, Any], ...]]) -> List:
"""
Reorders the elements in the array based on the sorting function.
Parameters:
- arr (Union[List, Tuple[Tuple[int, Any], ...]]): The array or iterable to be reordered.
Yields:
List: Yields reordered elements one by one.
"""
arr = sorted(arr, key=lambda x: self.fn(x[1]))
self.reorder_indices.extend([x[0] for x in arr])
yield from [x[1] for x in arr]
def get_original(self, newarr: List) -> List:
"""
Restores the original order of elements from the reordered list.
Parameters:
- newarr (List): The reordered array.
Returns:
List: The array with elements restored to their original order.
"""
res = [None] * self.size
cov = [False] * self.size
for ind, v in zip(self.reorder_indices, newarr):
res[ind] = v
cov[ind] = True
assert all(cov)
return res
def __len__(self):
return self.size
@staticmethod
def group(arr: Iterable, fn: Callable, values: bool = False) -> Iterable:
"""
Groups elements of an iterable based on a provided function.
Parameters:
- arr (Iterable): The iterable to be grouped.
- fn (Callable): The function to determine the grouping.
- values (bool): If True, returns the values of the group. Defaults to False.
Returns:
Iterable: An iterable of grouped elements.
"""
res = collections.defaultdict(list)
for ob in arr:
try:
hashable_dict = tuple(
(
key,
tuple(value)
if isinstance(value, collections.abc.Iterable)
else value,
)
for key, value in sorted(fn(ob).items())
)
res[hashable_dict].append(ob)
except TypeError:
res[fn(ob)].append(ob)
if not values:
return res
return res.values()
@staticmethod
def get_chunks(_iter, n: int = 0, fn=None):
"""
Divides an iterable into chunks of specified size or based on a given function.
Useful for batching
Parameters:
- iter: The input iterable to be divided into chunks.
- n: An integer representing the size of each chunk. Default is 0.
- fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
Returns:
An iterator that yields chunks of the input iterable.
Example usage:
```
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for chunk in chunks(data, 3):
print(chunk)
```
Output:
```
[1, 2, 3]
[4, 5, 6]
[7, 8, 9]
[10]
```
"""
arr = []
_iter = tuple(_iter)
for i, x in enumerate(_iter):
arr.append(x)
if len(arr) == (fn(i, _iter) if fn else n):
yield arr
arr = []
if arr:
yield arr
...@@ -7,9 +7,8 @@ from tqdm import tqdm ...@@ -7,9 +7,8 @@ from tqdm import tqdm
from lm_eval.api.instance import Instance from lm_eval.api.instance import Instance
from lm_eval.api.model import LM from lm_eval.api.model import LM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.models.utils import Collator, divide
from lm_eval.utils import ( from lm_eval.utils import (
Collator,
divide,
eval_logger, eval_logger,
get_rolling_token_windows, get_rolling_token_windows,
make_disjoint_window, make_disjoint_window,
......
...@@ -38,7 +38,7 @@ Homepage: https://allenai.org/data/arc ...@@ -38,7 +38,7 @@ Homepage: https://allenai.org/data/arc
#### Tasks #### Tasks
* `arc_easy` * `arc_easy`
* `arc_challange` * `arc_challenge`
### Checklist ### Checklist
......
...@@ -7,22 +7,22 @@ metric_list: ...@@ -7,22 +7,22 @@ metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
# ignore_case: true ignore_case: true
# ignore_punctuation: true # ignore_punctuation: true
regexes_to_ignore:
- "\\.$"
- ","
- "\\\\"
- "\n"
- '"'
generation_kwargs: generation_kwargs:
until: until:
- "</s>" - "</s>"
- "Q" - "Q:"
- "\n\n" - "<|im_end|>"
- "<0x0A>" - "<0x0A>"
do_sample: false do_sample: false
temperature: 0.0 temperature: 0.0
filter_list:
- name: "get-answer"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
num_fewshot: 0 num_fewshot: 0
metadata: metadata:
version: 1.0 version: 2.0
"dataset_name": "boolean_expressions" "dataset_name": "boolean_expressions"
"description": "Evaluate the result of a random Boolean expression.\n\n" "description": "Evaluate the result of a random Boolean expression.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_boolean_expressions" "task": "bbh_cot_zeroshot_boolean_expressions"
filter_list:
- name: "flexible-extract"
filter:
- function: "regex"
group_select: -1
regex_pattern: "\\b(True|False)\\b"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "causal_judgement" "dataset_name": "causal_judgement"
"description": "Answer questions about causal attribution.\n\n" "description": "Answer questions about causal attribution.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_causal_judgement" "task": "bbh_cot_zeroshot_causal_judgement"
filter_list:
- name: "flexible-extract"
filter:
- function: "regex"
group_select: -1
regex_pattern: "\\b(Yes|No|yes|no)\\b"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "date_understanding" "dataset_name": "date_understanding"
"description": "Infer the date from context.\n\n" "description": "Infer the date from context.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_date_understanding" "task": "bbh_cot_zeroshot_date_understanding"
filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: -1
ignore_case: true
ignore_punctuation: true
regex_pattern: "(\\([A-Z]\\))"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment