"official/projects/yolo/modeling/factory.py" did not exist on "9c060b0cd1d58c6617718f5f3ec425ded0d49a4b"
Unverified Commit 9822b06e authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge branch 'main' into weight_by_size

parents 51f27158 b177c82c
......@@ -51,7 +51,9 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
for lang in LANGUAGES:
file_name = f"xwinograd_{lang}.yaml"
try:
with open(f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf-8") as f:
with open(
f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf-8"
) as f:
f.write("# Generated by utils.py\n")
yaml.dump(
{
......
import collections
import fnmatch
import functools
import gc
import importlib.util
import inspect
import logging
import os
import pathlib
import re
import subprocess
import sys
import time
from functools import wraps
from itertools import islice
from typing import (
Any,
Callable,
Iterable,
Iterator,
List,
Literal,
Optional,
Tuple,
Type,
Union,
)
from typing import Any, Callable, List
import torch
import transformers
import numpy as np
import yaml
from jinja2 import BaseLoader, Environment, StrictUndefined
......@@ -99,44 +81,6 @@ def join_iters(iters):
yield from iter
def chunks(iter, n: int = 0, fn=None):
"""
Divides an iterable into chunks of specified size or based on a given function.
Useful for batching
Parameters:
- iter: The input iterable to be divided into chunks.
- n: An integer representing the size of each chunk. Default is 0.
- fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
Returns:
An iterator that yields chunks of the input iterable.
Example usage:
```
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for chunk in chunks(data, 3):
print(chunk)
```
Output:
```
[1, 2, 3]
[4, 5, 6]
[7, 8, 9]
[10]
```
"""
arr = []
for i, x in enumerate(iter):
arr.append(x)
if len(arr) == (fn(i, iter) if fn else n):
yield arr
arr = []
if arr:
yield arr
def group(arr, fn):
res = collections.defaultdict(list)
......@@ -146,25 +90,6 @@ def group(arr, fn):
return list(res.values())
class MultiChoice:
def __init__(self, choices) -> None:
self.choices = choices
# Simple wildcard support (linux filename patterns)
def __contains__(self, values) -> bool:
for value in values.split(","):
if len(fnmatch.filter(self.choices, value)) == 0:
eval_logger.info("Available tasks to choose:")
for choice in self.choices:
eval_logger.info(f" - {choice}")
raise ValueError("'{}' is not in task list".format(value))
return True
def __iter__(self) -> Iterator:
for choice in self.choices:
yield choice
# Returns a list containing all values of the source_list that
# match at least one of the patterns
def pattern_match(patterns, source_list):
......@@ -178,6 +103,12 @@ def pattern_match(patterns, source_list):
return sorted(list(task_names))
def softmax(x):
"""Compute softmax values for each sets of scores in x."""
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum()
def general_detokenize(string):
string = string.replace(" n't", "n't")
string = string.replace(" )", ")")
......@@ -283,64 +214,6 @@ class Reorderer:
return res
class Grouper:
"""
takes an array `arr` and function `fn` and returns a dictionary
with keys fn(ob) for each ob in `arr` and with values `self.arr[key]` a list of all
objects in `arr` satisfying `key == fn(ob)`.
"""
def __init__(self, arr, fn) -> None:
# self.orig_arr = arr
self.size = len(arr)
arr = list(enumerate(arr))
def group_return_dict(arr, fn):
res = collections.defaultdict(list)
for ob in arr:
res[fn(ob)].append(ob)
return res
arr = group_return_dict(arr, lambda x: fn(x[1]))
# self.arr has format Dict[Tuple[int, <entry from orig. arr>]]
self.arr = arr
self._grouped = None
def get_grouped(self):
# return the contents but not indices for our grouped dict.
if self._grouped:
return self._grouped
grouped = {}
for key in self.arr.keys():
# drop the index from each element of self.arr
grouped[key] = [y[1] for y in self.arr[key]]
self._grouped = grouped
return grouped
def get_original(self, grouped_dict):
# take in a grouped dictionary with e.g. results for each key listed
# in the same order as the instances in `self.arr`, and
# return the results in the same (single list) order as `self.orig_arr`.
res = [None] * self.size
cov = [False] * self.size
# orig = [None] * self.size
assert grouped_dict.keys() == self.arr.keys()
for key in grouped_dict.keys():
for (ind, _), v in zip(self.arr[key], grouped_dict[key]):
res[ind] = v
cov[ind] = True
# orig[ind] = _
assert all(cov)
# assert orig == self.orig_arr
return res
def make_table(result_dict, column: str = "results"):
"""Generate table of results."""
from pytablewriter import LatexTableWriter, MarkdownTableWriter
......@@ -369,7 +242,7 @@ def make_table(result_dict, column: str = "results"):
values = []
for k, dic in result_dict[column].items():
version = result_dict["versions"][k]
version = result_dict["versions"].get(k, "N/A")
n = str(result_dict["n-shot"][k])
if "alias" in dic:
......@@ -417,59 +290,8 @@ def positional_deprecated(fn):
return _wrapper
@positional_deprecated
def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
"""
Search upward in the directory tree to a maximum of three layers
to find and return the package root (containing the 'tests' folder)
"""
cur_path = start_path.resolve()
max_layers = 3
for _ in range(max_layers):
if (cur_path / "tests" / "test_version_stable.py").exists():
return cur_path
else:
cur_path = cur_path.parent.resolve()
raise FileNotFoundError(
f"Unable to find package root within {max_layers} upwards" + f"of {start_path}"
)
@positional_deprecated
def run_task_tests(task_list: List[str]):
"""
Find the package root and run the tests for the given tasks
"""
import pytest
package_root = find_test_root(start_path=pathlib.Path(__file__))
task_string = " or ".join(task_list)
args = [
f"{package_root}/tests/test_version_stable.py",
f"--rootdir={package_root}",
"-k",
f"{task_string}",
]
sys.path.append(str(package_root))
pytest_return_val = pytest.main(args)
if pytest_return_val:
raise ValueError(
f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
)
def get_git_commit_hash():
"""
Gets the git commit hash of your current repo (if it exists).
Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
"""
try:
git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
git_hash = git_hash.decode()
except subprocess.CalledProcessError or FileNotFoundError:
# FileNotFoundError occurs when git not installed on system
git_hash = None
return git_hash
def ignore_constructor(loader, node):
return node
def import_function(loader, node):
......@@ -489,11 +311,14 @@ def import_function(loader, node):
return function
# Add the import_function constructor to the YAML loader
yaml.add_constructor("!function", import_function)
def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None, mode="full"):
if mode == "simple":
constructor_fn = ignore_constructor
elif mode == "full":
constructor_fn = import_function
def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
# Add the import_function constructor to the YAML loader
yaml.add_constructor("!function", constructor_fn)
if yaml_config is None:
with open(yaml_path, "rb") as file:
yaml_config = yaml.full_load(file)
......@@ -521,7 +346,7 @@ def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
path = os.path.join(yaml_dir, path)
try:
included_yaml_config = load_yaml_config(path)
included_yaml_config = load_yaml_config(yaml_path=path, mode=mode)
final_yaml_config.update(included_yaml_config)
except Exception as ex:
# If failed to load, ignore
......@@ -546,389 +371,10 @@ def apply_template(template: str, doc: dict) -> str:
return rtemplate.render(**doc)
def create_iterator(raw_iterator, rank, world_size, limit=None):
def create_iterator(raw_iterator, *, rank=0, world_size=1, limit=None):
"""
Method for creating a (potentially) sliced and limited
iterator from a raw document iterator. Used for splitting data
among ranks in multigpu setting or only pulling a sample of documents
"""
return islice(raw_iterator, rank, limit, world_size)
def pad_and_concat(
max_length: int,
tensors: List[torch.Tensor],
padding_side: Literal["right", "left"] = "right",
):
"""
Method for padding a list of tensors given the maximum tensor
length in the batch. Used for batching inputs and continuations in
seq2seq models.
"""
assert (
padding_side == "left" or padding_side == "right"
), f"Unrecognized padding type: '{padding_side}' not 'left' or 'right'"
for i, tensor in enumerate(tensors):
if len(tensor.shape) == 2:
tensor = tensor.squeeze(0) # squeeze, in case passed [1, seq] size
tensor_len = tensor.shape[0]
if tensor_len < max_length:
if padding_side == "right":
# right-pad
tensors[i] = torch.cat(
[
tensor, # [seq]
torch.zeros(
max_length - tensor_len,
dtype=torch.long,
device=tensor.device,
), # [padding_length - seq]
],
dim=0,
).unsqueeze(0)
else:
# left-pad
tensors[i] = torch.cat(
[
torch.zeros(
max_length - tensor_len,
dtype=torch.long,
device=tensor.device,
), # [padding_length - seq]
tensor, # [seq]
],
dim=0,
).unsqueeze(0)
else:
tensors[i] = tensor.unsqueeze(0)
return torch.cat(tensors, dim=0)
def clear_torch_cache() -> None:
gc.collect()
torch.cuda.empty_cache()
def get_dtype(dtype: Union[str, torch.dtype]) -> torch.dtype:
"""Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
if isinstance(dtype, str) and dtype != "auto":
# Convert `str` args torch dtype: `float16` -> `torch.float16`
_torch_dtype = getattr(torch, dtype)
else:
_torch_dtype = dtype
return _torch_dtype
# Multi-token stopping criteria
class MultiTokenEOSCriteria(transformers.StoppingCriteria):
"""Criteria to stop on the specified multi-token sequence."""
def __init__(
self,
sequence: str,
tokenizer: transformers.PreTrainedTokenizer,
initial_decoder_input_length: int,
batch_size: int,
) -> None:
self.initial_decoder_input_length = initial_decoder_input_length
self.done_tracker = [False] * batch_size
self.sequence = sequence
self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
# print(sequence, self.sequence_ids)
# we look back for 2 more tokens than it takes to encode our stop sequence
# because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
# and we don't want to mistakenly not stop a generation because our
# (string) stop sequence was output in a different tokenization
# NOTE: there is a minor danger that this will end up looking back 2 tokens into the past, into the inputs to the model,
# and stopping generation immediately as a result. With only 2 extra tokens of lookback, this risk is minimized
# Additionally, in lookback_ids_batch we should prevent ever looking back into the inputs as described.
self.sequence_id_len = len(self.sequence_ids) + 2
self.tokenizer = tokenizer
def __call__(self, input_ids, scores, **kwargs) -> bool:
# For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :]
lookback_ids_batch = lookback_ids_batch[:, -self.sequence_id_len :]
lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
for i, done in enumerate(self.done_tracker):
if not done:
self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
return False not in self.done_tracker
def stop_sequences_criteria(
tokenizer: transformers.PreTrainedTokenizer,
stop_sequences: List[str],
initial_decoder_input_length: int,
batch_size: int,
) -> transformers.StoppingCriteriaList:
return transformers.StoppingCriteriaList(
[
*[
MultiTokenEOSCriteria(
sequence, tokenizer, initial_decoder_input_length, batch_size
)
for sequence in stop_sequences
],
]
)
# from more_itertools
def divide(iterable, n) -> List[Iterator]:
"""Divide the elements from *iterable* into *n* parts, maintaining
order.
>>> group_1, group_2 = divide([1, 2, 3, 4, 5, 6], 2)
>>> list(group_1)
[1, 2, 3]
>>> list(group_2)
[4, 5, 6]
If the length of *iterable* is not evenly divisible by *n*, then the
length of the returned iterables will not be identical:
>>> children = divide([1, 2, 3, 4, 5, 6, 7], 3)
>>> [list(c) for c in children]
[[1, 2, 3], [4, 5], [6, 7]]
If the length of the iterable is smaller than n, then the last returned
iterables will be empty:
>>> children = divide([1, 2, 3], 5)
>>> [list(c) for c in children]
[[1], [2], [3], [], []]
This function will exhaust the iterable before returning and may require
significant storage. If order is not important, see :func:`distribute`,
which does not first pull the iterable into memory.
"""
if n < 1:
raise ValueError("n must be at least 1")
try:
iterable[:0]
except TypeError:
seq = tuple(iterable)
else:
seq = iterable
q, r = divmod(len(seq), n)
ret = []
stop = 0
for i in range(1, n + 1):
start = stop
stop += q + 1 if i <= r else q
ret.append(iter(seq[start:stop]))
return ret
def retry_on_specific_exceptions(
on_exceptions: List[Type[Exception]],
max_retries: Optional[int] = None,
backoff_time: float = 3.0,
backoff_multiplier: float = 1.5,
on_exception_callback: Optional[Callable[[Exception, float], Any]] = None,
):
"""Retry on an LLM Provider's rate limit error with exponential backoff
For example, to use for OpenAI, do the following:
```
from openai import RateLimitError
# Recommend specifying max_retries to avoid infinite loops!
@retry_on_specific_exceptions([RateLimitError], max_retries=3)
def completion(...):
# Wrap OpenAI completion function here
...
```
"""
def decorator(func: Callable):
@wraps(func)
def wrapper(*args, **kwargs):
sleep_time = backoff_time
attempt = 0
while max_retries is None or attempt < max_retries:
try:
return func(*args, **kwargs)
except tuple(on_exceptions) as e:
if on_exception_callback is not None:
on_exception_callback(e, sleep_time)
time.sleep(sleep_time)
sleep_time *= backoff_multiplier
attempt += 1
return wrapper
return decorator
class Collator:
"""
A class for reordering and batching elements of an array.
This class allows for sorting an array based on a provided sorting function, grouping elements based on a grouping function, and generating batches from the sorted and grouped data.
"""
def __init__(
self,
arr: List,
sort_fn: Callable,
group_fn: Callable = lambda x: x[1],
grouping: bool = False,
) -> None:
self.grouping = grouping
self.fn = sort_fn
self.group_fn = lambda x: group_fn(x[1]) # first index are enumerated indices
self.reorder_indices: List = []
self.size = len(arr)
self.arr_with_indices: Iterable[Any] = tuple(enumerate(arr)) # [indices, (arr)]
if self.grouping is True:
self.group_by_index()
def group_by_index(self) -> None:
self.arr_with_indices = self.group(
self.arr_with_indices, fn=self.group_fn, values=False
)
def get_batched(self, n: int = 1, batch_fn: Optional[Callable] = None) -> Iterator:
"""
Generates and yields batches from the reordered array.
Parameters:
- n (int): The size of each batch. Defaults to 1.
- batch_fn (Optional[Callable[[int, Iterable], int]]): A function to determine the size of each batch. Defaults to None.
Yields:
Iterator: An iterator over batches of reordered elements.
"""
if self.grouping:
for (
key,
values,
) in self.arr_with_indices.items(): # type: ignore
values = self._reorder(values)
batch = self.get_chunks(values, n=n, fn=batch_fn)
yield from batch
else:
values = self._reorder(self.arr_with_indices) # type: ignore
batch = self.get_chunks(values, n=n, fn=batch_fn)
yield from batch
def _reorder(self, arr: Union[List, Tuple[Tuple[int, Any], ...]]) -> List:
"""
Reorders the elements in the array based on the sorting function.
Parameters:
- arr (Union[List, Tuple[Tuple[int, Any], ...]]): The array or iterable to be reordered.
Yields:
List: Yields reordered elements one by one.
"""
arr = sorted(arr, key=lambda x: self.fn(x[1]))
self.reorder_indices.extend([x[0] for x in arr])
yield from [x[1] for x in arr]
def get_original(self, newarr: List) -> List:
"""
Restores the original order of elements from the reordered list.
Parameters:
- newarr (List): The reordered array.
Returns:
List: The array with elements restored to their original order.
"""
res = [None] * self.size
cov = [False] * self.size
for ind, v in zip(self.reorder_indices, newarr):
res[ind] = v
cov[ind] = True
assert all(cov)
return res
def __len__(self):
return self.size
@staticmethod
def group(arr: Iterable, fn: Callable, values: bool = False) -> Iterable:
"""
Groups elements of an iterable based on a provided function.
Parameters:
- arr (Iterable): The iterable to be grouped.
- fn (Callable): The function to determine the grouping.
- values (bool): If True, returns the values of the group. Defaults to False.
Returns:
Iterable: An iterable of grouped elements.
"""
res = collections.defaultdict(list)
for ob in arr:
try:
hashable_dict = tuple(
(
key,
tuple(value)
if isinstance(value, collections.abc.Iterable)
else value,
)
for key, value in sorted(fn(ob).items())
)
res[hashable_dict].append(ob)
except TypeError:
res[fn(ob)].append(ob)
if not values:
return res
return res.values()
@staticmethod
def get_chunks(_iter, n: int = 0, fn=None):
"""
Divides an iterable into chunks of specified size or based on a given function.
Useful for batching
Parameters:
- iter: The input iterable to be divided into chunks.
- n: An integer representing the size of each chunk. Default is 0.
- fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
Returns:
An iterator that yields chunks of the input iterable.
Example usage:
```
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for chunk in chunks(data, 3):
print(chunk)
```
Output:
```
[1, 2, 3]
[4, 5, 6]
[7, 8, 9]
[10]
```
"""
arr = []
_iter = tuple(_iter)
for i, x in enumerate(_iter):
arr.append(x)
if len(arr) == (fn(i, _iter) if fn else n):
yield arr
arr = []
if arr:
yield arr
......@@ -36,6 +36,8 @@ dependencies = [
"tqdm-multiprocess",
"transformers>=4.1",
"zstandard",
"dill",
"word2number",
]
[tool.setuptools.packages.find]
......@@ -57,7 +59,9 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
anthropic = ["anthropic"]
dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy"]
gptq = ["auto-gptq[triton]>=0.6.0"]
hf_transfer = ["hf_transfer"]
ifeval = ["langdetect", "immutabledict"]
neuronx = ["optimum[neuronx]"]
mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"]
multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
......@@ -68,12 +72,13 @@ sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
testing = ["pytest", "pytest-cov", "pytest-xdist"]
vllm = ["vllm<=0.2.5"]
zeno = ["pandas", "zeno-client"]
wandb = ["wandb>=0.16.3", "pandas", "numpy"]
all = [
"lm_eval[anthropic]",
"lm_eval[dev]",
"lm_eval[gptq]",
"lm_eval[hf_transfer]",
"lm_eval[ifeval]",
"lm_eval[linting]",
"lm_eval[mamba]",
"lm_eval[math]",
"lm_eval[multilingual]",
......@@ -83,11 +88,9 @@ all = [
"lm_eval[testing]",
"lm_eval[vllm]",
"lm_eval[zeno]",
"lm_eval[wandb]",
]
[tool.ruff]
extend-exclude = ["lm_eval/evaluator.py", "lm_eval/tasks/*.py"]
[tool.ruff.lint]
extend-select = ["I"]
......@@ -96,5 +99,4 @@ lines-after-imports = 2
known-first-party = ["lm_eval"]
[tool.ruff.extend-per-file-ignores]
"__init__.py" = ["F401","F402","F403","I"]
"lm_eval/tasks/*"= ["E721"]
"__init__.py" = ["F401","F402","F403"]
......@@ -30,4 +30,7 @@ pip install pybind11
c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix)
```
If your your compiler isn't linked to python, you may need to add to the above `-undefined dynamic_lookup`
MacOS users: If your compiler isn't linked to Python, you may need to add to the above `-undefined dynamic_lookup`. \
Linux users: If your compiler isn't linked to Python, you may need to follow these steps:
1. Rename the compiled code file to `janitor_util.so`.
2. Before running `import Janitor` in your code, add `sys.path.append("your/relative/path/to/janitor_util.so")` so that Python knows the location of `janitor_util.so`.
......@@ -8,6 +8,7 @@ import scipy.stats
import torch
import lm_eval.evaluator
import lm_eval.models.utils
from lm_eval import tasks, utils
......@@ -113,7 +114,7 @@ if __name__ == "__main__":
batch_size=args.batch,
)
memory_stats()
utils.clear_torch_cache()
lm_eval.models.utils.clear_torch_cache()
eval_logger.info("Memory stats cleared")
memory_stats()
results_hf = lm_eval.evaluator.simple_evaluate(
......
"""
Usage:
python requests_caching.py --tasks=comma,separated,list,of,tasks --cache_requests=<true|refresh|delete]>
"""
import argparse
import os
from typing import List
import torch
from transformers import (
pipeline as trans_pipeline,
)
from lm_eval import simple_evaluate
from lm_eval.evaluator import request_caching_arg_to_dict
from lm_eval.utils import eval_logger
MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
# Used to specify alternate cache path, useful if run in a docker container
# NOTE raw datasets will break if you try to transfer the cache from your host to a docker image
LM_HARNESS_CACHE_PATH = os.getenv("LM_HARNESS_CACHE_PATH")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL = "EleutherAI/pythia-70m"
TASK = "text-generation"
def run_model_for_task_caching(tasks: List[str], cache_requests: str):
eval_logger.info(f"Loading HF model: {MODEL}")
trans_pipe = trans_pipeline(
task=TASK, model=MODEL, device=DEVICE, trust_remote_code=True
)
model = trans_pipe.model
tokenizer = trans_pipe.tokenizer
eval_logger.info(
f"Running simple_evaluate to cache request objects for tasks: {tasks}"
)
cache_args = request_caching_arg_to_dict(cache_requests=cache_requests)
eval_logger.info(
f"The following operations will be performed on the cache: {cache_requests}"
)
eval_data = simple_evaluate(
model="hf-auto",
model_args={
"pretrained": model,
"tokenizer": tokenizer,
},
limit=1,
device=DEVICE,
tasks=tasks,
write_out=True,
**cache_args,
)
return eval_data
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--tasks",
"-t",
default=None,
metavar="task1,task2",
)
parser.add_argument(
"--cache_requests",
type=str,
default=None,
choices=["true", "refresh", "delete"],
help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
)
args = parser.parse_args()
tasks = args.tasks.split(",")
eval_data = run_model_for_task_caching(
tasks=tasks, model=MODEL, device=DEVICE, cache_requests=args.cache_requests
)
......@@ -5,7 +5,7 @@ import random
import numpy as np
from lm_eval import tasks
from lm_eval.tasks import include_path, initialize_tasks
from lm_eval.tasks import TaskManager
from lm_eval.utils import eval_logger, join_iters
......@@ -39,22 +39,21 @@ def main():
args = parse_args()
np.random.seed(args.seed)
initialize_tasks(args.verbosity)
if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}")
include_path(args.include_path)
task_manager = TaskManager(args.verbosity, include_path=args.include_path)
if args.tasks == "all_tasks":
task_names = tasks.ALL_TASKS
task_names = task_manager.all_tasks
else:
task_names = args.tasks.split(",")
task_dict = tasks.get_task_dict(task_names)
task_dict = tasks.get_task_dict(task_names, task_manager)
os.makedirs(args.output_base_path, exist_ok=True)
for task_name, task in task_dict.items():
if isinstance(task, tuple):
group_name, task = task
_, task = task
rnd = random.Random()
rnd.seed(args.seed)
......
......@@ -11,20 +11,21 @@ from lm_eval.api.instance import Instance
from lm_eval.models.huggingface import HFLM
tasks.initialize_tasks()
task_manager = tasks.TaskManager()
class Test_HFLM:
torch.use_deterministic_algorithms(True)
task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
version_minor = sys.version_info.minor
multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")() # type: ignore
multiple_choice_task = task_list["arc_easy"] # type: ignore
multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
generate_until_task = tasks.TASK_REGISTRY.get("gsm8k")() # type: ignore
generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
generate_until_task = task_list["gsm8k"] # type: ignore
generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
generate_until: list[Instance] = generate_until_task.instances
rolling_task = tasks.TASK_REGISTRY.get("wikitext")() # type: ignore
rolling_task = task_list["wikitext"] # type: ignore
rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
ROLLING: list[Instance] = rolling_task.instances
......@@ -73,7 +74,7 @@ class Test_HFLM:
generate_until_RES = [
" The average of $2.50 each is $",
" A robe takes 2 bolts of blue fiber and half",
" $50,000 in repairs.",
" $50,000 in repairs.\n\nQuestion",
" He runs 1 sprint 3 times a week.",
" They feed each of her chickens three cups of mixed",
" The price of the glasses is $5, but",
......
import pytest
import torch
from lm_eval.models.neuron_optimum import wrap_constant_batch_size
def test_wrap_constant_batch_size():
class Tester:
def __init__(self, batch_size):
self.batch_size = batch_size
@wrap_constant_batch_size
def test_constant_batch_size(self, inputs):
assert len(inputs) == self.batch_size
return inputs
batch_size_test = 8
for i in range(1, batch_size_test + 1):
tensor = torch.ones([i, 2, 2])
out = Tester(batch_size=batch_size_test).test_constant_batch_size(tensor)
torch.testing.assert_allclose(out, tensor)
with pytest.raises(ValueError):
Tester(batch_size=batch_size_test).test_constant_batch_size(
torch.ones([batch_size_test + 1, 2, 2])
)
......@@ -6,12 +6,9 @@ from optimum.intel import OVModelForCausalLM
from transformers import AutoTokenizer
import lm_eval.evaluator as evaluator
import lm_eval.tasks as tasks
from lm_eval.api.registry import get_model
tasks.initialize_tasks()
SUPPORTED_ARCHITECTURES_TASKS = {
"facebook/opt-125m": "lambada_openai",
"hf-internal-testing/tiny-random-gpt2": "wikitext",
......
......@@ -7,6 +7,9 @@ import lm_eval.tasks as tasks
from lm_eval.api.instance import Instance
task_manager = tasks.TaskManager()
@pytest.mark.skip(reason="requires CUDA")
class TEST_VLLM:
vllm = pytest.importorskip("vllm")
......@@ -17,15 +20,15 @@ class TEST_VLLM:
except ModuleNotFoundError:
pass
torch.use_deterministic_algorithms(True)
tasks.initialize_tasks()
multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")() # type: ignore
task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
multiple_choice_task = task_list["arc_easy"] # type: ignore
multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
MULTIPLE_CH: List[Instance] = multiple_choice_task.instances
generate_until_task = tasks.TASK_REGISTRY.get("gsm8k")() # type: ignore
generate_until_task = task_list["gsm8k"] # type: ignore
generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
generate_until: List[Instance] = generate_until_task.instances
rolling_task = tasks.TASK_REGISTRY.get("wikitext")() # type: ignore
rolling_task = task_list["wikitext"] # type: ignore
rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
ROLLING: List[Instance] = rolling_task.instances
......
......@@ -6,11 +6,9 @@ import pytest
# import lm_eval.models as models
import lm_eval.api as api
import lm_eval.evaluator as evaluator
import lm_eval.tasks as tasks
from lm_eval import tasks
tasks.initialize_tasks()
# TODO: more fine grained unit tests rather than this big honking integration
# test once we break evaluator into smaller, more manageable pieces
......@@ -46,7 +44,8 @@ def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str
"device": None,
},
)
task_dict = tasks.get_task_dict(task_name, num_fewshot=0)
task_manager = tasks.TaskManager()
task_dict = tasks.get_task_dict(task_name, task_manager)
e2 = evaluator.evaluate(
lm=lm,
......
# import lm_eval.base as base
import importlib
import os
import sys
from datetime import datetime
from typing import List, Tuple
import pytest
import torch
# import lm_eval.models as models
from lm_eval.caching.cache import PATH
MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
# NOTE the script this loads uses simple evaluate
# TODO potentially test both the helper script and the normal script
sys.path.append(f"{MODULE_DIR}/../scripts")
model_loader = importlib.import_module("requests_caching")
run_model_for_task_caching = model_loader.run_model_for_task_caching
DEFAULT_TASKS = ["lambada_openai", "hellaswag"]
@pytest.fixture(autouse=True)
def setup_and_teardown():
# Setup
torch.use_deterministic_algorithms(False)
clear_cache()
# Yields control back to the test function
yield
# Cleanup here
def clear_cache():
if os.path.exists(PATH):
cache_files = os.listdir(PATH)
for file in cache_files:
file_path = f"{PATH}/{file}"
os.unlink(file_path)
# leaving tasks here to allow for the option to select specific task files
def get_cache_files(tasks: List[str] = None) -> Tuple[List[str], List[str]]:
cache_files = os.listdir(PATH)
file_task_names = []
for file in cache_files:
file_without_prefix = file.split("-")[1]
file_without_prefix_and_suffix = file_without_prefix.split(".")[0]
file_task_names.append(file_without_prefix_and_suffix)
return cache_files, file_task_names
def assert_created(tasks: List[str], file_task_names: List[str]):
tasks.sort()
file_task_names.sort()
assert tasks == file_task_names
@pytest.mark.parametrize("tasks", [DEFAULT_TASKS])
def test_requests_caching_true(tasks: List[str]):
run_model_for_task_caching(tasks=tasks, cache_requests="true")
cache_files, file_task_names = get_cache_files()
assert_created(tasks=tasks, file_task_names=file_task_names)
@pytest.mark.parametrize("tasks", [DEFAULT_TASKS])
def test_requests_caching_refresh(tasks: List[str]):
run_model_for_task_caching(tasks=tasks, cache_requests="true")
timestamp_before_test = datetime.now().timestamp()
run_model_for_task_caching(tasks=tasks, cache_requests="refresh")
cache_files, file_task_names = get_cache_files()
for file in cache_files:
modification_time = os.path.getmtime(f"{PATH}/{file}")
assert modification_time > timestamp_before_test
tasks.sort()
file_task_names.sort()
assert tasks == file_task_names
@pytest.mark.parametrize("tasks", [DEFAULT_TASKS])
def test_requests_caching_delete(tasks: List[str]):
# populate the data first, rerun this test within this test for additional confidence
test_requests_caching_true(tasks=tasks)
run_model_for_task_caching(tasks=tasks, cache_requests="delete")
cache_files, file_task_names = get_cache_files()
assert len(cache_files) == 0
# useful for locally running tests through the debugger
if __name__ == "__main__":
def run_tests():
tests = [
test_requests_caching_true,
test_requests_caching_refresh,
test_requests_caching_delete,
]
for test_func in tests:
clear_cache()
test_func(tasks=DEFAULT_TASKS)
print("Tests pass")
run_tests()
......@@ -8,7 +8,7 @@ from lm_eval.api.task import ConfigurableTask
from .utils import new_tasks
tasks.initialize_tasks()
task_manager = tasks.TaskManager()
# Default Task
TASKS = ["arc_easy"]
......@@ -19,9 +19,9 @@ def task_class():
task_classes = new_tasks()
# Check if task_classes is empty
if task_classes:
return [tasks.TASK_REGISTRY.get(x)() for x in task_classes]
return list(task_manager.load_task_or_group(task_classes).values())
else:
return [tasks.TASK_REGISTRY.get(x)() for x in TASKS]
return list(task_manager.load_task_or_group(TASKS).values())
@pytest.fixture()
......
import itertools
import numpy as np
import pytest
import torch
from lm_eval.utils import Collator, get_rolling_token_windows, make_disjoint_window
from lm_eval.api.metrics import (
aggregate_subtask_metrics,
mean,
pooled_sample_stderr,
stderr_for_metric,
)
from lm_eval.models.utils import Collator
from lm_eval.utils import (
get_rolling_token_windows,
make_disjoint_window,
)
# noinspection DuplicatedCode
......@@ -245,12 +259,20 @@ class TestCollator:
]
return samples
def make_loglikelihood_sample_group(self, end=11):
a = [(("x", "x"), [1, 2, 3, 4, 5, 6, 7, 8], [x]) for x in range(9)]
b = [
(("x", "x"), [1, 2, 3, 4, 5, 6, 7, 8], [x, y, z])
for x, y, z in zip(range(9), range(9, 18), range(18, 27))
]
return a + b
@pytest.mark.parametrize("batch_size, end", [(17, 30), (8, 61), (12, 48), (0, 9)])
def test_generations(self, batch_size, end):
_collate_gen = lambda x: (-len(x[0]), x[0]) # noqa: E731
generation_samples = self.make_generate_sample(int(end))
gens = Collator(generation_samples, _collate_gen, grouping=True)
gens = Collator(generation_samples, _collate_gen, group_by="gen_kwargs")
chunks = gens.get_batched(n=int(batch_size), batch_fn=None)
output = []
for chunks in chunks:
......@@ -279,7 +301,10 @@ class TestCollator:
def test_loglikelihood(self, batch_size, end):
_collate_log = lambda x: (-len(x[1]), tuple(x[1])) # noqa: E731
loglikelihood_samples = self.make_loglikelihood_sample(int(end))
loglikelihoods = Collator(loglikelihood_samples, _collate_log, grouping=False)
loglikelihoods = Collator(
loglikelihood_samples,
_collate_log,
)
chunks = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
output = []
for chunks in chunks:
......@@ -295,3 +320,81 @@ class TestCollator:
# check indices
reordered_output = loglikelihoods.get_original(output)
assert reordered_output == [x[1] for x in loglikelihood_samples]
@pytest.mark.parametrize("batch_size", [17, 8, 12, 0])
def test_context_grouping(self, batch_size):
def _collate(x):
toks = x[1] + x[2]
return -len(toks), tuple(toks)
_collate_log = _collate # noqa: E731
loglikelihood_samples = self.make_loglikelihood_sample_group()
loglikelihoods = Collator(
loglikelihood_samples,
_collate_log,
group_fn=lambda a: a[-2] + a[-1][:-1],
group_by="contexts",
)
chunks = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
output = []
outputs_ = []
for chunks in chunks:
# check batching
if batch_size != 0:
assert len(chunks) <= batch_size
# check reorder
assert all(
len(chunks[i][1]) <= len(chunks[i - 1][1])
for i in range(1, len(chunks))
)
for x in chunks:
for request_str, cont_toks, logits in loglikelihoods.get_cache(
req_str="".join(x[0]),
cxt_toks=x[1],
cont_toks=x[2],
logits=torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])
.unsqueeze(0)
.unsqueeze(0),
):
output.append(x[1])
outputs_.append(cont_toks)
assert len(output) == len(outputs_)
# check indices
reordered_output = loglikelihoods.get_original(output)
assert reordered_output == [x[1] for x in loglikelihood_samples]
def test_aggregate_mean():
# test weight_by_size is respected
assert (
aggregate_subtask_metrics([0.3, 0.2, 0.4], [20, 40, 100], weight_by_size=False)
== 0.3
)
assert (
aggregate_subtask_metrics([0.3, 0.2, 0.4], [20, 40, 100], weight_by_size=True)
== 0.3375
)
@pytest.mark.parametrize(
"samples",
[
[40 * [1.0] + 60 * [0.0], 30 * [1.0] + 30 * [0.0], 20 * [1.0] + 60 * [0.0]],
[35 * [1.0] + 65 * [0.0], 20 * [1.0] + 20 * [0.0]],
],
)
def test_aggregate_stderrs(samples):
# check that aggregating subtasks' bootstrap stderrs with our formula
# (using weight_by_size) is ~equiv.
# to just getting bootstrap stderr of the whole set of samples
mean_stderr = stderr_for_metric(metric=mean, bootstrap_iters=100000)
stderrs = [mean_stderr(subtask) for subtask in samples]
sizes = [len(subtask) for subtask in samples]
assert np.allclose(
pooled_sample_stderr(stderrs, sizes),
mean_stderr(list(itertools.chain.from_iterable(samples))),
atol=1.0e-3,
)
import os
from pathlib import Path
from typing import List, Union
from lm_eval.utils import load_yaml_config
......@@ -20,17 +19,18 @@ def load_changed_files(file_path: str) -> List[str]:
# checks the txt file for list of changed files.
# if file ends with .yaml then check yaml for task name
# if file ends with .py then parse the folder for all yaml files
# skips benchmarks folder
# if file ends with .yaml then check yaml and load the config.
# if the config task is a string, it's a task config.
# if the config task is a list, it's a group config.
def parser(full_path: List[str]) -> List[str]:
_output = set()
for x in full_path:
if x.endswith(".yaml") and "benchmarks" not in x:
_output.add(load_yaml_config(x)["task"])
elif x.endswith(".py") and "benchmarks" not in x:
path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))]
_output |= {load_yaml_config(x)["task"] for x in path}
if os.path.exists(x) and x.endswith(".yaml"):
config = load_yaml_config(x, mode="simple")
if isinstance(config["task"], str):
_output.add(config["task"])
elif isinstance(config["task"], list):
_output.add(config["group"])
return list(_output)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment