Unverified Commit 0594fe2b authored by Ethan Smith's avatar Ethan Smith Committed by GitHub
Browse files

Merge branch 'big-refactor' into pyproject_toml_v2

parents 40c9f9cb b65b9ca3
upper_pronouns = [
"A",
"An",
"The",
"She",
"He",
"It",
"They",
"My",
"His",
"Her",
"Their",
]
def process_doc(dataset):
def process_fn(doc):
# The HF implementation of `wsc273` is not `partial evaluation` friendly.
doc["text"] = doc["text"].replace(" ", " ")
doc["options"][0] = __normalize_option(doc, doc["options"][0])
doc["options"][1] = __normalize_option(doc, doc["options"][1])
return doc
return dataset.map(process_fn)
def __normalize_option(doc, option):
# Append `'s` to possessive determiner based options.
if doc["pronoun"].lower() in ["my", "his", "her", "our", "their"]:
option += "'s"
# Appropriately lowercase the pronoun in the option.
pronoun = option.split()[0]
start_of_sentence = doc["text"][doc["pronoun_loc"] - 2] == "."
if not start_of_sentence and pronoun in upper_pronouns:
return option.replace(pronoun, pronoun.lower())
return option
......@@ -10,7 +10,7 @@ import collections
import importlib.util
import fnmatch
from typing import List, Literal, Union
from typing import Iterator, List, Literal, Union
import gc
import torch
......@@ -65,7 +65,7 @@ def join_iters(iters):
yield from iter
def chunks(iter, n=0, fn=None):
def chunks(iter, n: int = 0, fn=None):
arr = []
for i, x in enumerate(iter):
arr.append(x)
......@@ -87,11 +87,11 @@ def group(arr, fn):
class MultiChoice:
def __init__(self, choices):
def __init__(self, choices) -> None:
self.choices = choices
# Simple wildcard support (linux filename patterns)
def __contains__(self, values):
def __contains__(self, values) -> bool:
for value in values.split(","):
if len(fnmatch.filter(self.choices, value)) == 0:
eval_logger.info(f"Available tasks to choose:")
......@@ -100,7 +100,7 @@ class MultiChoice:
raise ValueError("'{}' is not in task list".format(value))
return True
def __iter__(self):
def __iter__(self) -> Iterator:
for choice in self.choices:
yield choice
......@@ -108,7 +108,6 @@ class MultiChoice:
# Returns a list containing all values of the source_list that
# match at least one of the patterns
def pattern_match(patterns, source_list):
if type(patterns) == str:
patterns = [patterns]
......@@ -177,7 +176,7 @@ def make_disjoint_window(pair):
class Reorderer:
def __init__(self, arr, fn):
def __init__(self, arr, fn) -> None:
self.size = len(arr)
arr = list(enumerate(arr))
arr = group(arr, lambda x: fn(x[1]))
......@@ -212,7 +211,7 @@ class Grouper:
objects in `arr` satisfying `key == fn(ob)`.
"""
def __init__(self, arr, fn):
def __init__(self, arr, fn) -> None:
# self.orig_arr = arr
self.size = len(arr)
arr = list(enumerate(arr))
......@@ -263,7 +262,7 @@ class Grouper:
return res
def make_table(result_dict, column="results"):
def make_table(result_dict, column: str = "results"):
"""Generate table of results."""
from pytablewriter import MarkdownTableWriter, LatexTableWriter
......@@ -393,7 +392,6 @@ def get_git_commit_hash():
def import_function(loader, node):
function_name = loader.construct_scalar(node)
yaml_path = os.path.dirname(loader.name)
......@@ -428,7 +426,6 @@ def load_yaml_config(yaml_path):
include_path.reverse()
final_yaml_config = {}
for path in include_path:
# Assumes that path is a full path.
# If not found, assume the included yaml
# is in the same dir as the original yaml
......@@ -447,7 +444,7 @@ def load_yaml_config(yaml_path):
return yaml_config
def regex_replace(string, pattern, repl, count=0):
def regex_replace(string, pattern, repl, count: int = 0):
"""Implements the `re.sub` function as a custom Jinja filter."""
return re.sub(pattern, repl, string, count=count)
......@@ -521,7 +518,7 @@ def pad_and_concat(
return torch.cat(tensors, dim=0)
def clear_torch_cache():
def clear_torch_cache() -> None:
gc.collect()
torch.cuda.empty_cache()
......@@ -546,7 +543,7 @@ class MultiTokenEOSCriteria(transformers.StoppingCriteria):
tokenizer: transformers.PreTrainedTokenizer,
initial_decoder_input_length: int,
batch_size: int,
):
) -> None:
self.initial_decoder_input_length = initial_decoder_input_length
self.done_tracker = [False] * batch_size
self.sequence = sequence
......
......@@ -9,23 +9,26 @@ from pathlib import Path
from lm_eval import evaluator, utils
from lm_eval.api.registry import ALL_TASKS
from lm_eval.logger import eval_logger
from lm_eval.logger import eval_logger, SPACING
from lm_eval.tasks import include_task_folder
from lm_eval.benchmarks import include_benchmarks
os.environ["TOKENIZERS_PARALLELISM"] = "false"
def parse_args():
parser = argparse.ArgumentParser()
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("--model", required=True, help="Name of model e.g. `hf`")
parser.add_argument(
"--tasks",
default=None,
help="Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS))),
)
parser.add_argument(
"--model_args",
default="",
help="String arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
)
parser.add_argument(
"--tasks", default=None # , choices=utils.MultiChoice(sorted(ALL_TASKS))
)
parser.add_argument(
"--num_fewshot",
type=int,
......@@ -98,7 +101,7 @@ def parse_args():
return parser.parse_args()
def main():
def main() -> None:
args = parse_args()
if args.limit:
......@@ -125,10 +128,21 @@ def main():
else:
tasks_list = args.tasks.split(",")
task_names = utils.pattern_match(tasks_list, ALL_TASKS)
task_missing = []
for task in [task for task in tasks_list if task not in task_names]:
if os.path.isfile(task):
config = utils.load_yaml_config(task)
task_names.append(config)
else:
task_missing.append(task)
if task_missing != []:
missing = ", ".join(task_missing)
eval_logger.error(
f"Tasks were not found: {missing}\n"
f"{SPACING}Try `lm-eval -h` for list of available tasks",
)
raise ValueError(f"Tasks {missing} were not found.")
if args.output_path:
path = Path(args.output_path)
......
[mypy]
python_version = 3.9
show_traceback = True
check_untyped_defs = True
no_implicit_reexport = True
warn_unreachable = True
warn_unused_configs = True
warn_unused_ignores = True
warn_redundant_casts = True
# We ignore errors everywhere to gradually add type annotations
[mypy-lm_eval.*]
ignore_errors = True
[mypy-lm_eval.api.*]
ignore_errors = True
[mypy-lm_eval.prompts.*]
ignore_errors = True
[mypy-lm_eval.models.*]
ignore_errors = True
[mypy-scripts.*]
ignore_errors = True
[mypy-main]
ignore_errors = True
import pytest
from itertools import islice
import lm_eval.tasks as tasks
from .utilities_testing import load_changed_files, parser
from typing import List
from lm_eval.api.task import ConfigurableTask
import os
# GitHub CI
def new_tasks() -> List[str]:
FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
if os.path.exists(FILENAME):
# If tasks folder has changed then we get the list of files from FILENAME
# and parse the yaml files to get the task names.
return parser(load_changed_files(FILENAME))
elif os.getenv("API") is not None:
# Or if API has changed then we set the ENV variable API to True
# and run given tasks.
return ["arc_easy", "hellaswag", "piqa", "wikitext"]
# if both not true just do arc_easy
else:
return ["arc_easy"]
def get_task_class() -> List[ConfigurableTask]:
task_name = new_tasks()
x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name]
return x
@pytest.fixture()
def limit() -> int:
return 10
# Tests
@pytest.mark.parametrize("task_class", get_task_class())
class TestNewTasks:
def test_download(self, task_class: ConfigurableTask):
task_class().download()
assert task_class().dataset is not None
def test_has_training_docs(self, task_class: ConfigurableTask):
assert task_class().has_training_docs() in [True, False]
def test_check_training_docs(self, task_class: ConfigurableTask):
task = task_class()
if task.has_training_docs():
assert task._config["training_split"] is not None
def test_has_validation_docs(self, task_class):
assert task_class().has_validation_docs() in [True, False]
def test_check_validation_docs(self, task_class):
task = task_class()
if task.has_validation_docs():
assert task._config["validation_split"] is not None
def test_has_test_docs(self, task_class):
assert task_class().has_test_docs() in [True, False]
def test_check_test_docs(self, task_class):
task = task_class()
if task.has_test_docs():
assert task._config["test_split"] is not None
def test_should_decontaminate(self, task_class):
task = task_class()
assert task.should_decontaminate() in [True, False]
if task.should_decontaminate():
assert task._config["doc_to_decontamination_query"] is not None
def test_doc_to_text(self, task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
_array = [task.doc_to_text(doc) for doc in arr]
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
assert all(
isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True)
for x in _array
)
def test_create_choices(self, task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
if "multiple_choice" in task._config.output_type:
_array = [task.doc_to_choice(doc) for doc in arr]
# assert all(len(x) == 4 for x in _array)
assert all(isinstance(x, list) for x in _array)
assert all(isinstance(x[0], str) for x in _array)
def test_doc_to_target(self, task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
_array_target = [task.doc_to_target(doc) for doc in arr]
if task._config.output_type == "multiple_choice":
assert all(isinstance(label, int) for label in _array_target)
# _array_text = [task.doc_to_text(doc) for doc in arr]
# Not working
# assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
def test_build_all_requests(self, task_class, limit):
task_class().build_all_requests(rank=1, limit=limit, world_size=1)
assert task_class.instances is not None
# ToDO: Add proper testing
def test_construct_requests(self, task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
# assert all(isinstance(doc, list) for doc in requests)
assert len(requests) == limit if limit else True
import json
from typing import List
from lm_eval.utils import load_yaml_config
from pathlib import Path
FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
def load_changed_files(file_path: str = FILE_PATH) -> List[str]:
with open(file_path, "r") as f:
return [l for line in f.readlines() for l in line.strip().split(" ")]
def parser(full_path: List[str]) -> List[str]:
_output = set()
for x in full_path:
if x.endswith(".yaml"):
_output.add(load_yaml_config(x)["task"])
elif x.endswith(".py"):
path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))]
_output |= {load_yaml_config(x)["task"] for x in path}
return list(_output)
from __future__ import annotations
import pytest
from pathlib import Path
import numpy as np
from lm_eval.models.huggingface import HFLM
from lm_eval.api.instance import Instance
import lm_eval.tasks as tasks
import sys
import torch
class Test_HFLM:
torch.use_deterministic_algorithms(True)
version_minor = sys.version_info.minor
multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")() # type: ignore
multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
......@@ -90,8 +94,15 @@ class Test_HFLM:
def test_logliklihood(self) -> None:
res = self.LM.loglikelihood(self.MULTIPLE_CH)
_RES, _res = self.MULTIPLE_CH_RES, [r[0] for r in res]
# change atol in case of consistent failure
assert np.allclose(_res, _RES, atol=1e-4)
# log samples to CI
dir_path = Path("test_logs")
dir_path.mkdir(parents=True, exist_ok=True)
file_path = dir_path / f"outputs_log_{self.version_minor}.txt"
file_path = file_path.resolve()
with open(file_path, "w") as f:
f.write("\n".join(str(x) for x in _res))
assert np.allclose(_res, _RES, atol=1e-2)
# check indices for Multiple Choice
argmax_RES, argmax_res = np.argmax(
np.array(_RES).reshape(-1, 4), axis=1
......
from itertools import islice
import pytest
from typing import List
from .utils import new_tasks
import lm_eval.tasks as tasks
from lm_eval.api.task import ConfigurableTask
# Using fixtures to get the task class and limit
@pytest.fixture()
def task_class() -> ConfigurableTask:
task_name = ["arc_easy"]
x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name]
return x[0]
# Default Task
TASKS = ["arc_easy"]
def task_class():
global TASKS
# CI: new_tasks checks if any modifications have been made
task_classes = new_tasks()
# Check if task_classes is empty
if task_classes:
return [tasks.TASK_REGISTRY.get(x)() for x in task_classes]
else:
return [tasks.TASK_REGISTRY.get(x)() for x in TASKS]
@pytest.fixture()
......@@ -18,109 +26,96 @@ def limit() -> int:
# Tests
def test_download(task_class: ConfigurableTask):
task_class().download()
assert task_class().dataset is not None
def test_has_training_docs(task_class: ConfigurableTask):
assert task_class().has_training_docs() in [True, False]
def test_check_training_docs(task_class: ConfigurableTask):
task = task_class()
if task.has_training_docs():
assert task._config["training_split"] is not None
def test_has_validation_docs(task_class):
assert task_class().has_validation_docs() in [True, False]
def test_check_validation_docs(task_class):
task = task_class()
if task.has_validation_docs():
assert task._config["validation_split"] is not None
def test_has_test_docs(task_class):
assert task_class().has_test_docs() in [True, False]
def test_check_test_docs(task_class):
task = task_class()
if task.has_test_docs():
assert task._config["test_split"] is not None
def test_should_decontaminate(task_class):
task = task_class()
assert task.should_decontaminate() in [True, False]
if task.should_decontaminate():
assert task._config["doc_to_decontamination_query"] is not None
def test_doc_to_text(task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
_array = [task.doc_to_text(doc) for doc in arr]
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
assert all(
isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True) for x in _array
)
def test_create_choices(task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
if "multiple_choice" in task._config.output_type:
_array = [task.doc_to_choice(doc) for doc in arr]
# assert all(len(x) == 4 for x in _array)
assert all(isinstance(x, list) for x in _array)
assert all(isinstance(x[0], str) for x in _array)
def test_doc_to_target(task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
_array_target = [task.doc_to_target(doc) for doc in arr]
if task._config.output_type == "multiple_choice":
assert all(isinstance(label, int) for label in _array_target)
# _array_text = [task.doc_to_text(doc) for doc in arr]
# Not working
# assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
def test_build_all_requests(task_class, limit):
task_class().build_all_requests(rank=1, limit=limit, world_size=1)
assert task_class.instances is not None
# ToDO: Add proper testing
def test_construct_requests(task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
# assert all(isinstance(doc, list) for doc in requests)
assert len(requests) == limit if limit else True
@pytest.mark.parametrize("task_class", task_class())
class TestNewTasks:
def test_download(self, task_class: ConfigurableTask):
task_class.download()
assert task_class.dataset is not None
def test_has_training_docs(self, task_class: ConfigurableTask):
assert task_class.has_training_docs() in [True, False]
def test_check_training_docs(self, task_class: ConfigurableTask):
if task_class.has_training_docs():
assert task_class._config["training_split"] is not None
def test_has_validation_docs(self, task_class):
assert task_class.has_validation_docs() in [True, False]
def test_check_validation_docs(self, task_class):
if task_class.has_validation_docs():
assert task_class._config["validation_split"] is not None
def test_has_test_docs(self, task_class):
assert task_class.has_test_docs() in [True, False]
def test_check_test_docs(self, task_class):
task = task_class
if task.has_test_docs():
assert task._config["test_split"] is not None
def test_should_decontaminate(self, task_class):
task = task_class
assert task.should_decontaminate() in [True, False]
if task.should_decontaminate():
assert task._config["doc_to_decontamination_query"] is not None
def test_doc_to_text(self, task_class, limit):
task = task_class
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
_array = [task.doc_to_text(doc) for doc in arr]
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
assert all(
isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True)
for x in _array
)
def test_create_choices(self, task_class, limit):
task = task_class
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
if "multiple_choice" in task._config.output_type:
_array = [task.doc_to_choice(doc) for doc in arr]
# assert all(len(x) == 4 for x in _array)
assert all(isinstance(x, list) for x in _array)
assert all(isinstance(x[0], str) for x in _array)
def test_doc_to_target(self, task_class, limit):
task = task_class
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
_array_target = [task.doc_to_target(doc) for doc in arr]
if task._config.output_type == "multiple_choice":
assert all(isinstance(label, int) for label in _array_target)
# _array_text = [task.doc_to_text(doc) for doc in arr]
# Not working
# assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
def test_build_all_requests(self, task_class, limit):
task_class.build_all_requests(rank=1, limit=limit, world_size=1)
assert task_class.instances is not None
# ToDO: Add proper testing
def test_construct_requests(self, task_class, limit):
task = task_class
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
# assert all(isinstance(doc, list) for doc in requests)
assert len(requests) == limit if limit else True
# def test_create_choices(task_class):
......
import json
from typing import List
from lm_eval.utils import load_yaml_config
from pathlib import Path
import sys
from typing import Union
import os
# {{{CI}}}
# This is the path where the output for the changed files for the tasks folder is stored
# FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
# reads a text file and returns a list of words
# used to read the output of the changed txt from tj-actions/changed-files
def load_changed_files(file_path: str) -> List[str]:
with open(file_path, "r") as f:
content = f.read()
words_list = [x for x in content.split()]
sys.stdout.write(f"list of files: {words_list}")
return words_list
......@@ -30,3 +30,18 @@ def parser(full_path: List[str]) -> List[str]:
path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))]
_output |= {load_yaml_config(x)["task"] for x in path}
return list(_output)
def new_tasks() -> Union[list[str], None]:
FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
if os.path.exists(FILENAME):
# If tasks folder has changed then we get the list of files from FILENAME
# and parse the yaml files to get the task names.
return parser(load_changed_files(FILENAME))
elif os.getenv("API") is not None:
# Or if API has changed then we set the ENV variable API to True
# and run given tasks.
return ["arc_easy", "hellaswag", "piqa", "wikitext"]
# if both not true just do arc_easy
else:
return
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment