Commit 90ad5db7 authored by lintangsutawika's avatar lintangsutawika
Browse files

merged main

parents f692caa9 b177c82c
"""
Usage:
python requests_caching.py --tasks=comma,separated,list,of,tasks --cache_requests=<true|refresh|delete]>
"""
import argparse
import os
from typing import List
import torch
from transformers import (
pipeline as trans_pipeline,
)
from lm_eval import simple_evaluate
from lm_eval.evaluator import request_caching_arg_to_dict
from lm_eval.utils import eval_logger
MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
# Used to specify alternate cache path, useful if run in a docker container
# NOTE raw datasets will break if you try to transfer the cache from your host to a docker image
LM_HARNESS_CACHE_PATH = os.getenv("LM_HARNESS_CACHE_PATH")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL = "EleutherAI/pythia-70m"
TASK = "text-generation"
def run_model_for_task_caching(tasks: List[str], cache_requests: str):
eval_logger.info(f"Loading HF model: {MODEL}")
trans_pipe = trans_pipeline(
task=TASK, model=MODEL, device=DEVICE, trust_remote_code=True
)
model = trans_pipe.model
tokenizer = trans_pipe.tokenizer
eval_logger.info(
f"Running simple_evaluate to cache request objects for tasks: {tasks}"
)
cache_args = request_caching_arg_to_dict(cache_requests=cache_requests)
eval_logger.info(
f"The following operations will be performed on the cache: {cache_requests}"
)
eval_data = simple_evaluate(
model="hf-auto",
model_args={
"pretrained": model,
"tokenizer": tokenizer,
},
limit=1,
device=DEVICE,
tasks=tasks,
write_out=True,
**cache_args,
)
return eval_data
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--tasks",
"-t",
default=None,
metavar="task1,task2",
)
parser.add_argument(
"--cache_requests",
type=str,
default=None,
choices=["true", "refresh", "delete"],
help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
)
args = parser.parse_args()
tasks = args.tasks.split(",")
eval_data = run_model_for_task_caching(
tasks=tasks, model=MODEL, device=DEVICE, cache_requests=args.cache_requests
)
......@@ -22,8 +22,8 @@ class Test_HFLM:
multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
generate_until_task = task_list["gsm8k"] # type: ignore
generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
generate_until: list[Instance] = generate_until_task.instances
rolling_task = task_list["wikitext"] # type: ignore
rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
......@@ -74,7 +74,7 @@ class Test_HFLM:
generate_until_RES = [
" The average of $2.50 each is $",
" A robe takes 2 bolts of blue fiber and half",
" $50,000 in repairs.",
" $50,000 in repairs.\n\nQuestion",
" He runs 1 sprint 3 times a week.",
" They feed each of her chickens three cups of mixed",
" The price of the glasses is $5, but",
......
# import lm_eval.base as base
import importlib
import os
import sys
from datetime import datetime
from typing import List, Tuple
import pytest
import torch
# import lm_eval.models as models
from lm_eval.caching.cache import PATH
MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
# NOTE the script this loads uses simple evaluate
# TODO potentially test both the helper script and the normal script
sys.path.append(f"{MODULE_DIR}/../scripts")
model_loader = importlib.import_module("requests_caching")
run_model_for_task_caching = model_loader.run_model_for_task_caching
DEFAULT_TASKS = ["lambada_openai", "hellaswag"]
@pytest.fixture(autouse=True)
def setup_and_teardown():
# Setup
torch.use_deterministic_algorithms(False)
clear_cache()
# Yields control back to the test function
yield
# Cleanup here
def clear_cache():
if os.path.exists(PATH):
cache_files = os.listdir(PATH)
for file in cache_files:
file_path = f"{PATH}/{file}"
os.unlink(file_path)
# leaving tasks here to allow for the option to select specific task files
def get_cache_files(tasks: List[str] = None) -> Tuple[List[str], List[str]]:
cache_files = os.listdir(PATH)
file_task_names = []
for file in cache_files:
file_without_prefix = file.split("-")[1]
file_without_prefix_and_suffix = file_without_prefix.split(".")[0]
file_task_names.append(file_without_prefix_and_suffix)
return cache_files, file_task_names
def assert_created(tasks: List[str], file_task_names: List[str]):
tasks.sort()
file_task_names.sort()
assert tasks == file_task_names
@pytest.mark.parametrize("tasks", [DEFAULT_TASKS])
def test_requests_caching_true(tasks: List[str]):
run_model_for_task_caching(tasks=tasks, cache_requests="true")
cache_files, file_task_names = get_cache_files()
assert_created(tasks=tasks, file_task_names=file_task_names)
@pytest.mark.parametrize("tasks", [DEFAULT_TASKS])
def test_requests_caching_refresh(tasks: List[str]):
run_model_for_task_caching(tasks=tasks, cache_requests="true")
timestamp_before_test = datetime.now().timestamp()
run_model_for_task_caching(tasks=tasks, cache_requests="refresh")
cache_files, file_task_names = get_cache_files()
for file in cache_files:
modification_time = os.path.getmtime(f"{PATH}/{file}")
assert modification_time > timestamp_before_test
tasks.sort()
file_task_names.sort()
assert tasks == file_task_names
@pytest.mark.parametrize("tasks", [DEFAULT_TASKS])
def test_requests_caching_delete(tasks: List[str]):
# populate the data first, rerun this test within this test for additional confidence
test_requests_caching_true(tasks=tasks)
run_model_for_task_caching(tasks=tasks, cache_requests="delete")
cache_files, file_task_names = get_cache_files()
assert len(cache_files) == 0
# useful for locally running tests through the debugger
if __name__ == "__main__":
def run_tests():
tests = [
test_requests_caching_true,
test_requests_caching_refresh,
test_requests_caching_delete,
]
for test_func in tests:
clear_cache()
test_func(tasks=DEFAULT_TASKS)
print("Tests pass")
run_tests()
......@@ -2,6 +2,7 @@ import itertools
import numpy as np
import pytest
import torch
from lm_eval.api.metrics import (
aggregate_subtask_metrics,
......@@ -258,12 +259,20 @@ class TestCollator:
]
return samples
def make_loglikelihood_sample_group(self, end=11):
a = [(("x", "x"), [1, 2, 3, 4, 5, 6, 7, 8], [x]) for x in range(9)]
b = [
(("x", "x"), [1, 2, 3, 4, 5, 6, 7, 8], [x, y, z])
for x, y, z in zip(range(9), range(9, 18), range(18, 27))
]
return a + b
@pytest.mark.parametrize("batch_size, end", [(17, 30), (8, 61), (12, 48), (0, 9)])
def test_generations(self, batch_size, end):
_collate_gen = lambda x: (-len(x[0]), x[0]) # noqa: E731
generation_samples = self.make_generate_sample(int(end))
gens = Collator(generation_samples, _collate_gen, grouping=True)
gens = Collator(generation_samples, _collate_gen, group_by="gen_kwargs")
chunks = gens.get_batched(n=int(batch_size), batch_fn=None)
output = []
for chunks in chunks:
......@@ -292,7 +301,10 @@ class TestCollator:
def test_loglikelihood(self, batch_size, end):
_collate_log = lambda x: (-len(x[1]), tuple(x[1])) # noqa: E731
loglikelihood_samples = self.make_loglikelihood_sample(int(end))
loglikelihoods = Collator(loglikelihood_samples, _collate_log, grouping=False)
loglikelihoods = Collator(
loglikelihood_samples,
_collate_log,
)
chunks = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
output = []
for chunks in chunks:
......@@ -309,6 +321,48 @@ class TestCollator:
reordered_output = loglikelihoods.get_original(output)
assert reordered_output == [x[1] for x in loglikelihood_samples]
@pytest.mark.parametrize("batch_size", [17, 8, 12, 0])
def test_context_grouping(self, batch_size):
def _collate(x):
toks = x[1] + x[2]
return -len(toks), tuple(toks)
_collate_log = _collate # noqa: E731
loglikelihood_samples = self.make_loglikelihood_sample_group()
loglikelihoods = Collator(
loglikelihood_samples,
_collate_log,
group_fn=lambda a: a[-2] + a[-1][:-1],
group_by="contexts",
)
chunks = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
output = []
outputs_ = []
for chunks in chunks:
# check batching
if batch_size != 0:
assert len(chunks) <= batch_size
# check reorder
assert all(
len(chunks[i][1]) <= len(chunks[i - 1][1])
for i in range(1, len(chunks))
)
for x in chunks:
for request_str, cont_toks, logits in loglikelihoods.get_cache(
req_str="".join(x[0]),
cxt_toks=x[1],
cont_toks=x[2],
logits=torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])
.unsqueeze(0)
.unsqueeze(0),
):
output.append(x[1])
outputs_.append(cont_toks)
assert len(output) == len(outputs_)
# check indices
reordered_output = loglikelihoods.get_original(output)
assert reordered_output == [x[1] for x in loglikelihood_samples]
def test_aggregate_mean():
# test weight_by_size is respected
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment