Commit 60c9c170 authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

Merge branch 'main' into inverse-scaling-tasks

parents 4b2d565b b4cd85d4
import random
import tempfile
from pathlib import Path
import pytest
from optimum.intel import OVModelForCausalLM
......@@ -71,3 +72,21 @@ def test_evaluator(model_id, task):
limit=limit,
bootstrap_iters=10,
)
def test_ov_config():
"""Test that if specified, a custom OpenVINO config is loaded correctly"""
model_id = "hf-internal-testing/tiny-random-gpt2"
with tempfile.TemporaryDirectory() as tmpdirname:
config_file = str(Path(tmpdirname) / "ov_config.json")
with open(Path(config_file), "w") as f:
f.write('{"DYNAMIC_QUANTIZATION_GROUP_SIZE" : "32"}')
lm = get_model("openvino").create_from_arg_string(
f"pretrained={model_id},ov_config={config_file}"
)
assert (
lm.model.request.get_compiled_model().get_property(
"DYNAMIC_QUANTIZATION_GROUP_SIZE"
)
== 32
)
......@@ -25,8 +25,8 @@ class TEST_VLLM:
multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
MULTIPLE_CH: List[Instance] = multiple_choice_task.instances
generate_until_task = task_list["gsm8k"] # type: ignore
generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
generate_until: List[Instance] = generate_until_task.instances
rolling_task = task_list["wikitext"] # type: ignore
rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
......
import argparse
import pytest
import lm_eval.__main__
def test_cli_parse_error():
"""
Assert error raised if cli args argument doesn't have type
"""
with pytest.raises(ValueError):
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument(
"--model", "-m", type=str, default="hf", help="Name of model e.g. `hf`"
)
parser.add_argument(
"--tasks",
"-t",
default=None,
metavar="task1,task2",
help="To get full list of tasks, use the command lm-eval --tasks list",
)
lm_eval.__main__.check_argument_types(parser)
def test_cli_parse_no_error():
"""
Assert typed arguments are parsed correctly
"""
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument(
"--model", "-m", type=str, default="hf", help="Name of model e.g. `hf`"
)
parser.add_argument(
"--tasks",
"-t",
type=str,
default=None,
metavar="task1,task2",
help="To get full list of tasks, use the command lm-eval --tasks list",
)
lm_eval.__main__.check_argument_types(parser)
......@@ -14,25 +14,33 @@ from lm_eval import tasks
@pytest.mark.parametrize(
"task_name,limit,model,model_args",
"task_name,limit,model,model_args,bootstrap_iters",
[
(
["arc_easy"],
10,
"hf",
"pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu",
)
0,
),
(
["mmlu_abstract_algebra"],
None,
"hf",
"pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu",
10000,
),
],
)
def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str):
task_name = task_name
limit = 10
def test_evaluator(
task_name: List[str], limit: int, model: str, model_args: str, bootstrap_iters: int
):
e1 = evaluator.simple_evaluate(
model=model,
tasks=task_name,
limit=limit,
model_args=model_args,
bootstrap_iters=bootstrap_iters,
)
assert e1 is not None
......@@ -51,13 +59,17 @@ def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str
lm=lm,
task_dict=task_dict,
limit=limit,
bootstrap_iters=bootstrap_iters,
)
assert e2 is not None
# check that caching is working
def r(x):
return x["results"]["arc_easy"]
if "arc_easy" in x["results"]:
return x["results"]["arc_easy"]
else:
return x["results"]["mmlu_abstract_algebra"]
assert all(
x == y
......
......@@ -20,8 +20,8 @@ sys.path.append(f"{MODULE_DIR}/../scripts")
model_loader = importlib.import_module("requests_caching")
run_model_for_task_caching = model_loader.run_model_for_task_caching
DEFAULT_TASKS = ["lambada_openai", "hellaswag"]
os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = "1"
DEFAULT_TASKS = ["lambada_openai", "sciq"]
@pytest.fixture(autouse=True)
......@@ -64,16 +64,16 @@ def assert_created(tasks: List[str], file_task_names: List[str]):
@pytest.mark.parametrize("tasks", [DEFAULT_TASKS])
def test_requests_caching_true(tasks: List[str]):
def requests_caching_true(tasks: List[str]):
run_model_for_task_caching(tasks=tasks, cache_requests="true")
cache_files, file_task_names = get_cache_files()
print(file_task_names)
assert_created(tasks=tasks, file_task_names=file_task_names)
@pytest.mark.parametrize("tasks", [DEFAULT_TASKS])
def test_requests_caching_refresh(tasks: List[str]):
def requests_caching_refresh(tasks: List[str]):
run_model_for_task_caching(tasks=tasks, cache_requests="true")
timestamp_before_test = datetime.now().timestamp()
......@@ -93,9 +93,9 @@ def test_requests_caching_refresh(tasks: List[str]):
@pytest.mark.parametrize("tasks", [DEFAULT_TASKS])
def test_requests_caching_delete(tasks: List[str]):
def requests_caching_delete(tasks: List[str]):
# populate the data first, rerun this test within this test for additional confidence
test_requests_caching_true(tasks=tasks)
# test_requests_caching_true(tasks=tasks)
run_model_for_task_caching(tasks=tasks, cache_requests="delete")
......@@ -109,9 +109,9 @@ if __name__ == "__main__":
def run_tests():
tests = [
test_requests_caching_true,
test_requests_caching_refresh,
test_requests_caching_delete,
# test_requests_caching_true,
# test_requests_caching_refresh,
# test_requests_caching_delete,
]
for test_func in tests:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment