Commit cb8889cc authored by lintangsutawika's avatar lintangsutawika
Browse files

merged with latest update from main

parents ec05e561 74119471
...@@ -94,7 +94,11 @@ def eval_models(args, branch=None): ...@@ -94,7 +94,11 @@ def eval_models(args, branch=None):
ret = os.system(command) ret = os.system(command)
results[model] = json.load(open(output_path)) if ret == 0 else {"results": {}} results[model] = (
json.load(open(output_path, encoding="utf-8"))
if ret == 0
else {"results": {}}
)
end_time = time.time() end_time = time.time()
......
...@@ -5,7 +5,7 @@ import random ...@@ -5,7 +5,7 @@ import random
import numpy as np import numpy as np
from lm_eval import tasks from lm_eval import tasks
from lm_eval.tasks import include_path, initialize_tasks from lm_eval.tasks import TaskManager
from lm_eval.utils import eval_logger, join_iters from lm_eval.utils import eval_logger, join_iters
...@@ -39,22 +39,21 @@ def main(): ...@@ -39,22 +39,21 @@ def main():
args = parse_args() args = parse_args()
np.random.seed(args.seed) np.random.seed(args.seed)
initialize_tasks(args.verbosity)
if args.include_path is not None: if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}") eval_logger.info(f"Including path: {args.include_path}")
include_path(args.include_path)
task_manager = TaskManager(args.verbosity, include_path=args.include_path)
if args.tasks == "all_tasks": if args.tasks == "all_tasks":
task_names = tasks.ALL_TASKS task_names = task_manager.all_tasks
else: else:
task_names = args.tasks.split(",") task_names = args.tasks.split(",")
task_dict = tasks.get_task_dict(task_names) task_dict = tasks.get_task_dict(task_names, task_manager)
os.makedirs(args.output_base_path, exist_ok=True) os.makedirs(args.output_base_path, exist_ok=True)
for task_name, task in task_dict.items(): for task_name, task in task_dict.items():
if type(task) == tuple: if isinstance(task, tuple):
group_name, task = task _, task = task
rnd = random.Random() rnd = random.Random()
rnd.seed(args.seed) rnd.seed(args.seed)
......
...@@ -69,18 +69,20 @@ def main(): ...@@ -69,18 +69,20 @@ def main():
model_args = re.sub( model_args = re.sub(
"/|=", "/|=",
"__", "__",
json.load(open(Path(args.data_path, model, "results.json")))["config"][ json.load(
"model_args" open(Path(args.data_path, model, "results.json"), encoding="utf-8")
], )["config"]["model_args"],
) )
with open( with open(
Path(args.data_path, model, f"{model_args}_{task}.jsonl"), "r" Path(args.data_path, model, f"{model_args}_{task}.jsonl"),
"r",
encoding="utf-8",
) as file: ) as file:
data = json.loads(file.read()) data = json.loads(file.read())
configs = json.load(open(Path(args.data_path, model, "results.json")))[ configs = json.load(
"configs" open(Path(args.data_path, model, "results.json"), encoding="utf-8")
] )["configs"]
config = configs[task] config = configs[task]
if model_index == 0: # Only need to assemble data for the first model if model_index == 0: # Only need to assemble data for the first model
...@@ -124,7 +126,9 @@ def tasks_for_model(model: str, data_path: str): ...@@ -124,7 +126,9 @@ def tasks_for_model(model: str, data_path: str):
list: A list of tasks for the model. list: A list of tasks for the model.
""" """
dir_path = Path(data_path, model) dir_path = Path(data_path, model)
config = (json.load(open(Path(dir_path, "results.json")))["configs"],) config = (
json.load(open(Path(dir_path, "results.json"), encoding="utf-8"))["configs"],
)
return list(config[0].keys()) return list(config[0].keys())
......
...@@ -11,20 +11,21 @@ from lm_eval.api.instance import Instance ...@@ -11,20 +11,21 @@ from lm_eval.api.instance import Instance
from lm_eval.models.huggingface import HFLM from lm_eval.models.huggingface import HFLM
tasks.initialize_tasks() task_manager = tasks.TaskManager()
class Test_HFLM: class Test_HFLM:
torch.use_deterministic_algorithms(True) torch.use_deterministic_algorithms(True)
task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
version_minor = sys.version_info.minor version_minor = sys.version_info.minor
multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")() # type: ignore multiple_choice_task = task_list["arc_easy"] # type: ignore
multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1) multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
MULTIPLE_CH: list[Instance] = multiple_choice_task.instances MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
generate_until_task = tasks.TASK_REGISTRY.get("gsm8k")() # type: ignore generate_until_task = task_list["gsm8k"] # type: ignore
generate_until_task.build_all_requests(limit=10, rank=0, world_size=1) generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
generate_until_task._config.generation_kwargs["max_gen_toks"] = 10 generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
generate_until: list[Instance] = generate_until_task.instances generate_until: list[Instance] = generate_until_task.instances
rolling_task = tasks.TASK_REGISTRY.get("wikitext")() # type: ignore rolling_task = task_list["wikitext"] # type: ignore
rolling_task.build_all_requests(limit=10, rank=0, world_size=1) rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
ROLLING: list[Instance] = rolling_task.instances ROLLING: list[Instance] = rolling_task.instances
......
import random
import tempfile
import pytest
from optimum.intel import OVModelForCausalLM
from transformers import AutoTokenizer
import lm_eval.evaluator as evaluator
from lm_eval.api.registry import get_model
SUPPORTED_ARCHITECTURES_TASKS = {
"facebook/opt-125m": "lambada_openai",
"hf-internal-testing/tiny-random-gpt2": "wikitext",
}
@pytest.mark.parametrize("model_id,task", SUPPORTED_ARCHITECTURES_TASKS.items())
def test_evaluator(model_id, task):
with tempfile.TemporaryDirectory() as tmpdirname:
model = OVModelForCausalLM.from_pretrained(
model_id, export=True, use_cache=True
)
model.save_pretrained(tmpdirname)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.save_pretrained(tmpdirname)
lm = get_model("openvino").create_from_arg_string(
f"pretrained={tmpdirname}",
{
"batch_size": 1,
"device": "cpu",
},
)
def ll_fn(reqs):
for ctx, cont in [req.args for req in reqs]:
if len(ctx) == 0:
continue
# space convention
assert ctx[-1] != " "
assert cont[0] == " " or ctx[-1] == "\n"
res = []
random.seed(42)
for _ in reqs:
res.append((-random.random(), False))
return res
def ll_perp_fn(reqs):
for (string,) in [req.args for req in reqs]:
assert isinstance(string, str)
res = []
random.seed(42)
for _ in reqs:
res.append(-random.random())
return res
lm.loglikelihood = ll_fn
lm.loglikelihood_rolling = ll_perp_fn
limit = 10
evaluator.simple_evaluate(
model=lm,
tasks=[task],
num_fewshot=0,
limit=limit,
bootstrap_iters=10,
)
...@@ -7,6 +7,9 @@ import lm_eval.tasks as tasks ...@@ -7,6 +7,9 @@ import lm_eval.tasks as tasks
from lm_eval.api.instance import Instance from lm_eval.api.instance import Instance
task_manager = tasks.TaskManager()
@pytest.mark.skip(reason="requires CUDA") @pytest.mark.skip(reason="requires CUDA")
class TEST_VLLM: class TEST_VLLM:
vllm = pytest.importorskip("vllm") vllm = pytest.importorskip("vllm")
...@@ -17,15 +20,15 @@ class TEST_VLLM: ...@@ -17,15 +20,15 @@ class TEST_VLLM:
except ModuleNotFoundError: except ModuleNotFoundError:
pass pass
torch.use_deterministic_algorithms(True) torch.use_deterministic_algorithms(True)
tasks.initialize_tasks() task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")() # type: ignore multiple_choice_task = task_list["arc_easy"] # type: ignore
multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1) multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
MULTIPLE_CH: List[Instance] = multiple_choice_task.instances MULTIPLE_CH: List[Instance] = multiple_choice_task.instances
generate_until_task = tasks.TASK_REGISTRY.get("gsm8k")() # type: ignore generate_until_task = task_list["gsm8k"] # type: ignore
generate_until_task.build_all_requests(limit=10, rank=0, world_size=1) generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
generate_until_task._config.generation_kwargs["max_gen_toks"] = 10 generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
generate_until: List[Instance] = generate_until_task.instances generate_until: List[Instance] = generate_until_task.instances
rolling_task = tasks.TASK_REGISTRY.get("wikitext")() # type: ignore rolling_task = task_list["wikitext"] # type: ignore
rolling_task.build_all_requests(limit=10, rank=0, world_size=1) rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
ROLLING: List[Instance] = rolling_task.instances ROLLING: List[Instance] = rolling_task.instances
......
...@@ -6,11 +6,9 @@ import pytest ...@@ -6,11 +6,9 @@ import pytest
# import lm_eval.models as models # import lm_eval.models as models
import lm_eval.api as api import lm_eval.api as api
import lm_eval.evaluator as evaluator import lm_eval.evaluator as evaluator
import lm_eval.tasks as tasks from lm_eval import tasks
tasks.initialize_tasks()
# TODO: more fine grained unit tests rather than this big honking integration # TODO: more fine grained unit tests rather than this big honking integration
# test once we break evaluator into smaller, more manageable pieces # test once we break evaluator into smaller, more manageable pieces
...@@ -46,7 +44,8 @@ def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str ...@@ -46,7 +44,8 @@ def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str
"device": None, "device": None,
}, },
) )
task_dict = tasks.get_task_dict(task_name, num_fewshot=0) task_manager = tasks.TaskManager()
task_dict = tasks.get_task_dict(task_name, task_manager)
e2 = evaluator.evaluate( e2 = evaluator.evaluate(
lm=lm, lm=lm,
......
...@@ -8,7 +8,7 @@ from lm_eval.api.task import ConfigurableTask ...@@ -8,7 +8,7 @@ from lm_eval.api.task import ConfigurableTask
from .utils import new_tasks from .utils import new_tasks
tasks.initialize_tasks() task_manager = tasks.TaskManager()
# Default Task # Default Task
TASKS = ["arc_easy"] TASKS = ["arc_easy"]
...@@ -19,9 +19,9 @@ def task_class(): ...@@ -19,9 +19,9 @@ def task_class():
task_classes = new_tasks() task_classes = new_tasks()
# Check if task_classes is empty # Check if task_classes is empty
if task_classes: if task_classes:
return [tasks.TASK_REGISTRY.get(x)() for x in task_classes] return list(task_manager.load_task_or_group(task_classes).values())
else: else:
return [tasks.TASK_REGISTRY.get(x)() for x in TASKS] return list(task_manager.load_task_or_group(TASKS).values())
@pytest.fixture() @pytest.fixture()
......
import os import os
from pathlib import Path
from typing import List, Union from typing import List, Union
from lm_eval.utils import load_yaml_config from lm_eval.utils import load_yaml_config
...@@ -20,17 +19,18 @@ def load_changed_files(file_path: str) -> List[str]: ...@@ -20,17 +19,18 @@ def load_changed_files(file_path: str) -> List[str]:
# checks the txt file for list of changed files. # checks the txt file for list of changed files.
# if file ends with .yaml then check yaml for task name # if file ends with .yaml then check yaml and load the config.
# if file ends with .py then parse the folder for all yaml files # if the config task is a string, it's a task config.
# skips benchmarks folder # if the config task is a list, it's a group config.
def parser(full_path: List[str]) -> List[str]: def parser(full_path: List[str]) -> List[str]:
_output = set() _output = set()
for x in full_path: for x in full_path:
if x.endswith(".yaml") and "benchmarks" not in x: if os.path.exists(x) and x.endswith(".yaml"):
_output.add(load_yaml_config(x)["task"]) config = load_yaml_config(x, mode="simple")
elif x.endswith(".py") and "benchmarks" not in x: if isinstance(config["task"], str):
path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))] _output.add(config["task"])
_output |= {load_yaml_config(x)["task"] for x in path} elif isinstance(config["task"], list):
_output.add(config["group"])
return list(_output) return list(_output)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment