Commit b5efc813 authored by gk's avatar gk
Browse files

Merge branch 'master' into big-refactor-merge

parents 7dec84a0 b018a7d5
......@@ -167,13 +167,22 @@ def evaluate(
# get lists of each type of request
for task_name, task in task_dict.items():
versions[task_name] = task.VERSION
configs[task_name] = dict(task.dump_config()) # TODO: don't access a private attribute here ; for non-YAML tasks handle this case
# TODO: don't access a private attribute here ; for non-YAML tasks handle this case
configs[task_name] = dict(task.dump_config())
# deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
# task_docs = list(task_doc_func())
# rnd = random.Random()
# rnd.seed(42)
# rnd.shuffle(task_docs)
if limit is not None:
if task.has_test_docs():
task_docs = task.test_docs()
elif task.has_validation_docs():
task_docs = task.validation_docs()
else:
raise RuntimeError("Task has neither test_docs nor validation_docs")
limit = int(len(task_docs) * limit) if limit < 1.0 else int(limit)
task.build_all_requests(limit=limit, rank=lm.rank, world_size=lm.world_size)
......
This diff is collapsed.
......@@ -16,9 +16,6 @@ from lm_eval.api.registry import (
)
ALL_TASKS = sorted(list(TASK_REGISTRY.keys()) + list(GROUP_REGISTRY.keys()))
def get_task_name_from_config(task_config):
return "{dataset_path}_{dataset_name}".format(**task_config)
......
......@@ -19,7 +19,6 @@ from omegaconf import OmegaConf
from jinja2 import BaseLoader, Environment, StrictUndefined
from itertools import islice
from lm_eval import tasks
from lm_eval.logger import eval_logger
......
......@@ -16,7 +16,7 @@ def parse_args():
parser.add_argument("--tasks", default=None, choices=utils.MultiChoice(sorted(ALL_TASKS)))
parser.add_argument("--config", default=None)
parser.add_argument("--num_fewshot", type=int, default=0)
parser.add_argument("--batch_size", type=str, default=None)
parser.add_argument("--batch_size", type=int, default=1)
parser.add_argument("--max_batch_size", type=int, default=None,
help="Maximal batch size to try with --batch_size auto")
parser.add_argument("--device", type=str, default=None)
......
......@@ -3,7 +3,6 @@ Usage:
python make_table_tasks.py --output <markdown_filename>
"""
import logging
from lm_eval import tasks
from pytablewriter import MarkdownTableWriter, LatexTableWriter
import os
import json
......@@ -54,8 +53,6 @@ def make_table(result_dict):
if __name__ == "__main__":
task_names = tasks.ALL_TASKS
# loop dirs and subdirs in results dir
# for each dir, load json files
for dirpath, dirnames, filenames in os.walk("../results"):
......
......@@ -5,7 +5,8 @@ import subprocess
import time
from pathlib import Path
from lm_eval import tasks, utils
from lm_eval import evaluator, utils
from lm_eval.api.registry import ALL_TASKS
seq2seq_models = ["google/flan-t5-small"]
......@@ -31,7 +32,7 @@ def parse_args():
parser.add_argument("--num_fewshot", type=int, default=0)
parser.add_argument("--limit", type=float, default=None)
# TODO: implement hf-auto to pick between causal and seq2seq models so we don't need this
parser.add_argument("--model", default="hf-causal-experimental")
parser.add_argument("--model", default="hf-causal")
# Use whatever is faster here
parser.add_argument("--model_args", default="use_accelerate=True,load_in_8bit=True")
parser.add_argument("--batch_size", default="auto")
......@@ -50,14 +51,14 @@ def eval_models(args, branch=None):
results = {}
for model in args.models:
model_type = "hf-causal-experimental" if model in causal_models \
model_type = "hf-causal" if model in causal_models \
else "hf-seq2seq" if model in seq2seq_models else args.model
model_args = f"pretrained={model},{args.model_args}"
# TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
tasks = args.tasks if model in causal_models or model_type == "hf-causal-experimental" \
tasks = args.tasks if model in causal_models or model_type == "hf-causal" \
else list(filter(lambda task: task not in perplexity_tasks, args.tasks))
# TODO: OOM with auto for seq2seq models, also can OOM with llama
batch_size = args.batch_size if model in causal_models or model_type == "hf-causal-experimental" \
batch_size = args.batch_size if model in causal_models or model_type == "hf-causal" \
else 64 if args.batch_size == "auto" else args.batch_size
output_path = f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json"
......@@ -83,12 +84,12 @@ def extract_value(args, results, model, task, err=False):
if task not in results:
return 0
results = results[task]
if args.acc_norm and "acc_norm" in results:
return results["acc_norm"] if not err else results["acc_norm_stderr"]
if "acc" in results:
return results["acc"] if not err else results["acc_stderr"]
if (args.perplexity or "word_perplexity") in results:
return results[args.perplexity or "word_perplexity"] if not err else 0
if args.acc_norm and "acc_norm,none" in results:
return results["acc_norm,none"] if not err else results["acc_norm_stderr,none"]
if "acc,none" in results:
return results["acc,none"] if not err else results["acc_stderr,none"]
if (args.perplexity or "word_perplexity") + ",none" in results:
return results[(args.perplexity or "word_perplexity") + ",none"] if not err else 0
return 0
......@@ -110,8 +111,8 @@ def main():
args.branches = args.branches.split(",") if type(args.branches) == str else args.branches
args.models = args.models.split(",") if type(args.models) == str else args.models
args.tasks = tasks.ALL_TASKS if args.tasks == "all_tasks" \
else utils.pattern_match(args.tasks.split(",") if type(args.tasks) == str else args.tasks, tasks.ALL_TASKS)
args.tasks = ALL_TASKS if args.tasks == "all_tasks" \
else utils.pattern_match(args.tasks.split(","), ALL_TASKS) if type(args.tasks) == str else args.tasks
global initial_branch
initial_branch = subprocess.check_output("git branch --show-current", shell=True).decode("ascii").strip()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment