Unverified Commit 318bd988 authored by Wang, Yi's avatar Wang, Yi Committed by GitHub
Browse files

Merge branch 'EleutherAI:master' into fix_ptun

parents 35f1b5a7 25dfd3f6
......@@ -10,11 +10,10 @@ high quality distant supervision for answering the questions.
Homepage: https://nlp.cs.washington.edu/triviaqa/
"""
import inspect
import lm_eval.datasets.triviaqa.triviaqa
import string
from lm_eval.base import Task, rf
from lm_eval.metrics import mean
_CITATION = """
@InProceedings{JoshiTriviaQA2017,
author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},
......@@ -29,9 +28,9 @@ _CITATION = """
class TriviaQA(Task):
VERSION = 1
DATASET_PATH = inspect.getfile(lm_eval.datasets.triviaqa.triviaqa)
DATASET_NAME = None
VERSION = 2
DATASET_PATH = "trivia_qa"
DATASET_NAME = "rc.nocontext"
def has_training_docs(self):
return True
......@@ -74,19 +73,27 @@ class TriviaQA(Task):
return ret
def construct_requests(self, doc, ctx):
ret = []
for alias in self._remove_prefixes(doc["answer"]["aliases"]):
_, is_prediction = rf.loglikelihood(ctx, " " + alias)
ret.append(is_prediction)
return ret
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
continuation = rf.greedy_until(ctx, {"until": ["\n", ".", ","]})
return continuation
def process_results(self, doc, results):
return {"acc": float(any(results))}
continuation = results[0].strip().lower().translate(str.maketrans('', '', string.punctuation))
list_of_candidates = [alias.lower().translate(str.maketrans('', '', string.punctuation)) for alias in self._remove_prefixes(doc["answer"]["aliases"])]
return {"em": float(continuation in list_of_candidates)}
def aggregation(self):
return {
"acc": mean,
"em": mean,
}
def higher_is_better(self):
return {"acc": True}
return {"em": True}
......@@ -5,8 +5,10 @@ import collections
import functools
import inspect
import sys
import fnmatch
from typing import List, Union
import gc
import torch
from omegaconf import OmegaConf
......@@ -63,11 +65,11 @@ def join_iters(iters):
yield from iter
def chunks(iter, n):
def chunks(iter, n=0, fn=None):
arr = []
for x in iter:
for i, x in enumerate(iter):
arr.append(x)
if len(arr) == n:
if len(arr) == (fn(i) if fn else n):
yield arr
arr = []
......@@ -84,6 +86,42 @@ def group(arr, fn):
return list(res.values())
def _is_json_task(task_name):
return task_name == "json" or task_name.startswith("json=")
class MultiChoice:
def __init__(self, choices):
self.choices = choices
# Simple wildcard support (linux filename patterns)
def __contains__(self, values):
for value in values.split(","):
if len(fnmatch.filter(self.choices, value)) == 0 and not _is_json_task(
value
):
return False
return True
def __iter__(self):
for choice in self.choices:
yield choice
# Returns a list containing all values of the source_list that
# match at least one of the patterns
def pattern_match(patterns, source_list):
task_names = set()
for pattern in patterns:
if _is_json_task(pattern):
task_names.add(pattern)
for matching in fnmatch.filter(source_list, pattern):
task_names.add(matching)
return sorted(list(task_names))
def general_detokenize(string):
string = string.replace(" n't", "n't")
string = string.replace(" )", ")")
......@@ -246,3 +284,8 @@ def run_task_tests(task_list: List[str]):
raise ValueError(
f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
)
def clear_torch_cache():
gc.collect()
torch.cuda.empty_cache()
import argparse
import json
import logging
import fnmatch
import os
from lm_eval import tasks, evaluator
from lm_eval import tasks, evaluator, utils
logging.getLogger("openai").setLevel(logging.WARNING)
def _is_json_task(task_name):
return task_name == "json" or task_name.startswith("json=")
class MultiChoice:
def __init__(self, choices):
self.choices = choices
# Simple wildcard support (linux filename patterns)
def __contains__(self, values):
for value in values.split(","):
if len(fnmatch.filter(self.choices, value)) == 0 and not _is_json_task(
value
):
return False
return True
def __iter__(self):
for choice in self.choices:
yield choice
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--model", required=True)
parser.add_argument("--model_args", default="")
parser.add_argument("--tasks", default=None, choices=MultiChoice(tasks.ALL_TASKS))
parser.add_argument("--tasks", default=None, choices=utils.MultiChoice(tasks.ALL_TASKS))
parser.add_argument("--provide_description", action="store_true")
parser.add_argument("--num_fewshot", type=int, default=0)
parser.add_argument("--batch_size", type=str, default=None)
parser.add_argument("--max_batch_size", type=int, default=None,
help="Maximal batch size to try with --batch_size auto")
parser.add_argument("--device", type=str, default=None)
parser.add_argument("--output_path", default=None)
parser.add_argument("--limit", type=float, default=None,
......@@ -56,19 +34,6 @@ def parse_args():
return parser.parse_args()
# Returns a list containing all values of the source_list that
# match at least one of the patterns
def pattern_match(patterns, source_list):
task_names = set()
for pattern in patterns:
if _is_json_task(pattern):
task_names.add(pattern)
for matching in fnmatch.filter(source_list, pattern):
task_names.add(matching)
return sorted(list(task_names))
def main():
args = parse_args()
......@@ -82,7 +47,7 @@ def main():
if args.tasks is None:
task_names = tasks.ALL_TASKS
else:
task_names = pattern_match(args.tasks.split(","), tasks.ALL_TASKS)
task_names = utils.pattern_match(args.tasks.split(","), tasks.ALL_TASKS)
print(f"Selected Tasks: {task_names}")
......@@ -97,6 +62,7 @@ def main():
tasks=task_names,
num_fewshot=args.num_fewshot,
batch_size=args.batch_size,
max_batch_size=args.max_batch_size,
device=args.device,
no_cache=args.no_cache,
limit=args.limit,
......@@ -115,9 +81,10 @@ def main():
with open(args.output_path, "w") as f:
f.write(dumped)
batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
print(
f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}"
f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
)
print(evaluator.make_table(results))
......
import argparse
import json
import os
import subprocess
import time
from pathlib import Path
from lm_eval import tasks, utils
seq2seq_models = ["google/flan-t5-small"]
causal_models = ["gpt2", "facebook/opt-125m", "EleutherAI/gpt-neo-125m", "EleutherAI/pythia-160m"]
model_names = seq2seq_models + causal_models
completion_tasks = ["boolq", "lambada_openai", "winogrande"]
choice_tasks = ["hellaswag", "openbookqa", "piqa"]
perplexity_tasks = ["wikitext"]
generation_tasks = []
task_names = completion_tasks + choice_tasks + perplexity_tasks + generation_tasks
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--branches", default=[])
parser.add_argument("--models", default=model_names)
parser.add_argument("--tasks", default=task_names)
parser.add_argument("--acc_norm", type=bool, default=False)
parser.add_argument("--perplexity", default=None)
# TODO: implement num_fewshot and limit per task, e.g. task1:5,task2:1:100,task3::1000
parser.add_argument("--num_fewshot", type=int, default=0)
parser.add_argument("--limit", type=float, default=None)
# TODO: implement hf-auto to pick between causal and seq2seq models so we don't need this
parser.add_argument("--model", default="hf-causal-experimental")
# Use whatever is faster here
parser.add_argument("--model_args", default="use_accelerate=True,load_in_8bit=True")
parser.add_argument("--batch_size", default="auto")
return parser.parse_args()
def eval_models(args, branch=None):
if branch is not None:
if os.system(f"git checkout {branch}") != 0:
return {}, 0
branch = branch or initial_branch
start_time = time.time()
results = {}
for model in args.models:
model_type = "hf-causal-experimental" if model in causal_models \
else "hf-seq2seq" if model in seq2seq_models else args.model
model_args = f"pretrained={model},{args.model_args}"
# TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
tasks = args.tasks if model in causal_models or model_type == "hf-causal-experimental" \
else list(filter(lambda task: task not in perplexity_tasks, args.tasks))
# TODO: OOM with auto for seq2seq models, also can OOM with llama
batch_size = args.batch_size if model in causal_models or model_type == "hf-causal-experimental" \
else 64 if args.batch_size == "auto" else args.batch_size
output_path = f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json"
command = f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} " \
f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} " \
f"--batch_size {batch_size} --no_cache --output_path {output_path}"
print(f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}")
ret = os.system(command)
results[model] = json.load(open(output_path)) if ret == 0 else {"results": {}}
end_time = time.time()
return results, end_time - start_time
def extract_value(args, results, model, task, err=False):
if model not in results:
return 0
results = results[model]["results"]
if task not in results:
return 0
results = results[task]
if args.acc_norm and "acc_norm" in results:
return results["acc_norm"] if not err else results["acc_norm_stderr"]
if "acc" in results:
return results["acc"] if not err else results["acc_stderr"]
if (args.perplexity or "word_perplexity") in results:
return results[args.perplexity or "word_perplexity"] if not err else 0
return 0
def format_value(args, results, model, task):
val = 100 * extract_value(args, results, model, task)
err = 100 * extract_value(args, results, model, task, err=True)
return f"{val:.2f}{f' ± {err:.2f}' if err != 0 else ''}"
def format_diff(args, results1, results2, model, task):
val1 = 100 * extract_value(args, results1, model, task)
val2 = 100 * extract_value(args, results2, model, task)
diff = val2 - val1
return f"**+{diff:.2f}**" if diff > 0 else f"{diff:.2f}"
def main():
args = parse_args()
args.branches = args.branches.split(",") if type(args.branches) == str else args.branches
args.models = args.models.split(",") if type(args.models) == str else args.models
args.tasks = tasks.ALL_TASKS if args.tasks == "all_tasks" \
else utils.pattern_match(args.tasks.split(",") if type(args.tasks) == str else args.tasks, tasks.ALL_TASKS)
global initial_branch
initial_branch = subprocess.check_output("git branch --show-current", shell=True).decode("ascii").strip()
# TODO: implement proper timing for each task
# TODO: reduce IO by sharing tasks between models?
results, runtime = eval_models(args)
print(results, runtime)
runs = []
for branch in args.branches:
runs.append((branch, *eval_models(args, branch)))
os.system(f"git checkout {initial_branch}")
print("")
print(f"|task|{'|'.join(map(lambda model: Path(model).name, args.models))}|")
print(f"|--|{'--|' * len(args.models)}")
for task in args.tasks:
print(f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|")
for branch, branch_results, branch_runtime in runs:
print(f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|")
print(f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|")
print("")
print("|branch|runtime|%|")
print("|--|--|--|")
print(f"|{initial_branch}|{runtime:.1f}s|100%|")
for branch, _, branch_runtime in runs:
print(f"|{branch}|{branch_runtime:.1f}s|{100 * branch_runtime / runtime:.2f}%|")
if __name__ == "__main__":
main()
......@@ -45,5 +45,6 @@ setuptools.setup(
"multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
"sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"],
"auto-gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
"anthropic": ["anthropic"],
},
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment