"vscode:/vscode.git/clone" did not exist on "cec5fdd14430455153802ee3a8abeecf31e9653d"
Unverified Commit 318bd988 authored by Wang, Yi's avatar Wang, Yi Committed by GitHub
Browse files

Merge branch 'EleutherAI:master' into fix_ptun

parents 35f1b5a7 25dfd3f6
...@@ -10,11 +10,10 @@ high quality distant supervision for answering the questions. ...@@ -10,11 +10,10 @@ high quality distant supervision for answering the questions.
Homepage: https://nlp.cs.washington.edu/triviaqa/ Homepage: https://nlp.cs.washington.edu/triviaqa/
""" """
import inspect import inspect
import lm_eval.datasets.triviaqa.triviaqa import string
from lm_eval.base import Task, rf from lm_eval.base import Task, rf
from lm_eval.metrics import mean from lm_eval.metrics import mean
_CITATION = """ _CITATION = """
@InProceedings{JoshiTriviaQA2017, @InProceedings{JoshiTriviaQA2017,
author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke}, author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},
...@@ -29,9 +28,9 @@ _CITATION = """ ...@@ -29,9 +28,9 @@ _CITATION = """
class TriviaQA(Task): class TriviaQA(Task):
VERSION = 1 VERSION = 2
DATASET_PATH = inspect.getfile(lm_eval.datasets.triviaqa.triviaqa) DATASET_PATH = "trivia_qa"
DATASET_NAME = None DATASET_NAME = "rc.nocontext"
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -74,19 +73,27 @@ class TriviaQA(Task): ...@@ -74,19 +73,27 @@ class TriviaQA(Task):
return ret return ret
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
ret = [] """Uses RequestFactory to construct Requests and returns an iterable of
for alias in self._remove_prefixes(doc["answer"]["aliases"]): Requests which will be sent to the LM.
_, is_prediction = rf.loglikelihood(ctx, " " + alias) :param doc:
ret.append(is_prediction) The document as returned from training_docs, validation_docs, or test_docs.
return ret :param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
continuation = rf.greedy_until(ctx, {"until": ["\n", ".", ","]})
return continuation
def process_results(self, doc, results): def process_results(self, doc, results):
return {"acc": float(any(results))} continuation = results[0].strip().lower().translate(str.maketrans('', '', string.punctuation))
list_of_candidates = [alias.lower().translate(str.maketrans('', '', string.punctuation)) for alias in self._remove_prefixes(doc["answer"]["aliases"])]
return {"em": float(continuation in list_of_candidates)}
def aggregation(self): def aggregation(self):
return { return {
"acc": mean, "em": mean,
} }
def higher_is_better(self): def higher_is_better(self):
return {"acc": True} return {"em": True}
...@@ -5,8 +5,10 @@ import collections ...@@ -5,8 +5,10 @@ import collections
import functools import functools
import inspect import inspect
import sys import sys
import fnmatch
from typing import List, Union from typing import List, Union
import gc
import torch import torch
from omegaconf import OmegaConf from omegaconf import OmegaConf
...@@ -63,11 +65,11 @@ def join_iters(iters): ...@@ -63,11 +65,11 @@ def join_iters(iters):
yield from iter yield from iter
def chunks(iter, n): def chunks(iter, n=0, fn=None):
arr = [] arr = []
for x in iter: for i, x in enumerate(iter):
arr.append(x) arr.append(x)
if len(arr) == n: if len(arr) == (fn(i) if fn else n):
yield arr yield arr
arr = [] arr = []
...@@ -84,6 +86,42 @@ def group(arr, fn): ...@@ -84,6 +86,42 @@ def group(arr, fn):
return list(res.values()) return list(res.values())
def _is_json_task(task_name):
return task_name == "json" or task_name.startswith("json=")
class MultiChoice:
def __init__(self, choices):
self.choices = choices
# Simple wildcard support (linux filename patterns)
def __contains__(self, values):
for value in values.split(","):
if len(fnmatch.filter(self.choices, value)) == 0 and not _is_json_task(
value
):
return False
return True
def __iter__(self):
for choice in self.choices:
yield choice
# Returns a list containing all values of the source_list that
# match at least one of the patterns
def pattern_match(patterns, source_list):
task_names = set()
for pattern in patterns:
if _is_json_task(pattern):
task_names.add(pattern)
for matching in fnmatch.filter(source_list, pattern):
task_names.add(matching)
return sorted(list(task_names))
def general_detokenize(string): def general_detokenize(string):
string = string.replace(" n't", "n't") string = string.replace(" n't", "n't")
string = string.replace(" )", ")") string = string.replace(" )", ")")
...@@ -246,3 +284,8 @@ def run_task_tests(task_list: List[str]): ...@@ -246,3 +284,8 @@ def run_task_tests(task_list: List[str]):
raise ValueError( raise ValueError(
f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}" f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
) )
def clear_torch_cache():
gc.collect()
torch.cuda.empty_cache()
import argparse import argparse
import json import json
import logging import logging
import fnmatch
import os import os
from lm_eval import tasks, evaluator from lm_eval import tasks, evaluator, utils
logging.getLogger("openai").setLevel(logging.WARNING) logging.getLogger("openai").setLevel(logging.WARNING)
def _is_json_task(task_name):
return task_name == "json" or task_name.startswith("json=")
class MultiChoice:
def __init__(self, choices):
self.choices = choices
# Simple wildcard support (linux filename patterns)
def __contains__(self, values):
for value in values.split(","):
if len(fnmatch.filter(self.choices, value)) == 0 and not _is_json_task(
value
):
return False
return True
def __iter__(self):
for choice in self.choices:
yield choice
def parse_args(): def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--model", required=True) parser.add_argument("--model", required=True)
parser.add_argument("--model_args", default="") parser.add_argument("--model_args", default="")
parser.add_argument("--tasks", default=None, choices=MultiChoice(tasks.ALL_TASKS)) parser.add_argument("--tasks", default=None, choices=utils.MultiChoice(tasks.ALL_TASKS))
parser.add_argument("--provide_description", action="store_true") parser.add_argument("--provide_description", action="store_true")
parser.add_argument("--num_fewshot", type=int, default=0) parser.add_argument("--num_fewshot", type=int, default=0)
parser.add_argument("--batch_size", type=str, default=None) parser.add_argument("--batch_size", type=str, default=None)
parser.add_argument("--max_batch_size", type=int, default=None,
help="Maximal batch size to try with --batch_size auto")
parser.add_argument("--device", type=str, default=None) parser.add_argument("--device", type=str, default=None)
parser.add_argument("--output_path", default=None) parser.add_argument("--output_path", default=None)
parser.add_argument("--limit", type=float, default=None, parser.add_argument("--limit", type=float, default=None,
...@@ -56,19 +34,6 @@ def parse_args(): ...@@ -56,19 +34,6 @@ def parse_args():
return parser.parse_args() return parser.parse_args()
# Returns a list containing all values of the source_list that
# match at least one of the patterns
def pattern_match(patterns, source_list):
task_names = set()
for pattern in patterns:
if _is_json_task(pattern):
task_names.add(pattern)
for matching in fnmatch.filter(source_list, pattern):
task_names.add(matching)
return sorted(list(task_names))
def main(): def main():
args = parse_args() args = parse_args()
...@@ -82,7 +47,7 @@ def main(): ...@@ -82,7 +47,7 @@ def main():
if args.tasks is None: if args.tasks is None:
task_names = tasks.ALL_TASKS task_names = tasks.ALL_TASKS
else: else:
task_names = pattern_match(args.tasks.split(","), tasks.ALL_TASKS) task_names = utils.pattern_match(args.tasks.split(","), tasks.ALL_TASKS)
print(f"Selected Tasks: {task_names}") print(f"Selected Tasks: {task_names}")
...@@ -97,6 +62,7 @@ def main(): ...@@ -97,6 +62,7 @@ def main():
tasks=task_names, tasks=task_names,
num_fewshot=args.num_fewshot, num_fewshot=args.num_fewshot,
batch_size=args.batch_size, batch_size=args.batch_size,
max_batch_size=args.max_batch_size,
device=args.device, device=args.device,
no_cache=args.no_cache, no_cache=args.no_cache,
limit=args.limit, limit=args.limit,
...@@ -115,9 +81,10 @@ def main(): ...@@ -115,9 +81,10 @@ def main():
with open(args.output_path, "w") as f: with open(args.output_path, "w") as f:
f.write(dumped) f.write(dumped)
batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
print( print(
f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, " f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}" f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
) )
print(evaluator.make_table(results)) print(evaluator.make_table(results))
......
import argparse
import json
import os
import subprocess
import time
from pathlib import Path
from lm_eval import tasks, utils
seq2seq_models = ["google/flan-t5-small"]
causal_models = ["gpt2", "facebook/opt-125m", "EleutherAI/gpt-neo-125m", "EleutherAI/pythia-160m"]
model_names = seq2seq_models + causal_models
completion_tasks = ["boolq", "lambada_openai", "winogrande"]
choice_tasks = ["hellaswag", "openbookqa", "piqa"]
perplexity_tasks = ["wikitext"]
generation_tasks = []
task_names = completion_tasks + choice_tasks + perplexity_tasks + generation_tasks
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--branches", default=[])
parser.add_argument("--models", default=model_names)
parser.add_argument("--tasks", default=task_names)
parser.add_argument("--acc_norm", type=bool, default=False)
parser.add_argument("--perplexity", default=None)
# TODO: implement num_fewshot and limit per task, e.g. task1:5,task2:1:100,task3::1000
parser.add_argument("--num_fewshot", type=int, default=0)
parser.add_argument("--limit", type=float, default=None)
# TODO: implement hf-auto to pick between causal and seq2seq models so we don't need this
parser.add_argument("--model", default="hf-causal-experimental")
# Use whatever is faster here
parser.add_argument("--model_args", default="use_accelerate=True,load_in_8bit=True")
parser.add_argument("--batch_size", default="auto")
return parser.parse_args()
def eval_models(args, branch=None):
if branch is not None:
if os.system(f"git checkout {branch}") != 0:
return {}, 0
branch = branch or initial_branch
start_time = time.time()
results = {}
for model in args.models:
model_type = "hf-causal-experimental" if model in causal_models \
else "hf-seq2seq" if model in seq2seq_models else args.model
model_args = f"pretrained={model},{args.model_args}"
# TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
tasks = args.tasks if model in causal_models or model_type == "hf-causal-experimental" \
else list(filter(lambda task: task not in perplexity_tasks, args.tasks))
# TODO: OOM with auto for seq2seq models, also can OOM with llama
batch_size = args.batch_size if model in causal_models or model_type == "hf-causal-experimental" \
else 64 if args.batch_size == "auto" else args.batch_size
output_path = f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json"
command = f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} " \
f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} " \
f"--batch_size {batch_size} --no_cache --output_path {output_path}"
print(f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}")
ret = os.system(command)
results[model] = json.load(open(output_path)) if ret == 0 else {"results": {}}
end_time = time.time()
return results, end_time - start_time
def extract_value(args, results, model, task, err=False):
if model not in results:
return 0
results = results[model]["results"]
if task not in results:
return 0
results = results[task]
if args.acc_norm and "acc_norm" in results:
return results["acc_norm"] if not err else results["acc_norm_stderr"]
if "acc" in results:
return results["acc"] if not err else results["acc_stderr"]
if (args.perplexity or "word_perplexity") in results:
return results[args.perplexity or "word_perplexity"] if not err else 0
return 0
def format_value(args, results, model, task):
val = 100 * extract_value(args, results, model, task)
err = 100 * extract_value(args, results, model, task, err=True)
return f"{val:.2f}{f' ± {err:.2f}' if err != 0 else ''}"
def format_diff(args, results1, results2, model, task):
val1 = 100 * extract_value(args, results1, model, task)
val2 = 100 * extract_value(args, results2, model, task)
diff = val2 - val1
return f"**+{diff:.2f}**" if diff > 0 else f"{diff:.2f}"
def main():
args = parse_args()
args.branches = args.branches.split(",") if type(args.branches) == str else args.branches
args.models = args.models.split(",") if type(args.models) == str else args.models
args.tasks = tasks.ALL_TASKS if args.tasks == "all_tasks" \
else utils.pattern_match(args.tasks.split(",") if type(args.tasks) == str else args.tasks, tasks.ALL_TASKS)
global initial_branch
initial_branch = subprocess.check_output("git branch --show-current", shell=True).decode("ascii").strip()
# TODO: implement proper timing for each task
# TODO: reduce IO by sharing tasks between models?
results, runtime = eval_models(args)
print(results, runtime)
runs = []
for branch in args.branches:
runs.append((branch, *eval_models(args, branch)))
os.system(f"git checkout {initial_branch}")
print("")
print(f"|task|{'|'.join(map(lambda model: Path(model).name, args.models))}|")
print(f"|--|{'--|' * len(args.models)}")
for task in args.tasks:
print(f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|")
for branch, branch_results, branch_runtime in runs:
print(f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|")
print(f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|")
print("")
print("|branch|runtime|%|")
print("|--|--|--|")
print(f"|{initial_branch}|{runtime:.1f}s|100%|")
for branch, _, branch_runtime in runs:
print(f"|{branch}|{branch_runtime:.1f}s|{100 * branch_runtime / runtime:.2f}%|")
if __name__ == "__main__":
main()
...@@ -45,5 +45,6 @@ setuptools.setup( ...@@ -45,5 +45,6 @@ setuptools.setup(
"multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"], "multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
"sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"], "sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"],
"auto-gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"], "auto-gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
"anthropic": ["anthropic"],
}, },
) )
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment