Commit b7c3580a authored by lintangsutawika's avatar lintangsutawika
Browse files

reformatted

parent 86db4a4e
...@@ -8,7 +8,7 @@ We’d like your help to test it out! you can help by: ...@@ -8,7 +8,7 @@ We’d like your help to test it out! you can help by:
1. Trying out your current workloads on the big-refactor branch, and seeing if anything breaks or is counterintuitive, 1. Trying out your current workloads on the big-refactor branch, and seeing if anything breaks or is counterintuitive,
2. Porting tasks supported in the previous version of the harness to the new YAML configuration format. Please check out our [task implementation guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md) for more information. 2. Porting tasks supported in the previous version of the harness to the new YAML configuration format. Please check out our [task implementation guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md) for more information.
If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with: If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with:
- A command of the form `python main.py --model hf-causal --model_args ..... --tasks <task name> ...` which will run the task in the `master` branch, and what the score is - A command of the form `python main.py --model hf-causal --model_args ..... --tasks <task name> ...` which will run the task in the `master` branch, and what the score is
- A command of the form `python main.py --model hf-causal --model_args ..... --tasks <task name> ...` to run the task in your PR branch to `big-refactor`, and what the resulting score is, to show that we achieve equality between the two implementations. - A command of the form `python main.py --model hf-causal --model_args ..... --tasks <task name> ...` to run the task in your PR branch to `big-refactor`, and what the resulting score is, to show that we achieve equality between the two implementations.
......
...@@ -88,7 +88,12 @@ def simple_evaluate( ...@@ -88,7 +88,12 @@ def simple_evaluate(
if model_args is None: if model_args is None:
model_args = "" model_args = ""
lm = lm_eval.api.registry.get_model(model).create_from_arg_string( lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
model_args, {"batch_size": batch_size, "max_batch_size": max_batch_size, "device": device} model_args,
{
"batch_size": batch_size,
"max_batch_size": max_batch_size,
"device": device,
},
) )
else: else:
assert isinstance(model, lm_eval.api.model.LM) assert isinstance(model, lm_eval.api.model.LM)
...@@ -112,11 +117,15 @@ def simple_evaluate( ...@@ -112,11 +117,15 @@ def simple_evaluate(
if lm.rank == 0: if lm.rank == 0:
# add info about the model and few shot config # add info about the model and few shot config
results["config"] = { results["config"] = {
"model": model if isinstance(model, str) else model.model.config._name_or_path, "model": model
if isinstance(model, str)
else model.model.config._name_or_path,
"model_args": model_args, "model_args": model_args,
"num_fewshot": num_fewshot, "num_fewshot": num_fewshot,
"batch_size": batch_size, "batch_size": batch_size,
"batch_sizes": list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else [], "batch_sizes": list(lm.batch_sizes.values())
if hasattr(lm, "batch_sizes")
else [],
"device": device, "device": device,
"no_cache": no_cache, "no_cache": no_cache,
"limit": limit, "limit": limit,
......
...@@ -4,7 +4,9 @@ from tqdm import tqdm ...@@ -4,7 +4,9 @@ from tqdm import tqdm
import time import time
def anthropic_completion(client, model, prompt, max_tokens_to_sample, temperature, stop): def anthropic_completion(
client, model, prompt, max_tokens_to_sample, temperature, stop
):
"""Query Anthropic API for completion. """Query Anthropic API for completion.
Retry with back-off until they respond Retry with back-off until they respond
...@@ -46,8 +48,9 @@ class AnthropicLM(BaseLM): ...@@ -46,8 +48,9 @@ class AnthropicLM(BaseLM):
""" """
super().__init__() super().__init__()
import anthropic import anthropic
self.model = model self.model = model
self.client = anthropic.Client(os.environ['ANTHROPIC_API_KEY']) self.client = anthropic.Client(os.environ["ANTHROPIC_API_KEY"])
@property @property
def eot_token_id(self): def eot_token_id(self):
......
...@@ -168,8 +168,8 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len ...@@ -168,8 +168,8 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len
window_end = predicted + window_pred_len window_end = predicted + window_pred_len
yield ( yield (
token_list[window_end - max_seq_len - 1: window_end - 1], token_list[window_end - max_seq_len - 1 : window_end - 1],
token_list[window_end - window_pred_len: window_end], token_list[window_end - window_pred_len : window_end],
) )
predicted += window_pred_len predicted += window_pred_len
......
...@@ -17,17 +17,27 @@ def parse_args(): ...@@ -17,17 +17,27 @@ def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--model", required=True) parser.add_argument("--model", required=True)
parser.add_argument("--model_args", default="") parser.add_argument("--model_args", default="")
parser.add_argument("--tasks", default=None, choices=utils.MultiChoice(sorted(ALL_TASKS))) parser.add_argument(
"--tasks", default=None, choices=utils.MultiChoice(sorted(ALL_TASKS))
)
parser.add_argument("--config", default=None) parser.add_argument("--config", default=None)
parser.add_argument("--num_fewshot", type=int, default=0) parser.add_argument("--num_fewshot", type=int, default=0)
parser.add_argument("--batch_size", type=int, default=1) parser.add_argument("--batch_size", type=int, default=1)
parser.add_argument("--max_batch_size", type=int, default=None, parser.add_argument(
help="Maximal batch size to try with --batch_size auto") "--max_batch_size",
type=int,
default=None,
help="Maximal batch size to try with --batch_size auto",
)
parser.add_argument("--device", type=str, default=None) parser.add_argument("--device", type=str, default=None)
parser.add_argument("--output_path", default=None) parser.add_argument("--output_path", default=None)
parser.add_argument("--limit", type=float, default=None, parser.add_argument(
help="Limit the number of examples per task. " "--limit",
"If <1, limit is a percentage of the total number of examples.") type=float,
default=None,
help="Limit the number of examples per task. "
"If <1, limit is a percentage of the total number of examples.",
)
parser.add_argument("--data_sampling", type=float, default=None) parser.add_argument("--data_sampling", type=float, default=None)
parser.add_argument("--no_cache", action="store_true") parser.add_argument("--no_cache", action="store_true")
parser.add_argument("--decontamination_ngrams_path", default=None) parser.add_argument("--decontamination_ngrams_path", default=None)
......
...@@ -10,7 +10,12 @@ from lm_eval.api.registry import ALL_TASKS ...@@ -10,7 +10,12 @@ from lm_eval.api.registry import ALL_TASKS
seq2seq_models = ["google/flan-t5-small"] seq2seq_models = ["google/flan-t5-small"]
causal_models = ["gpt2", "facebook/opt-125m", "EleutherAI/gpt-neo-125m", "EleutherAI/pythia-160m"] causal_models = [
"gpt2",
"facebook/opt-125m",
"EleutherAI/gpt-neo-125m",
"EleutherAI/pythia-160m",
]
model_names = seq2seq_models + causal_models model_names = seq2seq_models + causal_models
...@@ -51,22 +56,41 @@ def eval_models(args, branch=None): ...@@ -51,22 +56,41 @@ def eval_models(args, branch=None):
results = {} results = {}
for model in args.models: for model in args.models:
model_type = "hf-causal" if model in causal_models \ model_type = (
else "hf-seq2seq" if model in seq2seq_models else args.model "hf-causal"
if model in causal_models
else "hf-seq2seq"
if model in seq2seq_models
else args.model
)
model_args = f"pretrained={model},{args.model_args}" model_args = f"pretrained={model},{args.model_args}"
# TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527 # TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
tasks = args.tasks if model in causal_models or model_type == "hf-causal" \ tasks = (
args.tasks
if model in causal_models or model_type == "hf-causal"
else list(filter(lambda task: task not in perplexity_tasks, args.tasks)) else list(filter(lambda task: task not in perplexity_tasks, args.tasks))
)
# TODO: OOM with auto for seq2seq models, also can OOM with llama # TODO: OOM with auto for seq2seq models, also can OOM with llama
batch_size = args.batch_size if model in causal_models or model_type == "hf-causal" \ batch_size = (
else 64 if args.batch_size == "auto" else args.batch_size args.batch_size
output_path = f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json" if model in causal_models or model_type == "hf-causal"
else 64
command = f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} " \ if args.batch_size == "auto"
f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} " \ else args.batch_size
f"--batch_size {batch_size} --no_cache --output_path {output_path}" )
output_path = (
print(f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}") f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json"
)
command = (
f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} "
f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} "
f"--batch_size {batch_size} --no_cache --output_path {output_path}"
)
print(
f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}"
)
ret = os.system(command) ret = os.system(command)
...@@ -89,7 +113,9 @@ def extract_value(args, results, model, task, err=False): ...@@ -89,7 +113,9 @@ def extract_value(args, results, model, task, err=False):
if "acc,none" in results: if "acc,none" in results:
return results["acc,none"] if not err else results["acc_stderr,none"] return results["acc,none"] if not err else results["acc_stderr,none"]
if (args.perplexity or "word_perplexity") + ",none" in results: if (args.perplexity or "word_perplexity") + ",none" in results:
return results[(args.perplexity or "word_perplexity") + ",none"] if not err else 0 return (
results[(args.perplexity or "word_perplexity") + ",none"] if not err else 0
)
return 0 return 0
...@@ -109,13 +135,24 @@ def format_diff(args, results1, results2, model, task): ...@@ -109,13 +135,24 @@ def format_diff(args, results1, results2, model, task):
def main(): def main():
args = parse_args() args = parse_args()
args.branches = args.branches.split(",") if type(args.branches) == str else args.branches args.branches = (
args.branches.split(",") if type(args.branches) == str else args.branches
)
args.models = args.models.split(",") if type(args.models) == str else args.models args.models = args.models.split(",") if type(args.models) == str else args.models
args.tasks = ALL_TASKS if args.tasks == "all_tasks" \ args.tasks = (
else utils.pattern_match(args.tasks.split(","), ALL_TASKS) if type(args.tasks) == str else args.tasks ALL_TASKS
if args.tasks == "all_tasks"
else utils.pattern_match(args.tasks.split(","), ALL_TASKS)
if type(args.tasks) == str
else args.tasks
)
global initial_branch global initial_branch
initial_branch = subprocess.check_output("git branch --show-current", shell=True).decode("ascii").strip() initial_branch = (
subprocess.check_output("git branch --show-current", shell=True)
.decode("ascii")
.strip()
)
# TODO: implement proper timing for each task # TODO: implement proper timing for each task
# TODO: reduce IO by sharing tasks between models? # TODO: reduce IO by sharing tasks between models?
...@@ -133,10 +170,16 @@ def main(): ...@@ -133,10 +170,16 @@ def main():
print(f"|task|{'|'.join(map(lambda model: Path(model).name, args.models))}|") print(f"|task|{'|'.join(map(lambda model: Path(model).name, args.models))}|")
print(f"|--|{'--|' * len(args.models)}") print(f"|--|{'--|' * len(args.models)}")
for task in args.tasks: for task in args.tasks:
print(f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|") print(
f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|"
)
for branch, branch_results, branch_runtime in runs: for branch, branch_results, branch_runtime in runs:
print(f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|") print(
print(f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|") f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|"
)
print(
f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|"
)
print("") print("")
print("|branch|runtime|%|") print("|branch|runtime|%|")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment