reformatted

b7c3580a · lintangsutawika · 86db4a4e · b7c3580a · b7c3580a · b7c3580a
Commit b7c3580a authored Jun 16, 2023 by lintangsutawika
6 changed files
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ We’d like your help to test it out! you can help by:
 1. Trying out your current workloads on the big-refactor branch, and seeing if anything breaks or is counterintuitive,
 2. Porting tasks supported in the previous version of the harness to the new YAML configuration format. Please check out our [task implementation guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md) for more information.

-If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with: 
+If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with:
 - A command of the form `python main.py --model hf-causal --model_args ..... --tasks <task name> ...` which will run the task in the `master` branch, and what the score is
 - A command of the form `python main.py --model hf-causal --model_args ..... --tasks <task name> ...` to run the task in your PR branch to `big-refactor`, and what the resulting score is, to show that we achieve equality between the two implementations.


--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -88,7 +88,12 @@ def simple_evaluate(
        if model_args is None:
            model_args = ""
        lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
-            model_args, {"batch_size": batch_size, "max_batch_size": max_batch_size, "device": device}
+            model_args,
+            {
+                "batch_size": batch_size,
+                "max_batch_size": max_batch_size,
+                "device": device,
+            },
        )
    else:
        assert isinstance(model, lm_eval.api.model.LM)
@@ -112,11 +117,15 @@ def simple_evaluate(
    if lm.rank == 0:
        # add info about the model and few shot config
        results["config"] = {
-            "model": model if isinstance(model, str) else model.model.config._name_or_path,
+            "model": model
+            if isinstance(model, str)
+            else model.model.config._name_or_path,
            "model_args": model_args,
            "num_fewshot": num_fewshot,
            "batch_size": batch_size,
-            "batch_sizes": list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else [],
+            "batch_sizes": list(lm.batch_sizes.values())
+            if hasattr(lm, "batch_sizes")
+            else [],
            "device": device,
            "no_cache": no_cache,
            "limit": limit,

--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -4,7 +4,9 @@ from tqdm import tqdm
 import time


-def anthropic_completion(client, model, prompt, max_tokens_to_sample, temperature, stop):
+def anthropic_completion(
+    client, model, prompt, max_tokens_to_sample, temperature, stop
+):
    """Query Anthropic API for completion.

    Retry with back-off until they respond
@@ -46,8 +48,9 @@ class AnthropicLM(BaseLM):
        """
        super().__init__()
        import anthropic
+
        self.model = model
-        self.client = anthropic.Client(os.environ['ANTHROPIC_API_KEY'])
+        self.client = anthropic.Client(os.environ["ANTHROPIC_API_KEY"])

    @property
    def eot_token_id(self):

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -168,8 +168,8 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len
        window_end = predicted + window_pred_len

        yield (
-            token_list[window_end - max_seq_len - 1: window_end - 1],
-            token_list[window_end - window_pred_len: window_end],
+            token_list[window_end - max_seq_len - 1 : window_end - 1],
+            token_list[window_end - window_pred_len : window_end],
        )
        predicted += window_pred_len


--- a/main.py
+++ b/main.py
@@ -17,17 +17,27 @@ def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", required=True)
    parser.add_argument("--model_args", default="")
-    parser.add_argument("--tasks", default=None, choices=utils.MultiChoice(sorted(ALL_TASKS)))
+    parser.add_argument(
+        "--tasks", default=None, choices=utils.MultiChoice(sorted(ALL_TASKS))
+    )
    parser.add_argument("--config", default=None)
    parser.add_argument("--num_fewshot", type=int, default=0)
    parser.add_argument("--batch_size", type=int, default=1)
-    parser.add_argument("--max_batch_size", type=int, default=None,
-                        help="Maximal batch size to try with --batch_size auto")
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        default=None,
+        help="Maximal batch size to try with --batch_size auto",
+    )
    parser.add_argument("--device", type=str, default=None)
    parser.add_argument("--output_path", default=None)
-    parser.add_argument("--limit", type=float, default=None,
-                        help="Limit the number of examples per task. "
-                             "If <1, limit is a percentage of the total number of examples.")
+    parser.add_argument(
+        "--limit",
+        type=float,
+        default=None,
+        help="Limit the number of examples per task. "
+        "If <1, limit is a percentage of the total number of examples.",
+    )
    parser.add_argument("--data_sampling", type=float, default=None)
    parser.add_argument("--no_cache", action="store_true")
    parser.add_argument("--decontamination_ngrams_path", default=None)

--- a/scripts/regression.py
+++ b/scripts/regression.py
@@ -10,7 +10,12 @@ from lm_eval.api.registry import ALL_TASKS


 seq2seq_models = ["google/flan-t5-small"]
-causal_models = ["gpt2", "facebook/opt-125m", "EleutherAI/gpt-neo-125m", "EleutherAI/pythia-160m"]
+causal_models = [
+    "gpt2",
+    "facebook/opt-125m",
+    "EleutherAI/gpt-neo-125m",
+    "EleutherAI/pythia-160m",
+]
 model_names = seq2seq_models + causal_models


@@ -51,22 +56,41 @@ def eval_models(args, branch=None):
    results = {}

    for model in args.models:
-        model_type = "hf-causal" if model in causal_models \
-            else "hf-seq2seq" if model in seq2seq_models else args.model
+        model_type = (
+            "hf-causal"
+            if model in causal_models
+            else "hf-seq2seq"
+            if model in seq2seq_models
+            else args.model
+        )
        model_args = f"pretrained={model},{args.model_args}"
        # TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
-        tasks = args.tasks if model in causal_models or model_type == "hf-causal" \
+        tasks = (
+            args.tasks
+            if model in causal_models or model_type == "hf-causal"
            else list(filter(lambda task: task not in perplexity_tasks, args.tasks))
+        )
        # TODO: OOM with auto for seq2seq models, also can OOM with llama
-        batch_size = args.batch_size if model in causal_models or model_type == "hf-causal" \
-            else 64 if args.batch_size == "auto" else args.batch_size
-        output_path = f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json"
-
-        command = f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} " \
-                  f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} " \
-                  f"--batch_size {batch_size} --no_cache --output_path {output_path}"
-
-        print(f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}")
+        batch_size = (
+            args.batch_size
+            if model in causal_models or model_type == "hf-causal"
+            else 64
+            if args.batch_size == "auto"
+            else args.batch_size
+        )
+        output_path = (
+            f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json"
+        )
+
+        command = (
+            f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} "
+            f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} "
+            f"--batch_size {batch_size} --no_cache --output_path {output_path}"
+        )
+
+        print(
+            f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}"
+        )

        ret = os.system(command)

@@ -89,7 +113,9 @@ def extract_value(args, results, model, task, err=False):
    if "acc,none" in results:
        return results["acc,none"] if not err else results["acc_stderr,none"]
    if (args.perplexity or "word_perplexity") + ",none" in results:
-        return results[(args.perplexity or "word_perplexity") + ",none"] if not err else 0
+        return (
+            results[(args.perplexity or "word_perplexity") + ",none"] if not err else 0
+        )
    return 0


@@ -109,13 +135,24 @@ def format_diff(args, results1, results2, model, task):
 def main():
    args = parse_args()

-    args.branches = args.branches.split(",") if type(args.branches) == str else args.branches
+    args.branches = (
+        args.branches.split(",") if type(args.branches) == str else args.branches
+    )
    args.models = args.models.split(",") if type(args.models) == str else args.models
-    args.tasks = ALL_TASKS if args.tasks == "all_tasks" \
-        else utils.pattern_match(args.tasks.split(","), ALL_TASKS) if type(args.tasks) == str else args.tasks
+    args.tasks = (
+        ALL_TASKS
+        if args.tasks == "all_tasks"
+        else utils.pattern_match(args.tasks.split(","), ALL_TASKS)
+        if type(args.tasks) == str
+        else args.tasks
+    )

    global initial_branch
-    initial_branch = subprocess.check_output("git branch --show-current", shell=True).decode("ascii").strip()
+    initial_branch = (
+        subprocess.check_output("git branch --show-current", shell=True)
+        .decode("ascii")
+        .strip()
+    )

    # TODO: implement proper timing for each task
    # TODO: reduce IO by sharing tasks between models?
@@ -133,10 +170,16 @@ def main():
    print(f"|task|{'|'.join(map(lambda model: Path(model).name, args.models))}|")
    print(f"|--|{'--|' * len(args.models)}")
    for task in args.tasks:
-        print(f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|")
+        print(
+            f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|"
+        )
        for branch, branch_results, branch_runtime in runs:
-            print(f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|")
-            print(f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|")
+            print(
+                f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|"
+            )
+            print(
+                f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|"
+            )

    print("")
    print("|branch|runtime|%|")