Merge pull request #676 from EleutherAI/new-flags

[Refactor] Miscellaneous fixes

Merge pull request #676 from EleutherAI/new-flags
[Refactor] Miscellaneous fixes
6862fa7d · Lintang Sutawika · GitHub · 98c85d73 · f7dde0c3 · 6862fa7d
Unverified Commit 6862fa7d authored Jul 14, 2023 by Lintang Sutawika Committed by GitHub Jul 14, 2023
10 changed files
--- a/lm_eval/api/instance.py
+++ b/lm_eval/api/instance.py
@@ -4,13 +4,13 @@ from typing import Literal, Tuple

 @dataclass
 class Instance:
-    request_type: str = Literal[
-        "loglikelihood", "loglikelihood_rolling", "greedy_until"
-    ]
-    doc: dict = None
-    arguments: tuple = None
-    idx: int = None
-    metadata: tuple = Tuple[str, int, int]  # TODO: better typehints here
+    request_type: Literal["loglikelihood", "loglikelihood_rolling", "greedy_until"]
+    doc: dict
+    arguments: tuple
+    idx: int
+    metadata: Tuple[str, int, int] = field(
+        default_factory=lambda: (None, None, None)
+    )  # TODO: better typehints here
    resps: list = field(default_factory=list)
    filtered_resps: dict = field(default_factory=dict)


--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -8,6 +8,7 @@ import evaluate
 import random
 import itertools
 import functools
+from tqdm import tqdm

 import datasets
 import numpy as np
@@ -217,8 +218,8 @@ class Task(abc.ABC):
                self._filters.append(filter_pipeline)

        self.sampler = samplers.Sampler(
-            list(self.fewshot_docs()), self, rnd=random.Random()
-        )  # TODO: pass the correct docs in here
+            list(self.fewshot_docs()), self, rnd=random.Random(1234)
+        )

    def download(self, data_dir=None, cache_dir=None, download_mode=None):
        """Downloads and returns the task dataset.
@@ -366,13 +367,18 @@ class Task(abc.ABC):
                False
            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"

+        eval_logger.info(
+            f"Building contexts for task '{self._config.task}' on rank {rank}..."
+        )
+
        instances = []
        for doc_id, doc in utils.create_iterator(
            enumerate(docs), rank, world_size, limit
        ):
            # sample fewshot context #TODO: need to offset doc_id by rank now!
            fewshot_ctx = self.fewshot_context(
-                doc, self._config.num_fewshot, rnd=random.Random()
+                doc,
+                self._config.num_fewshot,
            )

            # TODO: we should override self._config.repeats if doing greedy gen so users don't waste time+compute
@@ -453,7 +459,7 @@ class Task(abc.ABC):
        return len(re.split(r"\s+", doc))

    @utils.positional_deprecated
-    def fewshot_context(self, doc, num_fewshot, rnd=None):
+    def fewshot_context(self, doc, num_fewshot):
        """Returns a fewshot context string that is made up of a prepended description
        (if provided), the `num_fewshot` number of examples, and an appended prompt example.

@@ -461,15 +467,9 @@ class Task(abc.ABC):
            The document as returned from training_docs, validation_docs, or test_docs.
        :param num_fewshot: int
            The number of fewshot examples to provide in the returned context string.
-        :param rnd: random.Random
-            The pseudo-random number generator used to randomly sample examples.
-            WARNING: This is currently a required arg although it's optionalized with a default `None`.
        :returns: str
            The fewshot context.
        """
-        assert (
-            rnd is not None
-        ), "A `random.Random` generator argument must be provided to `rnd`"

        if num_fewshot == 0:
            # always prepend the (possibly empty) task description
@@ -625,7 +625,7 @@ class ConfigurableTask(Task):

        if self.fewshot_docs() is not None:
            self.sampler = samplers.Sampler(
-                list(self.fewshot_docs()), self, rnd=random.Random()
+                list(self.fewshot_docs()), self, rnd=random.Random(1234)
            )

    def download(self, dataset_kwargs=None):
@@ -1004,13 +1004,10 @@ class PerplexityTask(Task):
        assert k == 0
        return []

-    def fewshot_context(self, doc, num_fewshot, rnd=None):
+    def fewshot_context(self, doc, num_fewshot):
        assert (
            num_fewshot == 0
        ), "The number of fewshot examples must be 0 for perplexity tasks."
-        assert (
-            rnd is not None
-        ), "A `random.Random` generator argument must be provided to `rnd`."

        return ""


--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -45,6 +45,7 @@ def simple_evaluate(
    check_integrity=False,
    decontamination_ngrams_path=None,
    write_out=False,
+    log_samples=True,
 ):
    """Instantiate and evaluate a model on a list of tasks.

@@ -72,12 +73,17 @@ def simple_evaluate(
    :param check_integrity: bool
        Whether to run the relevant part of the test suite for the tasks
    :param write_out: bool
-        If True, write details about prompts and logits to json for all tasks
+        If True, write out an example document and model input for checking task integrity
+    :param log_samples: bool
+        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
    :return
        Dictionary of results
    """
-    random.seed(1234)
+    random.seed(0)
    np.random.seed(1234)
+    torch.manual_seed(
+        1234
+    )  # TODO: this may affect training runs that are run with evaluation mid-run.

    assert tasks != [], "No tasks specified"

@@ -118,6 +124,7 @@ def simple_evaluate(
        bootstrap_iters=bootstrap_iters,
        decontamination_ngrams_path=decontamination_ngrams_path,
        write_out=write_out,
+        log_samples=log_samples,
    )

    if lm.rank == 0:
@@ -154,6 +161,7 @@ def evaluate(
    bootstrap_iters=100000,
    decontamination_ngrams_path=None,
    write_out=False,
+    log_samples=True,
 ):
    """Instantiate and evaluate a model on a list of tasks.

@@ -168,7 +176,9 @@ def evaluate(
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
    :param write_out: bool
-        If True, write all prompts, logits and metrics to json for offline analysis
+        If True, write out an example document and model input for checking task integrity
+    :param log_samples: bool
+        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
    :return
        Dictionary of results
    """
@@ -213,7 +223,10 @@ def evaluate(
        # aggregate Instances by LM method requested to get output.
        reqtype = (
            "loglikelihood"
-            if (task.OUTPUT_TYPE == "multiple_choice" or task.OUTPUT_TYPE == "winograd_schema") 
+            if (
+                task.OUTPUT_TYPE == "multiple_choice"
+                or task.OUTPUT_TYPE == "winograd_schema"
+            )
            else task.OUTPUT_TYPE
        )  # TODO: this is hacky, fix in task.py
        requests[reqtype].extend(task.instances)
@@ -279,17 +292,18 @@ def evaluate(
                metrics = task.process_results(
                    doc, [req.filtered_resps[key] for req in requests]
                )
-                target = task.doc_to_target(doc)
-                example = {
-                    "doc_id": doc_id,
-                    "doc": doc,
-                    "target": target,
-                    "arguments": requests[0].args,
-                    "resps": [req.resps for req in requests],
-                    "filtered_resps": [req.filtered_resps[key] for req in requests],
-                }
-                example.update(metrics)
-                samples[task_name].append(example)
+                if log_samples:
+                    target = task.doc_to_target(doc)
+                    example = {
+                        "doc_id": doc_id,
+                        "doc": doc,
+                        "target": target,
+                        "arguments": [req.args for req in requests],
+                        "resps": [req.resps for req in requests],
+                        "filtered_resps": [req.filtered_resps[key] for req in requests],
+                    }
+                    example.update(metrics)
+                    samples[task_name].append(example)
                for metric, value in metrics.items():
                    vals[(task_name, key, metric)].append(value)

@@ -359,12 +373,15 @@ def evaluate(
                if stderr is not None:
                    results[task_name][metric + "_stderr" + "," + key] = stderr(items)

-        return {
+        results_dict = {
            "results": dict(results),
            "configs": dict(configs),
            "versions": dict(versions),
-            "samples": samples,
        }
+        if log_samples:
+            results_dict["samples"] = dict(samples)
+
+        return results_dict

    else:
        return None
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -70,6 +70,7 @@ class HFLM(LM):
        batch_size: Optional[int] = 1,
        low_cpu_mem_usage: Optional[bool] = True,
        trust_remote_code: Optional[bool] = False,
+        use_fast_tokenizer: Optional[bool] = True,
        # arguments used for splitting a model across GPUs naively.
        # only used if `parallelize=True`.
        parallelize: Optional[bool] = False,
@@ -216,6 +217,7 @@ class HFLM(LM):
            pretrained if tokenizer is None else tokenizer,
            revision=revision,
            trust_remote_code=trust_remote_code,
+            use_fast=use_fast_tokenizer,
        )

        self.vocab_size = self.tokenizer.vocab_size

--- a/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
+++ b/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
@@ -4,13 +4,11 @@ def doc_to_text(doc):
        ctxs, doc["question"], doc["final_decision"]
    )

+
 def doc_to_target(doc):
    return " {}".format(doc["final_decision"])

+
 def gold_alias(doc):
-    dict_to_label = {
-        'yes': 0,
-        'no': 1,
-        'maybe': 2
-    }
-    return dict_to_label[doc["final_decision"]]
\ No newline at end of file
+    dict_to_label = {"yes": 0, "no": 1, "maybe": 2}
+    return dict_to_label[doc["final_decision"]]
--- a/lm_eval/tasks/pubmedqa/pubmedqa.yaml
+++ b/lm_eval/tasks/pubmedqa/pubmedqa.yaml
@@ -14,4 +14,4 @@ gold_alias: !function preprocess_pubmedqa.gold_alias
 metric_list:
  - metric: acc
    aggregation: mean
-    higher_is_better: true
\ No newline at end of file
+    higher_is_better: true
--- a/lm_eval/tasks/race/preprocess_race.py
+++ b/lm_eval/tasks/race/preprocess_race.py
-import ast 
+import ast
+

 def process_ast(string):
    return ast.literal_eval(string)

+
 def last_problem(doc):
    return process_ast(doc["problems"])[-1]

+
 def get_answer_option(problem):
    letter_to_num = {"A": 0, "B": 1, "C": 2, "D": 3}
    answer = letter_to_num[problem["answer"]]
    return problem["options"][answer]

+
 def create_choices(doc):
    problem = last_problem(doc)
    choices = [problem["options"][i] for i in range(4)]
    return choices

+
 def doc_to_text(doc):
    text = "Article: " + doc["article"] + "\n\n"
    for problem in process_ast(doc["problems"])[:-1]:
        if problem["question"][-6:] == "  _  .":
-            text += (
-                problem["question"][-5:] + get_answer_option(problem) + "\n"
-            )
+            text += problem["question"][-5:] + get_answer_option(problem) + "\n"
        else:
            question = "Question: " + problem["question"] + "\n"
            answer = "Answer: " + get_answer_option(problem) + "\n"
@@ -30,6 +33,7 @@ def doc_to_text(doc):
    text += last_problem(doc)["question"]
    return text

+
 def doc_to_target(doc):
    letter_to_num = {"A": 0, "B": 1, "C": 2, "D": 3}
    answer = letter_to_num[last_problem(doc)["answer"]]

--- a/lm_eval/tasks/swag/swag.yaml
+++ b/lm_eval/tasks/swag/swag.yaml
@@ -17,4 +17,4 @@ metric_list:
    higher_is_better: true
  - metric: acc_norm
    aggregation: mean
-    higher_is_better: true
\ No newline at end of file
+    higher_is_better: true
--- a/lm_eval/tasks/winogrande/preprocess_winogrande.py
+++ b/lm_eval/tasks/winogrande/preprocess_winogrande.py
@@ -4,11 +4,13 @@ def partial_context(doc, option):
    pronoun_loc = doc["sentence"].index("_")
    return doc["sentence"][:pronoun_loc] + option

+
 def partial_target(doc):
    # The target is everything after the document specified pronoun.
    pronoun_loc = doc["sentence"].index("_") + 1
    return doc["sentence"][pronoun_loc:].strip()

+
 def create_choices(doc):
    choices = []
    for option in [doc["option1"], doc["option2"]]:
@@ -16,6 +18,7 @@ def create_choices(doc):
        choices.append(partial_ctx)
    return choices

+
 def gold_alias(doc):
    answer_to_num = {"1": 0, "2": 1}
-    return answer_to_num[doc['answer']]
\ No newline at end of file
+    return answer_to_num[doc["answer"]]
--- a/main.py
+++ b/main.py
@@ -43,6 +43,7 @@ def parse_args():
    parser.add_argument("--decontamination_ngrams_path", default=None)
    parser.add_argument("--check_integrity", action="store_true")
    parser.add_argument("--write_out", action="store_true", default=False)
+    parser.add_argument("--log_samples", action="store_true", default=True)
    return parser.parse_args()


@@ -89,10 +90,12 @@ def main():
        decontamination_ngrams_path=args.decontamination_ngrams_path,
        check_integrity=args.check_integrity,
        write_out=args.write_out,
+        log_samples=args.log_samples,
    )

    if results is not None:
-        samples = results.pop("samples")
+        if args.log_samples:
+            samples = results.pop("samples")
        dumped = json.dumps(results, indent=2, default=lambda o: str(o))
        print(dumped)

@@ -104,19 +107,20 @@ def main():
            with open(args.output_path, "w") as f:
                f.write(dumped)

-            for task_name, config in results["configs"].items():
-                output_name = "{}_{}".format(
-                    re.sub("/", "__", args.model_args), task_name
-                )
-                if os.path.isdir(args.output_path):
-                    filename = f"./{args.output_path}/{output_name}.jsonl"
-                elif os.path.isfile(args.output_path):
-                    filename = (
-                        f"./{os.path.dirname(args.output_path)}/{output_name}.jsonl"
+            if args.log_samples:
+                for task_name, config in results["configs"].items():
+                    output_name = "{}_{}".format(
+                        re.sub("/", "__", args.model_args), task_name
                    )
-
-                with jsonlines.open(filename, "w") as f:
-                    f.write_all(samples[task_name])
+                    if os.path.isdir(args.output_path):
+                        filename = f"./{args.output_path}/{output_name}.jsonl"
+                    elif os.path.isfile(args.output_path):
+                        filename = (
+                            f"./{os.path.dirname(args.output_path)}/{output_name}.jsonl"
+                        )
+
+                    with jsonlines.open(filename, "w") as f:
+                        f.write_all(samples[task_name])

        print(
            f"{args.model} ({args.model_args}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "