Merge branch 'big-refactor' into add-prost-config

66bb89e5 · FarzanehNakhaee · e8bb77db · 070b6b9c · 66bb89e5 · 66bb89e5
Commit 66bb89e5 authored Jul 04, 2023 by FarzanehNakhaee
17 changed files
--- a/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
 group:
  - super-glue-t5-prompt
-task: t5-prompt
-reference: "From Raffel et. al. 2019"
+task: super_glue-copa-t5-prompt
 dataset_path: super_glue
 dataset_name: copa
 training_split: train

--- a/lm_eval/tasks/super_glue/copa/utils.py
+++ b/lm_eval/tasks/super_glue/copa/utils.py
+def convert_choice(choice):
+    return choice[0].lower() + choice[1:]
+
+
+def doc_to_text(doc):
+    # Drop the period
+    connector = {
+        "cause": "because",
+        "effect": "therefore",
+    }[doc["question"]]
+    return doc["premise"].strip()[:-1] + f" {connector}"
+
+
+def doc_to_target(doc):
+    correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
+    # Connect the sentences
+    return " " + convert_choice(correct_choice)
--- a/lm_eval/tasks/super_glue/record/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/record/t5-prompt.yaml
 group:
  - super-glue-t5-prompt
-task: t5-prompt
-reference: "From Raffel et. al. 2019"
+task: super_glue-record-t5-prompt
 dataset_path: super_glue
 dataset_name: record
 training_split: train

--- a/lm_eval/tasks/super_glue/wic/default.yaml
+++ b/lm_eval/tasks/super_glue/wic/default.yaml
+group:
+  - super-glue-lm-eval-v1
+task: "wic"
+dataset_path: super_glue
+dataset_name: wic
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+gold_alias: "{{label}}" # this will be cast to an int.
+template_aliases: "{% set answer_choices = ['no', 'yes'] %}"
+metric_list:
+  - metric: acc
--- a/lm_eval/tasks/super_glue/wic/promptsource-00.yaml
+++ b/lm_eval/tasks/super_glue/wic/promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "GPT-3-prompt"
-dataset_path: super_glue
-dataset_name: wic
-training_split: train
-validation_split: validation
-use_prompt: "promptsource:GPT-3-prompt"
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/wic/promptsource-01.yaml
+++ b/lm_eval/tasks/super_glue/wic/promptsource-01.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "GPT-3-prompt-with-label"
-use_prompt: "promptsource:GPT-3-prompt-with-label"
--- a/lm_eval/tasks/super_glue/wic/promptsource-02.yaml
+++ b/lm_eval/tasks/super_glue/wic/promptsource-02.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "affirmation_true_or_false"
-use_prompt: "promptsource:affirmation_true_or_false"
--- a/lm_eval/tasks/super_glue/wic/utils.py
+++ b/lm_eval/tasks/super_glue/wic/utils.py
+def doc_to_text(doc):
+    return (
+        "Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the"
+        " two sentences above?\nAnswer:".format(
+            doc["sentence1"],
+            doc["sentence2"],
+            doc["sentence1"][doc["start1"] : doc["end1"]],
+        )
+    )
+
+
+def doc_to_target(doc):
+    return " {}".format({0: "no", 1: "yes"}[doc["label"]])
--- a/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
 group:
  - super-glue-t5-prompt
-task: t5-prompt
-reference: "From Raffel et. al. 2019"
+task: super_glue-wsc-t5-prompt
 dataset_path: super_glue
 dataset_name: wsc
 training_split: train

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -10,10 +10,11 @@ import collections
 import importlib.util
 import fnmatch

-from typing import List, Union
+from typing import List, Literal, Union

 import gc
 import torch
+import transformers

 from omegaconf import OmegaConf
 from jinja2 import BaseLoader, Environment, StrictUndefined
@@ -22,15 +23,6 @@ from itertools import islice
 from lm_eval.logger import eval_logger


-class ExitCodeError(Exception):
-    pass
-
-
-def sh(x):
-    if os.system(x):
-        raise ExitCodeError()
-
-
 def escaped_split(text, sep_char, maxsplit=-1):
    """Split text into a list on occurrences of the given separation
    character `sep_char`. The separation character may be escaped by a
@@ -180,26 +172,6 @@ def make_disjoint_window(pair):
    return a[: len(a) - (len(b) - 1)], b


-def select_continuation_from_batch_left_padding(
-    generations: Union[List[List[int]], torch.Tensor], max_context_size: int
-):
-    """Select the continuation from the batch, removing prompts of different lengths.
-    Args:
-        generations (Union[List[List[int]], torch.Tensor]):
-            A tensor or list-of-lists of shape [batch_size, sequence length].
-        max_context_size (int):
-            The size of the biggest context; generations will proceed from that
-            index.
-    Example:
-        PAD     PAD Continue : The dog chased the cat  [every       day of the week]
-        Riddle  me    this   : The  dog chased the  cat [yesterday] PAD PAD PAD PAD
-    Output:
-        [every day of the week]
-        [yesterday]  PAD PAD PAD PAD
-    """
-    return generations[:, max_context_size:]
-
-
 class Reorderer:
    def __init__(self, arr, fn):
        self.size = len(arr)
@@ -229,6 +201,64 @@ class Reorderer:
        return res


+class Grouper:
+    """
+    takes an array `arr` and function `fn` and returns a dictionary
+    with keys fn(ob) for each ob in `arr` and with values `self.arr[key]` a list of all
+    objects in `arr` satisfying `key == fn(ob)`.
+    """
+
+    def __init__(self, arr, fn):
+        # self.orig_arr = arr
+        self.size = len(arr)
+        arr = list(enumerate(arr))
+
+        def group_return_dict(arr, fn):
+            res = collections.defaultdict(list)
+
+            for ob in arr:
+                res[fn(ob)].append(ob)
+            return res
+
+        arr = group_return_dict(arr, lambda x: fn(x[1]))
+
+        # self.arr has format Dict[Tuple[int, <entry from orig. arr>]]
+        self.arr = arr
+        self._grouped = None
+
+    def get_grouped(self):
+        # return the contents but not indices for our grouped dict.
+        if self._grouped:
+            return self._grouped
+        grouped = {}
+        for key in self.arr.keys():
+            # drop the index from each element of self.arr
+            grouped[key] = [y[1] for y in self.arr[key]]
+        self._grouped = grouped
+        return grouped
+
+    def get_original(self, grouped_dict):
+        # take in a grouped dictionary with e.g. results for each key listed
+        # in the same order as the instances in `self.arr`, and
+        # return the results in the same (single list) order as `self.orig_arr`.
+        res = [None] * self.size
+        cov = [False] * self.size
+        # orig = [None] * self.size
+
+        assert grouped_dict.keys() == self.arr.keys()
+
+        for key in grouped_dict.keys():
+            for (ind, _), v in zip(self.arr[key], grouped_dict[key]):
+                res[ind] = v
+                cov[ind] = True
+                # orig[ind] = _
+
+        assert all(cov)
+        # assert orig == self.orig_arr
+
+        return res
+
+
 def make_table(result_dict):
    """Generate table of results."""
    from pytablewriter import MarkdownTableWriter, LatexTableWriter
@@ -339,7 +369,8 @@ def get_git_commit_hash():
    try:
        git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
        git_hash = git_hash.decode()
-    except subprocess.CalledProcessError:
+    except subprocess.CalledProcessError or FileNotFoundError:
+        # FileNotFoundError occurs when git not installed on system
        git_hash = None
    return git_hash

@@ -399,7 +430,13 @@ def load_yaml_config(yaml_path):
        return yaml_config


+def regex_replace(string, pattern, repl, count=0):
+    """Implements the `re.sub` function as a custom Jinja filter."""
+    return re.sub(pattern, repl, string, count=count)
+
+
 env = Environment(loader=BaseLoader, undefined=StrictUndefined)
+env.filters["regex_replace"] = regex_replace


 def apply_template(template, doc):
@@ -416,6 +453,116 @@ def create_iterator(raw_iterator, rank, world_size, limit=None):
    return islice(raw_iterator, rank, limit, world_size)


+def pad_and_concat(
+    max_length: int,
+    tensors: List[torch.Tensor],
+    padding_side: Literal["right", "left"] = "right",
+):
+    """
+    Method for padding a list of tensors given the maximum tensor
+    length in the batch. Used for batching inputs and continuations in
+    seq2seq models.
+    """
+    assert (
+        padding_side == "left" or padding_side == "right"
+    ), f"Unrecognized padding type: '{padding_side}' not 'left' or 'right'"
+
+    for i, tensor in enumerate(tensors):
+        tensor = tensor.squeeze(0)  # squeeze, in case passed [1, seq] size
+        tensor_len = tensor.shape[0]
+        if tensor_len < max_length:
+            if padding_side == "right":
+                # right-pad
+                tensors[i] = torch.cat(
+                    [
+                        tensor,  # [seq]
+                        torch.zeros(
+                            max_length - tensor_len,
+                            dtype=torch.long,
+                            device=tensor.device,
+                        ),  # [padding_length - seq]
+                    ],
+                    dim=0,
+                ).unsqueeze(0)
+            else:
+                # left-pad
+                tensors[i] = torch.cat(
+                    [
+                        torch.zeros(
+                            max_length - tensor_len,
+                            dtype=torch.long,
+                            device=tensor.device,
+                        ),  # [padding_length - seq]
+                        tensor,  # [seq]
+                    ],
+                    dim=0,
+                ).unsqueeze(0)
+        else:
+            tensors[i] = tensor.unsqueeze(0)
+
+    return torch.cat(tensors, dim=0)
+
+
 def clear_torch_cache():
    gc.collect()
    torch.cuda.empty_cache()
+
+
+def get_dtype(dtype: Union[str, torch.dtype]) -> torch.dtype:
+    """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
+    if isinstance(dtype, str) and dtype != "auto":
+        # Convert `str` args torch dtype: `float16` -> `torch.float16`
+        _torch_dtype = getattr(torch, dtype)
+    else:
+        _torch_dtype = dtype
+    return _torch_dtype
+
+
+# Multi-token stopping criteria
+class MultiTokenEOSCriteria(transformers.StoppingCriteria):
+    """Criteria to stop on the specified multi-token sequence."""
+
+    def __init__(
+        self,
+        sequence: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        initial_decoder_input_length: int,
+        batch_size: int,
+    ):
+        self.initial_decoder_input_length = initial_decoder_input_length
+        self.done_tracker = [False] * batch_size
+        self.sequence = sequence
+        self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
+        self.sequence_id_len = len(self.sequence_ids)
+        self.tokenizer = tokenizer
+
+    def __call__(self, input_ids, scores, **kwargs) -> bool:
+        # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
+        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :][
+            :, -self.sequence_id_len :
+        ]
+
+        lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
+
+        for i, done in enumerate(self.done_tracker):
+            if not done:
+                self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
+        return False not in self.done_tracker
+
+
+def stop_sequences_criteria(
+    tokenizer: transformers.PreTrainedTokenizer,
+    stop_sequences: List[str],
+    initial_decoder_input_length: int,
+    batch_size: int,
+) -> transformers.StoppingCriteriaList:
+    return transformers.StoppingCriteriaList(
+        [
+            *[
+                MultiTokenEOSCriteria(
+                    sequence, tokenizer, initial_decoder_input_length, batch_size
+                )
+                for sequence in stop_sequences
+            ],
+        ]
+    )
--- a/main.py
+++ b/main.py
@@ -39,9 +39,8 @@ def parse_args():
        "If <1, limit is a percentage of the total number of examples.",
    )
    parser.add_argument("--data_sampling", type=float, default=None)
-    parser.add_argument("--no_cache", action="store_true")
+    parser.add_argument("--use_cache", type=str, default=None)
    parser.add_argument("--decontamination_ngrams_path", default=None)
-    parser.add_argument("--description_dict_path", default=None)
    parser.add_argument("--check_integrity", action="store_true")
    parser.add_argument("--write_out", action="store_true", default=False)
    parser.add_argument("--output_base_path", type=str, default=None)
@@ -78,12 +77,6 @@ def main():

    eval_logger.info(f"Selected Tasks: {task_names}")

-    # TODO: description_dict?
-    # description_dict = {}
-    # if args.description_dict_path:
-    #     with open(args.description_dict_path, "r") as f:
-    #         description_dict = json.load(f)
-
    results = evaluator.simple_evaluate(
        model=args.model,
        model_args=args.model_args,
@@ -92,9 +85,8 @@ def main():
        batch_size=args.batch_size,
        max_batch_size=args.max_batch_size,
        device=args.device,
-        no_cache=args.no_cache,
+        use_cache=args.use_cache,
        limit=args.limit,
-        # description_dict=description_dict,
        decontamination_ngrams_path=args.decontamination_ngrams_path,
        check_integrity=args.check_integrity,
        write_out=args.write_out,
@@ -103,8 +95,7 @@ def main():

    if results is not None:
        samples = results.pop("samples")
-
-        dumped = json.dumps(results, indent=2)
+        dumped = json.dumps(results, indent=2, default=lambda o: str(o))
        print(dumped)

        batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))

--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -13,12 +13,10 @@ def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--output_base_path", required=True)
    parser.add_argument("--tasks", default="all_tasks")
-    parser.add_argument("--provide_description", action="store_true")
    parser.add_argument("--sets", type=str, default="val")  # example: val,test
    parser.add_argument("--num_fewshot", type=int, default=1)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--num_examples", type=int, default=1)
-    parser.add_argument("--description_dict_path", default=None)
    return parser.parse_args()


@@ -32,11 +30,6 @@ def main():
        task_names = args.tasks.split(",")
    task_dict = tasks.get_task_dict(task_names)

-    # description_dict = {}
-    # if args.description_dict_path:
-    #     with open(args.description_dict_path, "r") as f:
-    #         description_dict = json.load(f)
-
    os.makedirs(args.output_base_path, exist_ok=True)
    for task_name, task in task_dict.items():
        rnd = random.Random()
@@ -55,12 +48,6 @@ def main():

        docs = join_iters(iters)

-        # description = (
-        #     description_dict[task_name]
-        #     if description_dict and task_name in description_dict
-        #     else ""
-        # )
-
        with open(os.path.join(args.output_base_path, task_name), "w") as f:
            for i, doc in (
                zip(range(args.num_examples), docs)
@@ -72,7 +59,6 @@ def main():
                    doc=doc,
                    num_fewshot=args.num_fewshot,
                    rnd=rnd,
-                    # description=description,
                )
                f.write(ctx + "\n")


--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,9 @@ setuptools.setup(
    python_requires=">=3.9",
    install_requires=[
        "accelerate>=0.18.0",
+        "evaluate",
        "datasets>=2.0.0",
+        "evaluate>=0.4.0",
        "jsonlines",
        "numexpr",
        "openai>=0.6.4",
@@ -53,7 +55,7 @@ setuptools.setup(
        "promptsource": [
            "promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
        ],
-        "auto-gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
+        "gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
        "anthropic": ["anthropic"],
    },
 )
--- a/tests/test_description_dict.py
+++ b/tests/test_description_dict.py
@@ -3,17 +3,21 @@ import lm_eval.tasks
 import lm_eval.models


-def test_description_dict():
+def test_description():
    seed = 42
    num_examples = 1
-    task_names = ["hellaswag", "winogrande"]
+    task_names = ["arc_challenge", "lambada"]
    description_dict = {
-        "hellaswag": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
-        "winogrande": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
+        "arc_challenge": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
+        "lambada": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
    }

    task_dict = lm_eval.tasks.get_task_dict(task_names)
    for task_name, task in task_dict.items():
+
+        # patch description field in task (# TODO: make this much more cleaned up)
+        task._config.description = description_dict[task_name]
+
        rnd = random.Random()
        rnd.seed(seed)

@@ -37,6 +41,5 @@ def test_description_dict():
                doc=doc,
                num_fewshot=1,
                rnd=rnd,
-                description=description,
            )
            assert description in ctx
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
 import os
-import lm_eval.base as base
+
+# import lm_eval.base as base
+import lm_eval.api.registry as registry
 import lm_eval.tasks as tasks
-import lm_eval.models as models
+
+# import lm_eval.models as models
+
 import lm_eval.evaluator as evaluator
 import random
 import pytest
@@ -15,8 +19,10 @@ import pytest
 def test_evaluator(taskname, task_class):
    task_dict = tasks.get_task_dict([taskname])

-    os.system("rm test_cache.db")
-    lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db")
+    # TODO: re-add cachingLM
+    # os.system("rm test_cache.db")
+    # lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db")
+    lm = registry.get_model("dummy")()

    def ll_fn(reqs):
        for ctx, cont in reqs:
@@ -55,7 +61,6 @@ def test_evaluator(taskname, task_class):
        num_fewshot=0,
        limit=limit,
        bootstrap_iters=10,
-        description_dict=None,
    )
    e2 = evaluator.evaluate(
        lm=lm,
@@ -63,7 +68,6 @@ def test_evaluator(taskname, task_class):
        num_fewshot=0,
        limit=limit,
        bootstrap_iters=10,
-        description_dict=None,
    )

    # check that caching is working

--- a/tests/test_misc.py
+++ b/tests/test_misc.py
 import pytest
-import lm_eval.metrics as metrics
+import lm_eval.api.metrics as metrics
 import random



--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
 import lm_eval.tasks as tasks
-import lm_eval.base as base
+
 import pytest
 from itertools import islice

@@ -100,5 +100,5 @@ def test_documents_and_requests(taskname, task_class):
                reqs = [reqs]

            # todo: mock lm after refactoring evaluator.py to not be a mess
-            for req in reqs:
-                assert isinstance(req, base.Request)
+            # for req in reqs:
+            #     assert isinstance(req, base.Request)