Merge remote-tracking branch 'upstream/big-refactor' into refactor-more-tasks

6e3ef5ff · Benjamin Fattori · 026d2c21 · 070b6b9c · 026d2c21 · 026d2c21
Commit 6e3ef5ff authored Jul 04, 2023 by Benjamin Fattori
17 changed files
--- a/lm_eval/tasks/super_glue/copa/promptsource-01.yaml
+++ b/lm_eval/tasks/super_glue/copa/promptsource-01.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "best_option"
-use_prompt: "promptsource:best_option"
--- a/lm_eval/tasks/super_glue/copa/promptsource-02.yaml
+++ b/lm_eval/tasks/super_glue/copa/promptsource-02.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "cause_effect"
-use_prompt: "promptsource:cause_effect"
--- a/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
 group:
  - super-glue-t5-prompt
-task: t5-prompt
+task: super_glue-copa-t5-prompt
-reference: "From Raffel et. al. 2019"
 dataset_path: super_glue
 dataset_name: copa
 training_split: train

--- a/lm_eval/tasks/super_glue/copa/utils.py
+++ b/lm_eval/tasks/super_glue/copa/utils.py
+def convert_choice(choice):
+    return choice[0].lower() + choice[1:]
+def doc_to_text(doc):
+    # Drop the period
+    connector = {
+        "cause": "because",
+        "effect": "therefore",
+    }[doc["question"]]
+    return doc["premise"].strip()[:-1] + f" {connector}"
+def doc_to_target(doc):
+    correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
+    # Connect the sentences
+    return " " + convert_choice(correct_choice)
--- a/lm_eval/tasks/super_glue/record/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/record/t5-prompt.yaml
 group:
  - super-glue-t5-prompt
-task: t5-prompt
+task: super_glue-record-t5-prompt
-reference: "From Raffel et. al. 2019"
 dataset_path: super_glue
 dataset_name: record
 training_split: train

--- a/lm_eval/tasks/super_glue/wic/default.yaml
+++ b/lm_eval/tasks/super_glue/wic/default.yaml
+group:
+  - super-glue-lm-eval-v1
+task: "wic"
+dataset_path: super_glue
+dataset_name: wic
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+gold_alias: "{{label}}" # this will be cast to an int.
+template_aliases: "{% set answer_choices = ['no', 'yes'] %}"
+metric_list:
+  - metric: acc
--- a/lm_eval/tasks/super_glue/wic/promptsource-00.yaml
+++ b/lm_eval/tasks/super_glue/wic/promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "GPT-3-prompt"
-dataset_path: super_glue
-dataset_name: wic
-training_split: train
-validation_split: validation
-use_prompt: "promptsource:GPT-3-prompt"
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/wic/promptsource-01.yaml
+++ b/lm_eval/tasks/super_glue/wic/promptsource-01.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "GPT-3-prompt-with-label"
-use_prompt: "promptsource:GPT-3-prompt-with-label"
--- a/lm_eval/tasks/super_glue/wic/promptsource-02.yaml
+++ b/lm_eval/tasks/super_glue/wic/promptsource-02.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "affirmation_true_or_false"
-use_prompt: "promptsource:affirmation_true_or_false"
--- a/lm_eval/tasks/super_glue/wic/utils.py
+++ b/lm_eval/tasks/super_glue/wic/utils.py
+def doc_to_text(doc):
+    return (
+        "Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the"
+        " two sentences above?\nAnswer:".format(
+            doc["sentence1"],
+            doc["sentence2"],
+            doc["sentence1"][doc["start1"] : doc["end1"]],
+        )
+    )
+def doc_to_target(doc):
+    return " {}".format({0: "no", 1: "yes"}[doc["label"]])
--- a/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
 group:
  - super-glue-t5-prompt
-task: t5-prompt
+task: super_glue-wsc-t5-prompt
-reference: "From Raffel et. al. 2019"
 dataset_path: super_glue
 dataset_name: wsc
 training_split: train

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -10,7 +10,7 @@ import collections
 import importlib.util
 import fnmatch
-from typing import List, Union
+from typing import List, Literal, Union
 import gc
 import torch
@@ -23,15 +23,6 @@ from itertools import islice
 from lm_eval.logger import eval_logger
-class ExitCodeError(Exception):
-    pass
-def sh(x):
-    if os.system(x):
-        raise ExitCodeError()
 def escaped_split(text, sep_char, maxsplit=-1):
    """Split text into a list on occurrences of the given separation
    character `sep_char`. The separation character may be escaped by a
@@ -181,26 +172,6 @@ def make_disjoint_window(pair):
    return a[: len(a) - (len(b) - 1)], b
-def select_continuation_from_batch_left_padding(
-    generations: Union[List[List[int]], torch.Tensor], max_context_size: int
-):
-    """Select the continuation from the batch, removing prompts of different lengths.
-    Args:
-        generations (Union[List[List[int]], torch.Tensor]):
-            A tensor or list-of-lists of shape [batch_size, sequence length].
-        max_context_size (int):
-            The size of the biggest context; generations will proceed from that
-            index.
-    Example:
-        PAD     PAD Continue : The dog chased the cat  [every       day of the week]
-        Riddle  me    this   : The  dog chased the  cat [yesterday] PAD PAD PAD PAD
-    Output:
-        [every day of the week]
-        [yesterday]  PAD PAD PAD PAD
-    """
-    return generations[:, max_context_size:]
 class Reorderer:
    def __init__(self, arr, fn):
        self.size = len(arr)
@@ -398,7 +369,8 @@ def get_git_commit_hash():
    try:
        git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
        git_hash = git_hash.decode()
-    except subprocess.CalledProcessError:
+    except subprocess.CalledProcessError or FileNotFoundError:
+        # FileNotFoundError occurs when git not installed on system
        git_hash = None
    return git_hash
@@ -481,7 +453,11 @@ def create_iterator(raw_iterator, rank, world_size, limit=None):
    return islice(raw_iterator, rank, limit, world_size)
-def pad_and_concat(max_length: int, tensors: List[torch.Tensor], padding_side="right"):
+def pad_and_concat(
+    max_length: int,
+    tensors: List[torch.Tensor],
+    padding_side: Literal["right", "left"] = "right",
+):
    """
    Method for padding a list of tensors given the maximum tensor
    length in the batch. Used for batching inputs and continuations in

--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,7 @@ setuptools.setup(
        "promptsource": [
            "promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
        ],
-        "auto-gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
+        "gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
        "anthropic": ["anthropic"],
    },
 )
--- a/tests/test_description.py
+++ b/tests/test_description.py
@@ -6,14 +6,18 @@ import lm_eval.models
 def test_description():
    seed = 42
    num_examples = 1
-    task_names = ["hellaswag", "winogrande"]
+    task_names = ["arc_challenge", "lambada"]
    description_dict = {
-        "hellaswag": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
+        "arc_challenge": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
-        "winogrande": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
+        "lambada": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
    }
    task_dict = lm_eval.tasks.get_task_dict(task_names)
    for task_name, task in task_dict.items():
+        # patch description field in task (# TODO: make this much more cleaned up)
+        task._config.description = description_dict[task_name]
        rnd = random.Random()
        rnd.seed(seed)

--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
 import os
-import lm_eval.base as base
+# import lm_eval.base as base
+import lm_eval.api.registry as registry
 import lm_eval.tasks as tasks
-import lm_eval.models as models
+# import lm_eval.models as models
 import lm_eval.evaluator as evaluator
 import random
 import pytest
@@ -15,8 +19,10 @@ import pytest
 def test_evaluator(taskname, task_class):
    task_dict = tasks.get_task_dict([taskname])
-    os.system("rm test_cache.db")
+    # TODO: re-add cachingLM
-    lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db")
+    # os.system("rm test_cache.db")
+    # lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db")
+    lm = registry.get_model("dummy")()
    def ll_fn(reqs):
        for ctx, cont in reqs:

--- a/tests/test_misc.py
+++ b/tests/test_misc.py
 import pytest
-import lm_eval.metrics as metrics
+import lm_eval.api.metrics as metrics
 import random

--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
 import lm_eval.tasks as tasks
-import lm_eval.base as base
 import pytest
 from itertools import islice
@@ -100,5 +100,5 @@ def test_documents_and_requests(taskname, task_class):
                reqs = [reqs]
            # todo: mock lm after refactoring evaluator.py to not be a mess
-            for req in reqs:
+            # for req in reqs:
-                assert isinstance(req, base.Request)
+            #     assert isinstance(req, base.Request)