Merge pull request #609 from EleutherAI/misc-cleanup-refactor

[Refactor] Misc. cleanup of dead code

Merge pull request #609 from EleutherAI/misc-cleanup-refactor
[Refactor] Misc. cleanup of dead code
bd5b29eb · Lintang Sutawika · GitHub · 46d3bead · fbd712f7 · bd5b29eb
Unverified Commit bd5b29eb authored Jun 29, 2023 by Lintang Sutawika Committed by GitHub Jun 29, 2023
9 changed files
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -281,7 +281,7 @@ class Task(abc.ABC):
        else:
            eval_logger.warning(
                "has_training_docs and has_validation_docs are False"
-                ", using test_docs but this is not recommended."
+                ", using test_docs as fewshot_docs but this is not recommended."
            )
            return self.test_docs()

@@ -342,7 +342,8 @@ class Task(abc.ABC):
            fewshot_ctx = self.fewshot_context(
                doc, self._config.num_fewshot, rnd=random.Random()
            )
-            # TODO: we should override this if doing greedy gen so users don't waste time+compute
+
+            # TODO: we should override self._config.repeats if doing greedy gen so users don't waste time+compute
            inst = self.construct_requests(
                doc=doc,
                ctx=fewshot_ctx,

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -195,11 +195,6 @@ def evaluate(
        versions[task_name] = task.VERSION
        configs[task_name] = dict(task.dump_config())

-        # deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
-        # task_docs = list(task_doc_func())
-        # rnd = random.Random()
-        # rnd.seed(42)
-        # rnd.shuffle(task_docs)
        if limit is not None:
            if task.has_test_docs():
                task_docs = task.test_docs()
@@ -257,13 +252,12 @@ def evaluate(
        task.apply_filters()

    ### Collect values of metrics on all datapoints ###
-    # TODO: make metric configurable, add metric registry
    vals = collections.defaultdict(list)

    # unpack results and sort back in order and return control to Task
    for task_name, task in task_dict.items():
-        # calculate values for each filter setup (TODO: make getting list of keys cleaner)
-        # TODO: make it possible to use a different metric per key
+        # TODO: make it possible to use a different metric per filter
+        # iterate over different filters used
        for key in task.instances[0].filtered_resps.keys():
            doc_iterator = (
                itertools.islice(

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -124,28 +124,6 @@ def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs):
                get_task_name_from_object(task_element): task_element,
            }

-    # task_name_from_registry_dict = {
-    #     task_name: get_task(
-    #         task_name=task_name,
-    #         task_config=config
-    #     )
-    #     for group_name in task_name_list for task_name in GROUP_REGISTRY[group_name]
-    #     if (isinstance(group_name, str)) and (group_name in GROUP_REGISTRY)
-    # }
-    # task_name_from_config_dict = {
-    #     get_task_name_from_config(task_config): ConfigurableTask(
-    #         config=task_config
-    #     )
-    #     for task_config in task_name_list
-    #     if isinstance(task_config, dict)
-    # }
-    # # TODO: Do we still need this?
-    # task_name_from_object_dict = {
-    #     get_task_name_from_object(task_object): task_object
-    #     for task_object in task_name_list
-    #     if isinstance(task_object, Task)
-    # }
-
    assert set(task_name_from_registry_dict.keys()).isdisjoint(
        set(task_name_from_object_dict.keys())
    )

--- a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
@@ -25,7 +25,7 @@ metric_list:
    regexes_to_ignore:
      - ","
      - "\\$"
-delimiter: "\n\n"
+fewshot_delimiter: "\n\n"
 generation_kwargs:
  until:
    - "Q:"

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -23,15 +23,6 @@ from itertools import islice
 from lm_eval.logger import eval_logger


-class ExitCodeError(Exception):
-    pass
-
-
-def sh(x):
-    if os.system(x):
-        raise ExitCodeError()
-
-
 def escaped_split(text, sep_char, maxsplit=-1):
    """Split text into a list on occurrences of the given separation
    character `sep_char`. The separation character may be escaped by a
@@ -181,26 +172,6 @@ def make_disjoint_window(pair):
    return a[: len(a) - (len(b) - 1)], b


-def select_continuation_from_batch_left_padding(
-    generations: Union[List[List[int]], torch.Tensor], max_context_size: int
-):
-    """Select the continuation from the batch, removing prompts of different lengths.
-    Args:
-        generations (Union[List[List[int]], torch.Tensor]):
-            A tensor or list-of-lists of shape [batch_size, sequence length].
-        max_context_size (int):
-            The size of the biggest context; generations will proceed from that
-            index.
-    Example:
-        PAD     PAD Continue : The dog chased the cat  [every       day of the week]
-        Riddle  me    this   : The  dog chased the  cat [yesterday] PAD PAD PAD PAD
-    Output:
-        [every day of the week]
-        [yesterday]  PAD PAD PAD PAD
-    """
-    return generations[:, max_context_size:]
-
-
 class Reorderer:
    def __init__(self, arr, fn):
        self.size = len(arr)
@@ -396,9 +367,10 @@ def get_git_commit_hash():
    Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
    """
    try:
-        git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
+        git_hash = subprocess.check_output(["gt", "describe", "--always"]).strip()
        git_hash = git_hash.decode()
-    except subprocess.CalledProcessError:
+    except subprocess.CalledProcessError or FileNotFoundError:
+        # FileNotFoundError occurs when git not installed on system
        git_hash = None
    return git_hash


--- a/tests/test_description.py
+++ b/tests/test_description.py
@@ -6,14 +6,18 @@ import lm_eval.models
 def test_description():
    seed = 42
    num_examples = 1
-    task_names = ["hellaswag", "winogrande"]
+    task_names = ["arc_challenge", "lambada"]
    description_dict = {
-        "hellaswag": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
-        "winogrande": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
+        "arc_challenge": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
+        "lambada": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
    }

    task_dict = lm_eval.tasks.get_task_dict(task_names)
    for task_name, task in task_dict.items():
+
+        # patch description field in task (# TODO: make this much more cleaned up)
+        task._config.description = description_dict[task_name]
+
        rnd = random.Random()
        rnd.seed(seed)


--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
 import os
-import lm_eval.base as base
+
+# import lm_eval.base as base
+import lm_eval.api.registry as registry
 import lm_eval.tasks as tasks
-import lm_eval.models as models
+
+# import lm_eval.models as models
+
 import lm_eval.evaluator as evaluator
 import random
 import pytest
@@ -15,8 +19,10 @@ import pytest
 def test_evaluator(taskname, task_class):
    task_dict = tasks.get_task_dict([taskname])

-    os.system("rm test_cache.db")
-    lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db")
+    # TODO: re-add cachingLM
+    # os.system("rm test_cache.db")
+    # lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db")
+    lm = registry.get_model("dummy")()

    def ll_fn(reqs):
        for ctx, cont in reqs:

--- a/tests/test_misc.py
+++ b/tests/test_misc.py
 import pytest
-import lm_eval.metrics as metrics
+import lm_eval.api.metrics as metrics
 import random



--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
 import lm_eval.tasks as tasks
-import lm_eval.base as base
+
 import pytest
 from itertools import islice

@@ -100,5 +100,5 @@ def test_documents_and_requests(taskname, task_class):
                reqs = [reqs]

            # todo: mock lm after refactoring evaluator.py to not be a mess
-            for req in reqs:
-                assert isinstance(req, base.Request)
+            # for req in reqs:
+            #     assert isinstance(req, base.Request)