Merge pull request #501 from EleutherAI/update-config

Update config

Merge pull request #501 from EleutherAI/update-config
Update config
c5ed8cdc · Lintang Sutawika · GitHub · f6b76f5d · c17e3659 · c5ed8cdc
Unverified Commit c5ed8cdc authored May 20, 2023 by Lintang Sutawika Committed by GitHub May 20, 2023
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,5 @@ env
 data/
 lm_cache
 .idea
+
+*.egg-info/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -12,6 +12,7 @@ repos:
      - id: check-merge-conflict
      - id: check-symlinks
      - id: check-yaml
+        args: ['--unsafe']
      - id: destroyed-symlinks
      - id: detect-private-key
      - id: end-of-file-fixer

--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ python main.py \
    --device cuda:0
 ```

-Additional arguments can be provided to the model constructor using the `--model_args` flag. Most notably, this supports the common practice of using the `revisions` feature on the Hub to store partialy trained checkpoints:
+Additional arguments can be provided to the model constructor using the `--model_args` flag. Most notably, this supports the common practice of using the `revisions` feature on the Hub to store partially trained checkpoints:

 ```bash
 python main.py \
@@ -78,7 +78,7 @@ python main.py \
    --tasks lambada_openai,hellaswag
 ```

-While this functionality is only officially mantained for the official OpenAI API, it tends to also work for other hosting services that use the same API such as [goose.ai](goose.ai) with minor modification. We also have an implementation for the [TextSynth](https://textsynth.com/index.html) API, using `--model textsynth`.
+While this functionality is only officially maintained for the official OpenAI API, it tends to also work for other hosting services that use the same API such as [goose.ai](goose.ai) with minor modification. We also have an implementation for the [TextSynth](https://textsynth.com/index.html) API, using `--model textsynth`.

 To verify the data integrity of the tasks you're performing in addition to running the tasks themselves, you can use the `--check_integrity` flag:

@@ -116,7 +116,7 @@ When reporting eval harness results, please also report the version of each task

 ## Test Set Decontamination

-To address concerns about train / test contamination, we provide utilities for comparing results on a benchmark using only the data points nto found in the model trainign set. Unfortunately, outside of models trained on the Pile ans C4, its very rare that people who train models disclose the contents of the training data. However this utility can be useful to evaluate models you have trained on private data, provided you are willing to pre-compute the necessary indices. We provide computed indices for 13-gram exact match deduplication against the Pile, and plan to add additional precomputed dataset indices in the future (including C4 and min-hash LSH deduplication).
+To address concerns about train / test contamination, we provide utilities for comparing results on a benchmark using only the data points nto found in the model trainign set. Unfortunately, outside of models trained on the Pile and C4, its very rare that people who train models disclose the contents of the training data. However this utility can be useful to evaluate models you have trained on private data, provided you are willing to pre-compute the necessary indices. We provide computed indices for 13-gram exact match deduplication against the Pile, and plan to add additional precomputed dataset indices in the future (including C4 and min-hash LSH deduplication).

 For details on text decontamination, see the [decontamination guide](./docs/decontamination.md).


--- a/lm_eval/api/__init__.py
+++ b/lm_eval/api/__init__.py
 from . import metrics
-
-METRIC_REGISTRY = {
-    "matthews_corrcoef": metrics.matthews_corrcoef,
-    "f1_score": metrics.f1_score,
-    "perplexity": metrics.perplexity,
-    "bleu": metrics.bleu,
-    "chrf": metrics.chrf,
-    "ter": metrics.ter,
-}
-
-AGGREGATION_REGISTRY = {
-    "mean": metrics.mean,
-    "median": metrics.median,
-    "perplexity": metrics.perplexity,
-}
-
-HIGHER_IS_BETTER_REGISTRY = {
-    "matthews_corrcoef": True,
-    "f1_score": True,
-    "perplexity": False,
-    "bleu": True,
-    "chrf": True,
-    "ter": False,
-
-    "acc": True,
-    "acc_norm": True,
-    "acc_mutual_info": True,
-    "word_perplexity": False,
-    "byte_perplexity": False,
-    "bits_per_byte": False,
-}
\ No newline at end of file
--- a/lm_eval/api/filter.py
+++ b/lm_eval/api/filter.py
@@ -3,6 +3,7 @@ from typing import List

 from lm_eval.api.instance import Instance

+
 class Filter:
    """
    Filter classes operate on a per-task level.
@@ -26,6 +27,7 @@ class Filter:
        """
        return resps

+
 @dataclass
 class FilterEnsemble:
    """
@@ -34,21 +36,23 @@ class FilterEnsemble:
    `task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each
    pipeline separately.
    """
+
    name: str
    filters: List[Filter]

    def apply(self, instances: List[Instance]):

-        resps = [inst.resps for inst in instances] # operate just on the model responses
+        resps = [
+            inst.resps for inst in instances
+        ]  # operate just on the model responses
        for f in self.filters:
            # apply filters in sequence
            out = f.apply(resps)
-            resps = out # TODO: handle the case where a filter returns multiple "buckets"
+            resps = (
+                out  # TODO: handle the case where a filter returns multiple "buckets"
+            )

        # add the end results after filtering to filtered_requests of their respective source instances.
        # has key `self.name`: each FilterEnsemble applied in a given run should use a different name.
        for inst, resp in zip(instances, resps):
            inst.filtered_resps[self.name] = resp
-
-            
-
--- a/lm_eval/api/instance.py
+++ b/lm_eval/api/instance.py
 from dataclasses import dataclass, field
 from typing import Literal, Tuple

+
 @dataclass
 class Instance:
-    request_type: str = Literal["loglikelihood", "loglikelihood_rolling", "greedy_until"]
+    request_type: str = Literal[
+        "loglikelihood", "loglikelihood_rolling", "greedy_until"
+    ]
    doc: dict = None
    arguments: tuple = None
    idx: int = None
@@ -25,4 +28,6 @@ class Instance:
        """
        Returns (string,) where `string` is the string to calculate loglikelihood over
        """
-        return self.arguments if isinstance(self.arguments, tuple) else (self.arguments,)
+        return (
+            self.arguments if isinstance(self.arguments, tuple) else (self.arguments,)
+        )
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -10,6 +10,7 @@ import evaluate


 AGGREGATION_REGISTRY = {}
+
 METRIC_REGISTRY = {
    "acc": None,
    "acc_norm": None,
@@ -18,6 +19,21 @@ METRIC_REGISTRY = {
    "byte_perplexity": None,
 }

+HIGHER_IS_BETTER_REGISTRY = {
+    "matthews_corrcoef": True,
+    "f1_score": True,
+    "perplexity": False,
+    "bleu": True,
+    "chrf": True,
+    "ter": False,
+    "acc": True,
+    "acc_norm": True,
+    "acc_mutual_info": True,
+    "word_perplexity": False,
+    "byte_perplexity": False,
+    "bits_per_byte": False,
+}
+

 def register_metric(name):
    # TODO: do we want to enforce a certain interface to registered metrics?
@@ -38,12 +54,14 @@ def get_metric(name):
        return METRIC_REGISTRY[name]
    except KeyError:
        # TODO: change this print to logging?
-        print(f"Could not find registered metric '{name}' in lm-eval, \
-searching in HF Evaluate library...")
+        print(
+            f"Could not find registered metric '{name}' in lm-eval, \
+searching in HF Evaluate library..."
+        )
        try:
            metric_object = evaluate.load(name)
            return metric_object.compute
-        except:
+        except Exception:
            raise Warning(
                "{} not found in the evaluate library!".format(name),
                "Please check https://huggingface.co/evaluate-metric",

--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -6,14 +6,15 @@ from lm_eval import utils

 MODEL_REGISTRY = {}

+
 def register_model(*names):
    # either pass a list or a single alias.
    # function receives them as a tuple of strings

    def decorate(cls):
        for name in names:
-            assert (
-                issubclass(cls, LM)
+            assert issubclass(
+                cls, LM
            ), f"Model '{name}' ({cls.__name__}) must extend LM class"

            assert (

--- a/lm_eval/api/register.py
+++ b/lm_eval/api/register.py
+import os
+
+task_registry = {}
+group_registry = {}
+task2func_index = {}
+func2task_index = {}
+
+
+def register_task(name):
+    def wrapper(func):
+
+        task_registry[name] = func
+        func2task_index[func.__name__] = name
+        task2func_index[name] = func.__name__
+        return func
+
+    return wrapper
+
+
+def register_group(name):
+    def wrapper(func):
+
+        func_name = func2task_index[func.__name__]
+
+        if name in group_registry:
+            group_registry[name].append(func_name)
+        else:
+            group_registry[name] = [func_name]
+        return func
+
+    return wrapper
--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
-
-
 class Sampler:
-
    def __init__(self, docs, task, fewshot_indices=None, rnd=None):

        self.rnd = rnd
@@ -16,11 +13,14 @@ class Sampler:
        if fewshot_indices:  # subset few-shot docs from
            self.docs = self.docs.select(fewshot_indices)

-
    def get_context(self, doc, num_fewshot):

-        # draw an extra fewshot sample if using same split as evaluting on
-        n_samples = num_fewshot + 1 if self.config.fewshot_split == self.config.test_split else num_fewshot 
+        # draw an extra fewshot sample if using same split as evaluating on
+        n_samples = (
+            num_fewshot + 1
+            if self.config.fewshot_split == self.config.test_split
+            else num_fewshot
+        )

        # draw `n_samples` docs from fewshot_docs
        fewshotex = self.sample(n_samples)
@@ -51,7 +51,6 @@ class Sampler:


 class BalancedSampler(Sampler):
-
    def sample(self, n):
        """
        TODO: this should return approximately class-balanced samples from our fewshot examples.
@@ -60,12 +59,10 @@ class BalancedSampler(Sampler):

        pass

-class ManualSampler(Sampler):

+class ManualSampler(Sampler):
    def sample(self, n):
-        """
-
-        """
+        """ """
        pass



--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
-import collections
+import random
 import itertools
+import collections
+
 import numpy as np
-import random
+
+import lm_eval.api
 import lm_eval.api.metrics
-import lm_eval.models
+
 import lm_eval.tasks
-import lm_eval.api
-from lm_eval.utils import positional_deprecated, run_task_tests, make_table, get_git_commit_hash
+import lm_eval.models
+
+from lm_eval.utils import (
+    positional_deprecated,
+    run_task_tests,
+    make_table,
+    get_git_commit_hash,
+)
+
+from lm_eval.logger import eval_logger


 @positional_deprecated
@@ -65,7 +76,7 @@ def simple_evaluate(
        assert isinstance(model, lm_eval.api.model.LM)
        lm = model

-    task_dict = lm_eval.api.task.get_task_dict(tasks, num_fewshot=num_fewshot)
+    task_dict = lm_eval.tasks.get_task_dict(tasks, num_fewshot=num_fewshot)

    if check_integrity:
        run_task_tests(task_list=tasks)
@@ -73,7 +84,6 @@ def simple_evaluate(
    results = evaluate(
        lm=lm,
        task_dict=task_dict,
-        num_fewshot=num_fewshot,
        limit=limit,
        bootstrap_iters=bootstrap_iters,
        decontamination_ngrams_path=decontamination_ngrams_path,
@@ -102,7 +112,6 @@ decontaminate_suffix = "_decontaminate"
 def evaluate(
    lm,
    task_dict,
-    num_fewshot=0,
    limit=None,
    bootstrap_iters=100000,
    decontamination_ngrams_path=None,
@@ -123,15 +132,15 @@ def evaluate(
        Dictionary of results
    """

-    decontaminate = decontamination_ngrams_path is not None
+    # decontaminate = decontamination_ngrams_path is not None

    results = collections.defaultdict(dict)
    versions = collections.defaultdict(dict)

    requests = collections.defaultdict(list)
-    requests_origin = collections.defaultdict(list)
+    # requests_origin = collections.defaultdict(list)

-    docs = {}
+    # docs = {}

    # get lists of each type of request
    for task_name, task in task_dict.items():
@@ -146,13 +155,17 @@ def evaluate(
        # for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
        task.build_all_requests(limit=limit)
        # aggregate Instances by LM method requested to get output.
-        reqtype = "loglikelihood" if task.OUTPUT_TYPE == "multiple_choice" else task.OUTPUT_TYPE #TODO: this is hacky, fix in task.py
+        reqtype = (
+            "loglikelihood"
+            if task.OUTPUT_TYPE == "multiple_choice"
+            else task.OUTPUT_TYPE
+        )  # TODO: this is hacky, fix in task.py
        requests[reqtype].extend(task.instances)

    ### Run LM on inputs, get all outputs ###
    # execute each type of request
    for reqtype, reqs in requests.items():
-        print("Running", reqtype, "requests")
+        eval_logger.info("Running {} requests".format(reqtype))
        # create `K` copies of each request `req` based off `K = req.repeats`
        cloned_reqs = []
        for req in reqs:
@@ -170,7 +183,6 @@ def evaluate(
    for task_name, task in task_dict.items():
        task.apply_filters()

-
    ### Collect values of metrics on all datapoints ###
    # TODO: make metric configurable, add metric registry
    vals = collections.defaultdict(list)
@@ -180,21 +192,27 @@ def evaluate(
        # calculate values for each filter setup (TODO: make getting list of keys cleaner)
        # TODO: make it possible to use a different metric per key
        for key in task.instances[0].filtered_resps.keys():
-            for doc_id, doc in enumerate(itertools.islice(task.test_docs(), 0, limit) if task.has_test_docs() else task.validation_docs()):
+            for doc_id, doc in enumerate(
+                itertools.islice(task.test_docs(), 0, limit)
+                if task.has_test_docs()
+                else task.validation_docs()
+            ):
                # subset instances to only this document id ; sort by idx
                requests = list(filter(lambda x: x.doc_id == doc_id, task.instances))
                requests.sort(key=lambda x: x.idx)
-                metrics = task.process_results(doc, [req.filtered_resps[key] for req in requests])
+                metrics = task.process_results(
+                    doc, [req.filtered_resps[key] for req in requests]
+                )
                for metric, value in metrics.items():
                    vals[(task_name, key, metric)].append(value)

-
-
    ### Aggregate results over all datapoints ###
    # aggregate results ; run bootstrap CIs
    for (task_name, key, metric), items in vals.items():
        task = task_dict[task_name]
-        results[task_name][metric + " - filter=" + key] = task.aggregation()[metric](items)
+        results[task_name][metric + " - filter=" + key] = task.aggregation()[metric](
+            items
+        )

        # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
        # so we run them less iterations. still looking for a cleaner way to do this

--- a/lm_eval/filters/__init__.py
+++ b/lm_eval/filters/__init__.py
@@ -9,7 +9,7 @@ FILTER_REGISTRY = {
    # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function
    # that takes an input and returns a scalar and then should select the max reward,
    # or should implement different filters for different ways of handling a reward model's inference.
-    #"arg_max": selection.ArgMaxFilter, 
+    # "arg_max": selection.ArgMaxFilter,
 }


@@ -17,16 +17,19 @@ def get_filter(filter_name):
    return FILTER_REGISTRY[filter_name]


-def build_filter_ensemble(name, components):
+def build_filter_ensemble(filter_name, components):
    """
    Create a filtering pipeline.
    """
+
    filters = []
-    for step in components:
+    for (function, kwargs) in components:
+        if kwargs is None:
+            f = get_filter(function)()
+        else:
            # create a filter given its name in the registry
-            f = get_filter(step)() # TODO: pass kwargs to filters properly
-
+            f = get_filter(function)(**kwargs)  # TODO: pass kwargs to filters properly
        # add the filter as a pipeline step
        filters.append(f)

-    return FilterEnsemble(name=name, filters=filters)
+    return FilterEnsemble(name=filter_name, filters=filters)
--- a/lm_eval/filters/decontamination.py
+++ b/lm_eval/filters/decontamination.py
--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -4,19 +4,15 @@ from lm_eval.api.filter import Filter


 class RegexFilter(Filter):
-    """
-
+    """ """

-    """
-
-    def __init__(self, regex=r"#### (\-?[0-9\.\,]+)", fallback="[invalid]"):
+    def __init__(self, regex_pattern=r"#### (\-?[0-9\.\,]+)", fallback="[invalid]"):
        """
        pass a string `regex` to run `re.compile(r"regex")` on.
        `fallback` defines the output returned if no matches for the regex are located.
        """
-        self.regex_pattern = regex
-        self.regex = re.compile(regex)
-
+        self.regex_pattern = regex_pattern
+        self.regex = re.compile(regex_pattern)
        self.fallback = fallback

    def apply(self, resps):
@@ -30,7 +26,7 @@ class RegexFilter(Filter):
                match = self.regex.search(resp)
                if match:
                    match = match.group(1).strip()
-                    match_str.replace(",", "")
+                    match.replace(",", "")
                    # TODO: should we assume any other filtering is performed?
                else:
                    match = self.fallback

--- a/lm_eval/filters/selection.py
+++ b/lm_eval/filters/selection.py
 from lm_eval.api.filter import Filter

-class TakeFirstFilter:

+class TakeFirstFilter:
    def __init__(self):
        """
        Can define custom behavior here, if an individual instantiation of a Filter class should have state.

--- a/lm_eval/logger.py
+++ b/lm_eval/logger.py
+import logging
+
+logging.basicConfig(
+    format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
+    datefmt="%Y-%m-%d:%H:%M:%S",
+    level=logging.INFO,
+)
+eval_logger = logging.getLogger("lm-eval")
--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -6,6 +6,7 @@ from tqdm import tqdm
 import torch.nn.functional as F

 from lm_eval import utils
+from lm_eval.logger import eval_logger
 from lm_eval.api.model import LM, register_model


@@ -31,10 +32,10 @@ class HFLM(LM):
            if device not in ["cuda", "cpu"]:
                device = int(device)
            self._device = torch.device(device)
-            print(f"Using device '{device}'")
+            eval_logger.info(f"Using device '{device}'")
        else:
-            print("Device not specified")
-            print(f"Cuda Available? {torch.cuda.is_available()}")
+            eval_logger.warning("Device not specified")
+            eval_logger.info(f"Cuda Available? {torch.cuda.is_available()}")
            self._device = (
                torch.device("cuda")
                if torch.cuda.is_available()
@@ -110,7 +111,11 @@ class HFLM(LM):

    def _model_generate(self, context, max_length, eos_token_id):
        return self.gpt2.generate(
-            context, max_length=max_length, pad_token_id=eos_token_id, eos_token_id=eos_token_id, do_sample=False
+            context,
+            max_length=max_length,
+            pad_token_id=eos_token_id,
+            eos_token_id=eos_token_id,
+            do_sample=False,
        )

    def loglikelihood(self, requests):

--- a/lm_eval/models/gpt3.py
+++ b/lm_eval/models/gpt3.py
 import os
-import numpy as np
+import time
 import transformers
-from lm_eval.api.model import LM, register_model
-from lm_eval import utils
+
+import numpy as np
+
 from tqdm import tqdm
-import time
+from lm_eval import utils
+from lm_eval.api.model import LM, register_model


 def get_result(response, ctxlen):

--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
+from lm_eval.logger import eval_logger
+from promptsource.templates import DatasetTemplates
+
 # TODO: decide whether we want jinja2 or f-string prompts. would it be cursed to support both?
 # Prompt library.
 # Stores prompts in a dictionary indexed by 2 levels:
@@ -6,17 +9,37 @@
 PROMPT_REGISTRY = {
    "qa-basic": {
        "question-newline-answer": "Question: {{question}}\nAnswer:",
-        "q-newline-a": "Q: {question}\nA:"
+        "q-newline-a": "Q: {{question}}\nA:",
    },
 }

-def get_prompt(prompt_id: str):
+
+def get_prompt(prompt_id: str, dataset_name=None, subset_name=None):
    # unpack prompt name
-    try:
    category_name, prompt_name = prompt_id.split(":")
-    except:
+    eval_logger.info(f"Loading prompt from {category_name}")
+    if category_name == "promptsource":
+        try:
+            # prompts = DatasetTemplates(dataset_name, dataset_path)
+            if subset_name is None:
+                prompts = DatasetTemplates(dataset_name=dataset_name)
+            else:
+                prompts = DatasetTemplates(
+                    dataset_name=dataset_name, subset_name=subset_name
+                )
+        except Exception:
+            raise ValueError(f"{dataset_name} and {subset_name} not found")
+        if prompt_name in prompts.all_template_names:
+            return prompts[prompt_name]
+        else:
            raise ValueError(
-            f"expected only a single `:` as separator between \
-prompt category and name, but got `{prompt_id}` instead"
+                f"{prompt_name} not in prompt list {prompts.all_template_names}"
            )
+    else:
+        try:
            return PROMPT_REGISTRY[category_name][prompt_name]
+        except Exception:
+            raise ValueError(
+                f"expected only a single `:` as separator between \
+                prompt category and name, but got `{prompt_id}` instead"
+            )