Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into squadv2

3263c572 · lintangsutawika · a27e8ed1 · 33d52483 · 3263c572 · 3263c572
Commit 3263c572 authored Sep 18, 2023 by lintangsutawika
20 changed files
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
@@ -3,10 +3,10 @@ name: Tasks Modified
 on:
  push:
    branches:
-      - big-refactor
+      - 'big-refactor*'
  pull_request:
    branches:
-      - big-refactor
+      - 'big-refactor*'
  workflow_dispatch:
 # comment/edit out the above to stop/change the triggers
 jobs:
@@ -18,7 +18,7 @@ jobs:
      - name: checkout
        uses: actions/checkout@v3
        with:
-          fetch-depth: 0  # OR "2" -> To retrieve the preceding commit.
+          fetch-depth: 2  # OR "2" -> To retrieve the preceding commit.

      # Uses the tj-actions/changed-files@v37 action to check for changes.
      # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
@@ -51,6 +51,7 @@ jobs:
        with:
          python-version: 3.9
          cache: 'pip'
+          cache-dependency-path: setup.py
      - name: Install dependencies
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
        run: |
@@ -62,10 +63,10 @@ jobs:
      - name: Test with pytest
        # if new tasks are added, run tests on them
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
-        run: python -m pytest tests/extra/test_new_tasks.py -s -vv -n=auto
+        run: python -m pytest tests/test_tasks.py -s -vv
        # if api is modified, run tests on it
      - name: Test more tasks with pytest
        env:
          API: true
        if: steps.changed-tasks.outputs.api_any_modified == 'true'
-        run: python -m pytest tests/extra/test_new_tasks.py -s -vv -n=auto
+        run: python -m pytest tests/test_tasks.py -s -vv
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -26,7 +26,8 @@ jobs:
      uses: actions/setup-python@v4
      with:
        python-version: 3.9
-        cache: 'pip'
+        cache: pip
+        cache-dependency-path: setup.py
    - name: Install dependencies
      run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
    - name: Pre-Commit
@@ -46,22 +47,32 @@ jobs:
  testcpu:
    name: CPU Tests
    runs-on: ubuntu-latest
-    timeout-minutes: 20
+    strategy:
+      matrix:
+        python-version: [ "3.9", "3.10", "3.11" ]
+    timeout-minutes: 30

    steps:
    - name: Checkout Code
      uses: actions/checkout@v3
-    - name: Set up Python 3.9
+    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v4
      with:
-        python-version: 3.9
-        cache: 'pip'
+        python-version: ${{ matrix.python-version }}
+        cache: pip
+        cache-dependency-path: setup.py
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install -e '.[testing,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
-        #         Install optional git dependencies
+#         Install optional git dependencies
 #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
 #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
    - name: Test with pytest
      run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/tests_master --ignore=tests/extra
+    - name: Archive artifacts
+      uses: actions/upload-artifact@v3
+      with:
+        name: output_results
+        path: |
+          test_logs/*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -43,3 +43,9 @@ repos:
              .*\.json|ignore.txt
          )$
        args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.5.1
+    hooks:
+    - id: mypy
+      additional_dependencies: [".[sentencepiece,multilingual,promptsource,gptq]", "types-PyYAML", "types-requests"]
+      exclude: ^tests/.*$
--- a/README.md
+++ b/README.md
@@ -116,8 +116,10 @@ accelerate launch main.py \

 This will perform *data-parallel evaluation*: that is, placing a **single full copy** of your model onto each available GPU and *splitting batches across GPUs* to evaluate on K GPUs K times faster than on one.

-However, if your model *is too large to be run on a single one of your GPUs*, then we provide an alternative method to run these large models: use of the `parallelize` argument.
+If your model is *is too large to be run on a single one of your GPUs* then you can use `accelerate` with Fully Sharded Data Parallel (FSDP) that splits the weights of the model across your data parallel ranks. To enable this, ensure you select `YES` when asked ```Do you want to use FullyShardedDataParallel?``` when running `accelerate config`. To enable memory-efficient loading, select `YES` when asked `Do you want each individually wrapped FSDP unit to broadcast module parameters from rank 0 at the start?`. This will ensure only the rank 0 process loads the model and then broadcasts the parameters to the other ranks instead of having each rank load all parameters which can lead to large RAM usage spikes around the start of the script that may cause errors.

+
+We also provide an second method to run these large models: use of the `parallelize` argument.
 ```
 python main.py \
    --model hf \
@@ -132,7 +134,7 @@ To pass even more advanced keyword arguments to `accelerate`, we allow for the f
 - `max_cpu_memory`: the max amount of CPU memory to use when offloading the model weights to RAM.
 - `offload_folder`: a folder where model weights will be offloaded to disk if needed.

-Using this setting helps for massive models like BLOOM which require, or to avoid exceeding your total system RAM (by default, with `accelerate launch` one copy of the model for each GPU is initialized in RAM before moving it to GPU, resulting in large RAM usage spikes around the start of the script that may cause errors such as `Killed`.) However, it naively splits models across GPUs, resulting in only a single GPU performing work at any point in time, and so is much slower than launching with `accelerate launch`, possibly by a factor of the total # of GPUs.
+Note that this method naively splits models across GPUs, resulting in only a single GPU performing work at any point in time, and so is much slower than launching with `accelerate launch`, possibly by a factor of the total # of GPUs.

 **Note that this option requires launching evaluation via `python main.py` rather than `accelerate launch main.py`.**


--- a/docs/README.md
+++ b/docs/README.md
@@ -4,6 +4,7 @@ Welcome to the docs for the LM Evaluation Harness!

 ## Table of Contents

+* To learn about the public interface of the library, as well as how to evaluate via the commandline or as integrated into an external library, see the [Interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/user_guide.md)
 * To learn how to add a new library, API, or model type to the library, as well as a quick explainer on the types of ways to evaluate an LM, see the [Model Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/model_guide.md).
 * For a crash course on adding new tasks to the library, see our [New Task Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md).
 * To learn more about pushing the limits of task configuration that the Eval Harness supports, see the [Advanced Task Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/advanced_task_guide.md).

--- a/docs/interface.md
+++ b/docs/interface.md
+# User Guide
+
+This document details the interface exposed by `lm-eval` and provides details on what flags are available to users.
+
+## Command-line Interface
+
+A majority of users run the library by cloning it from Github and running the `main.py` script.
+
+Equivalently, running the library can be done via the `lm-eval` entrypoint at the command line.
+
+This mode supports a number of command-line arguments, the details of which can be also be seen via running with `-h` or `--help`:
+
+* `--model` : Selects which model type or provider is evaluated. Must be a string corresponding to the name of the model type/provider being used. See [the main README](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor#commercial-apis) for a full list of enabled model names and supported libraries or APIs.
+
+* `--model_args` : Controls parameters passed to the model constructor. Accepts a string containing comma-separated keyword arguments to the model class of the format `"arg1=val1,arg2=val2,..."`, such as, for example `--model_args pretrained=EleutherAI/pythia-160m,dtype=float32`. For a full list of what keyword arguments, see the initialization of the `lm_eval.api.model.LM` subclass, e.g. [`HFLM`](https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/models/huggingface.py#L66)
+
+* `--tasks` : Determines which tasks or task groups are evaluated. Accepts a comma-separated list of task names or task group names. Must be solely comprised of valid tasks/groups.
+
+* `--num_fewshot` : Sets the number of few-shot examples to place in context. Must be an integer.
+
+* `--batch_size` : Sets the batch size used for evaluation. Can be a positive integer or `"auto"` to automatically select the largest batch size that will fit in memory, speeding up evaluation. One can pass `--batch_size auto:N` to re-select the maximum batch size `N` times during evaluation. This can help accelerate evaluation further, since `lm-eval` sorts documents in descending order of context length.
+
+* `--max_batch_size` : Sets the maximum batch size to try to fit in memory, if `--batch_size auto` is passed.
+
+* `--device` : Sets which device to place the model onto. Must be a string, for example, `"cuda", "cuda:0", "cpu", "mps"`. Defaults to "cuda", and can be ignored if running multi-GPU or running a non-local model type.
+
+* `--output_path` : A string of the form `dir/file.jsonl` or `dir/`. Provides a path where high-level results will be saved, either into the file named or into the directory named. If `--log_samples` is passed as well, then per-document outputs and metrics will be saved into the directory as well.
+
+* `--log_samples` : If this flag is passed, then the model's outputs, and the text fed into the model, will be saved at per-document granularity. Must be used with `--output_path`.
+
+* `--limit` : Accepts an integer, or a float between 0.0 and 1.0 . If passed, will limit the number of documents to evaluate to the first X documents (if an integer) per task or first X% of documents per task. Useful for debugging, especially on costly API models.
+
+* `--use_cache` : Should be a path where a sqlite db file can be written to. Takes a string of format `/path/to/sqlite_cache_` in order to create a cache db at `/path/to/sqlite_cache_rank{i}.db` for each process (0-NUM_GPUS). This allows results of prior runs to be cached, so that there is no need to re-run results in order to re-score or re-run a given (model, task) pair again.
+
+* `--decontamination_ngrams_path` : Deprecated, see (this commit)[https://github.com/EleutherAI/lm-evaluation-harness/commit/00209e10f6e27edf5d766145afaf894079b5fe10] or older for a working decontamination-checker tool.
+
+* `--check_integrity` : If this flag is used, the library tests for each task selected are run to confirm task integrity.
+
+* `--write_out` : Used for diagnostic purposes to observe the format of task documents passed to a model. If this flag is used, then prints the prompt and gold target string for the first document of each task.
+
+* `--show_config` : If used, prints the full `lm_eval.api.task.TaskConfig` contents (non-default settings the task YAML file) for each task which was run, at the completion of an evaluation. Useful for when one is modifying a task's configuration YAML locally to transmit the exact configurations used for debugging or for reproducibility purposes.
+
+* `--include_path` : Accepts a path to a folder. If passed, then all YAML files containing `lm-eval`` compatible task configurations will be added to the task registry as available tasks. Used for when one is writing config files for their own task in a folder other than `lm_eval/tasks/`
+
+## External Library Usage
+
+We also support using the library's external API for use within model training loops or other scripts.
+
+`lm_eval` supplies two functions for external import and use: `lm_eval.evaluate()` and `lm_eval.simple_evaluate()`.
+
+
+`simple_evaluate()` can be used by simply creating an `lm_eval.api.model.LM` subclass that implements the methods described in the [Model Guide](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor/docs/model_guide.md), and wrapping your custom model in that class as follows:
+
+```python
+import lm_eval
+...
+
+my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
+...
+lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.greedy_until()`
+
+results = lm_eval.simple_evaluate( # call simple_evaluate
+    model=lm_obj,
+    tasks=["taskname1", "taskname2"],
+    num_fewshot=0,
+    ...
+)
+```
+
+
+See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L35 for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously.
+
+Additionally, the `evaluate()` function offers the core evaluation functionality provided by the library, but without some of the special handling and simplification + abstraction provided by `simple_evaluate()`.
+
+See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L173 for more details.
+
+As a brief example usage of `evaluate()`:
+```python
+import lm_eval
+
+from my_tasks import MyTask1 # suppose you've defined a custom lm_eval.api.Task subclass in your own external codebase
+...
+
+my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
+...
+lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.greedy_until()`
+
+
+
+def evaluate(
+    lm=lm_obj,
+    task_dict={"mytask1": MyTask1},
+    ...
+):
+```
--- a/lm_eval/__init__.py
+++ b/lm_eval/__init__.py
+from .evaluator import evaluate, simple_evaluate
--- a/lm_eval/api/filter.py
+++ b/lm_eval/api/filter.py
@@ -2,6 +2,7 @@ from dataclasses import dataclass
 from typing import List

 from lm_eval.api.instance import Instance
+from datasets import Dataset


 class Filter:
@@ -13,12 +14,12 @@ class Filter:

    """

-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args, **kwargs) -> None:
        """
        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
        """

-    def apply(self, resps):
+    def apply(self, resps, docs):
        """
        Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects.
        Should return the list of (filtered) response lists *in the same order as they were input*, e.g.
@@ -40,14 +41,14 @@ class FilterEnsemble:
    name: str
    filters: List[Filter]

-    def apply(self, instances: List[Instance]):
+    def apply(self, instances: List[Instance], docs: List[Dataset]) -> None:

        resps = [
            inst.resps for inst in instances
        ]  # operate just on the model responses
        for f in self.filters:
            # apply filters in sequence
-            resps = f.apply(resps)
+            resps = f.apply(resps, docs)

        # add the end results after filtering to filtered_requests of their respective source instances.
        # has key `self.name`: each FilterEnsemble applied in a given run should use a different name.

--- a/lm_eval/api/instance.py
+++ b/lm_eval/api/instance.py
@@ -19,7 +19,7 @@ class Instance:
    doc_id: str = None
    repeats: str = None

-    def __post_init__(self):
+    def __post_init__(self) -> None:
        # unpack metadata field
        self.task_name, self.doc_id, self.repeats = self.metadata


--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -56,6 +56,55 @@ def matthews_corrcoef(items):
    return sklearn.metrics.matthews_corrcoef(golds, preds)


+@register_aggregation("bleu")
+def bleu(items):
+    """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
+    for evaluating a generated sentence to a reference sentence. It counts matching
+    n-grams in the candidate translation to n-grams in the reference text, where
+    1-gram or unigram would be each token and a bigram comparison would be each
+    word pair. The comparison is made regardless of word order
+    Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
+    Paper: https://www.aclweb.org/anthology/P02-1040/
+
+    Higher is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_bleu(preds, refs).score
+
+
+@register_aggregation("chrf")
+def chrf(items):
+    """chrF++ is a tool for automatic evaluation of machine translation output
+    based on character n-gram precision and recall enhanced with word n-grams.
+    Source: https://github.com/m-popovic/chrF
+    Paper: https://www.aclweb.org/anthology/W15-3049.pdf
+
+    Higher is better  # TODO I think
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_chrf(preds, refs).score
+
+
+@register_aggregation("ter")
+def ter(items):
+    """Translation Error Rate is an error metric for machine translation that
+    measures the number of edits required to change a system output into one
+    of the references
+    Source: http://www.cs.umd.edu/~snover/tercom/
+    Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
+
+    Lower is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_ter(preds, refs).score
+
+
 @register_metric(
    metric="acc",
    higher_is_better=True,
@@ -160,6 +209,36 @@ def f1_fn(items):  # This is a passthrough function
    return items


+@register_metric(
+    metric="bleu",
+    higher_is_better=True,
+    output_type="greedy_until",
+    aggregation="bleu",
+)
+def bleu_fn(items):  # This is a passthrough function
+    return items
+
+
+@register_metric(
+    metric="chrf",
+    higher_is_better=True,
+    output_type="greedy_until",
+    aggregation="chrf",
+)
+def chrf_fn(items):  # This is a passthrough function
+    return items
+
+
+@register_metric(
+    metric="ter",
+    higher_is_better=True,
+    output_type="greedy_until",
+    aggregation="ter",
+)
+def ter_fn(items):  # This is a passthrough function
+    return items
+
+
 @register_metric(
    metric="acc_all",
    higher_is_better=True,
@@ -217,55 +296,6 @@ def weighted_mean(items):
    return sum(a) / sum(b)


-@register_metric(metric="bleu", higher_is_better=True, aggregation="mean")
-def bleu(items):
-    """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
-    for evaluating a generated sentence to a reference sentence. It counts matching
-    n-grams in the candidate translation to n-grams in the reference text, where
-    1-gram or unigram would be each token and a bigram comparison would be each
-    word pair. The comparison is made regardless of word order
-    Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
-    Paper: https://www.aclweb.org/anthology/P02-1040/
-
-    Higher is better
-    """
-    refs = list(zip(*items))[0]
-    preds = list(zip(*items))[1]
-    refs, preds = _sacreformat(refs, preds)
-    return sacrebleu.corpus_bleu(preds, refs).score
-
-
-@register_metric(metric="chrf", higher_is_better=True, aggregation="mean")
-def chrf(items):
-    """chrF++ is a tool for automatic evaluation of machine translation output
-    based on character n-gram precision and recall enhanced with word n-grams.
-    Source: https://github.com/m-popovic/chrF
-    Paper: https://www.aclweb.org/anthology/W15-3049.pdf
-
-    Higher is better  # TODO I think
-    """
-    refs = list(zip(*items))[0]
-    preds = list(zip(*items))[1]
-    refs, preds = _sacreformat(refs, preds)
-    return sacrebleu.corpus_chrf(preds, refs).score
-
-
-@register_metric(metric="ter", higher_is_better=True, aggregation="mean")
-def ter(items):
-    """Translation Error Rate is an error metric for machine translation that
-    measures the number of edits required to change a system output into one
-    of the references
-    Source: http://www.cs.umd.edu/~snover/tercom/
-    Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
-
-    Lower is better
-    """
-    refs = list(zip(*items))[0]
-    preds = list(zip(*items))[1]
-    refs, preds = _sacreformat(refs, preds)
-    return sacrebleu.corpus_ter(preds, refs).score
-
-
 def is_non_str_iterable(obj):
    return isinstance(obj, Iterable) and not isinstance(obj, str)

@@ -302,7 +332,7 @@ def _sacreformat(refs, preds):


 class _bootstrap_internal:
-    def __init__(self, f, n):
+    def __init__(self, f, n) -> None:
        self.f = f
        self.n = n


--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
 import abc
 import os

-from typing import Union, List, Tuple
+import torch
+from typing import Union, List, Tuple, Optional, Type, TypeVar
 from sqlitedict import SqliteDict
 import json
 import hashlib
@@ -11,9 +12,11 @@ from tqdm import tqdm
 from lm_eval import utils
 from lm_eval.logger import eval_logger

+T = TypeVar("T", bound="LM")
+

 class LM(abc.ABC):
-    def __init__(self):
+    def __init__(self) -> None:
        """Defines the interface that should be implemented by all LM subclasses.
        LMs are assumed to take text (strings) as input and yield strings as output
        (inputs/outputs should be tokenization-agnostic.)
@@ -111,11 +114,28 @@ class LM(abc.ABC):
        pass

    @classmethod
-    def create_from_arg_string(cls, arg_string, additional_config=None):
+    def create_from_arg_string(
+        cls: Type[T], arg_string: str, additional_config: Optional[dict] = None
+    ) -> T:
+        """
+        Creates an instance of the LM class using the given argument string and additional config.
+
+        Parameters:
+        - arg_string: A string containing arguments in the format key1=value1,key2=value2.
+        - additional_config: Optional dictionary containing additional configuration parameters.
+
+        Returns:
+        - Instance of the LM class.
+        """
        additional_config = {} if additional_config is None else additional_config
        args = utils.simple_parse_args_string(arg_string)
        args2 = {k: v for k, v in additional_config.items() if v is not None}
-        if args2.get("device") == "mps" or args.get("device") == "mps":
+        # TODO: delete once float16 MPS is fixed in torch stable
+        if (
+            args2.get("device") in ("mps", "mps:0")
+            or args.get("device") in ("mps", "mps:0")
+            and "dev" not in torch.__version__
+        ):
            args["dtype"] = "float32"
        return cls(**args, **args2)

@@ -133,7 +153,7 @@ class LM(abc.ABC):
        # not support multi-device parallelism nor expect it.
        return self._world_size

-    def set_cache_hook(self, cache_hook):
+    def set_cache_hook(self, cache_hook) -> None:
        self.cache_hook = cache_hook


@@ -144,14 +164,14 @@ def hash_args(attr, args):


 class CacheHook:
-    def __init__(self, cachinglm):
+    def __init__(self, cachinglm) -> None:
        if cachinglm is None:
            self.dbdict = None
            return

        self.dbdict = cachinglm.dbdict

-    def add_partial(self, attr, req, res):
+    def add_partial(self, attr, req, res) -> None:
        if self.dbdict is None:
            return
        hsh = hash_args(attr, req)
@@ -159,7 +179,7 @@ class CacheHook:


 class CachingLM:
-    def __init__(self, lm, cache_db):
+    def __init__(self, lm, cache_db) -> None:
        """LM wrapper that returns cached results if they exist, and uses the underlying LM if not.

        :param lm: LM

--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
 class Sampler:
-    def __init__(self, docs, task, fewshot_indices=None, rnd=None):
-
+    def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
        self.rnd = rnd
        assert self.rnd, "must pass rnd to FewShotSampler!"

@@ -19,7 +18,6 @@ class Sampler:
            self.docs = self.docs.select(fewshot_indices)

    def get_context(self, doc, num_fewshot):
-
        # draw an extra fewshot sample if using same split as evaluating on
        n_samples = (
            num_fewshot + 1
@@ -74,7 +72,7 @@ class Sampler:


 class BalancedSampler(Sampler):
-    def sample(self, n):
+    def sample(self, n) -> None:
        """
        TODO: this should return approximately class-balanced samples from our fewshot examples.
        TODO: what order should they be in? maybe random?
@@ -84,7 +82,7 @@ class BalancedSampler(Sampler):


 class ManualSampler(Sampler):
-    def sample(self, n):
+    def sample(self, n) -> None:
        """ """
        pass


--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -78,7 +78,7 @@ class TaskConfig(dict):
    # runtime configuration options
    num_fewshot: int = 0
    # scoring options
-    metric_list: str = None
+    metric_list: list = None
    output_type: str = "greedy_until"
    generation_kwargs: dict = None
    repeats: int = 1
@@ -88,7 +88,12 @@ class TaskConfig(dict):

    metadata: str = None  # by default, not used in the code. allows for users to pass arbitrary info to tasks

-    def __post_init__(self):
+    def __post_init__(self) -> None:
+        if "." in self.dataset_path:
+            import inspect
+            from importlib import import_module
+
+            self.dataset_path = inspect.getfile(import_module(self.dataset_path))

        if self.generation_kwargs is not None:
            if self.output_type != "greedy_until":
@@ -171,7 +176,7 @@ class Task(abc.ABC):
        cache_dir=None,
        download_mode=None,
        config=None,
-    ):
+    ) -> None:
        """
        :param data_dir: str
            Stores the path to a local folder containing the `Task`'s data files.
@@ -182,7 +187,6 @@ class Task(abc.ABC):
            HuggingFace `datasets` API with the default cache directory located at:
                `~/.cache/huggingface/datasets`
            NOTE: You can change the cache location globally for a given process
-            by setting the shell environment variable, `HF_DATASETS_CACHE`,
            to another directory:
                `export HF_DATASETS_CACHE="/path/to/another/directory"`
        :param download_mode: datasets.DownloadMode
@@ -213,7 +217,7 @@ class Task(abc.ABC):
            list(self.fewshot_docs()), self, rnd=random.Random(1234)
        )

-    def download(self, data_dir=None, cache_dir=None, download_mode=None):
+    def download(self, data_dir=None, cache_dir=None, download_mode=None) -> None:
        """Downloads and returns the task dataset.
        Override this method to download the dataset from a custom API.

@@ -246,6 +250,11 @@ class Task(abc.ABC):
            download_mode=download_mode,
        )

+    @property
+    def config(self):
+        """Returns the TaskConfig associated with this class."""
+        return self._config
+
    @abc.abstractmethod
    def has_training_docs(self):
        """Whether the task has a training set"""
@@ -322,7 +331,7 @@ class Task(abc.ABC):

        return rnd.sample(self._training_docs, k)

-    def doc_to_decontamination_query(self, doc):
+    def doc_to_decontamination_query(self, doc) -> None:
        print(
            "Override doc_to_decontamination_query with document specific decontamination query."
        )
@@ -336,7 +345,7 @@ class Task(abc.ABC):
    def doc_to_target(self, doc):
        pass

-    def build_all_requests(self, limit=None, rank=None, world_size=None):
+    def build_all_requests(self, limit=None, rank=None, world_size=None) -> None:
        """Build a set of Instances for a task, and store them in task.instances"""
        if self.has_test_docs():
            docs = self.test_docs()
@@ -348,7 +357,7 @@ class Task(abc.ABC):
            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"

        eval_logger.info(
-            f"Building contexts for task '{self._config.task}' on rank {rank}..."
+            f"Building contexts for task '{self.config.task}' on rank {rank}..."
        )

        instances = []
@@ -358,14 +367,14 @@ class Task(abc.ABC):
            # sample fewshot context #TODO: need to offset doc_id by rank now!
            fewshot_ctx = self.fewshot_context(
                doc,
-                self._config.num_fewshot,
+                self.config.num_fewshot,
            )

-            # TODO: we should override self._config.repeats if doing greedy gen so users don't waste time+compute
+            # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
            inst = self.construct_requests(
                doc=doc,
                ctx=fewshot_ctx,
-                metadata=(self._config["task"], doc_id, self._config.repeats),
+                metadata=(self.config["task"], doc_id, self.config.repeats),
            )

            if not isinstance(inst, list):
@@ -453,9 +462,9 @@ class Task(abc.ABC):

        if num_fewshot == 0:
            # always prepend the (possibly empty) task description
-            labeled_examples = self._config.description
+            labeled_examples = self.config.description
        else:
-            labeled_examples = self._config.description + self.sampler.get_context(
+            labeled_examples = self.config.description + self.sampler.get_context(
                doc, num_fewshot
            )

@@ -465,14 +474,13 @@ class Task(abc.ABC):
        elif type(example) == list:
            return [labeled_examples + ex for ex in example]
        elif type(example) == int:
-            if self._config.doc_to_choice is not None:
+            if self.config.doc_to_choice is not None:
                choices = self.doc_to_choice(doc)
                return labeled_examples + choices[example]
            else:
                return labeled_examples + str(example)

    def apply_filters(self):
-
        if hasattr(self, "_filters"):
            for f in self._filters:
                f.apply(self._instances)
@@ -488,7 +496,7 @@ class Task(abc.ABC):
        """
        # TODO: this should only return the overrides applied to a non-YAML task's configuration.
        # (num_fewshot)
-        return self._config.to_dict()
+        return self.config.to_dict()


 class ConfigurableTask(Task):
@@ -498,40 +506,40 @@ class ConfigurableTask(Task):

    def __init__(
        self, data_dir=None, cache_dir=None, download_mode=None, config: dict = None
-    ):  # TODO no super() call here
+    ) -> None:  # TODO no super() call here
        # Get pre-configured attributes
        self._config = self.CONFIG

        # Use new configurations if there was no preconfiguration
-        if self._config is None:
+        if self.config is None:
            self._config = TaskConfig(**config)
        # Overwrite configs
        else:
            if config is not None:
                self._config.__dict__.update(config)

-        if self._config is None:
+        if self.config is None:
            raise ValueError(
                "Must pass a config to ConfigurableTask, either in cls.CONFIG or `config` kwarg"
            )

-        if self._config.output_type is not None:
-            assert self._config.output_type in ALL_OUTPUT_TYPES
-            self.OUTPUT_TYPE = self._config.output_type
+        if self.config.output_type is not None:
+            assert self.config.output_type in ALL_OUTPUT_TYPES
+            self.OUTPUT_TYPE = self.config.output_type

-        if self._config.dataset_path is not None:
-            self.DATASET_PATH = self._config.dataset_path
+        if self.config.dataset_path is not None:
+            self.DATASET_PATH = self.config.dataset_path

-        if self._config.dataset_name is not None:
-            self.DATASET_NAME = self._config.dataset_name
+        if self.config.dataset_name is not None:
+            self.DATASET_NAME = self.config.dataset_name

        self._metric_fn_list = {}
        self._metric_fn_kwargs = {}
        self._aggregation_list = {}
        self._higher_is_better = {}

-        _metric_list = DEFAULT_METRIC_REGISTRY[self._config.output_type]
-        if self._config.metric_list is None:
+        _metric_list = DEFAULT_METRIC_REGISTRY[self.config.output_type]
+        if self.config.metric_list is None:
            # TODO: handle this in TaskConfig.__post_init__ ?
            for metric_name in _metric_list:
                self._metric_fn_list[metric_name] = get_metric(metric_name)
@@ -540,7 +548,7 @@ class ConfigurableTask(Task):
                )
                self._higher_is_better[metric_name] = is_higher_better(metric_name)
        else:
-            for metric_config in self._config.metric_list:
+            for metric_config in self.config.metric_list:
                assert "metric" in metric_config
                metric_name = metric_config["metric"]
                kwargs = {
@@ -549,7 +557,7 @@ class ConfigurableTask(Task):
                    if key not in ["metric", "aggregation", "higher_is_better"]
                }

-                if self._config.process_results is not None:
+                if self.config.process_results is not None:
                    self._metric_fn_list[metric_name] = None
                    self._metric_fn_kwargs[metric_name] = {}
                elif callable(metric_name):
@@ -570,7 +578,6 @@ class ConfigurableTask(Task):
                            "aggregation"
                        ]
                else:
-
                    INV_AGG_REGISTRY = {v: k for k, v in AGGREGATION_REGISTRY.items()}
                    metric_agg = get_default_aggregation(metric_name)
                    eval_logger.warning(
@@ -592,13 +599,13 @@ class ConfigurableTask(Task):
                    )
                    self._higher_is_better[metric_name] = is_higher_better(metric_name)

-        self.download(self._config.dataset_kwargs)
+        self.download(self.config.dataset_kwargs)
        self._training_docs = None
        self._fewshot_docs = None

-        if self._config.filter_list is not None:
+        if self.config.filter_list is not None:
            self._filters = []
-            for filter_config in self._config.filter_list:
+            for filter_config in self.config.filter_list:
                for filter_pipeline in filter_config:
                    filter_name = filter_config["name"]
                    filter_functions = filter_config["filter"]
@@ -613,10 +620,10 @@ class ConfigurableTask(Task):
        else:
            self._filters = [build_filter_ensemble("none", [["take_first", None]])]

-        if self._config.use_prompt is not None:
-            eval_logger.info(f"loading prompt {self._config.use_prompt}")
+        if self.config.use_prompt is not None:
+            eval_logger.info(f"loading prompt {self.config.use_prompt}")
            self.prompt = get_prompt(
-                self._config.use_prompt, self.DATASET_PATH, self.DATASET_NAME
+                self.config.use_prompt, self.DATASET_PATH, self.DATASET_NAME
            )
        else:
            self.prompt = None
@@ -627,23 +634,23 @@ class ConfigurableTask(Task):
            )

        if self.has_test_docs():
-            docs = self.test_docs()
+            self.task_docs = self.test_docs()
        elif self.has_validation_docs():
-            docs = self.validation_docs()
+            self.task_docs = self.validation_docs()
        else:
            assert (
                False
            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"

        # Test One Doc
-        self.features = list(docs.features.keys())
+        self.features = list(self.task_docs.features.keys())
        self.multiple_input = 0
        self.multiple_target = 0
-        test_doc = docs[0]
+        test_doc = self.task_docs[0]
        test_text = self.doc_to_text(test_doc)
        test_target = self.doc_to_target(test_doc)

-        if self._config.doc_to_choice is not None:
+        if self.config.doc_to_choice is not None:
            test_choice = self.doc_to_choice(test_doc)
            if type(test_choice) is not list:
                eval_logger.error("doc_to_choice must return list")
@@ -667,24 +674,23 @@ class ConfigurableTask(Task):
            check_choices = test_choice
        else:
            check_choices = [test_target]
-
-        for choice in check_choices:
-            choice_has_whitespace = True if " " in choice else False
-            delimiter_has_whitespace = (
-                True if " " in self._config.target_delimiter else False
-            )
-
-            if delimiter_has_whitespace and choice_has_whitespace:
-                eval_logger.warning(
-                    f'Both target_delimiter and target choice: "{choice}" have whitespace'
-                )
-            elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
-                eval_logger.warning(
-                    f'Both target_delimiter and target choice: "{choice}" does not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
+        if self.config.doc_to_choice is not None:
+            for choice in check_choices:
+                choice_has_whitespace = True if choice[0].isspace() else False
+                delimiter_has_whitespace = (
+                    True if self.config.target_delimiter[-1].isspace() else False
                )

-    def download(self, dataset_kwargs=None):
+                if delimiter_has_whitespace and choice_has_whitespace:
+                    eval_logger.warning(
+                        f'Both target_delimiter and target choice: "{choice}" have whitespace'
+                    )
+                elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
+                    eval_logger.warning(
+                        f'Both target_delimiter and target choice: "{choice}" does not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
+                    )

+    def download(self, dataset_kwargs=None) -> None:
        self.dataset = datasets.load_dataset(
            path=self.DATASET_PATH,
            name=self.DATASET_NAME,
@@ -692,67 +698,76 @@ class ConfigurableTask(Task):
        )

    def has_training_docs(self) -> bool:
-        if self._config.training_split is not None:
+        if self.config.training_split is not None:
            return True
        else:
            return False

    def has_validation_docs(self) -> bool:
-        if self._config.validation_split is not None:
+        if self.config.validation_split is not None:
            return True
        else:
            return False

    def has_test_docs(self) -> bool:
-        if self._config.test_split is not None:
+        if self.config.test_split is not None:
            return True
        else:
            return False

    def training_docs(self) -> datasets.Dataset:
        if self.has_training_docs():
-            if self._config.process_docs is not None:
-                return self._config.process_docs(
-                    self.dataset[self._config.training_split]
+            if self.config.process_docs is not None:
+                return self.config.process_docs(
+                    self.dataset[self.config.training_split]
                )
-            return self.dataset[self._config.training_split]
+            return self.dataset[self.config.training_split]

    def validation_docs(self) -> datasets.Dataset:
        if self.has_validation_docs():
-            if self._config.process_docs is not None:
-                return self._config.process_docs(
-                    self.dataset[self._config.validation_split]
+            if self.config.process_docs is not None:
+                return self.config.process_docs(
+                    self.dataset[self.config.validation_split]
                )
-            return self.dataset[self._config.validation_split]
+            return self.dataset[self.config.validation_split]

    def test_docs(self) -> datasets.Dataset:
        if self.has_test_docs():
-            if self._config.process_docs is not None:
-                return self._config.process_docs(self.dataset[self._config.test_split])
-            return self.dataset[self._config.test_split]
+            if self.config.process_docs is not None:
+                return self.config.process_docs(self.dataset[self.config.test_split])
+            return self.dataset[self.config.test_split]

    def fewshot_docs(self):
-        if self._config.fewshot_split is not None:
-            return self.dataset[self._config.fewshot_split]
+        if self.config.fewshot_split is not None:
+            return self.dataset[self.config.fewshot_split]
        else:
-            if self._config.num_fewshot > 0:
+            if self.config.num_fewshot > 0:
                eval_logger.warning(
-                    f"Task '{self._config.task}': "
+                    f"Task '{self.config.task}': "
                    "num_fewshot > 0 but fewshot_split is None. "
                    "using preconfigured rule."
                )
            return super().fewshot_docs()

+    def apply_filters(self):
+
+        if hasattr(self, "_filters"):
+            for f in self._filters:
+                f.apply(self._instances, self.task_docs)
+        else:
+            eval_logger.warning("No filter defined, passing through instances")
+            return self._instances
+
    def should_decontaminate(self):
-        return self._config.should_decontaminate
+        return self.config.should_decontaminate

    def doc_to_decontamination_query(self, doc):
-        if self._config.should_decontaminate:
-            if self._config.doc_to_decontamination_query in self.features:
-                return doc[self._config.doc_to_decontamination_query]
+        if self.config.should_decontaminate:
+            if self.config.doc_to_decontamination_query in self.features:
+                return doc[self.config.doc_to_decontamination_query]
            else:
                return ast.literal_eval(
-                    utils.apply_template(self._config.doc_to_decontamination_query, doc)
+                    utils.apply_template(self.config.doc_to_decontamination_query, doc)
                )

    def _process_doc(self, doc):
@@ -767,23 +782,22 @@ class ConfigurableTask(Task):
        return doc

    def doc_to_text(self, doc):
-
        if self.prompt is not None:
            doc_to_text = self.prompt
        else:
-            doc_to_text = self._config.doc_to_text
+            doc_to_text = self.config.doc_to_text

        if type(doc_to_text) == int:
            return doc_to_text
        elif type(doc_to_text) == str:
            if doc_to_text in self.features:
-                # if self._config.doc_to_choice is not None:
+                # if self.config.doc_to_choice is not None:
                #     return self.doc_to_choice(doc)[doc[doc_to_text]]
                # else:
                return doc[doc_to_text]
            else:
                text_string = utils.apply_template(doc_to_text, doc)
-                if text_string.isdigit():
+                if text_string.isdigit() and self._config.doc_to_choice is not None:
                    return ast.literal_eval(text_string)
                else:
                    return text_string
@@ -796,29 +810,28 @@ class ConfigurableTask(Task):
                return applied_prompt[0]
            else:
                eval_logger.warning("Applied prompt returns empty string")
-                return self._config.fewshot_delimiter
+                return self.config.fewshot_delimiter
        else:
            print(type(doc_to_text))
            raise TypeError

    def doc_to_target(self, doc: dict) -> Union[int, str, list]:
-
        if self.prompt is not None:
            doc_to_target = self.prompt
        else:
-            doc_to_target = self._config.doc_to_target
+            doc_to_target = self.config.doc_to_target

        if type(doc_to_target) == int:
            return doc_to_target
        elif type(doc_to_target) == str:
            if doc_to_target in self.features:
-                # if self._config.doc_to_choice is not None:
+                # if self.config.doc_to_choice is not None:
                #     return self.doc_to_choice(doc)[doc[doc_to_target]]
                # else:
                return doc[doc_to_target]
            else:
                target_string = utils.apply_template(doc_to_target, doc)
-                if target_string.isdigit():
+                if target_string.isdigit() and self._config.doc_to_choice is not None:
                    return ast.literal_eval(target_string)
                elif (
                    len(target_string) >= 2
@@ -839,18 +852,17 @@ class ConfigurableTask(Task):
                return applied_prompt[1]
            else:
                eval_logger.warning("Applied prompt returns empty string")
-                return self._config.fewshot_delimiter
+                return self.config.fewshot_delimiter
        else:
            raise TypeError

    def doc_to_choice(self, doc: Any) -> List[str]:
-
        if self.prompt is not None:
            doc_to_choice = self.prompt
-        elif self._config.doc_to_choice is None:
+        elif self.config.doc_to_choice is None:
            eval_logger.error("doc_to_choice was called but not set in config")
        else:
-            doc_to_choice = self._config.doc_to_choice
+            doc_to_choice = self.config.doc_to_choice

        if type(doc_to_choice) == str:
            return ast.literal_eval(utils.apply_template(doc_to_choice, doc))
@@ -871,8 +883,8 @@ class ConfigurableTask(Task):

        # in multiple_choice tasks, this should be castable to an int corresponding to the index
        # within the answer choices, while doc_to_target is the string version of {{answer_choices[gold]}}.
-        if self._config.gold_alias is not None:
-            doc_to_target = self._config.gold_alias
+        if self.config.gold_alias is not None:
+            doc_to_target = self.config.gold_alias
        else:
            return self.doc_to_target(doc)

@@ -888,15 +900,13 @@ class ConfigurableTask(Task):
    def construct_requests(
        self, doc: dict, ctx: str, **kwargs
    ) -> Union[List[Instance], Instance]:
-
        if self.OUTPUT_TYPE == "loglikelihood":
            arguments = (ctx, self.doc_to_target(doc))
        elif self.OUTPUT_TYPE == "loglikelihood_rolling":
            arguments = (self.doc_to_target(doc),)
        elif self.OUTPUT_TYPE == "multiple_choice":
-
            choices = self.doc_to_choice(doc)
-            target_delimiter = self._config.target_delimiter
+            target_delimiter = self.config.target_delimiter
            if self.multiple_input:
                # If there are multiple inputs, choices are placed in the ctx
                cont = self.doc_to_target(doc)
@@ -938,7 +948,7 @@ class ConfigurableTask(Task):
            return request_list

        elif self.OUTPUT_TYPE == "greedy_until":
-            arguments = (ctx, self._config.generation_kwargs)
+            arguments = (ctx, self.config.generation_kwargs)

        return Instance(
            request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs
@@ -946,8 +956,8 @@ class ConfigurableTask(Task):

    def process_results(self, doc, results):

-        if callable(self._config.process_results):
-            return self._config.process_results(doc, results)
+        if callable(self.config.process_results):
+            return self.config.process_results(doc, results)

        result_dict = {}
        use_metric = list(self._metric_fn_list.keys())
@@ -980,7 +990,6 @@ class ConfigurableTask(Task):
                ),
            }
        elif self.OUTPUT_TYPE == "multiple_choice":
-
            lls, is_greedy = zip(*results)

            # retrieve choices in List[str] form, to compute choice lengths, etc.
@@ -1005,18 +1014,36 @@ class ConfigurableTask(Task):
                gold = self.doc_to_text(doc)
            else:
                gold = self.doc_to_target(doc)
-                if type(gold) is str:
-                    gold = choices.index(gold)
+
+            gold_index_error = False
+            if type(gold) is list:
+                gold = [i if i < len(choices) else -100 for i in gold]
+                if -100 in gold:
+                    gold_index_error = True
+            else:
+                if type(gold) is int:
+                    gold = gold if gold < len(choices) else -100
+                elif type(gold) is str:
+                    gold = choices.index(gold) if gold in choices else -100
+
+                if gold == -100:
+                    gold_index_error = True
+
+            if gold_index_error:
+                eval_logger.warning(
+                    f"Label index was not in within range of available choices,"
+                    f"Sample:\n\n{doc}\n\n"
+                )

            if self.multiple_target:
                acc = 1.0 if pred in gold else 0.0
                acc_norm = 1.0 if pred_norm in gold else 0.0
-                exact_match = int(any([is_greedy[i] for i in gold]))
+                exact_match = int(any([is_greedy[i] if i != -100 else 0 for i in gold]))
            else:
                acc = 1.0 if pred == gold else 0.0
                acc_norm = 1.0 if pred_norm == gold else 0.0
                # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
-                exact_match = int(is_greedy[gold])
+                exact_match = int(is_greedy[gold]) if gold != -100 else 0

            result_dict = {
                **({"acc": acc} if "acc" in use_metric else {}),
@@ -1034,13 +1061,15 @@ class ConfigurableTask(Task):
                result_dict["acc_mutual_info"] = acc_mutual_info

        elif self.OUTPUT_TYPE == "greedy_until":
-
            gold = self.doc_to_target(doc)
-            if self._config.doc_to_choice is not None:
+            if self.config.doc_to_choice is not None:
                # If you set doc_to_choice,
                # it assumes that doc_to_target returns a number.
                choices = self.doc_to_choice(doc)
                gold = choices[gold]
+            # we expect multiple_targets to be a list.
+            elif self.multiple_target:
+                gold = list(gold)
            else:
                gold = str(gold)

@@ -1051,26 +1080,38 @@ class ConfigurableTask(Task):
                    # return true if any are true
                    # TODO: this may break for multipLe_target, non zero-or-1 metrics
                    scores = []
+                    if not isinstance(gold, list):
+                        # sometimes, a multiple_target dataset has exceptions where one doc has only one string answer
+                        # print(gold)
+                        gold = [gold]
                    for gold_option in gold:
-                        res = self._metric_fn_list[metric](
-                            references=[gold_option],
-                            predictions=[result],
-                            **self._metric_fn_kwargs[metric],
-                        )
-                        if isinstance(res, dict):
+                        try:
+                            result_score = self._metric_fn_list[metric](
+                                references=[gold_option],
+                                predictions=[result],
+                                **self._metric_fn_kwargs[metric],
+                            )
+                        except TypeError:  # TODO: this is hacky and I don't want to do it
+                            result_score = self._metric_fn_list[metric](
+                                [gold_option, result]
+                            )
+                        if isinstance(result_score, dict):
                            # TODO: this handles the case where HF evaluate returns a dict.
-                            res = res[metric]
-                        scores.append(res)
+                            result_score = result_score[metric]
+                        scores.append(result_score)
                    if any(scores):
                        result_score = 1.0
                    else:
                        result_score = 0.0
                else:
-                    result_score = self._metric_fn_list[metric](
-                        references=[gold],
-                        predictions=[result],
-                        **self._metric_fn_kwargs[metric],
-                    )
+                    try:
+                        result_score = self._metric_fn_list[metric](
+                            references=[gold],
+                            predictions=[result],
+                            **self._metric_fn_kwargs[metric],
+                        )
+                    except TypeError:  # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
+                        result_score = self._metric_fn_list[metric]([gold, result])
                    if isinstance(result_score, dict):
                        # TODO: this handles the case where HF evaluate returns a dict.
                        result_score = result_score[metric]
@@ -1164,7 +1205,7 @@ class PerplexityTask(Task):
    def doc_to_decontamination_query(self, doc):
        return doc

-    def doc_to_text(self, doc):
+    def doc_to_text(self, doc) -> str:
        return ""

    def doc_to_target(self, doc):

--- a/lm_eval/benchmarks/__init__.py
+++ b/lm_eval/benchmarks/__init__.py
@@ -11,8 +11,7 @@ from lm_eval.api.registry import (
 )


-def include_benchmarks(task_dir):
-
+def include_benchmarks(task_dir: str) -> None:
    for root, subdirs, file_list in os.walk(task_dir):
        if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
            for f in file_list:
@@ -45,7 +44,7 @@ def include_benchmarks(task_dir):

                        task_names = utils.pattern_match(task_list, ALL_TASKS)
                        for task in task_names:
-                            if task in TASK_REGISTRY:
+                            if (task in TASK_REGISTRY) or (task in GROUP_REGISTRY):
                                if group in GROUP_REGISTRY:
                                    GROUP_REGISTRY[group].append(task)
                                else:

--- a/lm_eval/benchmarks/pythia.yaml
+++ b/lm_eval/benchmarks/pythia.yaml
 group: pythia
 task:
  - lambada_openai
-  - wikitext
+  - logiqa
  - piqa
  - sciq
-  - wsc
+  - wikitext
  - winogrande
-  - arc
-  - logiqa
+  - wsc
+  - ai2_arc
  - blimp
  - hendrycksTest*
--- a/lm_eval/decontamination/archiver.py
+++ b/lm_eval/decontamination/archiver.py
 import os
+from typing import Any
 import zstandard
 import json
 import jsonlines
@@ -9,7 +10,7 @@ import tqdm
 from pathlib import Path


-def json_serial(obj):
+def json_serial(obj: Any) -> str:
    """JSON serializer for objects not serializable by default json code"""

    if isinstance(obj, (datetime.datetime,)):
@@ -19,7 +20,7 @@ def json_serial(obj):

 # Modified version of lm_dataformat Archive for single file.
 class Archive:
-    def __init__(self, file_path, compression_level=3):
+    def __init__(self, file_path: str, compression_level: int = 3) -> None:
        self.file_path = file_path
        dir_name = os.path.dirname(file_path)
        if dir_name:
@@ -28,7 +29,7 @@ class Archive:
        self.cctx = zstandard.ZstdCompressor(level=compression_level)
        self.compressor = self.cctx.stream_writer(self.fh)

-    def add_data(self, data, meta={}):
+    def add_data(self, data, meta={}) -> None:
        self.compressor.write(
            json.dumps({"text": data, "meta": meta}, default=json_serial).encode(
                "UTF-8"
@@ -36,7 +37,7 @@ class Archive:
            + b"\n"
        )

-    def commit(self):
+    def commit(self) -> None:
        self.compressor.flush(zstandard.FLUSH_FRAME)
        self.fh.flush()
        self.fh.close()
@@ -44,10 +45,16 @@ class Archive:

 # Modified version of lm_dataformat Reader with self.fh set, allowing peeking for tqdm.
 class Reader:
-    def __init__(self):
+    def __init__(self) -> None:
        pass

-    def read(self, file, get_meta=False, autojoin_paragraphs=True, para_joiner="\n\n"):
+    def read(
+        self,
+        file,
+        get_meta: bool = False,
+        autojoin_paragraphs: bool = True,
+        para_joiner: str = "\n\n",
+    ):
        with open(file, "rb") as fh:
            self.fh = fh
            cctx = zstandard.ZstdDecompressor()
@@ -72,7 +79,7 @@ class Reader:


 class TextArchive:
-    def __init__(self, file_path, mode="rb+"):
+    def __init__(self, file_path, mode: str = "rb+") -> None:
        self.file_path = file_path
        dir_name = os.path.dirname(file_path)
        if dir_name:
@@ -83,21 +90,21 @@ class TextArchive:

        self.fh = open(self.file_path, mode)

-    def add_data(self, data):
+    def add_data(self, data) -> None:
        self.fh.write(data.encode("UTF-8") + b"\n")

-    def commit(self):
+    def commit(self) -> None:
        self.fh.flush()
        self.fh.close()


 class TextReader:
-    def __init__(self, file_path):
+    def __init__(self, file_path) -> None:
        self.file_path = file_path

    # Optimized mmap read with infrequent tqdm updates to maintain speed
    # Tested up to 250MB/s.
-    def read_tqdm(self, update_frequency=10000):
+    def read_tqdm(self, update_frequency: int = 10000):
        current_file_position = 0
        line_counter = 0
        with open(self.file_path, "r") as fh, tqdm.tqdm(
@@ -149,7 +156,7 @@ class TextReader:
 # Optimized for speed. Decompresses the archive in shell before
 # using the mmap'd TextReader.
 class ZStdTextReader:
-    def __init__(self, file):
+    def __init__(self, file) -> None:
        self.file = file

    def read_tqdm(self):

--- a/lm_eval/decontamination/decontaminate.py
+++ b/lm_eval/decontamination/decontaminate.py
@@ -11,7 +11,7 @@ from .archiver import ZStdTextReader


 # Was used for testing the evaluator decoupled from the full logic below
-def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
+def get_train_overlap_stub(docs: dict, ngrams_path: str, ngrams_n_size: str):
    simulated_overlap = 0.1
    contaminated = int(len(docs) * simulated_overlap)
    return random.sample(range(len(docs)), contaminated)
@@ -25,6 +25,7 @@ def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
 # scripts are an info.json file containing the n_gram_size (13) and a bunch of "ngrams_{x}.bkt.txt.sorted.zst"
 # files. These should exist in the "ngrams_path" provided to this function.

+
 # Algorithm:
 # 1. Build lookups for each dataset {ngram: list(document_ids)}
 # 2. Merge into an overall lookup {ngram: [(task_name, task_set, doc_ids),]}
@@ -33,7 +34,7 @@ def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
 # 4. Strip the task_set from the dictionary keys and return
 #
 # We cache the task+set lookups as well as the overlaps.
-def get_train_overlap(docs_by_task_set, ngrams_path, limit):
+def get_train_overlap(docs_by_task_set: dict, ngrams_path: str, limit: int) -> dict:
    # return get_train_overlap_stub(docs, ngrams_path, ngrams_n_size)

    info_dict_path = os.path.join(ngrams_path, "info.json")
@@ -46,7 +47,7 @@ def get_train_overlap(docs_by_task_set, ngrams_path, limit):
    print("Building Lookups...")
    start = time.perf_counter()

-    def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit):
+    def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit) -> str:
        return f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.overlaps"

    lookups = {}

--- a/lm_eval/decontamination/janitor.py
+++ b/lm_eval/decontamination/janitor.py
 import re
 import string
-import timeit
 import pickle
 import traceback
 from pprint import pprint
+from typing import Iterator, Sequence, TypeVar

 # This is a cpp module. Compile janitor_util.cpp with:
 # c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
@@ -16,10 +16,12 @@ except Exception:
    traceback.print_exc()
    JANITOR_CPP = False

+T = TypeVar("T")
+

 # Implementation from nltk source
 # https://www.nltk.org/_modules/nltk/util.html
-def form_ngrams(sequence, n):
+def form_ngrams(sequence: Iterator[T], n: int) -> Iterator[tuple[T, ...]]:
    history = []
    while n > 1:
        # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
@@ -36,7 +38,7 @@ def form_ngrams(sequence, n):
        del history[0]


-def word_ngrams(s, n):
+def word_ngrams(s: str, n: int) -> Iterator[str]:
    """Splits a string into ngram words"""
    tokens = s.split()  # not a generator :(
    ngram_seqs = form_ngrams(iter(tokens), n)
@@ -68,14 +70,14 @@ def word_ngrams(s, n):


 # https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
-def split_indices(s):
+def split_indices(s: str) -> Iterator[tuple[str, tuple[int, int]]]:
    """Splits a string on whitespaces and records the indices of each in the original string.
    @:return generator((word, (start_idx, end_idx)), ...)
    """
    return ((m.group(0), (m.start(), m.end() - 1)) for m in re.finditer(r"\S+", s))


-def word_ngrams_indices(s, n):
+def word_ngrams_indices(s: str, n: int) -> Iterator[tuple[str, tuple[int, int]]]:
    """Splits a string into pairs of (ngram words, their start/end indices)"""
    tokens_with_indices = split_indices(s)

@@ -104,16 +106,15 @@ def word_ngrams_indices(s, n):


 class Janitor:
-
    # FIXME delete_chars: Should anything else go here? Special chars?
    def __init__(
        self,
-        ngram_n=13,
-        window_to_remove=200,
-        too_dirty_cutoff=10,
-        minimum_slice_length=200,
-        delete_chars=string.punctuation,
-    ):
+        ngram_n: int = 13,
+        window_to_remove: int = 200,
+        too_dirty_cutoff: int = 10,
+        minimum_slice_length: int = 200,
+        delete_chars: str = string.punctuation,
+    ) -> None:
        self.ngram_n = ngram_n
        self.window_to_remove = window_to_remove
        self.too_dirty_cutoff = too_dirty_cutoff
@@ -135,11 +136,11 @@ class Janitor:
    # I/O for saving contamination ngrams
    ##############

-    def save_contamination_ngrams(self, filename):
+    def save_contamination_ngrams(self, filename: str) -> None:
        with open(filename, "wb") as fp:
            pickle.dump(filename, fp)

-    def load_contamination_ngrams(self, filename):
+    def load_contamination_ngrams(self, filename: str) -> None:
        with open(filename, "rb") as fp:
            self.dirt_ngrams = pickle.load(fp)

@@ -147,7 +148,7 @@ class Janitor:
    # Call these :)
    ##############

-    def register_contaminant(self, dirt_string):
+    def register_contaminant(self, dirt_string: str) -> None:
        """Register a string as contamination to be removed, e.g. a test set
        This breaks the dirt_string into ngrams to store for future cleaning"""
        if JANITOR_CPP:
@@ -156,7 +157,7 @@ class Janitor:
            print("WARNING: Janitor running in python mode")
            return self.register_contaminant_python(dirt_string)

-    def clean(self, dirty_string):
+    def clean(self, dirty_string: str) -> list[str]:
        """Clean a string (e.g. a training set) by removing all ngrams previously
        registered as contaminants. Returns a list of clean chunks, or empty if
        the string was too dirty"""
@@ -166,7 +167,9 @@ class Janitor:
            print("WARNING: Janitor running in python mode")
            return self.clean_python(dirty_string)

-    def _split_chunks(self, dirty_string, dirty_parts):
+    def _split_chunks(
+        self, dirty_string: str, dirty_parts: Sequence[tuple]
+    ) -> list[str]:
        clean_chunks = []
        splice_idx = 0
        end = -1
@@ -189,12 +192,12 @@ class Janitor:
    # Fast C++
    ##############

-    def register_contaminant_cpp(self, dirt_string):
+    def register_contaminant_cpp(self, dirt_string) -> None:
        self.dirt_ngrams.update(
            janitor_util.clean_ngram(dirt_string, self.delete_chars, self.ngram_n)
        )

-    def clean_cpp(self, dirty_string):
+    def clean_cpp(self, dirty_string: str) -> list[str]:
        contamination_indices = janitor_util.clean_ngram_with_indices(
            dirty_string, self.delete_chars, self.ngram_n
        )
@@ -204,15 +207,15 @@ class Janitor:
    # Slow python
    ##############

-    def normalize_string(self, s):
+    def normalize_string(self, s: str) -> str:
        return s.translate(self.translation_table)

-    def register_contaminant_python(self, dirt_string):
+    def register_contaminant_python(self, dirt_string: str) -> None:
        self.dirt_ngrams.update(
            word_ngrams(self.normalize_string(dirt_string), self.ngram_n)
        )

-    def clean_python(self, dirty_string):
+    def clean_python(self, dirty_string: str) -> list[str]:
        contamination_indices = (
            (None, *idx_pair)
            for dirty_ngram, idx_pair in word_ngrams_indices(dirty_string, self.ngram_n)

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -42,11 +42,11 @@ def simple_evaluate(
    device=None,
    use_cache=None,
    limit=None,
-    bootstrap_iters=100000,
-    check_integrity=False,
+    bootstrap_iters: int = 100000,
+    check_integrity: bool = False,
    decontamination_ngrams_path=None,
-    write_out=False,
-    log_samples=True,
+    write_out: bool = False,
+    log_samples: bool = True,
 ):
    """Instantiate and evaluate a model on a list of tasks.

@@ -117,10 +117,11 @@ def simple_evaluate(

    task_dict = lm_eval.tasks.get_task_dict(tasks)
    for task_name in task_dict.keys():
-
        task_obj = task_dict[task_name]
        if type(task_obj) == tuple:
            group, task_obj = task_obj
+            if task_obj is None:
+                continue

        config = task_obj._config
        if num_fewshot is not None:
@@ -175,17 +176,17 @@ def evaluate(
    lm,
    task_dict,
    limit=None,
-    bootstrap_iters=100000,
+    bootstrap_iters: int = 100000,
    decontamination_ngrams_path=None,
-    write_out=False,
-    log_samples=True,
+    write_out: bool = False,
+    log_samples: bool = True,
 ):
    """Instantiate and evaluate a model on a list of tasks.

    :param lm: obj
        Language Model
    :param task_dict: dict[str, Task]
-        Dictionary of tasks. Tasks will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
+        Dictionary of tasks. Tasks will be taken to have name type(task).config.task .
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
@@ -210,24 +211,30 @@ def evaluate(
    samples = collections.defaultdict(list)
    # tracks all Instances/requests a model must generate output on.
    requests = collections.defaultdict(list)
-    # Stores task scores based on task grouping.
-    aggregate = collections.defaultdict(dict)
-    # tracks if a task was chosen via user selecting a group containing it
-    task_groups = collections.defaultdict(dict)
+    # Aggregated task scores presented with groups
+    results_agg = collections.defaultdict(dict)
+    # Aggregated groups scores only
+    groups_agg = collections.defaultdict(dict)
    # stores the amount to pad out reqs per req. type so that
    # number of fwd passes per distributed rank is equal
    padding_requests = collections.defaultdict(int)
-
-    # Stores group related keys and values for group-aggregation
-    task_groups = collections.defaultdict(dict)
+    # store the hierarchy to do proper ordering
+    task_hierarchy = collections.defaultdict(list)
+    # store the ordering of tasks and groups
+    task_order = collections.defaultdict(int)
+    # store the aggregation for aggregating across tasks in the same group
+    sample_agg_fn = collections.defaultdict(dict)

    # get lists of each type of request
    for task_name, task in task_dict.items():
-
        if type(task) == tuple:
-            group, task = task
-            task_groups[task_name] = group
-            aggregate[task_name] = {}
+            group_name, task = task
+            task_hierarchy[group_name].append(task_name)
+        else:
+            task_hierarchy[task_name] = []
+
+        if task is None:
+            continue

        versions[task_name] = task.VERSION
        configs[task_name] = dict(task.dump_config())
@@ -252,7 +259,8 @@ def evaluate(
                # print the prompt for the first few documents
                if inst.doc_id < 1:
                    eval_logger.info(
-                        f"Task: {task_name}; document {inst.doc_id}; context prompt (starting on next line):\n{inst.args[0]}\n(end of prompt on previous line)"
+                        f"Task: {task_name}; document {inst.doc_id}; context prompt (starting on next line):\
+\n{inst.args[0]}\n(end of prompt on previous line)\ntarget string or answer choice index (starting on next line):\n{task.doc_to_target(inst.doc)}\n(end of target on previous line)"
                    )
                    eval_logger.info(f"Request: {str(inst)}")

@@ -302,6 +310,8 @@ def evaluate(
    for task_name, task in task_dict.items():
        if type(task) == tuple:
            group, task = task
+            if task is None:
+                continue
        task.apply_filters()

    ### Collect values of metrics on all datapoints ###
@@ -311,6 +321,8 @@ def evaluate(
    for task_name, task in task_dict.items():
        if type(task) == tuple:
            group, task = task
+            if task is None:
+                continue
        # TODO: make it possible to use a different metric per filter
        # iterate over different filters used
        for key in task.instances[0].filtered_resps.keys():
@@ -349,7 +361,6 @@ def evaluate(
        # if multigpu, then gather data across all ranks
        # first gather logged samples across all ranks
        for task_name, task_samples in list(samples.items()):
-
            full_samples = [None] * lm.world_size
            torch.distributed.all_gather_object(full_samples, task_samples)

@@ -358,33 +369,39 @@ def evaluate(
        # then collect metrics across all ranks
        vals_torch = collections.defaultdict(list)
        for (task_name, key, metric), items in vals.items():
-
            numitem = 0
            if type(items[0]) == tuple:
                numitem = len(items[0])

-            # distributed gather requires all ranks to have same dimensions
-            # so we pad out with float32 min value
-            pad_value = torch.finfo(torch.float32).min
-            metrics_tensor = torch.tensor(items, device=lm.device)
-
-            original_dtype = metrics_tensor.dtype  # store original dtype
-            torch_device_tensor = lm.accelerator.pad_across_processes(
-                metrics_tensor.to(torch.float32), pad_index=pad_value
-            )
-            gathered_item = lm.accelerator.gather(torch_device_tensor)
+            if isinstance(items[0], (str, list)):
+                # handle the string case
+                gathered_items = [None] * lm.accelerator.num_processes
+                torch.distributed.all_gather_object(gathered_items, items)

-            if numitem > 0:
-                gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
+                gathered_item = list(itertools.chain.from_iterable(gathered_items))
            else:
-                gathered_filtered = gathered_item[gathered_item != pad_value]
+                # distributed gather requires all ranks to have same dimensions
+                # so we pad out with float32 min value
+                pad_value = torch.finfo(torch.float32).min
+                metrics_tensor = torch.tensor(items, device=lm.device)
+
+                original_dtype = metrics_tensor.dtype  # store original dtype
+                torch_device_tensor = lm.accelerator.pad_across_processes(
+                    metrics_tensor.to(torch.float32), pad_index=pad_value
+                )
+                gathered_item = lm.accelerator.gather(torch_device_tensor)

-            gathered_item = (
-                gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
-            )
-            # reconvert if we were passed a tuple of values
-            if numitem > 0:
-                gathered_item = [tuple(g) for g in gathered_item]
+                if numitem > 0:
+                    gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
+                else:
+                    gathered_filtered = gathered_item[gathered_item != pad_value]
+
+                gathered_item = (
+                    gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
+                )
+                # reconvert if we were passed a tuple of values
+                if numitem > 0:
+                    gathered_item = [tuple(g) for g in gathered_item]

            if lm.rank == 0:
                vals_torch[(task_name, key, metric)] = gathered_item
@@ -392,31 +409,68 @@ def evaluate(
        vals = vals_torch

    if lm.rank == 0:
+
+        ### Get task ordering for correct sample-wide aggregation
+        group_to_task = {}
+        for group in task_hierarchy.keys():
+            if group not in task_order:
+                task_order[group] = 0
+
+            if len(task_hierarchy[group]) > 0:
+                group_to_task[group] = task_hierarchy[group].copy()
+
+            for task in task_hierarchy[group]:
+
+                if task in task_order:
+                    task_order[task] += 1
+                else:
+                    task_order[task] = 1 + task_order[group]
+
+                if task in task_hierarchy:
+                    group_to_task[group].remove(task)
+                    group_to_task[group].extend(task_hierarchy[task])
+
+        task_to_group = {}
+        for group in group_to_task:
+            for task in group_to_task[group]:
+                if task in task_to_group:
+                    task_to_group[task].append(group)
+                else:
+                    task_to_group[task] = [group]
+
        ### Aggregate results over all datapoints ###
        # aggregate results ; run bootstrap CIs
        for (task_name, key, metric), items in vals.items():
            task = task_dict[task_name]
+            metric_key = metric + "," + key
+
            if type(task) == tuple:
-                group, task = task
-            task_score = task.aggregation()[metric](items)
-            results[task_name][metric + "," + key] = task_score
-
-            # Need to put back in results
-            # pythia | acc
-            #        | perplexity
-            #        | word_perplexity
-            #        | byte_perplexity
-            #        | bits_per_byte
-            if task_name in task_groups:
-                group_name = task_groups[task_name]
-                if metric in list(aggregate[group_name].keys()):
-                    aggregate[group_name][metric].append(task_score)
-                else:
-                    aggregate[group_name][metric] = [task_score]
+                group_name, task = task
+            else:
+                group_name = None
+
+            agg_fn = task.aggregation()[metric]
+            task_score = agg_fn(items)
+
+            if group_name is not None:
+                sample_metric_key = metric + "(sample agg)," + key
+                for grouping in task_to_group[task_name]:
+                    if metric_key in results[grouping]:
+                        results[grouping][metric_key].append(task_score)
+                    else:
+                        results[grouping][metric_key] = [task_score]
+
+                    if sample_metric_key in results[grouping]:
+                        results[grouping][sample_metric_key] += items
+                    else:
+                        results[grouping][sample_metric_key] = items.copy()
+                        sample_agg_fn[grouping][sample_metric_key] = agg_fn
+
+            results[task_name][metric_key] = task_score

            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
            # so we run them less iterations. still looking for a cleaner way to do this
-            if bootstrap_iters > 0:
+            if False:  # bootstrap_iters > 0:
                stderr = lm_eval.api.metrics.stderr_for_metric(
                    metric=task.aggregation()[metric],
                    bootstrap_iters=min(bootstrap_iters, 1000)
@@ -427,19 +481,38 @@ def evaluate(
                if stderr is not None:
                    results[task_name][metric + "_stderr" + "," + key] = stderr(items)

-        if bool(aggregate):
-            for group in aggregate.keys():
-                for metric in aggregate[group].keys():
-                    aggregate[group][metric] = np.average(aggregate[group][metric])
-                    versions[group] = "N/A"
+        if bool(results):
+            for task_or_group in results.keys():
+                for metric in results[task_or_group].keys():
+                    if type(results[task_or_group][metric]) == list:
+                        if "(sample agg)" in metric:
+                            results[task_or_group][metric] = sample_agg_fn[
+                                task_or_group
+                            ][metric](results[task_or_group][metric])
+                        else:
+                            results[task_or_group][metric] = np.average(
+                                results[task_or_group][metric]
+                            )
+                        versions[task_or_group] = "N/A"
+
+        for task_name, task in task_dict.items():
+            if type(task) == tuple:
+                group_name, task = task
+                order = task_order[group_name]
+                tabbed_name = "-" * order + group_name
+                results_agg[tabbed_name] = results[group_name]
+                versions[tabbed_name] = versions[group_name]
+                if order == 0:
+                    groups_agg[group_name] = results[group_name]
+
+            order = task_order[task_name]
+            tabbed_name = "-" * order + task_name
+            results_agg[tabbed_name] = results[task_name]
+            versions[tabbed_name] = versions[task_name]

        results_dict = {
-            "results": dict(sorted(results.items())),
-            **(
-                {"aggregate": dict(sorted(aggregate.items()))}
-                if bool(aggregate)
-                else {}
-            ),
+            "results": dict(results_agg.items()),
+            **({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
            "configs": dict(sorted(configs.items())),
            "versions": dict(sorted(versions.items())),
        }

--- a/lm_eval/filters/__init__.py
+++ b/lm_eval/filters/__init__.py
@@ -17,14 +17,16 @@ FILTER_REGISTRY = {


 def get_filter(filter_name):
-    return FILTER_REGISTRY[filter_name]
+    if filter_name in FILTER_REGISTRY:
+        return FILTER_REGISTRY[filter_name]
+    else:
+        return filter_name


 def build_filter_ensemble(filter_name, components):
    """
    Create a filtering pipeline.
    """
-
    filters = []
    for (function, kwargs) in components:
        if kwargs is None: