Merge branch 'big-refactor' of...

Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into flan-benchmark

Merge branch 'big-refactor' of...
Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into flan-benchmark
337492ad · lintangsutawika · 3d2ee4d4 · 4824a832 · 337492ad · 337492ad
Commit 337492ad authored Oct 05, 2023 by lintangsutawika
20 changed files
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
-# name: Tasks Modified
+name: Tasks Modified
-# on:
+on:
-#   push:
+  push:
-#     branches:
+    branches:
-#       - 'big-refactor*'
+      - 'big-refactor*'
-#   pull_request:
+  pull_request:
-#     branches:
+    branches:
-#       - 'big-refactor*'
+      - 'big-refactor*'
-#   workflow_dispatch:
+  workflow_dispatch:
-# # comment/edit out the above to stop/change the triggers
+# comment/edit out the above to stop/change the triggers
-# jobs:
+jobs:
-#   changed_files:
+  changed_files:
-#     runs-on: ubuntu-latest  # windows-latest || macos-latest
+    runs-on: ubuntu-latest  # windows-latest || macos-latest
-#     timeout-minutes: 120
+    timeout-minutes: 120
-#     name: Scan for changed tasks
+    name: Scan for changed tasks
-#     steps:
+    steps:
-#       - name: checkout
+      - name: checkout
-#         uses: actions/checkout@v3
+        uses: actions/checkout@v3
-#         with:
+        with:
-#           fetch-depth: 2  # OR "2" -> To retrieve the preceding commit.
+          fetch-depth: 2  # OR "2" -> To retrieve the preceding commit.
-#       # Uses the tj-actions/changed-files@v37 action to check for changes.
+      # Uses the tj-actions/changed-files@v37 action to check for changes.
-#       # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
+      # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
-#       # The `files_yaml` input optionally takes a yaml string to specify filters,
+      # The `files_yaml` input optionally takes a yaml string to specify filters,
-#       # and prepends the filter name to the standard output names.
+      # and prepends the filter name to the standard output names.
-#       - name: Check task folders
+      - name: Check task folders
-#         id: changed-tasks
+        id: changed-tasks
-#         uses: tj-actions/changed-files@v37.1.2
+        uses: tj-actions/changed-files@v37.1.2
-#         with:
+        with:
-#           # tasks checks the tasks folder and api checks the api folder for changes
+          # tasks checks the tasks folder and api checks the api folder for changes
-#           files_yaml: |
+          files_yaml: |
-#             tasks:
+            tasks:
-#               - lm_eval/tasks/**
+              - lm_eval/tasks/**
-#             api:
+            api:
-#               - lm_eval/api/**
+              - lm_eval/api/**
-#           write_output_files: true
+          write_output_files: true
-#     # The next step is optional; the files are written to the workspace by default (above).
+    # The next step is optional; the files are written to the workspace by default (above).
-#     # so it's just for debugging
+    # so it's just for debugging
-#       - name: Run Tests
+      - name: Run Tests
-#         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
-#         run: |
+        run: |
-#           echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
+          echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
-#           echo "One or more test file(s) has changed."
+          echo "One or more test file(s) has changed."
-#           echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
+          echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
-#       - name: Set up Python 3.9
+      - name: Set up Python 3.9
-#         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
-#         uses: actions/setup-python@v4
+        uses: actions/setup-python@v4
-#         with:
+        with:
-#           python-version: 3.9
+          python-version: 3.9
-#           cache: 'pip'
+          cache: 'pip'
-#           cache-dependency-path: setup.py
+          cache-dependency-path: setup.py
-#       - name: Install dependencies
+      - name: Install dependencies
-#         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
-#         run: |
+        run: |
-#             python -m pip install --upgrade pip
+            python -m pip install --upgrade pip
-#             pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+            pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
-#     #   Install optional git dependencies
+    #   Install optional git dependencies
-#     #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+    #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
-#     #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-#       - name: Test with pytest
+      - name: Test with pytest
-#         # if new tasks are added, run tests on them
+        # if new tasks are added, run tests on them
-#         if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
-#         run: python -m pytest tests/test_tasks.py -s -vv
+        run: python -m pytest tests/test_tasks.py -s -vv
-#         # if api is modified, run tests on it
+        # if api is modified, run tests on it
-#       - name: Test more tasks with pytest
+      - name: Test more tasks with pytest
-#         env:
+        env:
-#           API: true
+          API: true
-#         if: steps.changed-tasks.outputs.api_any_modified == 'true'
+        if: steps.changed-tasks.outputs.api_any_modified == 'true'
-#         run: python -m pytest tests/test_tasks.py -s -vv
+        run: python -m pytest tests/test_tasks.py -s -vv
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -22,10 +22,10 @@ jobs:
    steps:
    - name: Checkout Code
      uses: actions/checkout@v3
-    - name: Set up Python 3.9
+    - name: Set up Python 3.8
      uses: actions/setup-python@v4
      with:
-        python-version: 3.9
+        python-version: 3.8
        cache: pip
        cache-dependency-path: setup.py
    - name: Install dependencies
@@ -43,35 +43,35 @@ jobs:
 #       # mypy turned off for now
 #    - name: Lint with mypy
 #      run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
-# Job 2
+Job 2
-#   testcpu:
+  testcpu:
-#     name: CPU Tests
+    name: CPU Tests
-#     runs-on: ubuntu-latest
+    runs-on: ubuntu-latest
-#     strategy:
+    strategy:
-#       matrix:
+      matrix:
-#         python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
-#     timeout-minutes: 30
+    timeout-minutes: 30
-#     steps:
+    steps:
-#     - name: Checkout Code
+    - name: Checkout Code
-#       uses: actions/checkout@v3
+      uses: actions/checkout@v3
-#     - name: Set up Python ${{ matrix.python-version }}
+    - name: Set up Python ${{ matrix.python-version }}
-#       uses: actions/setup-python@v4
+      uses: actions/setup-python@v4
-#       with:
+      with:
-#         python-version: ${{ matrix.python-version }}
+        python-version: ${{ matrix.python-version }}
-#         cache: pip
+        cache: pip
-#         cache-dependency-path: setup.py
+        cache-dependency-path: setup.py
-#     - name: Install dependencies
+    - name: Install dependencies
-#       run: |
+      run: |
-#         python -m pip install --upgrade pip
+        python -m pip install --upgrade pip
-#         pip install -e '.[testing,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -e '.[testing,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
-# #         Install optional git dependencies
+#         Install optional git dependencies
-# #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+#                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
-# #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+#        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-#     - name: Test with pytest
+    - name: Test with pytest
-#       run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/tests_master --ignore=tests/extra
+      run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/tests_master --ignore=tests/extra
-#     - name: Archive artifacts
+    - name: Archive artifacts
-#       uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v3
-#       with:
+      with:
-#         name: output_results
+        name: output_results
-#         path: |
+        path: |
-#           test_logs/*
+          test_logs/*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,7 +33,7 @@ repos:
    rev: 22.3.0
    hooks:
      - id: black
-        language_version: python3.9
+        language_version: python3.8
  - repo: https://github.com/codespell-project/codespell
    rev: v2.1.0
    hooks:

--- a/README.md
+++ b/README.md
@@ -9,8 +9,8 @@ We’d like your help to test it out! you can help by:
 2. Porting tasks supported in the previous version of the harness to the new YAML configuration format. Please check out our [task implementation guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md) for more information.
 If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with:
- A command of the form `python main.py --model hf --model_args ..... --tasks <task name> ...` which will run the task in the `master` branch, and what the score is
+- A command of the form `python -m lm_eval --model hf --model_args ..... --tasks <task name> ...` which will run the task in the `master` branch, and what the score is
- A command of the form `python main.py --model hf --model_args ..... --tasks <task name> ...` to run the task in your PR branch to `big-refactor`, and what the resulting score is, to show that we achieve equality between the two implementations.
+- A command of the form `python -m lm_eval --model hf --model_args ..... --tasks <task name> ...` to run the task in your PR branch to `big-refactor`, and what the resulting score is, to show that we achieve equality between the two implementations.
 Lastly, we'll no longer be accepting new feature requests beyond those that are already open to the master branch as we carry out this switch to the new version over the next week, though we will be accepting bugfixes to `master` branch and PRs to `big-refactor`. Feel free to reach out in the #lm-thunderdome channel of the EAI discord for more information.
@@ -67,7 +67,7 @@ To evaluate a model hosted on the [HuggingFace Hub](https://huggingface.co/model
 ```bash
-python main.py \
+python -m lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/gpt-j-6B \
    --tasks hellaswag \
@@ -78,7 +78,7 @@ python main.py \
 Additional arguments can be provided to the model constructor using the `--model_args` flag. Most notably, this supports the common practice of using the `revisions` feature on the Hub to store partially trained checkpoints, or to specify the datatype for running a model:
 ```bash
-python main.py \
+python -m lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/pythia-160m,revision=step100000,dtype="float" \
    --tasks lambada_openai,hellaswag \
@@ -91,7 +91,7 @@ Models that are loaded via either `transformers.AutoModelForCausalLM` (autoregre
 Batch size selection can be automated by setting the  ```--batch_size``` flag to ```auto```. This will perform automatic detection of the largest batch size that will fit on your device. On tasks where there is a large difference between the longest and shortest example, it can be helpful to periodically recompute the largest batch size, to gain a further speedup. To do this, append ```:N``` to above flag to automatically recompute the largest batch size ```N``` times. For example, to recompute the batch size 4 times, the command would be:
 ```bash
-python main.py \
+python -m lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/pythia-160m,revision=step100000,dtype="float" \
    --tasks lambada_openai,hellaswag \
@@ -99,7 +99,7 @@ python main.py \
    --batch_size auto:4
 ```
-Alternatively, you can use `lm-eval` instead of `python main.py` to call lm eval from anywhere.
+Alternatively, you can use `lm-eval` or `lm_eval` instead of `python -m lm_eval` to call lm eval from anywhere.
 ### Multi-GPU Evaluation with Hugging Face `accelerate`
@@ -108,7 +108,7 @@ To parallelize evaluation of HuggingFace models across multiple GPUs, we allow f
 The first is performed by launching evaluation via the `accelerate` library as follows:
 ```
-accelerate launch main.py \
+accelerate launch -m lm_eval \
    --model hf \
    --tasks lambada_openai,arc_easy \
    --batch_size 16 \
@@ -121,7 +121,7 @@ If your model is *is too large to be run on a single one of your GPUs* then you
 We also provide an second method to run these large models: use of the `parallelize` argument.
 ```
-python main.py \
+python -m lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/pythia-12b,parallelize=True
    --tasks lambada_openai,arc_easy \
@@ -136,7 +136,7 @@ To pass even more advanced keyword arguments to `accelerate`, we allow for the f
 Note that this method naively splits models across GPUs, resulting in only a single GPU performing work at any point in time, and so is much slower than launching with `accelerate launch`, possibly by a factor of the total # of GPUs.
-**Note that this option requires launching evaluation via `python main.py` rather than `accelerate launch main.py`.**
+**Note that this option requires launching evaluation via `python -m lm_eval` rather than `accelerate launch -m lm_eval`.**
 To use `accelerate` with the `lm-eval` command, use
 ```
@@ -167,7 +167,7 @@ Our library supports language models served via the OpenAI Completions API as fo
 ```bash
 export OPENAI_API_SECRET_KEY=YOUR_KEY_HERE
-python main.py \
+python -m lm_eval \
    --model openai-completions \
    --model_args engine=davinci \
    --tasks lambada_openai,hellaswag
@@ -198,7 +198,7 @@ This will write out one text file for each task.
 To verify the data integrity of the tasks you're performing in addition to running the tasks themselves, you can use the `--check_integrity` flag:
 ```bash
-python main.py \
+python -m lm_eval \
    --model openai \
    --model_args engine=davinci \
    --tasks lambada_openai,hellaswag \
@@ -209,7 +209,7 @@ python main.py \
 For models loaded with the HuggingFace  `transformers` library, any arguments provided via `--model_args` get passed to the relevant constructor directly. This means that anything you can do with `AutoModel` can be done with our library. For example, you can pass a local path via `pretrained=` or use models finetuned with [PEFT](https://github.com/huggingface/peft) by taking the call you would run to evaluate the base model and add `,peft=PATH` to the `model_args` argument:
 ```bash
-python main.py \
+python -m lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/gpt-j-6b,parallelize=True,load_in_4bit=True,peft=nomic-ai/gpt4all-j-lora \
    --tasks openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq \
@@ -219,7 +219,7 @@ python main.py \
 [GPTQ](https://github.com/PanQiWei/AutoGPTQ) quantized models can be loaded by specifying their file names in `,gptq=NAME` (or `,gptq=True` for default names) in the `model_args` argument:
 ```bash
-python main.py \
+python -m lm_eval \
    --model hf \
    --model_args pretrained=model-name-or-path,gptq=model.safetensors,gptq_use_triton=True \
    --tasks hellaswag

--- a/docs/decontamination.md
+++ b/docs/decontamination.md
@@ -2,11 +2,11 @@
 ## Usage
-Simply add a "--decontamination_ngrams_path" when running main.py. The provided directory should contain
+Simply add a "--decontamination_ngrams_path" when running \__main\__.py. The provided directory should contain
 the ngram files and info.json produced in "Pile Ngram Generation" further down.
 ```bash
-python main.py \
+python -m lm_eval \
    --model gpt2 \
    --device 0 \
    --tasks sciq \

--- a/docs/interface.md
+++ b/docs/interface.md
@@ -4,7 +4,7 @@ This document details the interface exposed by `lm-eval` and provides details on
 ## Command-line Interface
-A majority of users run the library by cloning it from Github and running the `main.py` script.
+A majority of users run the library by cloning it from Github, installing the package as editable, and running the `python -m lm_eval` script.
 Equivalently, running the library can be done via the `lm-eval` entrypoint at the command line.

--- a/docs/model_guide.md
+++ b/docs/model_guide.md
@@ -70,9 +70,9 @@ smth smth tokenizer-agnostic
 Congrats on implementing your model! Now it's time to test it out.
-To make your model usable via the command line interface to `lm-eval` using `main.py`, you'll need to tell `lm-eval` what your model's name is.
+To make your model usable via the command line interface to `lm-eval` using `python -m lm_eval`, you'll need to tell `lm-eval` what your model's name is.
-This is done via a *decorator*, `lm_eval.api.registry.register_model`. Using `register_model()`, one can both tell the package what the model's name(s) to be used are when invoking it with `python main.py --model <name>` and alert `lm-eval` to the model's existence.
+This is done via a *decorator*, `lm_eval.api.registry.register_model`. Using `register_model()`, one can both tell the package what the model's name(s) to be used are when invoking it with `python -m lm_eval --model <name>` and alert `lm-eval` to the model's existence.
 ```python
 from lm_eval.api.registry import register_model

--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -214,7 +214,7 @@ metric_list:
 ```
 `aggregation` and `higher_is_better` can optionally be left out to default to the manually-set defaults if using a natively supported metric, otherwise it must be defined explicitly (for example, when using a custom metric implemented as a function).
-For a full list of natively supported metrics and aggregation functions see `docs/advanced_task_guide.md`. All metrics supported in [HuggingFace Evaluate](https://github.com/huggingface/evaluate/tree/main/metrics) can also be used, and will be loaded if a given metric name is not one natively supported in `lm-eval`.
+For a full list of natively supported metrics and aggregation functions see `docs/advanced_task_guide.md`. All metrics supported in [HuggingFace Evaluate](https://github.com/huggingface/evaluate/tree/main/metrics) can also be used, and will be loaded if a given metric name is not one natively supported in `lm-eval` or `hf_evaluate` is set to `true`.
 ### Optional, More Advanced Setup
@@ -258,7 +258,7 @@ You can do this via adding the Python snippet
 from lm_eval.tasks import include_task_folder
 include_task_folder("/path/to/yaml/parent/folder")
 ```
-to the top of any Python file that is run or imported when performing evaluation, such as `main.py`.
+to the top of any Python file that is run or imported when performing evaluation, such as `\_\_main\_\_.py`.
 Passing `--tasks /path/to/yaml/file` is also accepted.

--- a/main.py
+++ b/main.py
@@ -12,10 +12,10 @@ from lm_eval.api.registry import ALL_TASKS
 from lm_eval.logger import eval_logger, SPACING
 from lm_eval.tasks import include_path
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from typing import Union
-def parse_args() -> argparse.Namespace:
+def parse_eval_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument("--model", required=True, help="Name of model e.g. `hf`")
    parser.add_argument(
@@ -100,8 +100,13 @@ def parse_args() -> argparse.Namespace:
    return parser.parse_args()
-def main() -> None:
+def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
-    args = parse_args()
+    if not args:
+        # we allow for args to be passed externally, else we parse them ourselves
+        args = parse_eval_args()
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    if args.limit:
        eval_logger.warning(
@@ -132,8 +137,6 @@ def main() -> None:
                if os.path.isfile(task):
                    config = utils.load_yaml_config(task)
                    task_names.append(config)
-                else:
-                    task_missing.append(task)
        if task_missing != []:
            missing = ", ".join(task_missing)
@@ -213,4 +216,4 @@ def main() -> None:
 if __name__ == "__main__":
-    main()
+    cli_evaluate()
--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -117,24 +117,23 @@ def register_metric(**args):
    return decorate
-def get_metric(name):
+def get_metric(name, hf_evaluate_metric=False):
+    if not hf_evaluate_metric:
+        if name in METRIC_REGISTRY:
+            return METRIC_REGISTRY[name]
+        else:
+            eval_logger.warning(
+                f"Could not find registered metric '{name}' in lm-eval, searching in HF Evaluate library..."
+            )
    try:
-        return METRIC_REGISTRY[name]
+        metric_object = evaluate.load(name)
-    except KeyError:
+        return metric_object.compute
-        # TODO: change this print to logging?
+    except Exception:
-        print(
+        eval_logger.error(
-            f"Could not find registered metric '{name}' in lm-eval, \
+            f"{name} not found in the evaluate library! Please check https://huggingface.co/evaluate-metric",
-searching in HF Evaluate library..."
        )
-        try:
-            metric_object = evaluate.load(name)
-            return metric_object.compute
-        except Exception:
-            eval_logger.error(
-                "{} not found in the evaluate library!".format(name),
-                "Please check https://huggingface.co/evaluate-metric",
-            )
 def register_aggregation(name):

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -554,8 +554,13 @@ class ConfigurableTask(Task):
                kwargs = {
                    key: metric_config[key]
                    for key in metric_config
-                    if key not in ["metric", "aggregation", "higher_is_better"]
+                    if key
+                    not in ["metric", "aggregation", "higher_is_better", "hf_evaluate"]
                }
+                hf_evaluate_metric = (
+                    "hf_evaluate" in metric_config
+                    and metric_config["hf_evaluate"] is True
+                )
                if self.config.process_results is not None:
                    self._metric_fn_list[metric_name] = None
@@ -566,7 +571,9 @@ class ConfigurableTask(Task):
                    self._metric_fn_list[metric_name] = metric_fn
                    self._metric_fn_kwargs[metric_name] = kwargs
                else:
-                    self._metric_fn_list[metric_name] = get_metric(metric_name)
+                    self._metric_fn_list[metric_name] = get_metric(
+                        metric_name, hf_evaluate_metric
+                    )
                    self._metric_fn_kwargs[metric_name] = kwargs
                if "aggregation" in metric_config:
@@ -1067,6 +1074,7 @@ class ConfigurableTask(Task):
        elif self.OUTPUT_TYPE == "greedy_until":
            gold = self.doc_to_target(doc)
+            result = results[0]
            if self.config.doc_to_choice is not None:
                # If you set doc_to_choice,
                # it assumes that doc_to_target returns a number.
@@ -1075,10 +1083,10 @@ class ConfigurableTask(Task):
            # we expect multiple_targets to be a list.
            elif self.multiple_target:
                gold = list(gold)
-            else:
+            elif type(gold) != type(result):
-                gold = str(gold)
+                # cast gold to the same type as result
+                gold = type(result)(gold)
-            result = results[0]
            for metric in self._metric_fn_list.keys():
                if self.multiple_target:
                    # in the case where we have multiple targets,

--- a/lm_eval/filters/__init__.py
+++ b/lm_eval/filters/__init__.py
 from lm_eval.api.filter import FilterEnsemble
 from . import selection
 from . import extraction
+from . import transformation
 FILTER_REGISTRY = {
@@ -9,6 +10,9 @@ FILTER_REGISTRY = {
    "majority_vote": selection.MajorityVoteFilter,
    "take_first_k": selection.TakeKFilter,
    "remove_whitespace": extraction.WhitespaceFilter,
+    "lowercase": transformation.LowercaseFilter,
+    "uppercase": transformation.UppercaseFilter,
+    "map": transformation.MapFilter,
    # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function
    # that takes an input and returns a scalar and then should select the max reward,
    # or should implement different filters for different ways of handling a reward model's inference.

--- a/lm_eval/filters/transformation.py
+++ b/lm_eval/filters/transformation.py
+from lm_eval.api.filter import Filter
+class LowercaseFilter(Filter):
+    def __init__(self) -> None:
+        pass
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [resp.lower() for resp in inst]
+        return [filter_set(resp) for resp in resps]
+class UppercaseFilter(Filter):
+    def __init__(self) -> None:
+        pass
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [resp.upper() for resp in inst]
+        return [filter_set(resp) for resp in resps]
+class MapFilter(Filter):
+    def __init__(self, mapping_dict: dict = {}, default_value=None) -> None:
+        """
+        Initializes the MapFilter with a given mapping dictionary and default value.
+        Args:
+        - mapping_dict (dict): A dictionary containing the key-value mappings.
+                               Default is an empty dictionary.
+        - default_value (Any): The value to be returned when a key is not found in the mapping_dict.
+                               Default is None.
+        Example:
+        mapper = MapFilter({'A': 1, 'B': 2}, default_value=0)
+        """
+        assert isinstance(
+            mapping_dict, dict
+        ), "Provided mapping_dict is not a dictionary"
+        self.mapping_dict = mapping_dict
+        self.default_value = default_value
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [self.mapping_dict.get(resp, self.default_value) for resp in inst]
+        return [filter_set(resp) for resp in resps]
--- a/lm_eval/tasks/bigbench/README.md
+++ b/lm_eval/tasks/bigbench/README.md
+# BigBench
+### Paper
+Title: `Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models`
+Abstract: https://arxiv.org/abs/2206.04615
+The Beyond the Imitation Game Benchmark (BIG-bench) is a collaborative benchmark intended to probe large language models and extrapolate their future capabilities. 
+Homepage: https://github.com/google/BIG-bench
+### Citation
+```
+@misc{srivastava2022imitation,
+      title={Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models},
+      author={Aarohi Srivastava and Abhinav Rastogi and Abhishek Rao and Abu Awal Md Shoeb and Abubakar Abid and Adam Fisch and Adam R. Brown and Adam Santoro and Aditya Gupta and Adrià Garriga-Alonso and Agnieszka Kluska and Aitor Lewkowycz and Akshat Agarwal and Alethea Power and Alex Ray and Alex Warstadt and Alexander W. Kocurek and Ali Safaya and Ali Tazarv and Alice Xiang and Alicia Parrish and Allen Nie and Aman Hussain and Amanda Askell and Amanda Dsouza and Ambrose Slone and Ameet Rahane and Anantharaman S. Iyer and Anders Andreassen and Andrea Madotto and Andrea Santilli and Andreas Stuhlmüller and Andrew Dai and Andrew La and Andrew Lampinen and Andy Zou and Angela Jiang and Angelica Chen and Anh Vuong and Animesh Gupta and Anna Gottardi and Antonio Norelli and Anu Venkatesh and Arash Gholamidavoodi and Arfa Tabassum and Arul Menezes and Arun Kirubarajan and Asher Mullokandov and Ashish Sabharwal and Austin Herrick and Avia Efrat and Aykut Erdem and Ayla Karakaş and B. Ryan Roberts and Bao Sheng Loe and Barret Zoph and Bartłomiej Bojanowski and Batuhan Özyurt and Behnam Hedayatnia and Behnam Neyshabur and Benjamin Inden and Benno Stein and Berk Ekmekci and Bill Yuchen Lin and Blake Howald and Cameron Diao and Cameron Dour and Catherine Stinson and Cedrick Argueta and César Ferri Ramírez and Chandan Singh and Charles Rathkopf and Chenlin Meng and Chitta Baral and Chiyu Wu and Chris Callison-Burch and Chris Waites and Christian Voigt and Christopher D. Manning and Christopher Potts and Cindy Ramirez and Clara E. Rivera and Clemencia Siro and Colin Raffel and Courtney Ashcraft and Cristina Garbacea and Damien Sileo and Dan Garrette and Dan Hendrycks and Dan Kilman and Dan Roth and Daniel Freeman and Daniel Khashabi and Daniel Levy and Daniel Moseguí González and Danielle Perszyk and Danny Hernandez and Danqi Chen and Daphne Ippolito and Dar Gilboa and David Dohan and David Drakard and David Jurgens and Debajyoti Datta and Deep Ganguli and Denis Emelin and Denis Kleyko and Deniz Yuret and Derek Chen and Derek Tam and Dieuwke Hupkes and Diganta Misra and Dilyar Buzan and Dimitri Coelho Mollo and Diyi Yang and Dong-Ho Lee and Ekaterina Shutova and Ekin Dogus Cubuk and Elad Segal and Eleanor Hagerman and Elizabeth Barnes and Elizabeth Donoway and Ellie Pavlick and Emanuele Rodola and Emma Lam and Eric Chu and Eric Tang and Erkut Erdem and Ernie Chang and Ethan A. Chi and Ethan Dyer and Ethan Jerzak and Ethan Kim and Eunice Engefu Manyasi and Evgenii Zheltonozhskii and Fanyue Xia and Fatemeh Siar and Fernando Martínez-Plumed and Francesca Happé and Francois Chollet and Frieda Rong and Gaurav Mishra and Genta Indra Winata and Gerard de Melo and Germán Kruszewski and Giambattista Parascandolo and Giorgio Mariani and Gloria Wang and Gonzalo Jaimovitch-López and Gregor Betz and Guy Gur-Ari and Hana Galijasevic and Hannah Kim and Hannah Rashkin and Hannaneh Hajishirzi and Harsh Mehta and Hayden Bogar and Henry Shevlin and Hinrich Schütze and Hiromu Yakura and Hongming Zhang and Hugh Mee Wong and Ian Ng and Isaac Noble and Jaap Jumelet and Jack Geissinger and Jackson Kernion and Jacob Hilton and Jaehoon Lee and Jaime Fernández Fisac and James B. Simon and James Koppel and James Zheng and James Zou and Jan Kocoń and Jana Thompson and Jared Kaplan and Jarema Radom and Jascha Sohl-Dickstein and Jason Phang and Jason Wei and Jason Yosinski and Jekaterina Novikova and Jelle Bosscher and Jennifer Marsh and Jeremy Kim and Jeroen Taal and Jesse Engel and Jesujoba Alabi and Jiacheng Xu and Jiaming Song and Jillian Tang and Joan Waweru and John Burden and John Miller and John U. Balis and Jonathan Berant and Jörg Frohberg and Jos Rozen and Jose Hernandez-Orallo and Joseph Boudeman and Joseph Jones and Joshua B. Tenenbaum and Joshua S. Rule and Joyce Chua and Kamil Kanclerz and Karen Livescu and Karl Krauth and Karthik Gopalakrishnan and Katerina Ignatyeva and Katja Markert and Kaustubh D. Dhole and Kevin Gimpel and Kevin Omondi and Kory Mathewson and Kristen Chiafullo and Ksenia Shkaruta and Kumar Shridhar and Kyle McDonell and Kyle Richardson and Laria Reynolds and Leo Gao and Li Zhang and Liam Dugan and Lianhui Qin and Lidia Contreras-Ochando and Louis-Philippe Morency and Luca Moschella and Lucas Lam and Lucy Noble and Ludwig Schmidt and Luheng He and Luis Oliveros Colón and Luke Metz and Lütfi Kerem Şenel and Maarten Bosma and Maarten Sap and Maartje ter Hoeve and Maheen Farooqi and Manaal Faruqui and Mantas Mazeika and Marco Baturan and Marco Marelli and Marco Maru and Maria Jose Ramírez Quintana and Marie Tolkiehn and Mario Giulianelli and Martha Lewis and Martin Potthast and Matthew L. Leavitt and Matthias Hagen and Mátyás Schubert and Medina Orduna Baitemirova and Melody Arnaud and Melvin McElrath and Michael A. Yee and Michael Cohen and Michael Gu and Michael Ivanitskiy and Michael Starritt and Michael Strube and Michał Swędrowski and Michele Bevilacqua and Michihiro Yasunaga and Mihir Kale and Mike Cain and Mimee Xu and Mirac Suzgun and Mo Tiwari and Mohit Bansal and Moin Aminnaseri and Mor Geva and Mozhdeh Gheini and Mukund Varma T and Nanyun Peng and Nathan Chi and Nayeon Lee and Neta Gur-Ari Krakover and Nicholas Cameron and Nicholas Roberts and Nick Doiron and Nikita Nangia and Niklas Deckers and Niklas Muennighoff and Nitish Shirish Keskar and Niveditha S. Iyer and Noah Constant and Noah Fiedel and Nuan Wen and Oliver Zhang and Omar Agha and Omar Elbaghdadi and Omer Levy and Owain Evans and Pablo Antonio Moreno Casares and Parth Doshi and Pascale Fung and Paul Pu Liang and Paul Vicol and Pegah Alipoormolabashi and Peiyuan Liao and Percy Liang and Peter Chang and Peter Eckersley and Phu Mon Htut and Pinyu Hwang and Piotr Miłkowski and Piyush Patil and Pouya Pezeshkpour and Priti Oli and Qiaozhu Mei and Qing Lyu and Qinlang Chen and Rabin Banjade and Rachel Etta Rudolph and Raefer Gabriel and Rahel Habacker and Ramón Risco Delgado and Raphaël Millière and Rhythm Garg and Richard Barnes and Rif A. Saurous and Riku Arakawa and Robbe Raymaekers and Robert Frank and Rohan Sikand and Roman Novak and Roman Sitelew and Ronan LeBras and Rosanne Liu and Rowan Jacobs and Rui Zhang and Ruslan Salakhutdinov and Ryan Chi and Ryan Lee and Ryan Stovall and Ryan Teehan and Rylan Yang and Sahib Singh and Saif M. Mohammad and Sajant Anand and Sam Dillavou and Sam Shleifer and Sam Wiseman and Samuel Gruetter and Samuel R. Bowman and Samuel S. Schoenholz and Sanghyun Han and Sanjeev Kwatra and Sarah A. Rous and Sarik Ghazarian and Sayan Ghosh and Sean Casey and Sebastian Bischoff and Sebastian Gehrmann and Sebastian Schuster and Sepideh Sadeghi and Shadi Hamdan and Sharon Zhou and Shashank Srivastava and Sherry Shi and Shikhar Singh and Shima Asaadi and Shixiang Shane Gu and Shubh Pachchigar and Shubham Toshniwal and Shyam Upadhyay and Shyamolima and Debnath and Siamak Shakeri and Simon Thormeyer and Simone Melzi and Siva Reddy and Sneha Priscilla Makini and Soo-Hwan Lee and Spencer Torene and Sriharsha Hatwar and Stanislas Dehaene and Stefan Divic and Stefano Ermon and Stella Biderman and Stephanie Lin and Stephen Prasad and Steven T. Piantadosi and Stuart M. Shieber and Summer Misherghi and Svetlana Kiritchenko and Swaroop Mishra and Tal Linzen and Tal Schuster and Tao Li and Tao Yu and Tariq Ali and Tatsu Hashimoto and Te-Lin Wu and Théo Desbordes and Theodore Rothschild and Thomas Phan and Tianle Wang and Tiberius Nkinyili and Timo Schick and Timofei Kornev and Timothy Telleen-Lawton and Titus Tunduny and Tobias Gerstenberg and Trenton Chang and Trishala Neeraj and Tushar Khot and Tyler Shultz and Uri Shaham and Vedant Misra and Vera Demberg and Victoria Nyamai and Vikas Raunak and Vinay Ramasesh and Vinay Uday Prabhu and Vishakh Padmakumar and Vivek Srikumar and William Fedus and William Saunders and William Zhang and Wout Vossen and Xiang Ren and Xiaoyu Tong and Xinran Zhao and Xinyi Wu and Xudong Shen and Yadollah Yaghoobzadeh and Yair Lakretz and Yangqiu Song and Yasaman Bahri and Yejin Choi and Yichi Yang and Yiding Hao and Yifu Chen and Yonatan Belinkov and Yu Hou and Yufang Hou and Yuntao Bai and Zachary Seid and Zhuoye Zhao and Zijian Wang and Zijie J. Wang and Zirui Wang and Ziyi Wu},
+      year={2022},
+      eprint={2206.04615},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+### Groups and Tasks
+#### Groups
+* `group_name`: `Short description`
+#### Tasks
+* `task_name`: `1-sentence description of what this particular task does`
+* `task_name2`: ...
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/bigbench/generate_tasks.py
+++ b/lm_eval/tasks/bigbench/generate_tasks.py
+import os
+import yaml
+all_subtasks = [
+    'abstract_narrative_understanding',
+    'anachronisms',
+    'analogical_similarity',
+    'analytic_entailment',
+    'arithmetic',
+    'ascii_word_recognition',
+    'authorship_verification',
+    'auto_categorization',
+    'auto_debugging',
+    'bbq_lite_json',
+    'bridging_anaphora_resolution_barqa',
+    'causal_judgment',
+    'cause_and_effect',
+    'checkmate_in_one',
+    'chess_state_tracking',
+    'chinese_remainder_theorem',
+    'cifar10_classification',
+    'code_line_description',
+    'codenames',
+    'color',
+    'common_morpheme',
+    'conceptual_combinations',
+    'conlang_translation',
+    'contextual_parametric_knowledge_conflicts',
+    'crash_blossom',
+    'crass_ai',
+    'cryobiology_spanish',
+    'cryptonite',
+    'cs_algorithms',
+    'dark_humor_detection',
+    'date_understanding',
+    'disambiguation_qa',
+    'discourse_marker_prediction',
+    'disfl_qa',
+    'dyck_languages',
+    'elementary_math_qa',
+    'emoji_movie',
+    'emojis_emotion_prediction',
+    'empirical_judgments',
+    'english_proverbs',
+    'english_russian_proverbs',
+    'entailed_polarity',
+    'entailed_polarity_hindi',
+    'epistemic_reasoning',
+    'evaluating_information_essentiality',
+    'fact_checker',
+    'fantasy_reasoning',
+    'few_shot_nlg',
+    'figure_of_speech_detection',
+    'formal_fallacies_syllogisms_negation',
+    'gem',
+    'gender_inclusive_sentences_german',
+    'general_knowledge',
+    'geometric_shapes',
+    'goal_step_wikihow',
+    'gre_reading_comprehension',
+    'hhh_alignment',
+    'hindi_question_answering',
+    'hindu_knowledge',
+    'hinglish_toxicity',
+    'human_organs_senses',
+    'hyperbaton',
+    'identify_math_theorems',
+    'identify_odd_metaphor',
+    'implicatures',
+    'implicit_relations',
+    'intent_recognition',
+    'international_phonetic_alphabet_nli',
+    'international_phonetic_alphabet_transliterate',
+    'intersect_geometry',
+    'irony_identification',
+    'kanji_ascii',
+    'kannada',
+    'key_value_maps',
+    'known_unknowns',
+    'language_games',
+    'language_identification',
+    'linguistic_mappings',
+    'linguistics_puzzles',
+    'list_functions',
+    'logic_grid_puzzle',
+    'logical_args',
+    'logical_deduction',
+    'logical_fallacy_detection',
+    'logical_sequence',
+    'mathematical_induction',
+    'matrixshapes',
+    'metaphor_boolean',
+    'metaphor_understanding',
+    'minute_mysteries_qa',
+    'misconceptions',
+    'misconceptions_russian',
+    'mnist_ascii',
+    'modified_arithmetic',
+    'moral_permissibility',
+    'movie_dialog_same_or_different',
+    'movie_recommendation',
+    'mult_data_wrangling',
+    'multiemo',
+    'natural_instructions',
+    'navigate',
+    'nonsense_words_grammar',
+    'novel_concepts',
+    'object_counting',
+    'odd_one_out',
+    'operators',
+    'paragraph_segmentation',
+    'parsinlu_qa',
+    'parsinlu_reading_comprehension',
+    'penguins_in_a_table',
+    'periodic_elements',
+    'persian_idioms',
+    'phrase_relatedness',
+    'physical_intuition',
+    'physics',
+    'physics_questions',
+    'play_dialog_same_or_different',
+    'polish_sequence_labeling',
+    'presuppositions_as_nli',
+    'qa_wikidata',
+    'question_selection',
+    'real_or_fake_text',
+    'reasoning_about_colored_objects',
+    'repeat_copy_logic',
+    'rephrase',
+    'riddle_sense',
+    'ruin_names',
+    'salient_translation_error_detection',
+    'scientific_press_release',
+    'semantic_parsing_in_context_sparc',
+    'semantic_parsing_spider',
+    'sentence_ambiguity',
+    'similarities_abstraction',
+    'simp_turing_concept',
+    'simple_arithmetic_json',
+    'simple_arithmetic_json_multiple_choice',
+    'simple_arithmetic_json_subtasks',
+    'simple_arithmetic_multiple_targets_json',
+    'simple_ethical_questions',
+    'simple_text_editing',
+    'snarks',
+    'social_iqa',
+    'social_support',
+    'sports_understanding',
+    'strange_stories',
+    'strategyqa',
+    'sufficient_information',
+    'suicide_risk',
+    'swahili_english_proverbs',
+    'swedish_to_german_proverbs',
+    'symbol_interpretation',
+    'temporal_sequences',
+    'tense',
+    'timedial',
+    'topical_chat',
+    'tracking_shuffled_objects',
+    'understanding_fables',
+    'undo_permutation',
+    'unit_conversion',
+    'unit_interpretation',
+    'unnatural_in_context_learning',
+    'vitaminc_fact_verification',
+    'what_is_the_tao',
+    'which_wiki_edit',
+    'winowhy',
+    'word_sorting',
+    'word_unscrambling'
+    ]
+def main() -> None:
+    for path, task_type in zip(["multiple_choice", "greedy_until"], ["multiple_choice_template_yaml", "greedy_until_template_yaml"]):
+        os.makedirs(path, exist_ok=True)
+        for task in all_subtasks:
+            file_name = f"{task}.yaml"
+            try:
+                with open(f"{path}/{file_name}", "w") as f:
+                    f.write("# Generated by utils.py\n")
+                    yaml.dump(
+                        {
+                            "include": f"../{task_type}",
+                            "task": "bigbench_" + task + "_{}".format(task_type.split("_template_yaml")[0]),
+                            "dataset_name": task + "_zero_shot", # zero-shot version of the dataset
+                        },
+                        f,
+                        width=float("inf"), allow_unicode=True
+                    )
+            except FileExistsError:
+                pass
+if __name__ == "__main__":
+    main()
--- a/lm_eval/tasks/bigbench/greedy_until/abstract_narrative_understanding.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/abstract_narrative_understanding.yaml
+# Generated by utils.py
+dataset_name: abstract_narrative_understanding_zero_shot
+include: ../greedy_until_template_yaml
+task: bigbench_abstract_narrative_understanding_greedy_until
--- a/lm_eval/tasks/bigbench/greedy_until/anachronisms.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/anachronisms.yaml
+# Generated by utils.py
+dataset_name: anachronisms_zero_shot
+include: ../greedy_until_template_yaml
+task: bigbench_anachronisms_greedy_until
--- a/lm_eval/tasks/bigbench/greedy_until/analogical_similarity.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/analogical_similarity.yaml
+# Generated by utils.py
+dataset_name: analogical_similarity_zero_shot
+include: ../greedy_until_template_yaml
+task: bigbench_analogical_similarity_greedy_until
--- a/lm_eval/tasks/bigbench/greedy_until/analytic_entailment.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/analytic_entailment.yaml
+# Generated by utils.py
+dataset_name: analytic_entailment_zero_shot
+include: ../greedy_until_template_yaml
+task: bigbench_analytic_entailment_greedy_until
--- a/lm_eval/tasks/bigbench/greedy_until/arithmetic.yaml
+++ b/lm_eval/tasks/bigbench/greedy_until/arithmetic.yaml
+# Generated by utils.py
+dataset_name: arithmetic_zero_shot
+include: ../greedy_until_template_yaml
+task: bigbench_arithmetic_greedy_until