Merge remote-tracking branch 'origin/big-refactor' into calibration

e1ae8a2f · Herbie Bradley · 50e99bd7 · 30936bc7 · e1ae8a2f · e1ae8a2f
Commit e1ae8a2f authored Nov 26, 2023 by Herbie Bradley
20 changed files
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
-# name: Tasks Modified
+name: Tasks Modified

-# on:
-#   push:
-#     branches:
-#       - 'big-refactor*'
-#   pull_request:
-#     branches:
-#       - 'big-refactor*'
-#   workflow_dispatch:
-# # comment/edit out the above to stop/change the triggers
-# jobs:
-#   changed_files:
-#     runs-on: ubuntu-latest  # windows-latest || macos-latest
-#     timeout-minutes: 120
-#     name: Scan for changed tasks
-#     steps:
-#       - name: checkout
-#         uses: actions/checkout@v3
-#         with:
-#           fetch-depth: 2  # OR "2" -> To retrieve the preceding commit.
+on:
+  push:
+    branches:
+      - 'big-refactor*'
+  pull_request:
+    branches:
+      - 'big-refactor*'
+  workflow_dispatch:
+# comment/edit out the above to stop/change the triggers
+jobs:
+  changed_files:
+    runs-on: ubuntu-latest  # windows-latest || macos-latest
+    timeout-minutes: 120
+    name: Scan for changed tasks
+    steps:
+      - name: checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2  # OR "2" -> To retrieve the preceding commit.

-#       # Uses the tj-actions/changed-files@v37 action to check for changes.
-#       # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
-#       # The `files_yaml` input optionally takes a yaml string to specify filters,
-#       # and prepends the filter name to the standard output names.
-#       - name: Check task folders
-#         id: changed-tasks
-#         uses: tj-actions/changed-files@v37.1.2
-#         with:
-#           # tasks checks the tasks folder and api checks the api folder for changes
-#           files_yaml: |
-#             tasks:
-#               - lm_eval/tasks/**
-#             api:
-#               - lm_eval/api/**
-#           write_output_files: true
+      # Uses the tj-actions/changed-files@v37 action to check for changes.
+      # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
+      # The `files_yaml` input optionally takes a yaml string to specify filters,
+      # and prepends the filter name to the standard output names.
+      - name: Check task folders
+        id: changed-tasks
+        uses: tj-actions/changed-files@v37.1.2
+        with:
+          # tasks checks the tasks folder and api checks the api folder for changes
+          files_yaml: |
+            tasks:
+              - lm_eval/tasks/**
+            api:
+              - lm_eval/api/**
+          write_output_files: true

-#     # The next step is optional; the files are written to the workspace by default (above).
-#     # so it's just for debugging
-#       - name: Run Tests
-#         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
-#         run: |
-#           echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
-#           echo "One or more test file(s) has changed."
-#           echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
+    # The next step is optional; the files are written to the workspace by default (above).
+    # so it's just for debugging
+      - name: Run Tests
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        run: |
+          echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
+          echo "One or more test file(s) has changed."
+          echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"

-#       - name: Set up Python 3.9
-#         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
-#         uses: actions/setup-python@v4
-#         with:
-#           python-version: 3.9
-#           cache: 'pip'
-#           cache-dependency-path: setup.py
-#       - name: Install dependencies
-#         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
-#         run: |
-#             python -m pip install --upgrade pip
-#             pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
-#     #   Install optional git dependencies
-#     #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
-#     #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-#       - name: Test with pytest
-#         # if new tasks are added, run tests on them
-#         if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
-#         run: python -m pytest tests/test_tasks.py -s -vv
-#         # if api is modified, run tests on it
-#       - name: Test more tasks with pytest
-#         env:
-#           API: true
-#         if: steps.changed-tasks.outputs.api_any_modified == 'true'
-#         run: python -m pytest tests/test_tasks.py -s -vv
+      - name: Set up Python 3.9
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+          cache: 'pip'
+          cache-dependency-path: setup.py
+      - name: Install dependencies
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        run: |
+            python -m pip install --upgrade pip
+            pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+    #   Install optional git dependencies
+    #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+    #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+      - name: Test with pytest
+        # if new tasks are added, run tests on them
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
+        run: python -m pytest tests/test_tasks.py -s -vv
+        # if api is modified, run tests on it
+      - name: Test more tasks with pytest
+        env:
+          API: true
+        if: steps.changed-tasks.outputs.api_any_modified == 'true'
+        run: python -m pytest tests/test_tasks.py -s -vv
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -22,10 +22,10 @@ jobs:
    steps:
    - name: Checkout Code
      uses: actions/checkout@v3
-    - name: Set up Python 3.9
+    - name: Set up Python 3.8
      uses: actions/setup-python@v4
      with:
-        python-version: 3.9
+        python-version: 3.8
        cache: pip
        cache-dependency-path: setup.py
    - name: Install dependencies
@@ -44,34 +44,34 @@ jobs:
 #    - name: Lint with mypy
 #      run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
 # Job 2
-#   testcpu:
-#     name: CPU Tests
-#     runs-on: ubuntu-latest
-#     strategy:
-#       matrix:
-#         python-version: [ "3.8", "3.9", "3.10", "3.11" ]
-#     timeout-minutes: 30
-#     steps:
-#     - name: Checkout Code
-#       uses: actions/checkout@v3
-#     - name: Set up Python ${{ matrix.python-version }}
-#       uses: actions/setup-python@v4
-#       with:
-#         python-version: ${{ matrix.python-version }}
-#         cache: pip
-#         cache-dependency-path: setup.py
-#     - name: Install dependencies
-#       run: |
-#         python -m pip install --upgrade pip
-#         pip install -e '.[testing,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
-# #         Install optional git dependencies
-# #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
-# #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-#     - name: Test with pytest
-#       run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/tests_master --ignore=tests/extra
-#     - name: Archive artifacts
-#       uses: actions/upload-artifact@v3
-#       with:
-#         name: output_results
-#         path: |
-#           test_logs/*
+  testcpu:
+    name: CPU Tests
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+    timeout-minutes: 30
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: pip
+        cache-dependency-path: setup.py
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e '.[testing,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
+#         Install optional git dependencies
+#                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+#        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Test with pytest
+      run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/tests_master --ignore=tests/extra
+    - name: Archive artifacts
+      uses: actions/upload-artifact@v3
+      with:
+        name: output_results
+        path: |
+          test_logs/*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,14 +33,13 @@ repos:
    rev: 22.3.0
    hooks:
      - id: black
-        language_version: python3.9
  - repo: https://github.com/codespell-project/codespell
    rev: v2.1.0
    hooks:
      - id: codespell
        exclude: >
          (?x)^(
-              .*\.json|ignore.txt
+              .*\.json|ignore.txt|lm_eval/tasks/.*|.*yaml
          )$
        args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
  - repo: https://github.com/pre-commit/mirrors-mypy

--- a/CODEOWNERS
+++ b/CODEOWNERS
-* @haileyschoelkopf @lintangsutawika
+* @haileyschoelkopf @lintangsutawika @StellaAthena
--- a/README.md
+++ b/README.md
 # Language Model Evaluation Harness

-## Notice to Users
-(as of 6/15/23)
-We have a revamp of the Evaluation Harness library internals staged on the [big-refactor](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor) branch! It is far along in progress, but before we start to move the `master` branch of the repository over to this new design with a new version release, we'd like to ensure that it's been tested by outside users and there are no glaring bugs.
-
-We’d like your help to test it out! you can help by:
-1. Trying out your current workloads on the big-refactor branch, and seeing if anything breaks or is counterintuitive,
-2. Porting tasks supported in the previous version of the harness to the new YAML configuration format. Please check out our [task implementation guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md) for more information.
-
-If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with:
- A command of the form `python main.py --model hf --model_args ..... --tasks <task name> ...` which will run the task in the `master` branch, and what the score is
- A command of the form `python main.py --model hf --model_args ..... --tasks <task name> ...` to run the task in your PR branch to `big-refactor`, and what the resulting score is, to show that we achieve equality between the two implementations.
-
-Lastly, we'll no longer be accepting new feature requests beyond those that are already open to the master branch as we carry out this switch to the new version over the next week, though we will be accepting bugfixes to `master` branch and PRs to `big-refactor`. Feel free to reach out in the #lm-thunderdome channel of the EAI discord for more information.
-
 ## Overview

 This project provides a unified framework to test generative language models on a large number of different evaluation tasks.

 Features:

- Many tasks implemented, 200+ tasks [implemented in the old framework](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md) which require porting to the new setup as described in [the new task guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md).
+- Over 60 standard academic benchmarks for LLMs, with hundreds of subtasks and variants implemented.
 - Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (including quantization via [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface.
 - Support for commercial APIs including [OpenAI](https://openai.com), [goose.ai](https://goose.ai), and [TextSynth](https://textsynth.com/).
- Support for evaluation on adapters (e.g. LoRa) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft).
- Evaluating with publicly available prompts ensures reproducibility and comparability between papers.
+- Support for evaluation on adapters (e.g. LoRA) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft).
+- Support for local models and benchmarks.
+- Evaluation with publicly available prompts ensures reproducibility and comparability between papers.
+
+The Language Model Evaluation Harness is the backend for 🤗 Hugging Face's popular [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and is used internally by dozens of companies including NVIDIA, Cohere, Booz Allen Hamilton, and Mosaic ML.
+

 ## Install

-To install the `lm-eval` refactor branch from the github repository, run:
+To install the `lm-eval` package from the github repository, run:

 ```bash
 git clone https://github.com/EleutherAI/lm-evaluation-harness
@@ -54,7 +44,6 @@ To install the package with all extras, run
 pip install -e ".[all]"
 ```

-
 ## Support

 The best way to get support is to open an issue on this repo or join the EleutherAI discord server](discord.gg/eleutherai). The `#lm-thunderdome` channel is dedicated to developing this project and the `#release-discussion` channel is for receiving support for our releases.
@@ -67,7 +56,7 @@ To evaluate a model hosted on the [HuggingFace Hub](https://huggingface.co/model


 ```bash
-python main.py \
+python -m lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/gpt-j-6B \
    --tasks hellaswag \
@@ -78,7 +67,7 @@ python main.py \
 Additional arguments can be provided to the model constructor using the `--model_args` flag. Most notably, this supports the common practice of using the `revisions` feature on the Hub to store partially trained checkpoints, or to specify the datatype for running a model:

 ```bash
-python main.py \
+python -m lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/pythia-160m,revision=step100000,dtype="float" \
    --tasks lambada_openai,hellaswag \
@@ -86,12 +75,12 @@ python main.py \
    --batch_size 8
 ```

-Models that are loaded via either `transformers.AutoModelForCausalLM` (autoregressive, decoder-only GPT style models) or `transformers.AutoModelForSeq2SeqLM` (such as encoder-decoder models like T5) in Huggingface are supported via  Support for this model type is currently pending.
+Models that are loaded via both `transformers.AutoModelForCausalLM` (autoregressive, decoder-only GPT style models) and `transformers.AutoModelForSeq2SeqLM` (such as encoder-decoder models like T5) in Huggingface are supporteded.

 Batch size selection can be automated by setting the  ```--batch_size``` flag to ```auto```. This will perform automatic detection of the largest batch size that will fit on your device. On tasks where there is a large difference between the longest and shortest example, it can be helpful to periodically recompute the largest batch size, to gain a further speedup. To do this, append ```:N``` to above flag to automatically recompute the largest batch size ```N``` times. For example, to recompute the batch size 4 times, the command would be:

 ```bash
-python main.py \
+python -m lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/pythia-160m,revision=step100000,dtype="float" \
    --tasks lambada_openai,hellaswag \
@@ -99,7 +88,7 @@ python main.py \
    --batch_size auto:4
 ```

-Alternatively, you can use `lm-eval` instead of `python main.py` to call lm eval from anywhere.
+Alternatively, you can use `lm-eval` or `lm_eval` instead of `python -m lm_eval` to call lm eval from anywhere.

 ### Multi-GPU Evaluation with Hugging Face `accelerate`

@@ -108,7 +97,7 @@ To parallelize evaluation of HuggingFace models across multiple GPUs, we allow f
 The first is performed by launching evaluation via the `accelerate` library as follows:

 ```
-accelerate launch main.py \
+accelerate launch -m lm_eval \
    --model hf \
    --tasks lambada_openai,arc_easy \
    --batch_size 16 \
@@ -121,7 +110,7 @@ If your model is *is too large to be run on a single one of your GPUs* then you

 We also provide an second method to run these large models: use of the `parallelize` argument.
 ```
-python main.py \
+python -m lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/pythia-12b,parallelize=True
    --tasks lambada_openai,arc_easy \
@@ -136,7 +125,7 @@ To pass even more advanced keyword arguments to `accelerate`, we allow for the f

 Note that this method naively splits models across GPUs, resulting in only a single GPU performing work at any point in time, and so is much slower than launching with `accelerate launch`, possibly by a factor of the total # of GPUs.

-**Note that this option requires launching evaluation via `python main.py` rather than `accelerate launch main.py`.**
+**Note that this option requires launching evaluation via `python -m lm_eval` rather than `accelerate launch -m lm_eval`.**

 To use `accelerate` with the `lm-eval` command, use
 ```
@@ -151,14 +140,14 @@ A full accounting of the supported and planned libraries + APIs can be seen belo

 | API or Inference Server     | Implemented?                    | `--model <xxx>` name                                                             | Models supported:                    | Request Types:                                           |
 |-----------------------------|---------------------------------|----------------------------------------------------------------------------------|--------------------------------------|----------------------------------------------------------|
-| OpenAI Completions          | :heavy_check_mark:              | `openai`, `openai-completions`, `gooseai`                                        | up to `code-davinci-002`             | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
-| OpenAI ChatCompletions      | :x: Not yet - needs help!       | N/A                                                                              | (link here?)                         | `greedy_until` (no logprobs)                             |
-| Anthropic                   | :heavy_check_mark:              | `anthropic`                                                                      | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model)         | `greedy_until` (no logprobs)                             |
-| GooseAI                     | :heavy_check_mark: (not separately maintained)  | `openai`, `openai-completions`, `gooseai` (same interface as OpenAI Completions) |                                      | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
-| Textsynth                   | Needs testing                   | `textsynth`                                                                      | ???                                  | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
-| Cohere                      | :hourglass: - blocked on Cohere API bug | N/A                                                                              | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models) | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
-| GGML                        | :hourglass: [PR](https://github.com/EleutherAI/lm-evaluation-harness/pull/617)              | N/A                                                                              | ???                                  | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
-| vLLM                        | :x: Not yet - needs help!       | N/A                                                                              | All HF models                        | `greedy_until` (no logprobs)                             |
+| OpenAI Completions          | :heavy_check_mark:              | `openai`, `openai-completions`, `gooseai`                                        | up to `code-davinci-002`             | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| OpenAI ChatCompletions      | :x: Not yet - needs testing!       | N/A                                                                              | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt)                         | `generate_until` (no logprobs)                             |
+| Anthropic                   | :heavy_check_mark:              | `anthropic`                                                                      | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model)         | `generate_until` (no logprobs)                             |
+| GooseAI                     | :heavy_check_mark: (not separately maintained)  | `openai`, `openai-completions`, `gooseai` (same interface as OpenAI Completions) |                                      | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| Textsynth                   | Needs testing                   | `textsynth`                                                                      | ???                                  | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| Cohere                      | :hourglass: - blocked on Cohere API bug | N/A                                                                              | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| GGML/[Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python))                        | :heavy_check_mark:              | `gguf`, `ggml`                                                | Llama-architecture models (Llama, Llama 2, Llemma, Mistral(?), Llama finetunes)                               | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| vLLM                        | :x: Not yet - needs help!       | N/A                                                                              | All HF models                        | `generate_until` (no logprobs)                             |
 | Your inference server here! | ...                             | ...                                                                              | ...                                  | ...                                                      |                                | ...                                                      |

 It is on our roadmap to create task variants designed to enable models which do not serve logprobs/loglikelihoods to be compared with generation performance of open-source models.
@@ -167,7 +156,7 @@ Our library supports language models served via the OpenAI Completions API as fo

 ```bash
 export OPENAI_API_SECRET_KEY=YOUR_KEY_HERE
-python main.py \
+python -m lm_eval \
    --model openai-completions \
    --model_args engine=davinci \
    --tasks lambada_openai,hellaswag
@@ -198,18 +187,18 @@ This will write out one text file for each task.
 To verify the data integrity of the tasks you're performing in addition to running the tasks themselves, you can use the `--check_integrity` flag:

 ```bash
-python main.py \
+python -m lm_eval \
    --model openai \
    --model_args engine=davinci \
    --tasks lambada_openai,hellaswag \
    --check_integrity
 ```

-## Advanced Usage
+## Advanced Usage Tips

 For models loaded with the HuggingFace  `transformers` library, any arguments provided via `--model_args` get passed to the relevant constructor directly. This means that anything you can do with `AutoModel` can be done with our library. For example, you can pass a local path via `pretrained=` or use models finetuned with [PEFT](https://github.com/huggingface/peft) by taking the call you would run to evaluate the base model and add `,peft=PATH` to the `model_args` argument:
 ```bash
-python main.py \
+python -m lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/gpt-j-6b,parallelize=True,load_in_4bit=True,peft=nomic-ai/gpt4all-j-lora \
    --tasks openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq \
@@ -219,7 +208,7 @@ python main.py \
 [GPTQ](https://github.com/PanQiWei/AutoGPTQ) quantized models can be loaded by specifying their file names in `,gptq=NAME` (or `,gptq=True` for default names) in the `model_args` argument:

 ```bash
-python main.py \
+python -m lm_eval \
    --model hf \
    --model_args pretrained=model-name-or-path,gptq=model.safetensors,gptq_use_triton=True \
    --tasks hellaswag
@@ -227,42 +216,37 @@ python main.py \

 We support wildcards in task names, for example you can run all of the machine-translated lambada tasks via `--task lambada_openai_mt_*`.

-## Implementing new tasks
+To save evaluation results provide an `--output_path`. We also support logging model responses with the `--log_samples` flag for post-hoc analysis.

-To implement a new task in the eval harness, see [this guide](./docs/new_task_guide.md).
+Additionally, one can provide a directory with `--use_cache` to cache the results of prior runs. This allows you to avoid repeated execution of the same (model, task) pairs for re-scoring.

-
-As a start, we currently only support one prompt per task, which we strive to make the "standard" as defined by the benchmark's authors. If you would like to study how varying prompts causes changes in the evaluation score, we support prompts authored in the [Promptsource Library](https://github.com/bigscience-workshop/promptsource/tree/main) as described further in https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/docs/new_task_guide.md and https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/docs/advanced_task_guide.md and welcome contributions of novel task templates and task variants.
+For a full list of supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/interface.md) guide in our documentation!

 ## How to Contribute or Learn More?

 For more information on the library and how everything fits together, check out all of our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor/docs)! We plan to post a larger roadmap of desired + planned library improvements soon, with more information on how contributors can help.

-
 You can also ask for help, or discuss new features with the maintainers in the #lm-thunderdome channel of the EleutherAI discord! If you've used the library and have had a positive (or negative) experience, we'd love to hear from you!

+### Implementing new tasks
+
+To implement a new task in the eval harness, see [this guide](./docs/new_task_guide.md).
+
+In general, we following the following priority list for addressing concerns about prompting and other eval details:
+1. If there is widespread agreement among people who train LLMs, use the agreed upon procedure.
+2. If there is a clear and unambiguous official implementation, use that procedure.
+3. If there is widespread agreement among people who evaluate LLMs, use the agreed upon procedure.
+4. If there are multiple common implementations but not universal or widespread agreement, use our preferred option among the common implementations. As before, prioritize choosing from among the implementations found in LLM training papers.
+
+These are guidelines and not rules, and can be overruled in special circumstances.
+
+We try to prioritize agreement with the procedures used by other groups to decrease the harm when people inevitably compare runs across different papers despite our discouragement of the practice. Historically, we also prioritized the implementation from "Language Models are Few Shot Learners" as our original goal was specifically to compare results with that paper.

 ## Cite as

 ```
-@software{eval-harness,
-  author       = {Gao, Leo and
-                  Tow, Jonathan and
-                  Biderman, Stella and
-                  Black, Sid and
-                  DiPofi, Anthony and
-                  Foster, Charles and
-                  Golding, Laurence and
-                  Hsu, Jeffrey and
-                  McDonell, Kyle and
-                  Muennighoff, Niklas and
-                  Phang, Jason and
-                  Reynolds, Laria and
-                  Tang, Eric and
-                  Thite, Anish and
-                  Wang, Ben and
-                  Wang, Kevin and
-                  Zou, Andy},
+@misc{eval-harness,
+  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
  title        = {A framework for few-shot language model evaluation},
  month        = sep,
  year         = 2021,

--- a/docs/decontamination.md
+++ b/docs/decontamination.md
@@ -2,11 +2,11 @@

 ## Usage

-Simply add a "--decontamination_ngrams_path" when running main.py. The provided directory should contain
+Simply add a "--decontamination_ngrams_path" when running \__main\__.py. The provided directory should contain
 the ngram files and info.json produced in "Pile Ngram Generation" further down.

 ```bash
-python main.py \
+python -m lm_eval \
    --model gpt2 \
    --device 0 \
    --tasks sciq \

--- a/docs/interface.md
+++ b/docs/interface.md
@@ -4,7 +4,7 @@ This document details the interface exposed by `lm-eval` and provides details on

 ## Command-line Interface

-A majority of users run the library by cloning it from Github and running the `main.py` script.
+A majority of users run the library by cloning it from Github, installing the package as editable, and running the `python -m lm_eval` script.

 Equivalently, running the library can be done via the `lm-eval` entrypoint at the command line.

@@ -57,7 +57,9 @@ import lm_eval

 my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
 ...
-lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.greedy_until()`
+lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.generate_until()`
+
+lm_eval.tasks.initialize_tasks() # register all tasks from the `lm_eval/tasks` subdirectory. Alternatively, can call `lm_eval.tasks.include_path("path/to/my/custom/task/configs")` to only register a set of tasks in a separate directory.

 results = lm_eval.simple_evaluate( # call simple_evaluate
    model=lm_obj,
@@ -83,9 +85,9 @@ from my_tasks import MyTask1 # suppose you've defined a custom lm_eval.api.Task

 my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
 ...
-lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.greedy_until()`
-
+lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.generate_until()`

+lm_eval.tasks.initialize_tasks() # register all tasks from the `lm_eval/tasks` subdirectory. Alternatively, can call `lm_eval.tasks.include_path("path/to/my/custom/task/configs")` to only register a set of tasks in a separate directory.

 def evaluate(
    lm=lm_obj,

--- a/docs/model_guide.md
+++ b/docs/model_guide.md
 # New Model Guide

-The `lm-evaluation-harness` is intended to be a model-agnostic framework for evaluating . We provide first-class support for HuggingFace `AutoModelForCausalLM` and `AutoModelForSeq2SeqLM` type models, but
-
 This guide may be of special interest to users who are using the library outside of the repository, via installing the library via pypi and calling `lm_eval.evaluator.evaluate()` to evaluate an existing model.

 In order to properly evaluate a given LM, we require implementation of a wrapper class subclassing the `lm_eval.api.model.LM` class, that defines how the Evaluation Harness should interface with your model. This guide walks through how to write this `LM` subclass via adding it to the library!
@@ -44,35 +42,56 @@ class MyCustomLM(LM):
        #...


-    def greedy_until(self, requests: list[Instance]) -> list[str]:
+    def generate_until(self, requests: list[Instance]) -> list[str]:
        #...
    #...
 ```
-Where `Instance` is a dataclass defined in [`lm_eval.api.instance`](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/api/instance.py) with property `args` which returns a tuple of (context, continuation).
+Where `Instance` is a dataclass defined in [`lm_eval.api.instance`](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/api/instance.py) with property `args` of request-dependent type signature described below.
+
+We support three types of requests, consisting of different interactions / measurements with an autoregressive LM.
+
+All three request types take as input `requests` of type `list[Instance]` that have a matching `Instance.request_type` to the method name.

-We support
+- `generate_until`
+  - Each request contains `Instance.args : Tuple[str, dict]` containing 1. an input string to the LM and 2. a dictionary of keyword arguments used to control generation parameters.
+  - Using this input and these generation parameters, text will be sampled from the language model (typically until a maximum output length or specific stopping string sequences--for example, `{"until": ["\n\n", "."], "max_gen_toks": 128}`).
+  - The generated input+output text from the model will then be returned.

-The three types of
+- `loglikelihood`
+  - Each request contains `Instance.args : Tuple[str, str]` containing 1. an input string to the LM and 2. a target string on which the loglikelihood of the LM producing this target, conditioned on the input, will be returned.
+  - Each request will have, as result, `(ll, is_greedy): Tuple[float, int]` returned, where `ll` is a floating point number representing the log probability of generating the target string conditioned on the input, and `is_greedy` being either the value `0` or `1`, with it being `1` if and only if the target string *would be generated by greedy sampling from the LM* (that is, if the  target string is the *most likely* N-token string to be output by the LM given the input. )

+- `loglikelihood_rolling`
+  - Each request contains `Instance.args : Tuple[str]`, which is an input string to the model whose *entire* loglikelihood, conditioned on purely the EOT token, will be calculated.
+  - This is used to evaluate *perplexity* on a data distribution.
+  - It should return `(ll,) : Tuple[float]` , a.k.a. solely the *loglikelihood* of producing each piece of text given no starting input.


-smth smth tokenizer-agnostic
+To allow a model to be evaluated on all types of tasks, you will need to implement these three types of measurements (note that `loglikelihood_rolling` is a special case of `loglikelihood`). For a reference implementation, check out `lm_eval/models/huggingface.py` !

-3 reqtypes
- greedy_until, and the arguments passed to it
+**Tip: be careful of indexing in loglikelihood!**

- loglikelihood, and args passed to it

- loglikelihood_rolling, and args passed to it
+LMs take in tokens in position `[0 1 2 ... N]` and output a probability distribution for token position `N+1`. We provide a simplified graphic here, excerpted from `huggingface.py`:
+
+```
+# how this all works (illustrated on a causal decoder-only setup):
+#          CTX      CONT
+# inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+# model  \               \
+# logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+# cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+```

+The final token of the target is not passed into the LM, because we want the LM's predictions *up to but not past* that final target token. For more information, check out https://github.com/EleutherAI/lm-evaluation-harness/issues/942 .

 ## Registration

 Congrats on implementing your model! Now it's time to test it out.

-To make your model usable via the command line interface to `lm-eval` using `main.py`, you'll need to tell `lm-eval` what your model's name is.
+To make your model usable via the command line interface to `lm-eval` using `python -m lm_eval`, you'll need to tell `lm-eval` what your model's name is.

-This is done via a *decorator*, `lm_eval.api.registry.register_model`. Using `register_model()`, one can both tell the package what the model's name(s) to be used are when invoking it with `python main.py --model <name>` and alert `lm-eval` to the model's existence.
+This is done via a *decorator*, `lm_eval.api.registry.register_model`. Using `register_model()`, one can both tell the package what the model's name(s) to be used are when invoking it with `python -m lm_eval --model <name>` and alert `lm-eval` to the model's existence.

 ```python
 from lm_eval.api.registry import register_model
@@ -83,7 +102,9 @@ class MyCustomLM(LM):

 Using this decorator results in the class being added to an accounting of the usable LM types maintained internally to the library at `lm_eval.api.registry.MODEL_REGISTRY`. See `lm_eval.api.registry` for more detail on what sorts of registries and decorators exist in the library!

+## Testing

+We also recommend that new model contributions be accompanied by short tests of their 3 core functionalities, at minimum. To see an example of such tests, look at https://github.com/EleutherAI/lm-evaluation-harness/blob/35bdecd379c0cefad6897e67db892f4a6026a128/tests/test_ggml.py .

 ## Other


--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -17,7 +17,7 @@ git checkout -b <task-name>
 pip install -e ".[dev]"
 ```

-As a concrete example, we'll walk through reimplementing the `gsm8k` benchmark (a *generative* task which requires sampling text from a model) and the `sciq` benchmark. (a *discriminative*, or *multiple choice*, task where the model picks the most likely of several fixed answer choices).
+In this document, we'll walk through the basics of implementing a static benchmark evaluation in two formats: a *generative* task which requires sampling text from a model, such as [`gsm8k`](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/gsm8k/gsm8k.yaml), and a *discriminative*, or *multiple choice*, task where the model picks the most likely of several fixed answer choices, such as [`sciq`](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/sciq/sciq.yaml).

 ## Creating a YAML file

@@ -45,6 +45,16 @@ dataset_name: ... # the dataset configuration to use. Leave `null` if your datas
 dataset_kwargs: null # any extra keyword arguments that should be passed to the dataset constructor, e.g. `data_dir`.
 ```

+------------------------------
+**Tip:** To load a local dataset for evaluation, you can specify data files in the `dataset_kwargs` field, such as the following for JSON files:
+```
+dataset_path: json
+dataset_name: null
+dataset_kwargs:
+  data_files: /path/to/my/json
+```
+-------------------------------
+
 Next, we'd like to tell our task what the dataset's train, validation, and test splits are named, if they exist:

 ```yaml
@@ -116,7 +126,7 @@ doc_to_choice: ['No', 'Yes']

 We support the [Jinja 2](https://jinja.palletsprojects.com/en/3.1.x/) templating language for writing prompts. In practice, this means you can take your dataset's columns and do many basic string manipulations to place each document into prompted format.

-Take for example `super_glue/boolq`, as input, we'd like to use the features `passage` and `question` and string them together so that for a a sample line `doc`, the model sees something the format of:
+Take for example the dataset `super_glue/boolq`. As input, we'd like to use the features `passage` and `question` and string them together so that for a a sample line `doc`, the model sees something the format of:
 ```
 doc["passage"]
 Question: doc["question"]?
@@ -214,7 +224,7 @@ metric_list:
 ```
 `aggregation` and `higher_is_better` can optionally be left out to default to the manually-set defaults if using a natively supported metric, otherwise it must be defined explicitly (for example, when using a custom metric implemented as a function).

-For a full list of natively supported metrics and aggregation functions see `docs/advanced_task_guide.md`. All metrics supported in [HuggingFace Evaluate](https://github.com/huggingface/evaluate/tree/main/metrics) can also be used, and will be loaded if a given metric name is not one natively supported in `lm-eval`.
+For a full list of natively supported metrics and aggregation functions see `docs/advanced_task_guide.md`. All metrics supported in [HuggingFace Evaluate](https://github.com/huggingface/evaluate/tree/main/metrics) can also be used, and will be loaded if a given metric name is not one natively supported in `lm-eval` or `hf_evaluate` is set to `true`.

 ### Optional, More Advanced Setup

@@ -258,11 +268,29 @@ You can do this via adding the Python snippet
 from lm_eval.tasks import include_task_folder
 include_task_folder("/path/to/yaml/parent/folder")
 ```
-to the top of any Python file that is run or imported when performing evaluation, such as `main.py`.
+to the top of any Python file that is run or imported when performing evaluation, such as `\_\_main\_\_.py`.

 Passing `--tasks /path/to/yaml/file` is also accepted.


+## Beautifying Table Display
+
+To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed.
+``
+for example in `mmlu_abstract_algebra.yaml` we set `group_alias` to `stem` and `task_alias` to `abstract_algebra`.
+
+```
+"dataset_name": "abstract_algebra"
+"description": "The following are multiple choice questions (with answers) about abstract\
+  \ algebra.\n\n"
+"group": "mmlu_stem"
+"group_alias": "stem"
+"include": "_default_template_yaml"
+"task": "mmlu_abstract_algebra"
+"task_alias": "abstract_algebra"
+```
+Note: Even though `group` can be a list, for now, `group_alias` can only be a single string.
+
 ## Checking validity

 After registering your task, you can now check on your data downloading and verify that the few-shot samples look as intended. Run the following command with your desired args:
@@ -285,7 +313,7 @@ It's now time to check models' performance on your task! In the evaluation harne

 To enable this, we provide a checklist that should be completed when contributing a new task, to enable accurate book-keeping and to ensure that tasks added to the library are well-tested and, where applicable, precedented.

-### Task impl. checklist
+### Task Validity Checklist

 The checklist is the following:


--- a/docs/task_guide.md
+++ b/docs/task_guide.md
@@ -20,19 +20,19 @@ Task naming + registration:

 Dataset configuration options:
 - **dataset_path** (`str`) — The name of the dataset as listed by HF in the datasets Hub.
- **dataset_name**  (`str`, *optional*, defaults to None) — The name of, what HF calls, a “data instance” or sub-task of the benchmark. If your task does not contain any data instances, just leave this to default to None. (If you're familiar with the HF `datasets.load_dataset` function, these are just the first 2 arguments to it.)
+- **dataset_name**  (`str`, *optional*, defaults to None) — The name of what HF calls a “data instance” or sub-task of the benchmark. If your task does not contain any data instances, just leave this to default to None. (If you're familiar with the HF `datasets.load_dataset` function, these are just the first 2 arguments to it.)
 - **dataset_kwargs** (`dict`, *optional*) — Auxiliary arguments that `datasets.load_dataset` accepts. This can be used to specify arguments such as `data_files` or `data_dir` if you want to use local datafiles such as json or csv.
 - **training_split** (`str`, *optional*) — Split in the dataset to use as the training split.
 - **validation_split** (`str`, *optional*) — Split in the dataset to use as the validation split.
 - **test_split** (`str`, *optional*) — Split in the dataset to use as the test split.
- **fewshot_split** (`str`, *optional*) — Split in the dataset to draw few-shot exemplars from. assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
+- **fewshot_split** (`str`, *optional*) — Split in the dataset to draw few-shot exemplars from. assert that this not None if num_fewshot > 0.
 - **process_docs** (`Callable`, *optional*) — Optionally define a function to apply to each HF dataset split, to preprocess all documents before being fed into prompt template rendering or other evaluation steps. Can be used to rename dataset columns, or to process documents into a format closer to the expected format expected by a prompt template.

 Prompting / in-context formatting options:
 - **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text, doc_to_target, and doc_to_choice.
 - **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate input for the model
 - **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate target output for the model. For multiple choice tasks, this should return an index into
- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `greedy_until` tasks.
+- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `generate_until` tasks.
 - **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples.
 - **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested.

@@ -42,7 +42,7 @@ Runtime configuration options:

 Scoring details:
 - **metric_list** (`str`, *optional*, defaults to None) — A list of metrics to use for evaluation. See docs for expected format.
- **output_type** (`str`, *optional*, defaults to "greedy_until") — Selects the type of model output for the given task. Options are `greedy_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
+- **output_type** (`str`, *optional*, defaults to "generate_until") — Selects the type of model output for the given task. Options are `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
 - **generation_kwargs** (`dict`, *optional*) — Auxiliary arguments for the `generate` function from HF transformers library. Advanced keyword arguments may not be supported for non-HF LM classes.
 - **repeats** (`int`, *optional*, defaults to 1) — Number of repeated runs through model for each sample. can be used for cases such as self-consistency.
 - **filter_list** (`Union[str, list]`, *optional*) — List of filters to postprocess model outputs. See below for further detail on the filter API.
@@ -142,7 +142,7 @@ Our final filter pipeline, "maj@8", does majority voting across the first 8 of t
 - performing the same sequence of filters on these new sets of 8 responses, for each document.
 ```yaml
 - name: "maj@8"
-    filter:
+  filter:
    - function: "take_first_k"
      k: 8
    - function: "regex"

--- a/examples/README.md
+++ b/examples/README.md
-This folder is meant to contain instructions and task setups required to evaluate certain papers which may perform non-standard evaluation setups.
-
-Tasks can be supported already in the library under `lm_eval/tasks`, or if highly paper-specific, may remain as YAMLs in the respective `examples/paper-title` folder.
-
-## Verified Papers:
-
-* [WIP] [Chain-of-Thought Prompting Elicits Reasoning in Large Language Models](https://arxiv.org/abs/2201.11903)
-  * Further details can be found in the `chain_of_thought` subfolder.
-
-## Candidates to Support:
-
-* Least-to-Most Prompting
-* Algorithmic Prompting
-* Other in-scope prompting techniques
-  * Multi-turn prompting strategies are likely out of scope for the repository.
-* Pythia Suite: Term Frequencies over training
-* All setups from GPT-3 Paper
-* Varying few-shot orderings + selection ; Varying the label choices for multiple-choice tasks
-
-* Your Paper Here!
--- a/examples/chain_of_thought/README.md
+++ b/examples/chain_of_thought/README.md
-# Chain-of-Thought Prompting Elicits Reasoning in Large Language Models
-https://arxiv.org/abs/2201.11903
-
-## All Tasks in Paper
-
-* ...
-* ...
-* ...
-
-## Reproduction Scripts
-
-* ...
--- a/ignore.txt
+++ b/ignore.txt
@@ -5,3 +5,4 @@ maka
 mor
 te
 ond
+extraversion
--- a/main.py
+++ b/main.py
 import os
 import re
+import sys
 import json
-import fnmatch
-import jsonlines
-import argparse
 import logging
+import argparse
+import numpy as np
+
 from pathlib import Path
+from typing import Union

 from lm_eval import evaluator, utils
+from lm_eval.tasks import initialize_tasks, include_path
 from lm_eval.api.registry import ALL_TASKS
-from lm_eval.logger import eval_logger, SPACING
-from lm_eval.tasks import include_task_folder

-os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+def _handle_non_serializable(o):
+    if isinstance(o, np.int64) or isinstance(o, np.int32):
+        return int(o)
+    elif isinstance(o, set):
+        return list(o)
+    else:
+        return str(o)


-def parse_args() -> argparse.Namespace:
+def parse_eval_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
-    parser.add_argument("--model", required=True, help="Name of model e.g. `hf`")
+    parser.add_argument("--model", default="hf", help="Name of model e.g. `hf`")
    parser.add_argument(
        "--tasks",
        default=None,
-        help="Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS))),
+        help="To get full list of tasks, use the command lm-eval --tasks list",
    )
    parser.add_argument(
        "--model_args",
@@ -97,24 +105,43 @@ def parse_args() -> argparse.Namespace:
        default=None,
        help="Additional path to include if there are external tasks to include.",
    )
+    parser.add_argument(
+        "--verbosity",
+        type=str,
+        default="INFO",
+        help="Log error when tasks are not registered.",
+    )
    return parser.parse_args()


-def main() -> None:
-    args = parse_args()
+def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
+    if not args:
+        # we allow for args to be passed externally, else we parse them ourselves
+        args = parse_eval_args()
+
+    eval_logger = utils.eval_logger
+    eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
+    eval_logger.info(f"Verbosity set to {args.verbosity}")
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+    initialize_tasks(args.verbosity)

    if args.limit:
        eval_logger.warning(
            " --limit SHOULD ONLY BE USED FOR TESTING."
            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
        )
-
    if args.include_path is not None:
        eval_logger.info(f"Including path: {args.include_path}")
-        include_task_folder(args.include_path)
+        include_path(args.include_path)

    if args.tasks is None:
        task_names = ALL_TASKS
+    elif args.tasks == "list":
+        eval_logger.info(
+            "Available Tasks:\n - {}".format(f"\n - ".join(sorted(ALL_TASKS)))
+        )
+        sys.exit()
    else:
        if os.path.isdir(args.tasks):
            import glob
@@ -127,21 +154,25 @@ def main() -> None:
        else:
            tasks_list = args.tasks.split(",")
            task_names = utils.pattern_match(tasks_list, ALL_TASKS)
-            task_missing = []
            for task in [task for task in tasks_list if task not in task_names]:
                if os.path.isfile(task):
                    config = utils.load_yaml_config(task)
                    task_names.append(config)
-                else:
-                    task_missing.append(task)
-
-        if task_missing != []:
-            missing = ", ".join(task_missing)
-            eval_logger.error(
-                f"Tasks were not found: {missing}\n"
-                f"{SPACING}Try `lm-eval -h` for list of available tasks",
-            )
-            raise ValueError(f"Tasks {missing} were not found.")
+            task_missing = [
+                task
+                for task in tasks_list
+                if task not in task_names and "*" not in task
+            ]  # we don't want errors if a wildcard ("*") task name was used
+
+            if task_missing:
+                missing = ", ".join(task_missing)
+                eval_logger.error(
+                    f"Tasks were not found: {missing}\n"
+                    f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
+                )
+                raise ValueError(
+                    f"Tasks {missing} were not found. Try `lm-eval --tasks list` for list of available tasks."
+                )

    if args.output_path:
        path = Path(args.output_path)
@@ -184,7 +215,7 @@ def main() -> None:
    if results is not None:
        if args.log_samples:
            samples = results.pop("samples")
-        dumped = json.dumps(results, indent=2, default=lambda o: str(o))
+        dumped = json.dumps(results, indent=2, default=_handle_non_serializable)
        if args.show_config:
            print(dumped)

@@ -199,9 +230,10 @@ def main() -> None:
                        re.sub("/|=", "__", args.model_args), task_name
                    )
                    filename = path.joinpath(f"{output_name}.jsonl")
-
-                    with jsonlines.open(filename, "w") as f:
-                        f.write_all(samples[task_name])
+                    samples_dumped = json.dumps(
+                        samples[task_name], indent=2, default=_handle_non_serializable
+                    )
+                    filename.open("w").write(samples_dumped)

        print(
            f"{args.model} ({args.model_args}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
@@ -213,4 +245,4 @@ def main() -> None:


 if __name__ == "__main__":
-    main()
+    cli_evaluate()
--- a/lm_eval/api/instance.py
+++ b/lm_eval/api/instance.py
@@ -4,7 +4,7 @@ from typing import Literal, Tuple

 @dataclass
 class Instance:
-    request_type: Literal["loglikelihood", "loglikelihood_rolling", "greedy_until"]
+    request_type: Literal["loglikelihood", "loglikelihood_rolling", "generate_until"]
    doc: dict
    arguments: tuple
    idx: int

--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
+import logging
 import math
 import random
 from collections.abc import Iterable

+import evaluate
 import numpy as np
 import sacrebleu
 import sklearn.metrics

 from lm_eval.api.registry import register_aggregation, register_metric

+eval_logger = logging.getLogger("lm-eval")
+

 # Register Aggregations First
 @register_aggregation("mean")
@@ -166,6 +170,19 @@ def acc_mutual_info_fn(items):  # This is a passthrough function
    return items


+exact_match = evaluate.load("exact_match")
+
+
+@register_metric(
+    metric="exact_match",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="mean",
+)
+def exact_match_fn(**kwargs):
+    return exact_match.compute(**kwargs)
+
+
 @register_metric(
    metric="ece",
    higher_is_better=False,
@@ -262,7 +279,7 @@ def f1_fn(items):  # This is a passthrough function
 @register_metric(
    metric="bleu",
    higher_is_better=True,
-    output_type="greedy_until",
+    output_type="generate_until",
    aggregation="bleu",
 )
 def bleu_fn(items):  # This is a passthrough function
@@ -272,7 +289,7 @@ def bleu_fn(items):  # This is a passthrough function
 @register_metric(
    metric="chrf",
    higher_is_better=True,
-    output_type="greedy_until",
+    output_type="generate_until",
    aggregation="chrf",
 )
 def chrf_fn(items):  # This is a passthrough function
@@ -282,7 +299,7 @@ def chrf_fn(items):  # This is a passthrough function
 @register_metric(
    metric="ter",
    higher_is_better=True,
-    output_type="greedy_until",
+    output_type="generate_until",
    aggregation="ter",
 )
 def ter_fn(items):  # This is a passthrough function

--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -10,7 +10,10 @@ import hashlib
 from tqdm import tqdm

 from lm_eval import utils
-from lm_eval.logger import eval_logger
+
+import logging
+
+eval_logger = logging.getLogger("lm-eval")

 T = TypeVar("T", bound="LM")

@@ -96,7 +99,7 @@ class LM(abc.ABC):

    # TODO: Add an optional max length
    @abc.abstractmethod
-    def greedy_until(self, requests) -> List[str]:
+    def generate_until(self, requests) -> List[str]:
        """Generate greedily until a stopping sequence

        :param requests: list[Instance]
@@ -211,12 +214,12 @@ class CachingLM:
            )
            for req in tqdm(requests):
                hsh = hash_args(attr, req.args)
-                if attr == "greedy_until" and req.args[1].get("do_sample", False):
+                if attr == "generate_until" and req.args[1].get("do_sample", False):
                    # when we are doing non-greedy generation, don't use the cache
                    # (else every "randomly sampled" generation would be identical for repeats > 1).
                    if not warned:
                        eval_logger.warning(
-                            f"Arguments to lm.greedy_until() '{req.args[1]}' include non-deterministic sampling. Caching will not be performed for such requests."
+                            f"Arguments to lm.generate_until() '{req.args[1]}' include non-deterministic sampling. Caching will not be performed for such requests."
                        )
                        warned = True
                    res.append(None)

--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
 import os
 import evaluate
 from lm_eval.api.model import LM
-from lm_eval.logger import eval_logger
+
+import logging
+
+eval_logger = logging.getLogger("lm-eval")

 MODEL_REGISTRY = {}

@@ -68,10 +71,10 @@ def register_group(name):
    return decorate


-AGGREGATION_REGISTRY = {}
-DEFAULT_AGGREGATION_REGISTRY = {}
-METRIC_REGISTRY = {}
 OUTPUT_TYPE_REGISTRY = {}
+METRIC_REGISTRY = {}
+METRIC_AGGREGATION_REGISTRY = {}
+AGGREGATION_REGISTRY = {}
 HIGHER_IS_BETTER_REGISTRY = {}

 DEFAULT_METRIC_REGISTRY = {
@@ -81,7 +84,7 @@ DEFAULT_METRIC_REGISTRY = {
    ],
    "loglikelihood_rolling": ["word_perplexity", "byte_perplexity", "bits_per_byte"],
    "multiple_choice": ["acc", "acc_norm"],
-    "greedy_until": ["exact_match"],
+    "generate_until": ["exact_match"],
 }


@@ -95,8 +98,7 @@ def register_metric(**args):
        for key, registry in [
            ("metric", METRIC_REGISTRY),
            ("higher_is_better", HIGHER_IS_BETTER_REGISTRY),
-            # ("output_type", OUTPUT_TYPE_REGISTRY),
-            ("aggregation", DEFAULT_AGGREGATION_REGISTRY),
+            ("aggregation", METRIC_AGGREGATION_REGISTRY),
        ]:

            if key in args:
@@ -117,24 +119,23 @@ def register_metric(**args):
    return decorate


-def get_metric(name):
+def get_metric(name, hf_evaluate_metric=False):
+
+    if not hf_evaluate_metric:
+        if name in METRIC_REGISTRY:
+            return METRIC_REGISTRY[name]
+        else:
+            eval_logger.warning(
+                f"Could not find registered metric '{name}' in lm-eval, searching in HF Evaluate library..."
+            )

    try:
-        return METRIC_REGISTRY[name]
-    except KeyError:
-        # TODO: change this print to logging?
-        print(
-            f"Could not find registered metric '{name}' in lm-eval, \
-searching in HF Evaluate library..."
+        metric_object = evaluate.load(name)
+        return metric_object.compute
+    except Exception:
+        eval_logger.error(
+            f"{name} not found in the evaluate library! Please check https://huggingface.co/evaluate-metric",
        )
-        try:
-            metric_object = evaluate.load(name)
-            return metric_object.compute
-        except Exception:
-            eval_logger.error(
-                "{} not found in the evaluate library!".format(name),
-                "Please check https://huggingface.co/evaluate-metric",
-            )


 def register_aggregation(name):
@@ -159,12 +160,13 @@ def get_aggregation(name):
        )


-def get_default_aggregation(metric_name):
+def get_metric_aggregation(name):
+
    try:
-        return DEFAULT_AGGREGATION_REGISTRY[metric_name]
+        return METRIC_AGGREGATION_REGISTRY[name]
    except KeyError:
        eval_logger.warning(
-            f"No default aggregation metric for metric '{metric_name}'!"
+            "{} metric is not assigned a default aggregation!".format(name),
        )


@@ -172,7 +174,6 @@ def is_higher_better(metric_name):
    try:
        return HIGHER_IS_BETTER_REGISTRY[metric_name]
    except KeyError:
-        raise Warning(f"higher_is_better not specified for metric '{metric_name}'!")
        eval_logger.warning(
            f"higher_is_better not specified for metric '{metric_name}'!"
        )
--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
-class Sampler:
+class ContextSampler:
    def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
        self.rnd = rnd
        assert self.rnd, "must pass rnd to FewShotSampler!"
@@ -46,14 +46,14 @@ class Sampler:
                    )
                    + self.target_delimiter
                    + (
-                        self.doc_to_target(doc)[0]
+                        str(self.doc_to_target(doc)[0])
                        if type(self.doc_to_target(doc)) is list
                        else self.doc_to_target(doc)
                        if (
                            self.config.doc_to_choice is None
                            or type(self.doc_to_target(doc)) is str
                        )
-                        else self.doc_to_choice(doc)[self.doc_to_target(doc)]
+                        else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
                    )
                    for doc in selected_docs
                ]
@@ -71,7 +71,19 @@ class Sampler:
        return self.rnd.sample(self.docs, n)


-class BalancedSampler(Sampler):
+class FirstNSampler(ContextSampler):
+    def sample(self, n) -> None:
+        """
+        Draw the first `n` samples in order from the specified split.
+        Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
+        """
+        assert n <= len(
+            self.docs
+        ), f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
+        return self.docs[:n]
+
+
+class BalancedSampler(ContextSampler):
    def sample(self, n) -> None:
        """
        TODO: this should return approximately class-balanced samples from our fewshot examples.
@@ -81,12 +93,27 @@ class BalancedSampler(Sampler):
        pass


-class ManualSampler(Sampler):
+class ManualSampler(ContextSampler):
    def sample(self, n) -> None:
        """ """
        pass


+SAMPLER_REGISTRY = {
+    "default": ContextSampler,
+    "first_n": FirstNSampler,
+}
+
+
+def get_sampler(name):
+    try:
+        return SAMPLER_REGISTRY[name]
+    except KeyError:
+        raise ValueError(
+            f"Attempted to use contextsampler '{name}', but no sampling strategy for this name found! Supported model names: {', '.join(SAMPLER_REGISTRY.keys())}"
+        )
+
+
 # TODO: how should we do design here? might be better to have a single sampler and pass more kwargs at init.
 # Depends what's easier for new user to add own functionality on top of


--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
 import abc
 import ast
-import functools
 import itertools
+import logging
 import random
 import re
 from collections.abc import Callable
@@ -18,7 +18,6 @@ from tqdm import tqdm
 from lm_eval import utils
 from lm_eval.api import samplers
 from lm_eval.api.filter import FilterEnsemble
-from lm_eval.api.instance import Instance
 from lm_eval.api.metrics import (
    bits_per_byte,
    mean,
@@ -26,31 +25,34 @@ from lm_eval.api.metrics import (
    weighted_perplexity,
 )
 from lm_eval.api.registry import (
-    AGGREGATION_REGISTRY,
    DEFAULT_METRIC_REGISTRY,
    OUTPUT_TYPE_REGISTRY,
    get_aggregation,
-    get_default_aggregation,
    get_metric,
+    get_metric_aggregation,
    is_higher_better,
 )
 from lm_eval.filters import build_filter_ensemble
-from lm_eval.logger import eval_logger
 from lm_eval.prompts import get_prompt

 ALL_OUTPUT_TYPES = [
    "loglikelihood",
    "multiple_choice",
    "loglikelihood_rolling",
-    "greedy_until",
+    "generate_until",
 ]


+eval_logger = logging.getLogger("lm-eval")
+
+
 @dataclass
 class TaskConfig(dict):
    # task naming/registry
    task: str = None
+    task_alias: str = None
    group: Union[str, list] = None
+    group_alias: Union[str, list] = None
    # HF dataset options.
    # which dataset to use,
    # and what splits for what purpose
@@ -67,17 +69,17 @@ class TaskConfig(dict):
    doc_to_text: Union[Callable, str] = None
    doc_to_target: Union[Callable, str] = None
    doc_to_choice: Union[Callable, str, dict, list] = None
-    gold_alias: Union[Callable, str] = None
    process_results: Union[Callable, str] = None
    use_prompt: str = None
    description: str = ""
    target_delimiter: str = " "
    fewshot_delimiter: str = "\n\n"
+    fewshot_config: dict = None
    # runtime configuration options
    num_fewshot: int = 0
    # scoring options
    metric_list: list = None
-    output_type: str = "greedy_until"
+    output_type: str = "generate_until"
    generation_kwargs: dict = None
    repeats: int = 1
    filter_list: Union[str, list] = None
@@ -87,18 +89,18 @@ class TaskConfig(dict):
    metadata: str = None  # by default, not used in the code. allows for users to pass arbitrary info to tasks

    def __post_init__(self) -> None:
-        if "." in self.dataset_path:
+        if self.dataset_path and ("." in self.dataset_path):
            import inspect
            from importlib import import_module

            self.dataset_path = inspect.getfile(import_module(self.dataset_path))

        if self.generation_kwargs is not None:
-            if self.output_type != "greedy_until":
+            if self.output_type != "generate_until":
                eval_logger.warning(
-                    "passed `generation_kwargs`, but not using `output_type: greedy_until`!"
+                    f"[{self.task}] passed `generation_kwargs`, but not using `output_type: generate_until`!"
                )
-                assert self.output_type != "greedy_until"
+                assert self.output_type != "generate_until"

            if "temperature" in self.generation_kwargs:
                self.generation_kwargs["temperature"] = float(
@@ -108,14 +110,13 @@ class TaskConfig(dict):
            if "until" not in self.generation_kwargs:
                self.generation_kwargs["until"] = [self.fewshot_delimiter]
        else:
-            if self.output_type == "greedy_until":
+            if self.output_type == "generate_until":
                # ensure that we greedily generate in absence of explicit arguments otherwise
                self.generation_kwargs = {
                    "until": None
                    if self.fewshot_delimiter is None
                    else [self.fewshot_delimiter],
                    "do_sample": False,
-                    "temperature": 0.0,
                }

        # TODO: how to make TaskConfigs be de- and re-serializable, even when using the !function constructor?
@@ -201,19 +202,9 @@ class Task(abc.ABC):
        self._fewshot_docs = None
        self._instances = None

-        self._config = TaskConfig(**config) if config else TaskConfig()
-
-        if not hasattr(self, "_filters"):
-            self._filters = []
-            for name, components in self._config.get(
-                "filters", [["none", [["take_first", None]]]]
-            ):
-                filter_pipeline = build_filter_ensemble(name, components)
-                self._filters.append(filter_pipeline)
+        self._config = TaskConfig({**config}) if config else TaskConfig()

-        self.sampler = samplers.Sampler(
-            list(self.fewshot_docs()), self, rnd=random.Random(1234)
-        )
+        self._filters = [build_filter_ensemble("none", [["take_first", None]])]

    def download(self, data_dir=None, cache_dir=None, download_mode=None) -> None:
        """Downloads and returns the task dataset.
@@ -354,9 +345,7 @@ class Task(abc.ABC):
                False
            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"

-        eval_logger.info(
-            f"Building contexts for task '{self.config.task}' on rank {rank}..."
-        )
+        eval_logger.info(f"Building contexts for task on rank {rank}...")

        instances = []
        for doc_id, doc in utils.create_iterator(
@@ -446,7 +435,13 @@ class Task(abc.ABC):
        return len(re.split(r"\s+", doc))

    @utils.positional_deprecated
-    def fewshot_context(self, doc, num_fewshot):
+    def fewshot_context(
+        self,
+        doc,
+        num_fewshot,
+        rnd=random.Random(1234),
+        description=None,
+    ):
        """Returns a fewshot context string that is made up of a prepended description
        (if provided), the `num_fewshot` number of examples, and an appended prompt example.

@@ -454,34 +449,56 @@ class Task(abc.ABC):
            The document as returned from training_docs, validation_docs, or test_docs.
        :param num_fewshot: int
            The number of fewshot examples to provide in the returned context string.
+        :param rnd: random.Random
+            The pseudo-random number generator used to randomly sample examples.
+            WARNING: This is currently a required arg although it's optionalized with a default `None`.
+        :param description: str
+            The task's description that will be prepended to the fewshot examples.
        :returns: str
            The fewshot context.
        """
+        assert (
+            rnd is not None
+        ), "A `random.Random` generator argument must be provided to `rnd`"
+
+        description = description if description else ""

        if num_fewshot == 0:
-            # always prepend the (possibly empty) task description
-            labeled_examples = self.config.description
+            labeled_examples = ""
        else:
-            labeled_examples = self.config.description + self.sampler.get_context(
-                doc, num_fewshot
+            # for sets with no training docs, draw from other set *but ensure no overlap with current doc*
+            if self.has_training_docs():
+                fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
+            else:
+                if self._fewshot_docs is None:
+                    self._fewshot_docs = list(
+                        self.validation_docs()
+                        if self.has_validation_docs()
+                        else self.test_docs()
+                    )
+
+                fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
+
+                # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+                fewshotex = [x for x in fewshotex if x != doc][:num_fewshot]
+
+            labeled_examples = (
+                "\n\n".join(
+                    [
+                        self.doc_to_text(doc) + self.doc_to_target(doc)
+                        for doc in fewshotex
+                    ]
+                )
+                + "\n\n"
            )

        example = self.doc_to_text(doc)
-        if type(example) == str:
-            return labeled_examples + example
-        elif type(example) == list:
-            return [labeled_examples + ex for ex in example]
-        elif type(example) == int:
-            if self.config.doc_to_choice is not None:
-                choices = self.doc_to_choice(doc)
-                return labeled_examples + choices[example]
-            else:
-                return labeled_examples + str(example)
+        return description + labeled_examples + example

    def apply_filters(self):
        if hasattr(self, "_filters"):
            for f in self._filters:
-                f.apply(self._instances)
+                f.apply(self._instances, None)
        else:
            eval_logger.warning("No filter defined, passing through instances")
            return self._instances
@@ -536,12 +553,14 @@ class ConfigurableTask(Task):
        self._aggregation_list = {}
        self._higher_is_better = {}

-        _metric_list = DEFAULT_METRIC_REGISTRY[self.config.output_type]
        if self.config.metric_list is None:
            # TODO: handle this in TaskConfig.__post_init__ ?
+            _metric_list = DEFAULT_METRIC_REGISTRY[self.config.output_type]
+
            for metric_name in _metric_list:
                self._metric_fn_list[metric_name] = get_metric(metric_name)
-                self._aggregation_list[metric_name] = get_default_aggregation(
+                self._metric_fn_kwargs[metric_name] = {}
+                self._aggregation_list[metric_name] = get_metric_aggregation(
                    metric_name
                )
                self._higher_is_better[metric_name] = is_higher_better(metric_name)
@@ -552,8 +571,13 @@ class ConfigurableTask(Task):
                kwargs = {
                    key: metric_config[key]
                    for key in metric_config
-                    if key not in ["metric", "aggregation", "higher_is_better"]
+                    if key
+                    not in ["metric", "aggregation", "higher_is_better", "hf_evaluate"]
                }
+                hf_evaluate_metric = (
+                    "hf_evaluate" in metric_config
+                    and metric_config["hf_evaluate"] is True
+                )

                if self.config.process_results is not None:
                    self._metric_fn_list[metric_name] = None
@@ -564,7 +588,9 @@ class ConfigurableTask(Task):
                    self._metric_fn_list[metric_name] = metric_fn
                    self._metric_fn_kwargs[metric_name] = kwargs
                else:
-                    self._metric_fn_list[metric_name] = get_metric(metric_name)
+                    self._metric_fn_list[metric_name] = get_metric(
+                        metric_name, hf_evaluate_metric
+                    )
                    self._metric_fn_kwargs[metric_name] = kwargs

                if "aggregation" in metric_config:
@@ -577,9 +603,9 @@ class ConfigurableTask(Task):
                        ]
                else:
                    INV_AGG_REGISTRY = {v: k for k, v in AGGREGATION_REGISTRY.items()}
-                    metric_agg = get_default_aggregation(metric_name)
+                    metric_agg = get_metric_aggregation(metric_name)
                    eval_logger.warning(
-                        f"metric {metric_name} is defined, but aggregation is not. "
+                        f"[Task: {self._config.task}] metric {metric_name} is defined, but aggregation is not. "
                        f"using default "
                        f"aggregation={INV_AGG_REGISTRY[metric_agg]}"
                    )
@@ -591,7 +617,7 @@ class ConfigurableTask(Task):
                    ]
                else:
                    eval_logger.warning(
-                        f"metric {metric_name} is defined, but higher_is_better is not. "
+                        f"[Task: {self._config.task}] metric {metric_name} is defined, but higher_is_better is not. "
                        f"using default "
                        f"higher_is_better={is_higher_better(metric_name)}"
                    )
@@ -627,9 +653,11 @@ class ConfigurableTask(Task):
            self.prompt = None

        if self.fewshot_docs() is not None:
-            self.sampler = samplers.Sampler(
-                list(self.fewshot_docs()), self, rnd=random.Random(1234)
-            )
+            self.sampler = samplers.get_sampler(
+                self.config.fewshot_config.get("sampler", "default")
+                if self.config.fewshot_config
+                else "default"
+            )(list(self.fewshot_docs()), self, rnd=random.Random(1234))

        if self.has_test_docs():
            self.task_docs = self.test_docs()
@@ -678,7 +706,10 @@ class ConfigurableTask(Task):
            for choice in check_choices:
                choice_has_whitespace = True if choice[0].isspace() else False
                delimiter_has_whitespace = (
-                    True if self.config.target_delimiter[-1].isspace() else False
+                    True
+                    if self.config.target_delimiter.rstrip()
+                    != self.config.target_delimiter
+                    else False
                )

                if delimiter_has_whitespace and choice_has_whitespace:
@@ -687,7 +718,7 @@ class ConfigurableTask(Task):
                    )
                elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
                    eval_logger.warning(
-                        f'Both target_delimiter and target choice: "{choice}" does not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
+                        f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" do not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
                    )

    def download(self, dataset_kwargs=None) -> None:
@@ -751,6 +782,40 @@ class ConfigurableTask(Task):
            else:
                return None

+    def apply_filters(self):
+    @utils.positional_deprecated
+    def fewshot_context(self, doc, num_fewshot):
+        """Returns a fewshot context string that is made up of a prepended description
+        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
+
+        :param doc: str
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param num_fewshot: int
+            The number of fewshot examples to provide in the returned context string.
+        :returns: str
+            The fewshot context.
+        """
+
+        if num_fewshot == 0:
+            # always prepend the (possibly empty) task description
+            labeled_examples = self.config.description
+        else:
+            labeled_examples = self.config.description + self.sampler.get_context(
+                doc, num_fewshot
+            )
+
+        example = self.doc_to_text(doc)
+        if type(example) == str:
+            return labeled_examples + example
+        elif type(example) == list:
+            return [labeled_examples + ex for ex in example]
+        elif type(example) == int:
+            if self.config.doc_to_choice is not None:
+                choices = self.doc_to_choice(doc)
+                return labeled_examples + choices[example]
+            else:
+                return labeled_examples + str(example)
+
    def apply_filters(self):
        if hasattr(self, "_filters"):
            for f in self._filters:
@@ -840,7 +905,10 @@ class ConfigurableTask(Task):
                    and (target_string[0] == "[")
                    and (target_string[-1] == "]")
                ):
-                    return ast.literal_eval(target_string)
+                    try:
+                        return ast.literal_eval(target_string)
+                    except (SyntaxError, ValueError):
+                        return target_string
                else:
                    return target_string
        elif type(doc_to_target) == list:
@@ -879,26 +947,6 @@ class ConfigurableTask(Task):
        else:
            raise TypeError

-    def gold_alias(self, doc):
-        # returns a version of the gold target answer to a document,
-        # which should be passed into metric for scoring as the ground truth.
-
-        # in multiple_choice tasks, this should be castable to an int corresponding to the index
-        # within the answer choices, while doc_to_target is the string version of {{answer_choices[gold]}}.
-        if self.config.gold_alias is not None:
-            doc_to_target = self.config.gold_alias
-        else:
-            return self.doc_to_target(doc)
-
-        if type(doc_to_target) == str:
-            return utils.apply_template(doc_to_target, doc)
-        elif callable(doc_to_target):
-            return doc_to_target(doc)
-        elif hasattr(doc_to_target, "apply"):
-            return doc_to_target.apply(doc)[1]
-        else:
-            raise TypeError
-
    def construct_requests(
        self, doc: dict, ctx: str, **kwargs
    ) -> Union[List[Instance], Instance]:
@@ -949,7 +997,7 @@ class ConfigurableTask(Task):
                )
            return request_list

-        elif self.OUTPUT_TYPE == "greedy_until":
+        elif self.OUTPUT_TYPE == "generate_until":
            arguments = (ctx, self.config.generation_kwargs)

        return Instance(
@@ -1073,8 +1121,9 @@ class ConfigurableTask(Task):
                acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0
                result_dict["acc_mutual_info"] = acc_mutual_info

-        elif self.OUTPUT_TYPE == "greedy_until":
+        elif self.OUTPUT_TYPE == "generate_until":
            gold = self.doc_to_target(doc)
+            result = results[0]
            if self.config.doc_to_choice is not None:
                # If you set doc_to_choice,
                # it assumes that doc_to_target returns a number.
@@ -1083,10 +1132,10 @@ class ConfigurableTask(Task):
            # we expect multiple_targets to be a list.
            elif self.multiple_target:
                gold = list(gold)
-            else:
-                gold = str(gold)
+            elif type(gold) != type(result):
+                # cast gold to the same type as result
+                gold = type(result)(gold)

-            result = results[0]
            for metric in self._metric_fn_list.keys():
                if self.multiple_target:
                    # in the case where we have multiple targets,
@@ -1136,7 +1185,7 @@ class ConfigurableTask(Task):
        else:
            raise ValueError(
                f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",
-                "'loglikelihood', 'loglikelihood_rolling', 'greedy_until' or 'multiple_choice'",
+                "'loglikelihood', 'loglikelihood_rolling', 'generate_until' or 'multiple_choice'",
            )

        return result_dict