solved merge conflict

0d1ef037 · lintangsutawika · aa44be3f · ada4a31d · 0d1ef037 · 0d1ef037
Commit 0d1ef037 authored Jan 17, 2024 by lintangsutawika
20 changed files
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
@@ -56,7 +56,7 @@ jobs:
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
        run: |
            python -m pip install --upgrade pip
-            pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+            pip install -e '.[dev]' --extra-index-url https://download.pytorch.org/whl/cpu
    #   Install optional git dependencies
    #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
    #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi

--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -17,29 +17,22 @@ jobs:
  linter:
    name: Linters
    runs-on: ubuntu-latest
-    timeout-minutes: 20
+    timeout-minutes: 5

    steps:
    - name: Checkout Code
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
    - name: Set up Python 3.8
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
      with:
        python-version: 3.8
        cache: pip
-        cache-dependency-path: setup.py
-    - name: Install dependencies
-      run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu ; export SKIP=no-commit-to-branch # env var deactivates --no-commit-to-branch
+        cache-dependency-path: pyproject.toml
    - name: Pre-Commit
+      env:
+        SKIP: "no-commit-to-branch,mypy"
+
      uses: pre-commit/action@v3.0.0
-    - name: Lint with pylint
-      run: python -m pylint --disable=all -e W0311 --jobs=0 --indent-string='    ' **/*.py
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=F,E9,E71,E72,E501,E112,E113,W6 --extend-ignore=F541 --show-source --statistics --exit-zero
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
 #       # mypy turned off for now
 #    - name: Lint with mypy
 #      run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
@@ -53,22 +46,22 @@ jobs:
    timeout-minutes: 30
    steps:
    - name: Checkout Code
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
      with:
        python-version: ${{ matrix.python-version }}
        cache: pip
-        cache-dependency-path: setup.py
+        cache-dependency-path: pyproject.toml
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install -e '.[testing,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -e '.[dev,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
 #         Install optional git dependencies
 #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
 #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
    - name: Test with pytest
-      run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/tests_master --ignore=tests/extra
+      run: python -m pytest --showlocals -s -vv -n=auto
    - name: Archive artifacts
      uses: actions/upload-artifact@v3
      with:

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,6 +17,7 @@ repos:
      - id: detect-private-key
      - id: end-of-file-fixer
      - id: no-commit-to-branch
+        always_run: false
      - id: requirements-txt-fixer
      - id: trailing-whitespace
        args: [--markdown-linebreak-ext=md]
@@ -26,14 +27,16 @@ repos:
        args: [--remove]
      - id: mixed-line-ending
        args: [--fix=lf]
-  - repo: https://github.com/pycqa/flake8
-    rev: 3.7.9
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.1.8
    hooks:
-      - id: flake8
-  - repo: https://github.com/psf/black
-    rev: 22.3.0
-    hooks:
-      - id: black
+      # Run the linter.
+      - id: ruff
+        args:
+          - --fix
+        # Run the formatter.
+      - id: ruff-format
  - repo: https://github.com/codespell-project/codespell
    rev: v2.1.0
    hooks:

--- a/CITATION.bib
+++ b/CITATION.bib
-@software{eval-harness,
-  author       = {Gao, Leo and
-                  Tow, Jonathan and
-                  Biderman, Stella and
-                  Black, Sid and
-                  DiPofi, Anthony and
-                  Foster, Charles and
-                  Golding, Laurence and
-                  Hsu, Jeffrey and
-                  McDonell, Kyle and
-                  Muennighoff, Niklas and
-                  Phang, Jason and
-                  Reynolds, Laria and
-                  Tang, Eric and
-                  Thite, Anish and
-                  Wang, Ben and
-                  Wang, Kevin and
-                  Zou, Andy},
+@misc{eval-harness,
+  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
  title        = {A framework for few-shot language model evaluation},
-  month        = sep,
-  year         = 2021,
+  month        = 12,
+  year         = 2023,
  publisher    = {Zenodo},
-  version      = {v0.0.1},
-  doi          = {10.5281/zenodo.5371628},
-  url          = {https://doi.org/10.5281/zenodo.5371628}
+  version      = {v0.4.0},
+  doi          = {10.5281/zenodo.10256836},
+  url          = {https://zenodo.org/records/10256836}
 }
--- a/CODEOWNERS
+++ b/CODEOWNERS
-* @haileyschoelkopf @lintangsutawika @StellaAthena
+* @haileyschoelkopf @lintangsutawika
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ New updates and features include:

 Please see our updated documentation pages in `docs/` for more details.

-Development will be continuing on the `main` branch, and we encourage you to give us feedback on what features are desired and how to improve the library further, or ask questions, either in issues or PRs on GitHub, or in the [EleutherAI discord](discord.gg/eleutherai)!
+Development will be continuing on the `main` branch, and we encourage you to give us feedback on what features are desired and how to improve the library further, or ask questions, either in issues or PRs on GitHub, or in the [EleutherAI discord](https://discord.gg/eleutherai)!

 ## Overview

@@ -28,13 +28,13 @@ This project provides a unified framework to test generative language models on
 - Over 60 standard academic benchmarks for LLMs, with hundreds of subtasks and variants implemented.
 - Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (including quantization via [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface.
 - Support for fast and memory-efficient inference with [vLLM](https://github.com/vllm-project/vllm).
- Support for commercial APIs including [OpenAI](https://openai.com), [goose.ai](https://goose.ai), and [TextSynth](https://textsynth.com/).
+- Support for commercial APIs including [OpenAI](https://openai.com), and [TextSynth](https://textsynth.com/).
 - Support for evaluation on adapters (e.g. LoRA) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft).
 - Support for local models and benchmarks.
 - Evaluation with publicly available prompts ensures reproducibility and comparability between papers.
 - Easy support for custom prompts and evaluation metrics.

-The Language Model Evaluation Harness is the backend for 🤗 Hugging Face's popular [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), has been used in [hundreds of papers](https://scholar.google.com/scholar?oi=bibs&hl=en&authuser=2&cites=15052937328817631261,4097184744846514103,17476825572045927382,18443729326628441434,12854182577605049984) is used internally by dozens of companies including NVIDIA, Cohere, Nous Research, Booz Allen Hamilton, and Mosaic ML.
+The Language Model Evaluation Harness is the backend for 🤗 Hugging Face's popular [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), has been used in [hundreds of papers](https://scholar.google.com/scholar?oi=bibs&hl=en&authuser=2&cites=15052937328817631261,4097184744846514103,1520777361382155671,17476825572045927382,18443729326628441434,14801318227356878622,7890865700763267262,12854182577605049984,15641002901115500560,5104500764547628290), and is used internally by dozens of organizations including NVIDIA, Cohere, BigScience, BigCode, Nous Research, and Mosaic ML.

 ## Install

@@ -49,23 +49,28 @@ pip install -e .
 We also provide a number of optional dependencies for extended functionality. Extras can be installed via `pip install -e ".[NAME]"`

 | Name          | Use                                   |
-| ------------- | ------------------------------------- |
+|---------------|---------------------------------------|
 | anthropic     | For using Anthropic's models          |
-| dev           | You probably don't want to use this   |
+| dev           | For linting PRs and contributions     |
 | gptq          | For loading models with GPTQ          |
-| testing       | You probably don't want to use this   |
+| ifeval        | For running the IFEval task           |
+| mamba         | For loading Mamba SSM models          |
+| math          | For running math task answer checking |
 | multilingual  | For multilingual tokenizers           |
 | openai        | For using OpenAI's models             |
-| promptsource  | For using PromtSource prompts         |
+| promptsource  | For using PromptSource prompts        |
 | sentencepiece | For using the sentencepiece tokenizer |
+| testing       | For running library test suite        |
 | vllm          | For loading models with vLLM          |
-| all           | Loads all extras                      |
+| zeno          | For visualizing results with Zeno     |
+|---------------|---------------------------------------|
+| all           | Loads all extras (not recommended)    |

 ## Basic Usage

 ### Hugging Face `transformers`

-To evaluate a model hosted on the [HuggingFace Hub](https://huggingface.co/models) (e.g. GPT-J-6B) on `hellaswag` you can use the following command:
+To evaluate a model hosted on the [HuggingFace Hub](https://huggingface.co/models) (e.g. GPT-J-6B) on `hellaswag` you can use the following command (this assumes you are using a CUDA-compatible GPU):

 ```bash
 lm_eval --model hf \
@@ -97,40 +102,52 @@ lm_eval --model hf \
    --batch_size auto:4
 ```

-Alternatively, you can use `lm-eval` instead of `lm_eval`.
+The full list of supported arguments are provided [here](./docs/interface.md), and on the terminal by calling `lm_eval -h`. Alternatively, you can use `lm-eval` instead of `lm_eval`.

 > [!Note]
 > Just like you can provide a local path to `transformers.AutoModel`, you can also provide a local path to `lm_eval` via `--model_args pretrained=/path/to/model`

 #### Multi-GPU Evaluation with Hugging Face `accelerate`

-To parallelize evaluation of HuggingFace models across multiple GPUs, we leverage the [accelerate 🚀](https://github.com/huggingface/accelerate) library as follows:
+We support two main ways of using Hugging Face's [accelerate 🚀](https://github.com/huggingface/accelerate) library for multi-GPU evaluation.
+
+To perform *data-parallel evaluation* (where each GPU loads a **separate full copy** of the model), we leverage the `accelerate` launcher as follows:

 ```
 accelerate launch -m lm_eval --model hf \
    --tasks lambada_openai,arc_easy \
    --batch_size 16
 ```
+(or via `accelerate launch --no-python lm_eval`).
+
+For cases where your model can fit on a single GPU, this allows you to evaluate on K GPUs K times faster than on one.
+
+**WARNING**: This setup does not work with FSDP model sharding, so in `accelerate config` FSDP must be disabled, or the NO_SHARD FSDP option must be used.
+
+The second way of using `accelerate` for multi-GPU evaluation is when your model is *too large to fit on a single GPU.*
+
+In this setting, run the library *outside of the `accelerate` launcher*, but passing `parallelize=True` to `--model_args` as follows:

-This will perform *data-parallel evaluation*: that is, placing a **single full copy** of your model onto each available GPU and *splitting batches across GPUs* to evaluate on K GPUs K times faster than on one.
+```
+lm_eval --model hf \
+    --tasks lambada_openai,arc_easy \
+    --model_args parallelize=True \
+    --batch_size 16
+```

-If your model is *is too large to be run on a single one of your GPUs* then you can use `accelerate` with Fully Sharded Data Parallel (FSDP) that splits the weights of the model across your data parallel ranks. To enable this, ensure you select `YES` when asked ```Do you want to use FullyShardedDataParallel?``` when running `accelerate config`. To enable memory-efficient loading, select `YES` when asked `Do you want each individually wrapped FSDP unit to broadcast module parameters from rank 0 at the start?`. This will ensure only the rank 0 process loads the model and then broadcasts the parameters to the other ranks instead of having each rank load all parameters which can lead to large RAM usage spikes around the start of the script that may cause errors.
+This means that your model's weights will be split across all available GPUs.

-To pass even more advanced keyword arguments to `accelerate`, we allow for the following arguments as well:
+For more advanced users or even larger models, we allow for the following arguments when `parallelize=True` as well:
 - `device_map_option`: How to split model weights across available GPUs. defaults to "auto".
 - `max_memory_per_gpu`: the max GPU memory to use per GPU in loading the model.
 - `max_cpu_memory`: the max amount of CPU memory to use when offloading the model weights to RAM.
 - `offload_folder`: a folder where model weights will be offloaded to disk if needed.

-To use `accelerate` with the `lm-eval` command, use
-```
-accelerate launch --no_python lm-eval --model ...
-```
-
+These two options (`accelerate launch` and `parallelize=True`) are mutually exclusive.

 ### Tensor + Data Parallel and Optimized Inference with `vLLM`

-We also support vLLM for faster inference on [supported model types](https://docs.vllm.ai/en/latest/models/supported_models.html). For single-GPU or multi-GPU — tensor parallel, data parallel, or a combination of both — inference, for example:
+We also support vLLM for faster inference on [supported model types](https://docs.vllm.ai/en/latest/models/supported_models.html), especially faster when splitting a model across multiple GPUs. For single-GPU or multi-GPU — tensor parallel, data parallel, or a combination of both — inference, for example:

 ```bash
 lm_eval --model vllm \
@@ -140,7 +157,7 @@ lm_eval --model vllm \
 ```
 For a full list of supported vLLM configurations, please reference our vLLM integration and the vLLM documentation.

-vLLM occasionally differs in output from Huggingface. We treat Huggingface as the reference implementation, and provide a script at [./scripts/model_comparator.py](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/scripts/model_comparator.py) for checking validity of vllm results against HF.
+vLLM occasionally differs in output from Huggingface. We treat Huggingface as the reference implementation, and provide a [script](./scripts/model_comparator.py) for checking the validity of vllm results against HF.

 ### Model APIs and Inference Servers

@@ -151,24 +168,28 @@ To call a hosted model, use:
 ```bash
 export OPENAI_API_KEY=YOUR_KEY_HERE
 lm_eval --model openai-completions \
-    --model_args engine=davinci \
+    --model_args model=davinci \
    --tasks lambada_openai,hellaswag
 ```

-Note that for externally hosted models, configs such as `--device` and `--batch_size` should not be used and do not function. Just like you can use `--model_args` to pass arbitrary arguments to the model constructor for local models, you can use it to pass arbitrary arguments to the model API for hosted models. See the documentation of the hosting service for information on what arguments they support.
+We also support using your own local inference server with an implemented version of the OpenAI ChatCompletions endpoint and passing trained HuggingFace artifacts and tokenizers.

+```bash
+lm_eval --model local-chat-completions --tasks gsm8k --model_args model=facebook/opt-125m,base_url=http://{yourip}:8000/v1
+```
+Note that for externally hosted models, configs such as `--device` and `--batch_size` should not be used and do not function. Just like you can use `--model_args` to pass arbitrary arguments to the model constructor for local models, you can use it to pass arbitrary arguments to the model API for hosted models. See the documentation of the hosting service for information on what arguments they support.

-| API or Inference Server     | Implemented?                    | `--model <xxx>` name                                                             | Models supported:                                                                             | Request Types:                                           |
-|-----------------------------|---------------------------------|----------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------|----------------------------------------------------------|
-| OpenAI Completions          | :heavy_check_mark:              | `openai`, `openai-completions`, `gooseai`                                        | up to `code-davinci-002`                                                                      | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-| OpenAI ChatCompletions      | :x: Not yet - needs testing!       | N/A                                                                              | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt)                 | `generate_until` (no logprobs)                             |
-| Anthropic                   | :heavy_check_mark:              | `anthropic`                                                                      | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model)  | `generate_until` (no logprobs)                             |
-| GooseAI                     | :heavy_check_mark: (not separately maintained)  | `openai`, `openai-completions`, `gooseai` (same interface as OpenAI Completions) |                                                                                               | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-| Textsynth                   | :heavy_check_mark:                   | `textsynth`                                                                      | [All supported engines](https://textsynth.com/documentation.html#engines)                                                                                           | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-| Cohere                      | [:hourglass: - blocked on Cohere API bug](https://github.com/EleutherAI/lm-evaluation-harness/pull/395) | N/A                                                                              | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models)                        | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-| [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python))                        | :heavy_check_mark:              | `gguf`, `ggml`                                                                   | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp)               | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-| vLLM                        | :heavy_check_mark:       | `vllm`                                                                           | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                             |
-| Your inference server here! | ...                             | ...                                                                              | ...                                                                                           | ...                                                      |                                | ...                                                      |
+| API or Inference Server                                                                                                   | Implemented?                    | `--model <xxx>` name                                                | Models supported:                                                                             | Request Types:                                             |
+|---------------------------------------------------------------------------------------------------------------------------|---------------------------------|---------------------------------------------------------------------|-----------------------------------------------------------------------------------------------|------------------------------------------------------------|
+| OpenAI Completions                                                                                                        | :heavy_check_mark:              | `openai-completions` | up to `code-davinci-002`                                                                      | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| OpenAI ChatCompletions                                                                                                    | :heavy_check_mark:        | `openai-chat-completions`, `local-chat-completions`                                                               | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt)                 | `generate_until` (no logprobs)                             |
+| Anthropic                                                                                                                 | :heavy_check_mark:              | `anthropic`                                                         | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model)  | `generate_until` (no logprobs)                             |
+| Textsynth                                                                                                                 | :heavy_check_mark:                   | `textsynth`                                                         | [All supported engines](https://textsynth.com/documentation.html#engines)                     | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| Cohere                                                                                                                    | [:hourglass: - blocked on Cohere API bug](https://github.com/EleutherAI/lm-evaluation-harness/pull/395) | N/A                                                                 | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models)                        | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)) | :heavy_check_mark:              | `gguf`, `ggml`                                                      | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp)                   | `generate_until`, `loglikelihood`, (perplexity evaluation not yet implemented) |
+| vLLM                                                                                                                      | :heavy_check_mark:       | `vllm`                                                              | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| Mamba                       | :heavy_check_mark:       | `mamba_ssm`                                                                      | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces) | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                             |
+| Your local inference server!                                                                                              | :heavy_check_mark:                             | `local-chat-completions` (using `openai-chat-completions` model type)    | Any server address that accepts GET requests using HF models and mirror's OpenAI's ChatCompletions interface                                  | `generate_until`                                           |                                | ...                                                      |

 It is on our roadmap to create task variants designed to enable models which do not serve logprobs/loglikelihoods to be compared with generation performance of open-source models.

@@ -176,6 +197,8 @@ It is on our roadmap to create task variants designed to enable models which do

 A number of other libraries contain scripts for calling the eval harness through their library. These include [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py), [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md), and [mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/blob/master/eval_harness.py).

+To create your own custom integration you can follow instructions from [this tutorial](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage).
+
 ### Additional Features

 If you have a Metal compatible Mac, you can run the eval harness using the MPS back-end by replacing `--device cuda:0` with `--device mps` (requires PyTorch version 2.1 or higher).
@@ -210,11 +233,11 @@ lm_eval --model hf \
    --device cuda:0
 ```

-[GPTQ](https://github.com/PanQiWei/AutoGPTQ) quantized models can be loaded by specifying their file names in `,gptq=NAME` (or `,gptq=True` for default names) in the `model_args` argument:
+[GPTQ](https://github.com/PanQiWei/AutoGPTQ) quantized models can be loaded by specifying their file names in `,autogptq=NAME` (or `,autogptq=True` for default names) in the `model_args` argument:

 ```bash
 lm_eval --model hf \
-    --model_args pretrained=model-name-or-path,gptq=model.safetensors,gptq_use_triton=True \
+    --model_args pretrained=model-name-or-path,autogptq=model.safetensors,gptq_use_triton=True \
    --tasks hellaswag
 ```

@@ -226,9 +249,50 @@ Additionally, one can provide a directory with `--use_cache` to cache the result

 For a full list of supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md) guide in our documentation!

+## Visualizing Results
+
+You can use [Zeno](https://zenoml.com) to visualize the results of your eval harness runs.
+
+First, head to [hub.zenoml.com](https://hub.zenoml.com) to create an account and get an API key [on your account page](https://hub.zenoml.com/account).
+Add this key as an environment variable:
+
+```bash
+export ZENO_API_KEY=[your api key]
+```
+
+You'll also need to install the `lm_eval[zeno]` package extra.
+
+To visualize the results, run the eval harness with the `log_samples` and `output_path` flags.
+We expect `output_path` to contain multiple folders that represent individual model names.
+You can thus run your evaluation on any number of tasks and models and upload all of the results as projects on Zeno.
+
+```bash
+lm_eval \
+    --model hf \
+    --model_args pretrained=EleutherAI/gpt-j-6B \
+    --tasks hellaswag \
+    --device cuda:0 \
+    --batch_size 8 \
+    --log_samples \
+    --output_path output/gpt-j-6B
+```
+
+Then, you can upload the resulting data using the `zeno_visualize` script:
+
+```bash
+python scripts/zeno_visualize.py \
+    --data_path output \
+    --project_name "Eleuther Project"
+```
+
+This will use all subfolders in `data_path` as different models and upload all tasks within these model folders to Zeno.
+If you run the eval harness on multiple tasks, the `project_name` will be used as a prefix and one project will be created per task.
+
+You can find an example of this workflow in [examples/visualize-zeno.ipynb](examples/visualize-zeno.ipynb).
+
 ## How to Contribute or Learn More?

-For more information on the library and how everything fits together, check out all of our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor/docs)! We plan to post a larger roadmap of desired + planned library improvements soon, with more information on how contributors can help.
+For more information on the library and how everything fits together, check out all of our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs)! We plan to post a larger roadmap of desired + planned library improvements soon, with more information on how contributors can help.

 ### Implementing new tasks


--- a/docs/README.md
+++ b/docs/README.md
@@ -4,7 +4,7 @@ Welcome to the docs for the LM Evaluation Harness!

 ## Table of Contents

-* To learn about the public interface of the library, as well as how to evaluate via the commandline or as integrated into an external library, see the [Interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/user_guide.md)
+* To learn about the public interface of the library, as well as how to evaluate via the commandline or as integrated into an external library, see the [Interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/interface.md)
 * To learn how to add a new library, API, or model type to the library, as well as a quick explainer on the types of ways to evaluate an LM, see the [Model Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/model_guide.md).
 * For a crash course on adding new tasks to the library, see our [New Task Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md).
 * To learn more about pushing the limits of task configuration that the Eval Harness supports, see the [Task Configuration Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/task_guide.md).
--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -46,16 +46,6 @@ dataset_name: ... # the dataset configuration to use. Leave `null` if your datas
 dataset_kwargs: null # any extra keyword arguments that should be passed to the dataset constructor, e.g. `data_dir`.
 ```

------------------------------
-**Tip:** To load a local dataset for evaluation, you can specify data files in the `dataset_kwargs` field, such as the following for JSON files:
-```
-dataset_path: json
-dataset_name: null
-dataset_kwargs:
-  data_files: /path/to/my/json
-```
-------------------------------
-
 Next, we'd like to tell our task what the dataset's train, validation, and test splits are named, if they exist:

 ```yaml
@@ -99,6 +89,36 @@ Now, in our YAML config file we'll use the `!function` constructor, and tell the
 process_docs: !function utils.process_docs
 ```

+### Using Local Datasets
+
+To load a local dataset for evaluation, you can specify data files in the `dataset_kwargs` field, such as the following for JSON files:
+
+```
+dataset_path: json
+dataset_name: null
+dataset_kwargs:
+  data_files: /path/to/my/json
+```
+Or with files already split into separate directories:
+
+```
+dataset_path: arrow
+dataset_kwargs:
+  data_files:
+    train: /path/to/arrow/train/data-00000-of-00001.arrow
+    validation: /path/to/arrow/validation/data-00000-of-00001.arrow
+```
+
+Alternatively, if you have previously downloaded a dataset from huggingface hub (using `save_to_disk()`) and wish to use the local files, you will need to use `data_dir` under `dataset_kwargs` to point to where the directory is.
+
+```
+dataset_path: hellaswag
+dataset_kwargs:
+  data_dir: hellaswag_local/
+```
+
+You can also set `dataset_path` as a directory path in your local system. This will assume that there is a loading script with the same name as the directory. [See datasets docs](https://huggingface.co/docs/datasets/loading#local-loading-script).
+
 ## Writing a Prompt Template

 The next thing we need to do is decide what format to use when presenting the data to the LM. This is our **prompt**, where we'll define both an input and output format.
@@ -315,6 +335,25 @@ python -m scripts.write_out \
 Open the file specified at the `--output_base_path <path>` and ensure it passes
 a simple eye test.

+## Versioning
+
+One key feature in LM Evaluation Harness is the ability to version tasks--that is, mark them with a specific version number that can be bumped whenever a breaking change is made.
+
+This version info can be provided by adding the following to your new task config file:
+
+```
+metadata:
+  version: 0
+```
+
+Now, whenever a change needs to be made to your task in the future, please increase the version number by 1 so that users can differentiate the different task iterations and versions.
+
+If you are incrementing a task's version, please also consider adding a changelog to the task's README.md noting the date, PR number, what version you have updated to, and a one-liner describing the change.
+
+for example,
+
+* \[Dec 25, 2023\] (PR #999) Version 0.0 -> 1.0: Fixed a bug with answer extraction that led to underestimated performance.
+
 ## Checking performance + equivalence

 It's now time to check models' performance on your task! In the evaluation harness, we intend to support a wide range of evaluation tasks and setups, but prioritize the inclusion of already-proven benchmarks following the precise evaluation setups in the literature where possible.
@@ -340,4 +379,4 @@ It is recommended to include a filled-out copy of this checklist in the README.m

 ## Submitting your task

-You're all set! Now push your work and make a pull request to the `big-refactor` branch! Thanks for the contribution :). If there are any questions, please leave a message in the `#lm-thunderdome` channel on the EAI discord!
+You're all set! Now push your work and make a pull request to the `main` branch! Thanks for the contribution :). If there are any questions, please leave a message in the `#lm-thunderdome` channel on the EAI discord!
--- a/docs/task_guide.md
+++ b/docs/task_guide.md
@@ -219,6 +219,49 @@ Aggregation functions:
 * `weighted_perplexity`
 * `bits_per_byte`

+### Adding a Multiple Choice Metric
+
+Adding a multiple choice metric has a few steps. To get it working you need to:
+
+1. register a metric function
+2. register an aggregation function
+3. update the `Task` definition to make sure the correct arguments are passed
+
+The default metric and aggregation functions are in `lm_eval/api/metrics.py`, and you can add a function there if it's for general use. The metrics are towards the bottom of the file and look like this:
+
+
+    @register_metric(
+        metric="mcc",
+        higher_is_better=True,
+        output_type="multiple_choice",
+        aggregation="matthews_corrcoef",
+    )
+    def mcc_fn(items):  # This is a passthrough function
+        return items
+
+Note that many of these are passthrough functions, and for multiple choice (at least) this function is never actually called.
+
+Aggregation functions are defined towards the top of the file, here's an example:
+
+    @register_aggregation("matthews_corrcoef")
+    def matthews_corrcoef(items):
+        unzipped_list = list(zip(*items))
+        golds = unzipped_list[0]
+        preds = unzipped_list[1]
+        return sklearn.metrics.matthews_corrcoef(golds, preds)
+
+This function returns a single numeric value. The input is defined in `Task.process_results` in `lm_eval/api/task.py`. There's a section that looks like this:
+
+
+    result_dict = {
+        **({"acc": acc} if "acc" in use_metric else {}),
+        **({"f1": (gold, pred)} if "f1" in use_metric else {}),
+        **({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
+        **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
+        **({"exact_match": exact_match} if "exact_match" in use_metric else {}),
+    }
+
+The value here determines the input to the aggregation function, though the name used matches the metric function. These metrics all have simple needs and just need the accuracy or gold and predicted values, but immediately below this there are examples of metrics with more complicated needs you can use as reference.

 ## Good Reference Tasks

@@ -258,6 +301,23 @@ task:
  - hendrycksTest*
 ```

+It is also possible to list an existing task in your benchmark configuration with some adjustments. For example, a few tasks from mmlu is included `multimedqa`. There, the `task_alias` and `group_alias` (See [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md#beautifying-table-display) for more details) are modified to suit the benchmark.
+
+```yaml
+group: multimedqa
+task:
+  - pubmedqa
+  - medmcqa
+  - medqa_4options
+  - task: mmlu_anatomy
+    task_alias: "anatomy (mmlu)"
+    group_alias: null
+  - task: mmlu_clinical_knowledge
+    task_alias: "clinical_knowledge (mmlu)"
+    group_alias: null
+  ...
+```
+
 Alternatively, benchmarks can have tasks that are customizable for each task. They can be defined like how a yaml task is usually set.

 ```yaml

--- a/examples/visualize-zeno.ipynb
+++ b/examples/visualize-zeno.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Visualizing Results in Zeno\n",
+    "\n",
+    "Benchmarking your models is the first step towards making sure your model performs well.\n",
+    "However, looking at the data behind the benchmark, slicing the data into subsets, and comparing models on individual instances can help you even more in evaluating and quantifying the behavior of your AI system.\n",
+    "\n",
+    "All of this can be done in [Zeno](https://zenoml.com)!\n",
+    "Zeno is super easy to use with the eval harness, let's explore how you can easily upload and visualize your eval results.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install this project if you did not already do that. This is all that needs to be installed for you to be able to visualize your data in Zeno!\n",
+    "!pip install -e ..\n",
+    "!pip install -e ..[zeno]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Run the Eval Harness\n",
+    "\n",
+    "To visualize the results, run the eval harness with the `log_samples` and `output_path` flags. We expect `output_path` to contain multiple folders that represent individual model names. You can thus run your evaluation on any number of tasks and models and upload all of the results as projects on Zeno.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!lm_eval \\\n",
+    "    --model hf \\\n",
+    "    --model_args pretrained=EleutherAI/gpt-neo-2.7B \\\n",
+    "    --tasks hellaswag,wikitext \\\n",
+    "    --batch_size 8 \\\n",
+    "    --device mps \\\n",
+    "    --log_samples \\\n",
+    "    --output_path output/gpt-neo-2.7B \\\n",
+    "    --limit 10"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Set your API Key\n",
+    "\n",
+    "This is so you can be authenticated with Zeno.\n",
+    "If you don't already have a Zeno account, first create an account on [Zeno Hub](https://hub.zenoml.com).\n",
+    "After logging in to Zeno Hub, generate your API key by clicking on your profile at the bottom left to navigate to your account page.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%env ZENO_API_KEY=YOUR_API_KEY"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Visualize Eval Results\n",
+    "\n",
+    "You can now use the `zeno_visualize` script to upload the results to Zeno.\n",
+    "\n",
+    "This will use all subfolders in `data_path` as different models and upload all tasks within these model folders to Zeno. If you run the eval harness on multiple tasks, the `project_name` will be used as a prefix and one project will be created per task.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!python ../scripts/zeno_visualize.py --data_path output --project_name \"Zeno Upload Test\""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "zeno_projects",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
+import argparse
+import json
+import logging
 import os
 import re
 import sys
-import json
-import logging
-import argparse
-import numpy as np
-
 from pathlib import Path
 from typing import Union

+import numpy as np
+
 from lm_eval import evaluator, utils
-from lm_eval.tasks import initialize_tasks, include_path
 from lm_eval.api.registry import ALL_TASKS
+from lm_eval.tasks import include_path, initialize_tasks
+from lm_eval.utils import make_table


 def _handle_non_serializable(o):
@@ -25,73 +26,93 @@ def _handle_non_serializable(o):

 def parse_eval_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
-    parser.add_argument("--model", default="hf", help="Name of model e.g. `hf`")
+    parser.add_argument("--model", "-m", default="hf", help="Name of model e.g. `hf`")
    parser.add_argument(
        "--tasks",
+        "-t",
        default=None,
+        metavar="task1,task2",
        help="To get full list of tasks, use the command lm-eval --tasks list",
    )
    parser.add_argument(
        "--model_args",
+        "-a",
        default="",
-        help="String arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
+        help="Comma separated string arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
    )
    parser.add_argument(
        "--num_fewshot",
+        "-f",
        type=int,
        default=None,
+        metavar="N",
        help="Number of examples in few-shot context",
    )
-    parser.add_argument("--batch_size", type=str, default=1)
+    parser.add_argument(
+        "--batch_size",
+        "-b",
+        type=str,
+        default=1,
+        metavar="auto|auto:N|N",
+        help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.",
+    )
    parser.add_argument(
        "--max_batch_size",
        type=int,
        default=None,
-        help="Maximal batch size to try with --batch_size auto",
+        metavar="N",
+        help="Maximal batch size to try with --batch_size auto.",
    )
    parser.add_argument(
        "--device",
        type=str,
        default=None,
-        help="Device to use (e.g. cuda, cuda:0, cpu)",
+        help="Device to use (e.g. cuda, cuda:0, cpu).",
    )
    parser.add_argument(
        "--output_path",
+        "-o",
        default=None,
        type=str,
-        metavar="= [dir/file.jsonl] [DIR]",
+        metavar="DIR|DIR/file.json",
        help="The path to the output file where the result metrics will be saved. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
    )
    parser.add_argument(
        "--limit",
+        "-L",
        type=float,
        default=None,
+        metavar="N|0<N<1",
        help="Limit the number of examples per task. "
        "If <1, limit is a percentage of the total number of examples.",
    )
    parser.add_argument(
        "--use_cache",
+        "-c",
        type=str,
        default=None,
+        metavar="DIR",
        help="A path to a sqlite db file for caching model responses. `None` if not caching.",
    )
    parser.add_argument("--decontamination_ngrams_path", default=None)  # TODO: not used
    parser.add_argument(
        "--check_integrity",
        action="store_true",
-        help="Whether to run the relevant part of the test suite for the tasks",
+        help="Whether to run the relevant part of the test suite for the tasks.",
    )
    parser.add_argument(
        "--write_out",
+        "-w",
        action="store_true",
        default=False,
-        help="Prints the prompt for the first few documents",
+        help="Prints the prompt for the first few documents.",
    )
    parser.add_argument(
        "--log_samples",
+        "-s",
        action="store_true",
        default=False,
-        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis",
+        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
    )
    parser.add_argument(
        "--show_config",
@@ -103,21 +124,24 @@ def parse_eval_args() -> argparse.Namespace:
        "--include_path",
        type=str,
        default=None,
+        metavar="DIR",
        help="Additional path to include if there are external tasks to include.",
    )
    parser.add_argument(
        "--gen_kwargs",
-        default="",
+        default=None,
        help=(
            "String arguments for model generation on greedy_until tasks,"
-            " e.g. `temperature=0,top_k=0,top_p=0`"
+            " e.g. `temperature=0,top_k=0,top_p=0`."
        ),
    )
    parser.add_argument(
        "--verbosity",
-        type=str,
+        "-v",
+        type=str.upper,
        default="INFO",
-        help="Log error when tasks are not registered.",
+        metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
+        help="Controls the reported logging error level. Set to DEBUG when testing + adding new task configurations for comprehensive log output.",
    )
    return parser.parse_args()

@@ -147,7 +171,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        task_names = ALL_TASKS
    elif args.tasks == "list":
        eval_logger.info(
-            "Available Tasks:\n - {}".format(f"\n - ".join(sorted(ALL_TASKS)))
+            "Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS)))
        )
        sys.exit()
    else:
@@ -179,7 +203,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
                    f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
                )
                raise ValueError(
-                    f"Tasks {missing} were not found. Try `lm-eval --tasks list` for list of available tasks."
+                    f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks, or '--verbosity DEBUG' to troubleshoot task registration issues."
                )

    if args.output_path:
@@ -224,7 +248,9 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
    if results is not None:
        if args.log_samples:
            samples = results.pop("samples")
-        dumped = json.dumps(results, indent=2, default=_handle_non_serializable)
+        dumped = json.dumps(
+            results, indent=2, default=_handle_non_serializable, ensure_ascii=False
+        )
        if args.show_config:
            print(dumped)

@@ -240,17 +266,20 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
                    )
                    filename = path.joinpath(f"{output_name}.jsonl")
                    samples_dumped = json.dumps(
-                        samples[task_name], indent=2, default=_handle_non_serializable
+                        samples[task_name],
+                        indent=2,
+                        default=_handle_non_serializable,
+                        ensure_ascii=False,
                    )
-                    filename.open("w").write(samples_dumped)
+                    filename.write_text(samples_dumped, encoding="utf-8")

        print(
            f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
        )
-        print(evaluator.make_table(results))
+        print(make_table(results))
        if "groups" in results:
-            print(evaluator.make_table(results, "groups"))
+            print(make_table(results, "groups"))


 if __name__ == "__main__":

--- a/lm_eval/api/filter.py
+++ b/lm_eval/api/filter.py
 from dataclasses import dataclass
 from typing import List

-from lm_eval.api.instance import Instance
 from datasets import Dataset

+from lm_eval.api.instance import Instance
+

 class Filter:
    """
@@ -42,7 +43,6 @@ class FilterEnsemble:
    filters: List[Filter]

    def apply(self, instances: List[Instance], docs: List[Dataset]) -> None:
-
        resps = [
            inst.resps for inst in instances
        ]  # operate just on the model responses

--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
+import logging
 import math
+import random
 from collections.abc import Iterable
 from collections import defaultdict
+
+import evaluate
 import numpy as np
 import sacrebleu
 import sklearn.metrics
-import random
-import evaluate

-from lm_eval.api.registry import register_metric, register_aggregation
+from lm_eval.api.registry import register_aggregation, register_metric

-import logging

 eval_logger = logging.getLogger("lm-eval")

+
 # Register Aggregations First
 @register_aggregation("mean")
 def mean(arr):

--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
 import abc
+import hashlib
+import json
+import logging
 import os
+from typing import List, Optional, Tuple, Type, TypeVar

-import torch
-from typing import Union, List, Tuple, Optional, Type, TypeVar
 from sqlitedict import SqliteDict
-import json
-import hashlib
-
 from tqdm import tqdm

 from lm_eval import utils

-import logging

 eval_logger = logging.getLogger("lm-eval")


--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
-import os
+import logging
+
 import evaluate
+
 from lm_eval.api.model import LM

-import logging

 eval_logger = logging.getLogger("lm-eval")

@@ -91,7 +92,6 @@ DEFAULT_METRIC_REGISTRY = {
 def register_metric(**args):
    # TODO: do we want to enforce a certain interface to registered metrics?
    def decorate(fn):
-
        assert "metric" in args
        name = args["metric"]

@@ -100,7 +100,6 @@ def register_metric(**args):
            ("higher_is_better", HIGHER_IS_BETTER_REGISTRY),
            ("aggregation", METRIC_AGGREGATION_REGISTRY),
        ]:
-
            if key in args:
                value = args[key]
                assert (
@@ -120,7 +119,6 @@ def register_metric(**args):


 def get_metric(name, hf_evaluate_metric=False):
-
    if not hf_evaluate_metric:
        if name in METRIC_REGISTRY:
            return METRIC_REGISTRY[name]
@@ -151,7 +149,6 @@ def register_aggregation(name):


 def get_aggregation(name):
-
    try:
        return AGGREGATION_REGISTRY[name]
    except KeyError:
@@ -161,7 +158,6 @@ def get_aggregation(name):


 def get_metric_aggregation(name):
-
    try:
        return METRIC_AGGREGATION_REGISTRY[name]
    except KeyError:

--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
@@ -40,18 +40,18 @@ class ContextSampler:
                        self.doc_to_text(doc)
                        if (
                            self.config.doc_to_choice is None
-                            or type(self.doc_to_text(doc)) is str
+                            or isinstance(self.doc_to_text(doc), str)
                        )
                        else self.doc_to_choice(doc)[self.doc_to_text(doc)]
                    )
                    + self.target_delimiter
                    + (
                        str(self.doc_to_target(doc)[0])
-                        if type(self.doc_to_target(doc)) is list
+                        if isinstance(self.doc_to_target(doc), list)
                        else self.doc_to_target(doc)
                        if (
                            self.config.doc_to_choice is None
-                            or type(self.doc_to_target(doc)) is str
+                            or isinstance(self.doc_to_target(doc), str)
                        )
                        else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
                    )
@@ -77,8 +77,8 @@ class FirstNSampler(ContextSampler):
        Draw the first `n` samples in order from the specified split.
        Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
        """
-        assert n <= len(
-            self.docs
+        assert (
+            n <= len(self.docs)
        ), f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
        return self.docs[:n]


--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
 import abc
-from dataclasses import dataclass, field, asdict
-
-import os
-import re
 import ast
-import yaml
 import logging
-import evaluate
 import random
-import itertools
-import functools
-from tqdm import tqdm
+import re
+from collections.abc import Callable
+from dataclasses import asdict, dataclass
+from typing import Any, List, Literal, Tuple, Union

 import datasets
 import numpy as np

-from typing import Union, List, Any, Tuple, Literal
-from collections.abc import Callable
-
 from lm_eval import utils
 from lm_eval.api import samplers
 from lm_eval.api.instance import Instance
-from lm_eval.api.filter import FilterEnsemble
-
-from lm_eval.prompts import get_prompt
-from lm_eval.filters import build_filter_ensemble
 from lm_eval.api.metrics import (
+    bits_per_byte,
    mean,
    weighted_perplexity,
-    bits_per_byte,
-    metric_max_over_ground_truths,
 )
 from lm_eval.api.registry import (
-    get_metric,
+    AGGREGATION_REGISTRY,
+    DEFAULT_METRIC_REGISTRY,
    get_aggregation,
+    get_metric,
    get_metric_aggregation,
    is_higher_better,
-    DEFAULT_METRIC_REGISTRY,
-    OUTPUT_TYPE_REGISTRY,
-    AGGREGATION_REGISTRY,
 )
+from lm_eval.filters import build_filter_ensemble
+from lm_eval.prompts import get_prompt
+

 ALL_OUTPUT_TYPES = [
    "loglikelihood",
@@ -97,12 +86,6 @@ class TaskConfig(dict):
    ] = None  # by default, not used in the code. allows for users to pass arbitrary info to tasks

    def __post_init__(self) -> None:
-        if self.dataset_path and os.path.exists(os.path.dirname(self.dataset_path)):
-            import inspect
-            from importlib import import_module
-
-            self.dataset_path = inspect.getfile(import_module(self.dataset_path))
-
        if self.generation_kwargs is not None:
            if self.output_type != "generate_until":
                eval_logger.warning(
@@ -349,9 +332,7 @@ class Task(abc.ABC):
        elif self.has_validation_docs():
            docs = self.validation_docs()
        else:
-            assert (
-                False
-            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
+            assert False, f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"

        eval_logger.info(f"Building contexts for task on rank {rank}...")

@@ -546,6 +527,10 @@ class ConfigurableTask(Task):
                "Must pass a config to ConfigurableTask, either in cls.CONFIG or `config` kwarg"
            )

+        if isinstance(self.config.metadata, dict):
+            if "version" in self.config.metadata:
+                self.VERSION = self.config.metadata["version"]
+
        if self.config.output_type is not None:
            assert self.config.output_type in ALL_OUTPUT_TYPES
            self.OUTPUT_TYPE = self.config.output_type
@@ -603,9 +588,9 @@ class ConfigurableTask(Task):

                if "aggregation" in metric_config:
                    agg_name = metric_config["aggregation"]
-                    if type(agg_name) == str:
+                    if isinstance(agg_name, str):
                        self._aggregation_list[metric_name] = get_aggregation(agg_name)
-                    elif callable(agg_name):
+                    elif callable(agg_name):  # noqa: E721
                        self._aggregation_list[metric_name] = metric_config[
                            "aggregation"
                        ]
@@ -672,9 +657,7 @@ class ConfigurableTask(Task):
        elif self.has_validation_docs():
            self.task_docs = self.validation_docs()
        else:
-            assert (
-                False
-            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
+            assert False, f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"

        # Test One Doc
        self.features = list(self.task_docs.features.keys())
@@ -686,20 +669,20 @@ class ConfigurableTask(Task):

        if self.config.doc_to_choice is not None:
            test_choice = self.doc_to_choice(test_doc)
-            if type(test_choice) is not list:
+            if not isinstance(test_choice, list):
                eval_logger.error("doc_to_choice must return list")
            else:
                num_choice = len(test_choice)

-            if type(test_text) is int:
+            if isinstance(test_text, int):
                self.multiple_input = num_choice
        else:
            test_choice = None

-        if type(test_target) is list:
+        if isinstance(test_target, list):
            self.multiple_target = len(test_target)
        else:
-            if (type(test_target) is int) and (test_choice is not None):
+            if (isinstance(test_target, int)) and (test_choice is not None):
                test_target = test_choice[test_target]
            else:
                test_target = str(test_target)
@@ -719,11 +702,11 @@ class ConfigurableTask(Task):
                )

                if delimiter_has_whitespace and choice_has_whitespace:
-                    eval_logger.warning(
-                        f'Both target_delimiter and target choice: "{choice}" have whitespace'
+                    eval_logger.debug(
+                        f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" have whitespace'
                    )
                elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
-                    eval_logger.warning(
+                    eval_logger.debug(
                        f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" do not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
                    )

@@ -776,6 +759,8 @@ class ConfigurableTask(Task):

    def fewshot_docs(self):
        if self.config.fewshot_split is not None:
+            if self.config.process_docs is not None:
+                return self.config.process_docs(self.dataset[self.config.fewshot_split])
            return self.dataset[self.config.fewshot_split]
        else:
            if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0):
@@ -808,16 +793,19 @@ class ConfigurableTask(Task):
            )

        example = self.doc_to_text(doc)
-        if type(example) == str:
-            return labeled_examples + example
-        elif type(example) == list:
-            return [labeled_examples + ex for ex in example]
-        elif type(example) == int:
-            if self.config.doc_to_choice is not None:
-                choices = self.doc_to_choice(doc)
-                return labeled_examples + choices[example]
-            else:
-                return labeled_examples + str(example)
+        if self.multiple_input:
+            return labeled_examples
+        else:
+            if isinstance(example, str):
+                return labeled_examples + example
+            elif isinstance(example, list):
+                return [labeled_examples + ex for ex in example]
+            elif isinstance(example, int):
+                if self.config.doc_to_choice is not None:
+                    choices = self.doc_to_choice(doc)
+                    return labeled_examples + choices[example]
+                else:
+                    return labeled_examples + str(example)

    def apply_filters(self):
        if hasattr(self, "_filters"):
@@ -864,9 +852,9 @@ class ConfigurableTask(Task):
        else:
            doc_to_text = self.config.doc_to_text

-        if type(doc_to_text) == int:
+        if isinstance(doc_to_text, int):
            return doc_to_text
-        elif type(doc_to_text) == str:
+        elif isinstance(doc_to_text, str):
            if doc_to_text in self.features:
                # if self.config.doc_to_choice is not None:
                #     return self.doc_to_choice(doc)[doc[doc_to_text]]
@@ -898,9 +886,9 @@ class ConfigurableTask(Task):
        else:
            doc_to_target = self.config.doc_to_target

-        if type(doc_to_target) == int:
+        if isinstance(doc_to_target, int):
            return doc_to_target
-        elif type(doc_to_target) == str:
+        elif isinstance(doc_to_target, str):
            if doc_to_target in self.features:
                # if self.config.doc_to_choice is not None:
                #     return self.doc_to_choice(doc)[doc[doc_to_target]]
@@ -921,7 +909,7 @@ class ConfigurableTask(Task):
                        return target_string
                else:
                    return target_string
-        elif type(doc_to_target) == list:
+        elif isinstance(doc_to_target, list):
            return doc_to_target
        elif callable(doc_to_target):
            return doc_to_target(doc)
@@ -944,14 +932,14 @@ class ConfigurableTask(Task):
        else:
            doc_to_choice = self.config.doc_to_choice

-        if type(doc_to_choice) == str:
+        if isinstance(doc_to_choice, str):
            if doc_to_choice in self.features:
                return doc[doc_to_choice]
            else:
                return ast.literal_eval(utils.apply_template(doc_to_choice, doc))
-        elif type(doc_to_choice) == list:
+        elif isinstance(doc_to_choice, list):
            return doc_to_choice
-        elif type(doc_to_choice) == dict:
+        elif isinstance(doc_to_choice, dict):
            return list(doc_to_choice.values())
        elif callable(doc_to_choice):
            return doc_to_choice(doc)
@@ -973,7 +961,9 @@ class ConfigurableTask(Task):
            if self.multiple_input:
                # If there are multiple inputs, choices are placed in the ctx
                cont = self.doc_to_target(doc)
-                arguments = [(ctx, f"{target_delimiter}{cont}") for ctx in choices]
+                arguments = [
+                    (ctx + choice, f"{target_delimiter}{cont}") for choice in choices
+                ]
            else:
                # Otherwise they are placed in the continuation
                arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]
@@ -1085,14 +1075,14 @@ class ConfigurableTask(Task):
                gold = self.doc_to_target(doc)

            gold_index_error = False
-            if type(gold) is list:
+            if isinstance(gold, list):
                gold = [i if i < len(choices) else -100 for i in gold]
                if -100 in gold:
                    gold_index_error = True
            else:
-                if type(gold) is int:
+                if isinstance(gold, int):
                    gold = gold if gold < len(choices) else -100
-                elif type(gold) is str:
+                elif isinstance(gold, str):
                    gold = choices.index(gold) if gold in choices else -100

                if gold == -100:
@@ -1164,27 +1154,36 @@ class ConfigurableTask(Task):
                        # sometimes, a multiple_target dataset has exceptions where one doc has only one string answer
                        # print(gold)
                        gold = [gold]
-                    for gold_option in gold:
-                        try:
-                            result_score = self._metric_fn_list[metric](
-                                references=[gold_option],
-                                predictions=[result],
-                                **self._metric_fn_kwargs[metric],
-                            )
-                        except (
-                            TypeError
-                        ):  # TODO: this is hacky and I don't want to do it
-                            result_score = self._metric_fn_list[metric](
-                                [gold_option, result]
-                            )
-                        if isinstance(result_score, dict):
-                            # TODO: this handles the case where HF evaluate returns a dict.
-                            result_score = result_score[metric]
-                        scores.append(result_score)
-                    if any(scores):
-                        result_score = 1.0
+                    if metric == "exact_match":
+                        result = [result for _ in range(len(gold))]
+                        scores = self._metric_fn_list[metric](
+                            references=gold,
+                            predictions=result,
+                            **self._metric_fn_kwargs[metric],
+                        )[metric]
+                        result_score = 1.0 if scores > 0.0 else 0.0
                    else:
-                        result_score = 0.0
+                        for gold_option in gold:
+                            try:
+                                result_score = self._metric_fn_list[metric](
+                                    references=[gold_option],
+                                    predictions=[result],
+                                    **self._metric_fn_kwargs[metric],
+                                )
+                            except (
+                                TypeError
+                            ):  # TODO: this is hacky and I don't want to do it
+                                result_score = self._metric_fn_list[metric](
+                                    [gold_option, result]
+                                )
+                            if isinstance(result_score, dict):
+                                # TODO: this handles the case where HF evaluate returns a dict.
+                                result_score = result_score[metric]
+                            scores.append(result_score)
+                        if any(scores):
+                            result_score = 1.0
+                        else:
+                            result_score = 0.0
                else:
                    try:
                        result_score = self._metric_fn_list[metric](
@@ -1192,9 +1191,7 @@ class ConfigurableTask(Task):
                            predictions=[result],
                            **self._metric_fn_kwargs[metric],
                        )
-                    except (
-                        TypeError
-                    ):  # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
+                    except TypeError:  # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
                        result_score = self._metric_fn_list[metric]([gold, result])
                    if isinstance(result_score, dict):
                        # TODO: this handles the case where HF evaluate returns a dict.

--- a/lm_eval/decontamination/archiver.py
+++ b/lm_eval/decontamination/archiver.py
+import datetime
+import io
+import json
+import mmap
 import os
+from pathlib import Path
 from typing import Any
-import zstandard
-import json
+
 import jsonlines
-import io
-import datetime
-import mmap
 import tqdm
-from pathlib import Path
+import zstandard


 def json_serial(obj: Any) -> str:

--- a/lm_eval/decontamination/decontaminate.py
+++ b/lm_eval/decontamination/decontaminate.py
-import time
-import random
-import pickle
-import json
+import collections
 import glob
+import json
 import os
-import collections
+import pickle
+import random
+import time

-from .janitor import Janitor, word_ngrams
 from .archiver import ZStdTextReader
+from .janitor import Janitor, word_ngrams


 # Was used for testing the evaluator decoupled from the full logic below
@@ -109,7 +109,7 @@ def get_train_overlap(docs_by_task_set: dict, ngrams_path: str, limit: int) -> d
        print(f"Merging lookups took {elapsed:0.5f} seconds.")

        print(f"{ngrams_n_size} grams files found in {ngrams_path}:")
-        files = glob.glob(os.path.join(ngrams_path, f"*.sorted.zst"))
+        files = glob.glob(os.path.join(ngrams_path, "*.sorted.zst"))
        print(files)

        for file in files:
@@ -135,11 +135,7 @@ def get_train_overlap(docs_by_task_set: dict, ngrams_path: str, limit: int) -> d
                        matching_unique += 1
                        for task_name, task_set, doc_ids in merged_lookup[ngram]:
                            task_doc_set = duplicates[(task_name, task_set)]
-                            for (
-                                doc_id
-                            ) in (
-                                doc_ids
-                            ):  # Record contamination across all relevant task/set combos
+                            for doc_id in doc_ids:  # Record contamination across all relevant task/set combos
                                task_doc_set.add(doc_id)
                        del merged_lookup[ngram]  # No point matching again
                    else:

--- a/lm_eval/decontamination/janitor.py
+++ b/lm_eval/decontamination/janitor.py
+import pickle
 import re
 import string
-import pickle
 import traceback
-from pprint import pprint
-from typing import Iterator, Sequence, TypeVar, List, Tuple
+from typing import Iterator, List, Sequence, Tuple, TypeVar
+

 # This is a cpp module. Compile janitor_util.cpp with:
 # c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup