Merge branch 'main' into humaneval

# Conflicts: # lm_eval/api/task.py

Merge branch 'main' into humaneval
# Conflicts: # lm_eval/api/task.py
173b2bc3 · Baber · 74344829 · bb098f13 · 173b2bc3 · 173b2bc3
Commit 173b2bc3 authored Jan 10, 2025 by Baber
20 changed files
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
@@ -16,7 +16,7 @@ jobs:
    name: Scan for changed tasks
    steps:
      - name: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          fetch-depth: 2  # OR "2" -> To retrieve the preceding commit.

@@ -47,7 +47,7 @@ jobs:

      - name: Set up Python 3.9
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          python-version: 3.9
          cache: 'pip'
@@ -56,7 +56,7 @@ jobs:
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
        run: |
            python -m pip install --upgrade pip
-            pip install -e '.[dev]' --extra-index-url https://download.pytorch.org/whl/cpu
+            pip install -e '.[dev,ifeval]' --extra-index-url https://download.pytorch.org/whl/cpu
    #   Install optional git dependencies
    #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
    #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi

--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -13,7 +13,7 @@ jobs:
    steps:
    - uses: actions/checkout@v4
    - name: Set up Python
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
      with:
        python-version: "3.x"

@@ -26,7 +26,7 @@ jobs:
    - name: Build a binary wheel and a source tarball
      run: python3 -m build
    - name: Store the distribution packages
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
      with:
        name: python-package-distributions
        path: dist/
@@ -46,7 +46,7 @@ jobs:

    steps:
    - name: Download all the dists
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
      with:
        name: python-package-distributions
        path: dist/
@@ -68,7 +68,7 @@ jobs:

    steps:
    - name: Download all the dists
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
      with:
        name: python-package-distributions
        path: dist/

--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -22,10 +22,10 @@ jobs:
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
-    - name: Set up Python 3.8
+    - name: Set up Python 3.9
      uses: actions/setup-python@v5
      with:
-        python-version: 3.8
+        python-version: 3.9
        cache: pip
        cache-dependency-path: pyproject.toml
    - name: Pre-Commit
@@ -42,7 +42,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        python-version: ["3.9", "3.10", "3.11", "3.12" ]
    timeout-minutes: 30
    steps:
    - name: Checkout Code
@@ -56,15 +56,35 @@ jobs:
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install -e '.[dev,anthropic,sentencepiece,optimum,deepsparse,sparseml]' --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -e '.[dev,sentencepiece,api]' --extra-index-url https://download.pytorch.org/whl/cpu
 #         Install optional git dependencies
 #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
 #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
    - name: Test with pytest
-      run: python -m pytest --showlocals -s -vv -n=auto
+      run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py
    - name: Archive artifacts
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
      with:
-        name: output_results
+        name: output_testcpu${{ matrix.python-version }}
        path: |
          test_logs/*
+  testmodels:
+    name: External LM Tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v5
+      with:
+        python-version: 3.9
+        cache: pip
+        cache-dependency-path: pyproject.toml
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -U transformers peft
+    - name: Test with pytest
+      run: python -m pytest tests/models --showlocals -s -vv
--- a/.gitignore
+++ b/.gitignore
@@ -8,11 +8,13 @@ build
 dist
 *.egg-info
 venv
+.venv/
 .vscode/
 temp
 __pycache__
 .ipynb_checkpoints
 temp
+test_logs/
 # IPython
 profile_default/
 ipython_config.py

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 exclude: ^tests/testdata/
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v5.0.0
    hooks:
      - id: check-added-large-files
      - id: check-ast
@@ -29,7 +29,7 @@ repos:
      - id: mixed-line-ending
        args: [--fix=lf]
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.8
+    rev: v0.7.4
    hooks:
      # Run the linter.
      - id: ruff

--- a/CODEOWNERS
+++ b/CODEOWNERS
-* @haileyschoelkopf @lintangsutawika
+* @baberabb @lintangsutawika
--- a/README.md
+++ b/README.md
@@ -2,11 +2,22 @@

 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10256836.svg)](https://doi.org/10.5281/zenodo.10256836)

+---
+
+*Latest News 📣*
+
+- [2024/09] We are prototyping allowing users of LM Evaluation Harness to create and evaluate on text+image multimodal input, text output tasks, and have just added the `hf-multimodal` and `vllm-vlm` model types and `mmmu` task as a prototype feature. We welcome users to try out this in-progress feature and stress-test it for themselves, and suggest they check out [`lmms-eval`](https://github.com/EvolvingLMMs-Lab/lmms-eval), a wonderful project originally forking off of the lm-evaluation-harness, for a broader range of multimodal tasks, models, and features.
+- [2024/07] [API model](docs/API_guide.md) support has been updated and refactored, introducing support for batched and async requests, and making it significantly easier to customize and use for your own purposes. **To run Llama 405B, we recommend using VLLM's OpenAI-compliant API to host the model, and use the `local-completions` model type to evaluate the model.**
+- [2024/07] New Open LLM Leaderboard tasks have been added ! You can find them under the [leaderboard](lm_eval/tasks/leaderboard/README.md) task group.
+
+---
+
 ## Announcement
 **A new v0.4.0 release of lm-evaluation-harness is available** !

 New updates and features include:

+- **New Open LLM Leaderboard tasks have been added ! You can find them under the [leaderboard](lm_eval/tasks/leaderboard/README.md) task group.**
 - Internal refactoring
 - Config-based task creation and configuration
 - Easier import and sharing of externally-defined task config YAMLs
@@ -20,13 +31,15 @@ Please see our updated documentation pages in `docs/` for more details.

 Development will be continuing on the `main` branch, and we encourage you to give us feedback on what features are desired and how to improve the library further, or ask questions, either in issues or PRs on GitHub, or in the [EleutherAI discord](https://discord.gg/eleutherai)!

+---
+
 ## Overview

 This project provides a unified framework to test generative language models on a large number of different evaluation tasks.

 **Features:**
 - Over 60 standard academic benchmarks for LLMs, with hundreds of subtasks and variants implemented.
- Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (including quantization via [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface.
+- Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (including quantization via [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface.
 - Support for fast and memory-efficient inference with [vLLM](https://github.com/vllm-project/vllm).
 - Support for commercial APIs including [OpenAI](https://openai.com), and [TextSynth](https://textsynth.com/).
 - Support for evaluation on adapters (e.g. LoRA) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft).
@@ -41,7 +54,7 @@ The Language Model Evaluation Harness is the backend for 🤗 Hugging Face's pop
 To install the `lm-eval` package from the github repository, run:

 ```bash
-git clone https://github.com/EleutherAI/lm-evaluation-harness
+git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness
 cd lm-evaluation-harness
 pip install -e .
 ```
@@ -94,7 +107,7 @@ lm_eval --model hf \

 #### Multi-GPU Evaluation with Hugging Face `accelerate`

-We support two main ways of using Hugging Face's [accelerate 🚀](https://github.com/huggingface/accelerate) library for multi-GPU evaluation.
+We support three main ways of using Hugging Face's [accelerate 🚀](https://github.com/huggingface/accelerate) library for multi-GPU evaluation.

 To perform *data-parallel evaluation* (where each GPU loads a **separate full copy** of the model), we leverage the `accelerate` launcher as follows:

@@ -111,7 +124,7 @@ For cases where your model can fit on a single GPU, this allows you to evaluate

 The second way of using `accelerate` for multi-GPU evaluation is when your model is *too large to fit on a single GPU.*

-In this setting, run the library *outside of the `accelerate` launcher*, but passing `parallelize=True` to `--model_args` as follows:
+In this setting, run the library *outside the `accelerate` launcher*, but passing `parallelize=True` to `--model_args` as follows:

 ```
 lm_eval --model hf \
@@ -128,7 +141,19 @@ For more advanced users or even larger models, we allow for the following argume
 - `max_cpu_memory`: the max amount of CPU memory to use when offloading the model weights to RAM.
 - `offload_folder`: a folder where model weights will be offloaded to disk if needed.

-These two options (`accelerate launch` and `parallelize=True`) are mutually exclusive.
+The third option is to use both at the same time. This will allow you to take advantage of both data parallelism and model sharding, and is especially useful for models that are too large to fit on a single GPU.
+
+```
+accelerate launch --multi_gpu --num_processes {nb_of_copies_of_your_model} \
+    -m lm_eval --model hf \
+    --tasks lambada_openai,arc_easy \
+    --model_args parallelize=True \
+    --batch_size 16
+```
+
+To learn more about model parallelism and how to use it with the `accelerate` library, see the [accelerate documentation](https://huggingface.co/docs/transformers/v4.15.0/en/parallelism)
+
+**Warning: We do not natively support multi-node evaluation using the `hf` model type! Please reference [our GPT-NeoX library integration](https://github.com/EleutherAI/gpt-neox/blob/main/eval.py) for an example of code in which a custom multi-machine evaluation script is written.**

 **Note: we do not currently support multi-node evaluations natively, and advise using either an externally hosted server to run inference requests against, or creating a custom integration with your distributed framework [as is done for the GPT-NeoX library](https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py).**

@@ -180,6 +205,19 @@ Note that it is recommended to substitute the `python` command by `torchrun --np

 Not supported yet: multi-node evaluation and combinations of data replication with tensor or pipeline parallelism.

+#### Multi-GPU evaluation with OpenVINO models
+
+Pipeline parallelizm during evaluation is supported with OpenVINO models
+
+To enable  pipeline parallelism, set the `model_args` of `pipeline_parallel`. In addition, you also have to set up `device` to value `HETERO:<GPU index1>,<GPU index2>` for example `HETERO:GPU.1,GPU.0` For example, the command to use pipeline parallelism of 2 is:
+
+```
+lm_eval --model openvino \
+    --tasks wikitext \
+    --model_args pretrained=<path_to_ov_model>,pipeline_parallel=True \
+    --device HETERO:GPU.1,GPU.0
+```
+
 ### Tensor + Data Parallel and Optimized Inference with `vLLM`

 We also support vLLM for faster inference on [supported model types](https://docs.vllm.ai/en/latest/models/supported_models.html), especially faster when splitting a model across multiple GPUs. For single-GPU or multi-GPU — tensor parallel, data parallel, or a combination of both — inference, for example:
@@ -216,26 +254,28 @@ lm_eval --model openai-completions \
 We also support using your own local inference server with servers that mirror the OpenAI Completions and ChatCompletions APIs.

 ```bash
-lm_eval --model local-chat-completions --tasks gsm8k --model_args model=facebook/opt-125m,base_url=http://{yourip}:8000/v1
-```
-Note that for externally hosted models, configs such as `--device` and `--batch_size` should not be used and do not function. Just like you can use `--model_args` to pass arbitrary arguments to the model constructor for local models, you can use it to pass arbitrary arguments to the model API for hosted models. See the documentation of the hosting service for information on what arguments they support.
-
-| API or Inference Server                                                                                                   | Implemented?                    | `--model <xxx>` name                                                | Models supported:                                                                             | Request Types:                                             |
-|---------------------------------------------------------------------------------------------------------------------------|---------------------------------|---------------------------------------------------------------------|-----------------------------------------------------------------------------------------------|------------------------------------------------------------|
-| OpenAI Completions                                                                                                        | :heavy_check_mark:              | `openai-completions`, `local-completions` | All OpenAI Completions API models                                            | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-| OpenAI ChatCompletions                                                                                                    | :heavy_check_mark:        | `openai-chat-completions`, `local-chat-completions`                                                               | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt)                 | `generate_until` (no logprobs)                             |
-| Anthropic                                                                                                                 | :heavy_check_mark:              | `anthropic`                                                         | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model)  | `generate_until` (no logprobs)                             |
-| Anthropic Chat                                                                                                                | :heavy_check_mark:              | `anthropic-chat`, `anthropic-chat-completions`                                                         | [Supported Anthropic Engines](https://docs.anthropic.com/claude/docs/models-overview)  | `generate_until` (no logprobs)                             |
-| Textsynth                                                                                                                 | :heavy_check_mark:                   | `textsynth`                                                         | [All supported engines](https://textsynth.com/documentation.html#engines)                     | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-| Cohere                                                                                                                    | [:hourglass: - blocked on Cohere API bug](https://github.com/EleutherAI/lm-evaluation-harness/pull/395) | N/A                                                                 | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models)                        | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-| [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)) | :heavy_check_mark:              | `gguf`, `ggml`                                                      | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp)                   | `generate_until`, `loglikelihood`, (perplexity evaluation not yet implemented) |
-| vLLM                                                                                                                      | :heavy_check_mark:       | `vllm`                                                              | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-| Mamba                       | :heavy_check_mark:       | `mamba_ssm`                                                                      | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces) | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                             |
-| Huggingface Optimum (Causal LMs)    | ✔️         | `openvino`                                 |     Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format                           |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
-| Neuron via AWS Inf2 (Causal LMs)    | ✔️         | `neuronx`                                 |     Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2)                         |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
-| [Neural Magic DeepSparse](https://github.com/neuralmagic/deepsparse)    | ✔️         | `deepsparse`                                 |     Any LM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub with the "deepsparse" tag](https://huggingface.co/models?other=deepsparse)                       |  `generate_until`, `loglikelihood`                         | ...                                                      |
-| [Neural Magic SparseML](https://github.com/neuralmagic/sparseml)    | ✔️         | `sparseml`                                 |     Any decoder-only AutoModelForCausalLM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub](https://huggingface.co/neuralmagic). Especially useful for models with quantization like [`zoo:llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized`](https://sparsezoo.neuralmagic.com/models/llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized)                         |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
-| Your local inference server!                                                                                              | :heavy_check_mark:                             | `local-completions` or `local-chat-completions` (using `openai-chat-completions` model type)    | Any server address that accepts GET requests using HF models and mirror's OpenAI's Completions or ChatCompletions interface                                  | `generate_until`                                           |                                | ...                |
+lm_eval --model local-completions --tasks gsm8k --model_args model=facebook/opt-125m,base_url=http://{yourip}:8000/v1/completions,num_concurrent=1,max_retries=3,tokenized_requests=False,batch_size=16
+```
+Note that for externally hosted models, configs such as `--device` which relate to where to place a local model should not be used and do not function. Just like you can use `--model_args` to pass arbitrary arguments to the model constructor for local models, you can use it to pass arbitrary arguments to the model API for hosted models. See the documentation of the hosting service for information on what arguments they support.
+
+| API or Inference Server                                                                                                   | Implemented?                    | `--model <xxx>` name                                | Models supported:                                                                                                                                                                                                                                                                                                                                          | Request Types:                                             |
+|---------------------------------------------------------------------------------------------------------------------------|---------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------|
+| OpenAI Completions                                                                                                        | :heavy_check_mark:              | `openai-completions`, `local-completions`           | All OpenAI Completions API models                                                                                                                                                                                                                                                                                                                          | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| OpenAI ChatCompletions                                                                                                    | :heavy_check_mark:        | `openai-chat-completions`, `local-chat-completions` | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt)                                                                                                                                                                                                                                                                              | `generate_until` (no logprobs)                             |
+| Anthropic                                                                                                                 | :heavy_check_mark:              | `anthropic`                                         | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model)                                                                                                                                                                                                                                                               | `generate_until` (no logprobs)                             |
+| Anthropic Chat                                                                                                                | :heavy_check_mark:              | `anthropic-chat`, `anthropic-chat-completions`      | [Supported Anthropic Engines](https://docs.anthropic.com/claude/docs/models-overview)                                                                                                                                                                                                                                                                      | `generate_until` (no logprobs)                             |
+| Textsynth                                                                                                                 | :heavy_check_mark:                   | `textsynth`                                         | [All supported engines](https://textsynth.com/documentation.html#engines)                                                                                                                                                                                                                                                                                  | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| Cohere                                                                                                                    | [:hourglass: - blocked on Cohere API bug](https://github.com/EleutherAI/lm-evaluation-harness/pull/395) | N/A                                                 | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models)                                                                                                                                                                                                                                                                                     | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)) | :heavy_check_mark:              | `gguf`, `ggml`                                      | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp)                                                                                                                                                                                                                                                                                | `generate_until`, `loglikelihood`, (perplexity evaluation not yet implemented) |
+| vLLM                                                                                                                      | :heavy_check_mark:       | `vllm`                                              | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html)                                                                                                                                                                                                                                                              | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| Mamba                       | :heavy_check_mark:       | `mamba_ssm`                                         | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces)                                                                                                                                                                                                                                                      | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                             |
+| Huggingface Optimum (Causal LMs)    | ✔️         | `openvino`                                          | Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format                                                                                                                                                                                                                            |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
+| Huggingface Optimum-intel IPEX (Causal LMs)    | ✔️         | `ipex`                                          | Any decoder-only AutoModelForCausalLM                                                                                                                                                                                                                      |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
+| Neuron via AWS Inf2 (Causal LMs)    | ✔️         | `neuronx`                                           | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2)                                                                                                                                                                                            |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
+| [Neural Magic DeepSparse](https://github.com/neuralmagic/deepsparse)    | ✔️         | `deepsparse`                                        | Any LM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub with the "deepsparse" tag](https://huggingface.co/models?other=deepsparse)                                                                                                                                                                                                       |  `generate_until`, `loglikelihood`                         | ...                                                      |
+| [Neural Magic SparseML](https://github.com/neuralmagic/sparseml)    | ✔️         | `sparseml`                                          | Any decoder-only AutoModelForCausalLM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub](https://huggingface.co/neuralmagic). Especially useful for models with quantization like [`zoo:llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized`](https://sparsezoo.neuralmagic.com/models/llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized) |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
+| Watsonx.ai                                                                                                                 | :heavy_check_mark:              | `watsonx_llm`                                         | [Supported Watsonx.ai Engines](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx)                                                                                                                                                                                                                                                               | `generate_until` `loglikelihood`                         |
+| [Your local inference server!](docs/API_guide.md)                                                                                             | :heavy_check_mark:                             | `local-completions` or `local-chat-completions`     | Support for OpenAI API-compatible servers, with easy customization for other APIs.                                                                                                                                                                                                                                                                         | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                                          |                                | ...                |

 Models which do not supply logits or logprobs can be used with tasks of type `generate_until` only, while local models, or APIs that supply logprobs/logits of their prompts, can be run on all task types: `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.

@@ -294,8 +334,16 @@ lm_eval --model hf \
    --tasks hellaswag
 ```

-[GPTQ](https://github.com/PanQiWei/AutoGPTQ) quantized models can be loaded by specifying their file names in `,autogptq=NAME` (or `,autogptq=True` for default names) in the `model_args` argument:
+GPTQ quantized models can be loaded using [GPTQModel](https://github.com/ModelCloud/GPTQModel) (faster) or [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)
+
+GPTQModel: add `,gptqmodel=True` to `model_args`
+```bash
+lm_eval --model hf \
+    --model_args pretrained=model-name-or-path,gptqmodel=True \
+    --tasks hellaswag
+```

+AutoGPTQ: add `,autogptq=True` to `model_args`:
 ```bash
 lm_eval --model hf \
    --model_args pretrained=model-name-or-path,autogptq=model.safetensors,gptq_use_triton=True \
@@ -304,11 +352,12 @@ lm_eval --model hf \

 We support wildcards in task names, for example you can run all of the machine-translated lambada tasks via `--task lambada_openai_mt_*`.

-## Saving Results
+## Saving & Caching Results

 To save evaluation results provide an `--output_path`. We also support logging model responses with the `--log_samples` flag for post-hoc analysis.

-Additionally, one can provide a directory with `--use_cache` to cache the results of prior runs. This allows you to avoid repeated execution of the same (model, task) pairs for re-scoring.
+> [!TIP]
+> Use `--use_cache <DIR>` to cache evaluation results and skip previously evaluated samples when resuming runs of the same (model, task) pairs. Note that caching is rank-dependent, so restart with the same GPU count if interrupted. You can also use --cache_requests to save dataset preprocessing steps for faster evaluation resumption.

 To push results and samples to the Hugging Face Hub, first ensure an access token with write access is set in the `HF_TOKEN` environment variable. Then, use the `--hf_hub_log_args` flag to specify the organization, repository name, repository visibility, and whether to push results and samples to the Hub - [example dataset on the  HF Hub](https://huggingface.co/datasets/KonradSzafer/lm-eval-results-demo). For instance:

@@ -435,29 +484,29 @@ The best way to get support is to open an issue on this repo or join the [Eleuth
 ## Optional Extras
 Extras dependencies can be installed via `pip install -e ".[NAME]"`

-| Name          | Use                                   |
-|---------------|---------------------------------------|
-| anthropic     | For using Anthropic's models          |
-| deepsparse     | For running NM's DeepSparse models    |
-| dev           | For linting PRs and contributions     |
-| gptq          | For loading models with GPTQ          |
-| hf_transfer   | For speeding up HF Hub file downloads |
-| ifeval        | For running the IFEval task           |
-| neuronx       | For running on AWS inf2 instances     |
-| mamba         | For loading Mamba SSM models          |
-| math          | For running math task answer checking |
-| multilingual  | For multilingual tokenizers           |
-| openai        | For using OpenAI's models             |
-| optimum       | For running Intel OpenVINO models     |
-| promptsource  | For using PromptSource prompts        |
-| sentencepiece | For using the sentencepiece tokenizer |
-| sparseml      | For using NM's SparseML models        |
-| testing       | For running library test suite        |
-| unitxt        | For IBM's unitxt dataset tasks        |
-| vllm          | For loading models with vLLM          |
-| zeno          | For visualizing results with Zeno     |
-|---------------|---------------------------------------|
-| all           | Loads all extras (not recommended)    |
+| Name            | Use                                          |
+|-----------------|----------------------------------------------|
+| api             | For using api models (Anthropic, OpenAI API) |
+| deepsparse      | For running NM's DeepSparse models           |
+| dev             | For linting PRs and contributions            |
+| gptq            | For loading models with GPTQ                 |
+| hf_transfer     | For speeding up HF Hub file downloads        |
+| ifeval          | For running the IFEval task                  |
+| ibm_watsonx_ai  | For using IBM watsonx.ai model apis          |
+| ipex            | For running on optimum-intel ipex backend    |
+| neuronx         | For running on AWS inf2 instances            |
+| mamba           | For loading Mamba SSM models                 |
+| math            | For running math task answer checking        |
+| multilingual    | For multilingual tokenizers                  |
+| optimum         | For running Intel OpenVINO models            |
+| promptsource    | For using PromptSource prompts               |
+| sentencepiece   | For using the sentencepiece tokenizer        |
+| sparseml        | For using NM's SparseML models               |
+| testing         | For running library test suite               |
+| vllm            | For loading models with vLLM                 |
+| zeno            | For visualizing results with Zeno            |
+| --------------- | ---------------------------------------      |
+| all             | Loads all extras (not recommended)           |

 ## Cite as

@@ -465,11 +514,11 @@ Extras dependencies can be installed via `pip install -e ".[NAME]"`
 @misc{eval-harness,
  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
  title        = {A framework for few-shot language model evaluation},
-  month        = 12,
-  year         = 2023,
+  month        = 07,
+  year         = 2024,
  publisher    = {Zenodo},
-  version      = {v0.4.0},
-  doi          = {10.5281/zenodo.10256836},
-  url          = {https://zenodo.org/records/10256836}
+  version      = {v0.4.3},
+  doi          = {10.5281/zenodo.12608602},
+  url          = {https://zenodo.org/records/12608602}
 }
 ```
--- a/docs/API_guide.md
+++ b/docs/API_guide.md
+# TemplateAPI Usage Guide
+
+The `TemplateAPI` class is a versatile superclass designed to facilitate the integration of various API-based language models into the lm-evaluation-harness framework. This guide will explain how to use and extend the `TemplateAPI` class to implement your own API models. If your API implements the OpenAI API you can use the `local-completions` or the `local-chat-completions` (defined [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py)) model types, which can also serve as examples of how to effectively subclass this template.
+
+## Overview
+
+The `TemplateAPI` class provides a template for creating API-based model implementations. It handles common functionalities such as:
+
+- Tokenization (optional)
+- Batch processing
+- Caching
+- Retrying failed requests
+- Parsing API responses
+
+To use this class, you typically need to subclass it and implement specific methods for your API.
+
+## Key Methods to Implement
+
+When subclassing `TemplateAPI`, you need to implement the following methods:
+
+1. `_create_payload`: Creates the JSON payload for API requests.
+2. `parse_logprobs`: Parses log probabilities from API responses.
+3. `parse_generations`: Parses generated text from API responses.
+4. `headers`: Returns the headers for the API request.
+
+You may also need to override other methods or properties depending on your API's specific requirements.
+
+> [!NOTE]
+> Currently loglikelihood and MCQ based tasks (such as MMLU) are only supported for completion endpoints. Not for chat-completion — those that expect a list of dicts — endpoints! Completion APIs which support instruct tuned models can be evaluated with the `--apply_chat_template` option in order to simultaneously evaluate models using a chat template format while still being able to access the model logits needed for loglikelihood-based tasks.
+
+# TemplateAPI Usage Guide
+
+## TemplateAPI Arguments
+
+When initializing a `TemplateAPI` instance or a subclass, you can provide several arguments to customize its behavior. Here's a detailed explanation of some important arguments:
+
+- `model` or `pretrained` (str):
+   - The name or identifier of the model to use.
+   - `model` takes precedence over `pretrained` when both are provided.
+
+- `base_url` (str):
+   - The base URL for the API endpoint.
+
+- `tokenizer` (str, optional):
+  - The name or path of the tokenizer to use.
+  - If not provided, it defaults to using the same tokenizer name as the model.
+
+- `num_concurrent` (int):
+   - Number of concurrent requests to make to the API.
+   - Useful for APIs that support parallel processing.
+   - Default is 1 (sequential processing).
+
+- `timeout` (int, optional):
+   - Timeout for API requests in seconds.
+   - Default is 30.
+
+- `tokenized_requests` (bool):
+  - Determines whether the input is pre-tokenized. Defaults to `True`.
+  - Requests can be sent in either tokenized form (`list[list[int]]`) or as text (`list[str]`, or `str` for batch_size=1).
+  - For loglikelihood-based tasks, prompts require tokenization to calculate the context length. If `False` prompts are decoded back to text before being sent to the API.
+  - Not as important for `generate_until` tasks.
+  - Ignored for chat formatted inputs (list[dict...]) or if tokenizer_backend is None.
+
+- `tokenizer_backend` (str, optional):
+  - Required for loglikelihood-based or MCQ tasks.
+  - Specifies the tokenizer library to use. Options are "tiktoken", "huggingface", or None.
+  - Default is "huggingface".
+
+- `max_length` (int, optional):
+  - Maximum length of input + output.
+  - Default is 2048.
+
+- `max_retries` (int, optional):
+   - Maximum number of retries for failed API requests.
+   - Default is 3.
+
+- `max_gen_toks` (int, optional):
+  - Maximum number of tokens to generate in completion tasks.
+  - Default is 256 or set in task yaml.
+
+- `batch_size` (int or str, optional):
+  - Number of requests to batch together (if the API supports batching).
+  - Can be an integer or "auto" (which defaults to 1 for API models).
+  - Default is 1.
+
+- `seed` (int, optional):
+  - Random seed for reproducibility.
+  - Default is 1234.
+
+- `add_bos_token` (bool, optional):
+  - Whether to add the beginning-of-sequence token to inputs (when tokenizing).
+  - Default is False.
+
+- `custom_prefix_token_id` (int, optional):
+  - Custom token ID to use as a prefix for inputs.
+  - If not provided, uses the model's default BOS or EOS token (if `add_bos_token` is True).
+
+- `verify_certificate` (bool, optional):
+  - Whether to validate the certificate of the API endpoint (if HTTPS).
+  - Default is True.
+
+
+Example usage:
+
+```python
+class MyAPIModel(TemplateAPI):
+    def __init__(self, **kwargs):
+        super().__init__(
+            model="my-model",
+            base_url="https://api.mymodel.com/v1/completions",
+            tokenizer_backend="huggingface",
+            num_concurrent=5,
+            max_retries=5,
+            batch_size=10,
+            **kwargs
+        )
+
+    # Implement other required methods...
+```
+
+When subclassing `TemplateAPI`, you can override these arguments in your `__init__` method to set default values specific to your API. You can also add additional (potentially user-specified) arguments as needed for your specific implementation.
+
+## Example Implementation: OpenAI API
+
+The `OpenAICompletionsAPI` and `OpenAIChatCompletion` ([here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py) classes demonstrate how to implement API models using the `TemplateAPI` class. Here's a breakdown of the key components:
+
+### 1. Subclassing and Initialization
+
+```python
+@register_model("openai-completions")
+class OpenAICompletionsAPI(LocalCompletionsAPI):
+    def __init__(
+        self,
+        base_url="https://api.openai.com/v1/completions",
+        tokenizer_backend="tiktoken",
+        **kwargs,
+    ):
+        super().__init__(
+            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
+        )
+```
+
+### 2. Implementing API Key Retrieval
+
+```python
+@cached_property
+def api_key(self):
+    key = os.environ.get("OPENAI_API_KEY", None)
+    if key is None:
+        raise ValueError(
+            "API key not found. Please set the OPENAI_API_KEY environment variable."
+        )
+    return key
+```
+
+### 3. Creating the Payload
+
+```python
+def _create_payload(
+    self,
+    messages: Union[List[List[int]], List[dict], List[str], str],
+    generate=False,
+    gen_kwargs: Optional[dict] = None,
+    **kwargs,
+) -> dict:
+    if generate:
+        # ... (implementation for generation)
+    else:
+        # ... (implementation for log likelihood)
+```
+
+### 4. Parsing API Responses
+
+```python
+@staticmethod
+def parse_logprobs(
+    outputs: Union[Dict, List[Dict]],
+    tokens: List[List[int]] = None,
+    ctxlens: List[int] = None,
+    **kwargs,
+) -> List[Tuple[float, bool]]:
+    # ... (implementation)
+
+@staticmethod
+def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]:
+    # ... (implementation)
+```
+
+The requests are initiated in the `model_call` or the `amodel_call` methods.
+
+## Implementing Your Own API Model
+
+To implement your own API model:
+
+1. Subclass `TemplateAPI` or one of its subclasses (e.g., `LocalCompletionsAPI`).
+2. Override the `__init__` method if you need to set specific parameters.
+3. Implement the `_create_payload` and `header` methods to create the appropriate payload for your API.
+4. Implement the `parse_logprobs` and `parse_generations` methods to parse your API's responses.
+5. Override the `api_key` property if your API requires authentication.
+6. Override any other methods as necessary to match your API's behavior.
+
+## Best Practices
+
+1. Use the `@register_model` decorator to register your model with the framework (and import it in `lm_eval/models/__init__.py`!).
+3. Use environment variables for sensitive information like API keys.
+4. Properly handle batching and concurrent requests if supported by your API.
--- a/docs/CONTRIBUTING.md
+++ b/docs/CONTRIBUTING.md
@@ -2,8 +2,6 @@

 Welcome and thank you for your interest in the LM Evaluation Harness! We welcome contributions and feedback and appreciate your time spent with our library, and hope you find it useful!

-We intend LM Evaluation Harness to be a broadly useful and
-
 ## Important Resources

 There are several places information about LM Evaluation Harness is located:
@@ -11,7 +9,7 @@ There are several places information about LM Evaluation Harness is located:
 - Our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs)
 - We occasionally use [GitHub Milestones](https://github.com/EleutherAI/lm-evaluation-harness/milestones) to track progress toward specific near-term version releases.
 - We maintain a [Project Board](https://github.com/orgs/EleutherAI/projects/25) for tracking current work items and PRs, and for future roadmap items or feature requests.
- Further discussion and support conversations are located in the #lm-thunderdome channel of the [EleutherAI discord](discord.gg/eleutherai).
+- Further discussion and support conversations are located in the #lm-thunderdome channel of the [EleutherAI discord](https://discord.gg/eleutherai).

 ## Code Style

@@ -32,7 +30,7 @@ in order to ensure linters and other checks will be run upon committing.
 We use [pytest](https://docs.pytest.org/en/latest/) for running unit tests. All library unit tests can be run via:

 ```
-python -m pytest --ignore=tests/tests_master --ignore=tests/extra
+python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py
 ```

 ## Contributor License Agreement

--- a/docs/README.md
+++ b/docs/README.md
@@ -4,7 +4,8 @@ Welcome to the docs for the LM Evaluation Harness!

 ## Table of Contents

-* To learn about the public interface of the library, as well as how to evaluate via the commandline or as integrated into an external library, see the [Interface](./interface.md)
+* To learn about the public interface of the library, as well as how to evaluate via the command line or as integrated into an external library, see the [Interface](./interface.md).
 * To learn how to add a new library, API, or model type to the library, as well as a quick explainer on the types of ways to evaluate an LM, see the [Model Guide](./model_guide.md).
+  * For an extended description of how to extend the library to new model classes served over an API, see the [API Guide](./API_guide.md).
 * For a crash course on adding new tasks to the library, see our [New Task Guide](./new_task_guide.md).
 * To learn more about pushing the limits of task configuration that the Eval Harness supports, see the [Task Configuration Guide](./task_guide.md).
--- a/docs/chat-template-readme.md
+++ b/docs/chat-template-readme.md
+# Chat Template Delimiter Handling Update
+
+## Overview
+This change modifies how delimiters are handled when applying chat templates in the request construction process for likelihood and multiple-choice based tasks. When `apply_chat_template` is set to `True`, the target delimiter is now set to an empty string instead of using the configured delimiter.
+
+## Background
+By default, the system uses a target delimiter (typically a whitespace " ") between the context and target text when constructing prompts. The full string is constructed as:
+```
+doc_to_text(doc) + target_delimiter + doc_to_target(doc)
+```
+
+While this worked well for base models where we wanted the model to predict a single whitespace followed by the answer, chat models have their own formatting conventions that handle spacing differently.
+
+## The Change
+- When `apply_chat_template=True`, the target delimiter is now empty ("") instead of the default whitespace
+- This prevents interference between chat template formatting and the default delimiter system
+- Particularly important for multiple choice tasks where the template itself handles spacing
+
+## Example
+```
+# Before (with default delimiter " ")
+<user>Question: What color is the sky?\nAnswer:<assistant> blue
+
+# After
+<user>Question: What color is the sky?\nAnswer:<assistant>blue
+```
--- a/docs/interface.md
+++ b/docs/interface.md
@@ -46,7 +46,11 @@ This mode supports a number of command-line arguments, the details of which can

 - `--system_instruction`: Specifies a system instruction string to prepend to the prompt.

- `--apply_chat_template` : If this flag is on, a chat template will be applied to the prompt. For Hugging Face models, the chat template is taken from the tokenizer, if the tokenizer does not have a chat template, a default one will be applied. For other models, chat templating is not currently implemented.
+- `--apply_chat_template` : This flag specifies whether to apply a chat template to the prompt. It can be used in the following ways:
+	- `--apply_chat_template` : When used without an argument, applies the only available chat template to the prompt. For Hugging Face models, if no dedicated chat template exists, the default chat template will be applied.
+	- `--apply_chat_template template_name` : If the model has multiple chat templates, apply the specified template to the prompt.
+
+    For Hugging Face models, the default chat template can be found in the [`default_chat_template`](https://github.com/huggingface/transformers/blob/fc35907f95459d7a6c5281dfadd680b6f7b620e3/src/transformers/tokenization_utils_base.py#L1912) property of the Transformers Tokenizer.

 - `--fewshot_as_multiturn` : If this flag is on, the Fewshot examples are treated as a multi-turn conversation. Questions are provided as user content and answers are provided as assistant responses. Requires `--num_fewshot` to be set to be greater than 0, and `--apply_chat_template` to be on.

@@ -54,16 +58,19 @@ This mode supports a number of command-line arguments, the details of which can

 * `--seed`: Set seed for python's random, numpy and torch.  Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three.  The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility).  E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`.  E.g, `--seed 42` sets all three seeds to 42.

-* `--wandb_args`:  Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list [here](https://docs.wandb.ai/ref/python/init). e.g., ```--wandb_args project=test-project,name=test-run```
+* `--wandb_args`:  Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list [here](https://docs.wandb.ai/ref/python/init). e.g., ```--wandb_args project=test-project,name=test-run```. Also allows for the passing of the step to log things at (passed to `wandb.run.log`), e.g., `--wandb_args step=123`.

 * `--hf_hub_log_args` : Logs evaluation results to Hugging Face Hub. Accepts a string with the arguments separated by commas. Available arguments:
    * `hub_results_org` - organization name on Hugging Face Hub, e.g., `EleutherAI`. If not provided, the results will be pushed to the owner of the Hugging Face token,
-    * `hub_repo_name` - repository name on Hugging Face Hub, e.g., `lm-eval-results`,
+    * `hub_repo_name` - repository name on Hugging Face Hub (deprecated, `details_repo_name` and `results_repo_name` should be used instead), e.g., `lm-eval-results`,
+    * `details_repo_name` - repository name on Hugging Face Hub to store details, e.g., `lm-eval-results`,
+    * `results_repo_name` - repository name on Hugging Face Hub to store results, e.g., `lm-eval-results`,
    * `push_results_to_hub` - whether to push results to Hugging Face Hub, can be `True` or `False`,
    * `push_samples_to_hub` - whether to push samples results to Hugging Face Hub, can be `True` or `False`. Requires `--log_samples` to be set,
    * `public_repo` - whether the repository is public, can be `True` or `False`,
    * `leaderboard_url` - URL to the leaderboard, e.g., `https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard`.
    * `point_of_contact` - Point of contact for the results dataset, e.g., `yourname@example.com`.
+    * `gated` - whether to gate the details dataset, can be `True` or `False`.

 ## External Library Usage

@@ -102,12 +109,10 @@ results = lm_eval.simple_evaluate( # call simple_evaluate
 )
 ```

-See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L35 for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously.
+See the `simple_evaluate()` and `evaluate()` functions in [lm_eval/evaluator.py](../lm_eval/evaluator.py#:~:text=simple_evaluate) for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously.

 Additionally, the `evaluate()` function offers the core evaluation functionality provided by the library, but without some of the special handling and simplification + abstraction provided by `simple_evaluate()`.

-See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L173 for more details.
-
 As a brief example usage of `evaluate()`:

 ```python

--- a/docs/model_guide.md
+++ b/docs/model_guide.md
@@ -118,17 +118,45 @@ class MyCustomLM(LM):
    #...
    @property
    def tokenizer_name(self) -> str:
-        # should return a string denoting the name of the model's tokenizer and/or the accompanying chat template.
-
-    @property
-    def chat_template(self) -> str:
-        # should return a chat template formatting string that is used to build prompt from a user/assistant chat history.
-        # this will be saved in the evaluation results for reproducibility.
+        """
+        Return the name of the model's tokenizer and/or the accompanying chat template.
+        The returned string is used to cache requests.
+
+        Returns:
+            str: The name of the model's tokenizer and/or chat template.
+        """
+
+    def chat_template(self, chat_template: Union[bool, str] = False) -> str:
+        """
+        Get the appropriate chat template for the model based on the `chat_template` argument.
+
+        This method returns the chat template string to build the prompt from a chat history.
+        The chat template is saved in the evaluation results for reproducibility.
+        Boolean arguments should be used with models that have only one chat template,
+        while string arguments are used with models that have multiple chat templates.
+        For the reference implementation, see HFLM class in `lm_eval.models.huggingface`.
+
+        Args:
+            chat_template (Union[bool, str]): Specifies whether to apply a chat template:
+                - If False: Do not apply any chat template.
+                - If True: Apply the default chat template.
+                - If str: Apply the specified chat template by name.
+
+        Returns:
+            str: The selected chat template in Jinja format.
+        """

    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
-        # responsible for taking as input a chat history that would be fed into the model, and
-        # rendering it as a string that can be then tokenized and input into the model.
-    #...
+        """
+        Process a chat history to create a string that can be tokenized and input into the model.
+
+        Args:
+            chat_history (List[Dict[str, str]]): A list of dictionaries representing the chat history,
+                where each dictionary has "role" and "content" keys.
+
+        Returns:
+            str: A string representing the chat history that can be tokenized and fed into the model.
+        """
 ```

 - `apply_chat_template`

--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -86,20 +86,20 @@ Let's create a python file in the directory where we're writing our YAML file:
 ```bash
 touch lm_eval/tasks/<dataset_name>/utils.py
 ```
-Now, in `utils.py` we'll write a function to process each split of our dataset:
-
-TODO: Change the example to one that's in the tasks/
+Now, in `utils.py` we'll write a function to process each split of our dataset (the following example is drawn from [the `hellaswag` task](../lm_eval/tasks/hellaswag/utils.py)):

 ```python
-def process_docs(dataset: datasets.Dataset):
-    def _helper(doc):
-      # modifies the contents of a single
-      # document in our dataset.
-      doc["choices"] = [doc["choice1"], doc["choice2"], doc["wrong_answer"]]
-      doc["gold"] = doc["label"]
-      return doc
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc):
+        ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
+        out_doc = {
+            "query": preprocess(doc["activity_label"] + ": " + ctx),
+            "choices": [preprocess(ending) for ending in doc["endings"]],
+            "gold": int(doc["label"]),
+        }
+        return out_doc

-    return dataset.map(_helper) # returns back a datasets.Dataset object
+    return dataset.map(_process_doc)
 ```

 Now, in our YAML config file we'll use the `!function` constructor, and tell the config where our imported Python function will come from. At runtime, before doing anything else we will preprocess our dataset according to this function!
@@ -190,7 +190,8 @@ doc_to_target: "{{answer}}"
 ```


-**Important**: we now add `target_delimiter` between input and target which defaults to " ", such that the full input-output string is `doc_to_target(doc) + target_delimiter + doc_to_text(doc)`. `doc_to_text` and `doc_to_target` should not contain trailing right or left whitespace, respectively.
+> [!WARNING]
+> We add `target_delimiter` between input and target which defaults to " ", such that the full input-output string is `doc_to_text(doc) + target_delimiter + doc_to_target(doc)`. `doc_to_text` and `doc_to_target` should not contain trailing right or left whitespace, respectively. For multiple choice the target will be each choice index concatenated with the delimiter.


 #### Multiple choice format
@@ -206,7 +207,7 @@ doc_to_choice: "{{[distractor1, distractor2, distractor3, correct_answer]}}"
 ```
 Task implementers are thus able to decide what the answer choices should be for a document, and what prompt format to use.

-The label index can also be sourced from a feature directly. For example in `superglue/boolq`, the label index if defined in the feature `label`. We can set `doc_to_target` as simply `label`. The options or verbalizers can be written in a the form of a list `["no", "yes"]` that will correspond to the label index.
+The label index can also be sourced from a feature directly. For example in `superglue/boolq`, the label index if defined in the feature `label`. We can set `doc_to_target` as simply `label`. The options or verbalizers can be written in the form of a list `["no", "yes"]` that will correspond to the label index.

 ```yaml
 doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
@@ -285,7 +286,7 @@ As a heuristic check:

 For more detail on the task system and advanced features, see [`docs/task_guide.md`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md) . If none of the above sound like they apply to your task, it's time to continue onto checking your task performance!

-### Task name + groups (registering a task)
+### Task name + tags (registering a task)

 To test a task conveniently, it helps to *register* the task--that is, to give it a name and make the `lm-eval` library aware it exists!

@@ -296,14 +297,14 @@ task: <name of the task>
 ```
 Including a task name is mandatory.

-It is often also convenient to label your task with several `groups`, or tags, though this field is optional:
+It is often also convenient to label your task with several `tag` values, though this field is optional:

 ```yaml
-group:
-  - group1
-  - group2
+tag:
+  - tag1
+  - tag2
 ```
-This will add your task to the `group1` and `group2` groups, enabling people to know how to categorize your task, and if desired run all tasks in one of these groups at once, your task along with them.
+This will add your task to the `tag1` and `tag2` tags, enabling people to know how to categorize your task, and if desired run all tasks in one of these groups at once, your task along with them.


 If your task is not in the `lm_eval/tasks` folder, you'll need to tell the Eval Harness where to look for YAML files.
@@ -319,7 +320,48 @@ Passing `--tasks /path/to/yaml/file` is also accepted.

 ### Advanced Group Configs

-You can make more complete group config while also tailoring parameters for individual tasks.
+While `tag` values are helpful when you want to be able to quickly and conveniently run a set of related tasks via `--tasks my_tag_name`, often, we wish to implement more complex logic. For example, the MMLU benchmark contains 57 *subtasks* that must all be *averaged* together in order to report a final 'MMLU score'.
+
+Groupings of tasks might also use particular variants of a task--for example, we might want to default to evaluating a task as 5-shot when called as part of a given grouping, but not have a preference for number of shots when evaluating it as a standalone.
+
+We implement this via **groups**, which are distinct from tags. Groups can be implemented via *group config* YAML files, which are laid out similarly but slightly differently to tasks' YAML configs.
+
+The most basic form of group can be defined via a YAML config similar to the following:
+
+```yaml
+group: nli_tasks
+task:
+  - cb
+  - anli_r1
+  - rte
+metadata:
+  version: 1.0
+```
+
+This will behave almost identically to a `tag` that includes these 3 tasks, but with one key distinction: we'll print the `nli_tasks` group as a row (with no associated metrics) in our table of outputs, and visually show that these 3 tasks appear under its subheader.
+
+
+Now, let's assume we actually want to report an aggregate score for `nli_tasks`. We would instead use a YAML config like the following:
+
+```yaml
+group: nli_tasks
+task:
+  - cb
+  - anli_r1
+  - rte
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean).
+metadata:
+  version: 1.0
+```
+
+Similar to our `metric_list` for listing out the metrics we want to calculate for a given task, we use an `aggregate_metric_list` field to specify which metric name to aggregate across subtasks, what aggregation function to use, and whether we should micro- or macro- average these metrics. See [./task_guide.md](./task_guide.md) for a full list of related sub-keys.
+
+**[!Tip]: currently, we predominantly only support the aggregation of group metrics that use `mean` (either micro- or macro- averaged) over their subtasks. If you require even more complex aggregation rules, you may want to perform aggregation offline.**
+
+Group configs can be fairly complex! We can do various operations, such as defining new subtask(s) inline in our group YAML, overriding an existing task's specific config value, or nesting existing groups within our

 For example, let's build a config for evaluating MMLU and a few natural language inference tasks. For MMLU, we can write the name for the benchmark as a subtask written under `task`. You can configure the parameters such as `num_fewshot`. If the task being configured is a group such as `mmlu` or `super_glue`, the parameter set will be applied to all of the subtasks.

@@ -331,33 +373,13 @@ task:
      - cb
      - anli_r1
      - rte
+    aggregate_metric_list:
+      - metric: acc
+        aggregation: mean
+        higher_is_better: true
  - task: mmlu
    num_fewshot: 2
 ```
-It's also important to note how you can basically insert a group config as a task. Here, to make a group of natural language inference tasks, you simply write like how you would normally write a group config but this time place that as part of a task list under the main group being built.
-
-### Duplicate Tasks in Group Configs
-
-There might be cases where you might want to evaluate prompts and how models perform over prompt variations. You can list an existing task (In the example below, `anli_r1`) which varying `doc_to_text` implementation. To differentiate from each variation, we can utilize `task_alias`. LM-Eval will recognize that there are multiple variations of the same tasks and differentiate them.
-```yaml
-group: flan_held_in
-group_alias: Flan (Held-In)
-task:
-  # ANLI R1
-  - group: anli_r1_flan
-    group_alias: ANLI R1
-    task:
-      - task: anli_r1
-        task_alias: prompt-0
-        include: _held_in_template_yaml
-        doc_to_text: "{{premise}}\n\nChoose your answer ..."
-        ...
-      - task: anli_r1
-        task_alias: prompt-1
-        include: _held_in_template_yaml
-        doc_to_text: "{{premise}}\n\nBased on ..."
-      ...
-```

 ### Configuring python classes

@@ -382,21 +404,29 @@ task:
  ...
 ```

+You can also pass a custom argument to your class by accepting `config` in the custom class constructor.
+Here's how to do it:
+
+```yaml
+task: 20_newsgroups
+class: !function task.Unitxt
+recipe: card=cards.20_newsgroups,template=templates.classification.multi_class.title
+```
+
+In this example, `recipe` is the custom argument for the `Unitxt` class.
+
 ## Beautifying Table Display

-To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed. For example in `mmlu_abstract_algebra.yaml` we set `group_alias` to `stem` and `task_alias` to `abstract_algebra`.
+To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed. For example in `mmlu_abstract_algebra.yaml` we set `task_alias` to `abstract_algebra`. In group configs, a `group_alias` for a group can also be set.

 ```
 "dataset_name": "abstract_algebra"
 "description": "The following are multiple choice questions (with answers) about abstract\
  \ algebra.\n\n"
-"group": "mmlu_stem"
-"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_abstract_algebra"
 "task_alias": "abstract_algebra"
 ```
-Note: Even though `group` can be a list, for now, `group_alias` can only be a single string.

 ## Checking validity

@@ -416,9 +446,9 @@ a simple eye test.

 ## Versioning

-One key feature in LM Evaluation Harness is the ability to version tasks--that is, mark them with a specific version number that can be bumped whenever a breaking change is made.
+One key feature in LM Evaluation Harness is the ability to version tasks and groups--that is, mark them with a specific version number that can be bumped whenever a breaking change is made.

-This version info can be provided by adding the following to your new task config file:
+This version info can be provided by adding the following to your new task or group config file:

 ```
 metadata:

--- a/docs/task_guide.md
+++ b/docs/task_guide.md
@@ -16,7 +16,8 @@ Tasks are configured via the `TaskConfig` object. Below, we describe all fields

 Task naming + registration:
 - **task** (`str`, defaults to None) — name of the task.
- **group** (`str`, *optional*) — name of the task group(s) a task belongs to. Enables one to run all tasks with a specified tag or group name at once.
+- **task_alias** (`str`, defaults to None) - Alias of the task name that will be printed in the final table results.
+- **tag** (`str`, *optional*) — name of the task tags(s) a task belongs to. Enables one to run all tasks with a specified tag name at once.

 Dataset configuration options:
 - **dataset_path** (`str`) — The name of the dataset as listed by HF in the datasets Hub.
@@ -55,8 +56,6 @@ Other:

 ## Filters

-Explain: What are filters? What is their place in the pipeline?
-
 A key component of the `lm-evaluation-harness` library is the `Filter` object. In a typical evaluation run of the harness, we take the formatted inputs and run them through our LM, with the appropriate output type (greedy or free-form generation, or loglikelihood-based comparative scoring).

 After getting scores or output text from our LM on each `Instance` or document in the dataset, we then need to feed these responses into a metric or scoring function to return scores to a user.
@@ -295,105 +294,24 @@ Generative tasks:
 Tasks using complex filtering:
 - GSM8k with CoT (+ with Self-Consistency): (`lm_eval/tasks/gsm8k/gsm8k-cot.yaml` ; `lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml`)

-
-## Benchmarks
+# Group Configuration

 When evaluating a language model, it's is not unusual to test across a number of tasks that may not be related to one another in order to assess a variety of capabilities. To this end, it may be combursome to have to list the set of tasks or add a new group name to each yaml of each individual task.

-To solve this, we can create a benchmark yaml config. This is a config that contains the names of the tasks that should be included in a particular benchmark. The config consists of two main keys `group` which denotes the name of the benchmark and `task` which is where we can list the tasks. The tasks listed in `task` are the task names that have been registered. A good example would be the list of tasks used to evaluate the Pythia Suite.
-
-```yaml
-group: pythia
-task:
-  - lambada_openai
-  - wikitext
-  - piqa
-  - sciq
-  - wsc
-  - winogrande
-  - arc
-  - logiqa
-  - blimp
-  - hendrycksTest*
-```
-
-It is also possible to list an existing task in your benchmark configuration with some adjustments. For example, a few tasks from mmlu is included `multimedqa`. There, the `task_alias` and `group_alias` (See [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md#beautifying-table-display) for more details) are modified to suit the benchmark.
-
-```yaml
-group: multimedqa
-task:
-  - pubmedqa
-  - medmcqa
-  - medqa_4options
-  - task: mmlu_anatomy
-    task_alias: "anatomy (mmlu)"
-    group_alias: null
-  - task: mmlu_clinical_knowledge
-    task_alias: "clinical_knowledge (mmlu)"
-    group_alias: null
-  ...
-```
+To solve this, we can create a **group** yaml config. This is a config that contains the names of the tasks that should be included in a particular group. The config consists of two main keys: a `group` key which denotes the name of the group (as it would be called from the command line, e.g. `mmlu`) and a `task` key which is where we can list the tasks. The tasks listed in `task` are the task names that have been registered. A good example of a group yaml config can be found at [../lm_eval/tasks/mmlu/default/_mmlu.yaml]. See also the [New Task Guide](./new_task_guide.md) for a more in-depth and tutorial-esque explanation of how to write complex GroupConfigs.

-Alternatively, benchmarks can have tasks that are customizable for each task. They can be defined like how a yaml task is usually set.
+## Configurations

-```yaml
-group: t0_eval
-task:
-  # Coreference Resolution
-  - dataset_path: super_glue
-    dataset_name: wsc.fixed
-    use_prompt: promptsource:*
-    training_split: train
-    validation_split: validation
-    metric_list:
-      - metric: exact_match
-        aggregation: mean
-        higher_is_better: true
-        ignore_case: true
-        ignore_punctuation: true
-  # Coreference Resolution
-  - dataset_path: winogrande
-    dataset_name: winogrande_xl
-    use_prompt: promptsource:*
-    training_split: train
-    validation_split: validation
-    metric_list:
-      - metric: exact_match
-        aggregation: mean
-        higher_is_better: true
-        ignore_case: true
-        ignore_punctuation: true
-  ...
-```
+Groups are configured via the `GroupConfig` object. Below, we describe all fields usable within the object, and their role in defining a task.

-If the benchmark contains the same dataset but with different configurations, use `task` to differentiate between them. For example, T0-Eval evaluates on 3 versions of ANLI but the huggingface dataset collects them in one dataset.
-
-```YAML
-group: t0_eval
-task:
-  ...
-  - task: anli_r1
-    dataset_path: anli
-    use_prompt: promptsource:*
-    training_split: train_r1
-    validation_split: dev_r1
-    metric_list:
-      - metric: exact_match
-        aggregation: mean
-        higher_is_better: true
-        ignore_case: true
-        ignore_punctuation: true
-  - task: anli_r2
-    dataset_path: anli
-    use_prompt: promptsource:*
-    training_split: train_r2
-    validation_split: dev_r2
-    metric_list:
-      - metric: exact_match
-        aggregation: mean
-        higher_is_better: true
-        ignore_case: true
-        ignore_punctuation: true
-```
+### Parameters

-Calling the benchmark is done the same way we would call any task with `--tasks`. Benchmarks can be added in `lm_eval/tasks/benchmarks/`
+- **group** (`str`, defaults to `None`) — name of the group. Used to invoke it from the command line.
+- **group_alias** (`str`, defaults to `None`) - Alternative name for the group that will be printed in the table output.
+- **task** (`Union[str, list]`, defaults to `None`) - List of tasks that constitute the group.
+- **aggregate_metric_list** (`list`, defaults to `None`) - similar to `metric_list` in TaskConfigs, provide a list of configurations for metrics that should be aggregated across subtasks. Leaving empty will result in no aggregation being performed for this group. Keys for each list entry are:
+  - `metric: str` - the name of the metric to aggregate over (all subtasks must report a metric holding this name.)
+  - `aggregation: str` - what aggregation function to apply to aggregate these per-subtask metrics.  **currently, only `mean` is supported.**
+  - `weight_by_size: bool = True` whether to perform micro- averaging (`True`) or macro- (`False`) averaging of subtasks' accuracy scores when reporting the group's metric. MMLU, for example, averages over per-document accuracies (the *micro average*), resulting in the same accuracy as if one simply concatenated all 57 subjects into a single dataset and evaluated accuracy on that dataset.
+  - `filter_list: Union[str, List[str]] = "none"` - what filter keys one should match on to aggregate results. For example, if trying to aggregate over the `exact_match` metric using `strict-match` filter for `bbh_cot_zeroshot`, then set this to be `filter_list: "strict-match"`.  
+- **metadata** (`dict`, *optional*) - As with TaskConfigs, a field where extra config metadata can be passed. set the `num_fewshot` key within this to override the printed n_shot value in a results table for your group, for example.
--- a/examples/lm-eval-overview.ipynb
+++ b/examples/lm-eval-overview.ipynb
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Qw83KAePAhaS"
-      },
-      "source": [
-        "# Releasing LM-Evaluation-Harness v0.4.0"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Z7k2vq1iAdqr"
-      },
-      "source": [
-        "With the vast amount of work done in the field today, it helps to have a tool that people can use easily to share their results and use to check others to ensure reported numbers are valid. The LM Evaluation Harness is one such tool the community has used extensively. We want to continue to support the community and with that in mind, we’re excited to announce a major update on the LM Evaluation Harness to further our goal for open and accessible AI research."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "0gDoM0AJAvEc"
-      },
-      "source": [
-        "Our refactor stems from our desires to make the following believed best practices easier to carry out.  \n",
-        "\n",
-        "1.   Never copy results from other papers\n",
-        "2.   Always share your exact prompts\n",
-        "3.   Always provide model outputs\n",
-        "4.   Qualitatively review a small batch of outputs before running evaluation jobs at scale\n",
-        "\n",
-        "We also wanted to make the library a better experience to use and to contribute or design evaluations within. New features in the new release that serve this purpose include:\n",
-        "\n",
-        "1. Faster Evaluation Runtimes (accelerated data-parallel inference with HF Transformers + Accelerate, and commonly used or faster inference libraries such as vLLM and Llama-CPP)\n",
-        "2. Easier addition and sharing of new tasks (YAML-based task config formats, allowing single-file sharing of custom tasks)\n",
-        "3. More configurability, for more advanced workflows and easier operation with modifying prompts\n",
-        "4. Better logging of data at runtime and post-hoc"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "nnwsOpjda_YW"
-      },
-      "source": [
-        "In this notebook we will be going through a short tutorial on how things work."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "zAov81vTbL2K"
-      },
-      "source": [
-        "## Install LM-Eval"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "8hiosGzq_qZg",
-        "outputId": "6ab73e5e-1f54-417e-a388-07e0d870b132"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Collecting git+https://github.com/EleutherAI/lm-evaluation-harness.git@big-refactor\n",
-            "  Cloning https://github.com/EleutherAI/lm-evaluation-harness.git (to revision big-refactor) to /tmp/pip-req-build-tnssql5s\n",
-            "  Running command git clone --filter=blob:none --quiet https://github.com/EleutherAI/lm-evaluation-harness.git /tmp/pip-req-build-tnssql5s\n",
-            "  Running command git checkout -b big-refactor --track origin/big-refactor\n",
-            "  Switched to a new branch 'big-refactor'\n",
-            "  Branch 'big-refactor' set up to track remote branch 'big-refactor' from 'origin'.\n",
-            "  Resolved https://github.com/EleutherAI/lm-evaluation-harness.git to commit 42f486ee49b65926a444cb0620870a39a5b4b0a8\n",
-            "  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
-            "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
-            "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
-            "Collecting accelerate>=0.21.0 (from lm-eval==1.0.0)\n",
-            "  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.4/261.4 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hCollecting evaluate (from lm-eval==1.0.0)\n",
-            "  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.1/84.1 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hCollecting datasets>=2.0.0 (from lm-eval==1.0.0)\n",
-            "  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m521.2/521.2 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hCollecting jsonlines (from lm-eval==1.0.0)\n",
-            "  Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)\n",
-            "Requirement already satisfied: numexpr in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (2.8.7)\n",
-            "Collecting peft>=0.2.0 (from lm-eval==1.0.0)\n",
-            "  Downloading peft-0.6.2-py3-none-any.whl (174 kB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m174.7/174.7 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hCollecting pybind11>=2.6.2 (from lm-eval==1.0.0)\n",
-            "  Downloading pybind11-2.11.1-py3-none-any.whl (227 kB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.7/227.7 kB\u001b[0m \u001b[31m12.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hCollecting pytablewriter (from lm-eval==1.0.0)\n",
-            "  Downloading pytablewriter-1.2.0-py3-none-any.whl (111 kB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m111.1/111.1 kB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hCollecting rouge-score>=0.0.4 (from lm-eval==1.0.0)\n",
-            "  Downloading rouge_score-0.1.2.tar.gz (17 kB)\n",
-            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
-            "Collecting sacrebleu>=1.5.0 (from lm-eval==1.0.0)\n",
-            "  Downloading sacrebleu-2.3.2-py3-none-any.whl (119 kB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m119.7/119.7 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hRequirement already satisfied: scikit-learn>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (1.2.2)\n",
-            "Collecting sqlitedict (from lm-eval==1.0.0)\n",
-            "  Downloading sqlitedict-2.1.0.tar.gz (21 kB)\n",
-            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
-            "Requirement already satisfied: torch>=1.8 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (2.1.0+cu118)\n",
-            "Collecting tqdm-multiprocess (from lm-eval==1.0.0)\n",
-            "  Downloading tqdm_multiprocess-0.0.11-py3-none-any.whl (9.8 kB)\n",
-            "Requirement already satisfied: transformers>=4.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (4.35.2)\n",
-            "Collecting zstandard (from lm-eval==1.0.0)\n",
-            "  Downloading zstandard-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m29.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (1.23.5)\n",
-            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (23.2)\n",
-            "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (5.9.5)\n",
-            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (6.0.1)\n",
-            "Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (0.19.4)\n",
-            "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (9.0.0)\n",
-            "Collecting pyarrow-hotfix (from datasets>=2.0.0->lm-eval==1.0.0)\n",
-            "  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n",
-            "Collecting dill<0.3.8,>=0.3.0 (from datasets>=2.0.0->lm-eval==1.0.0)\n",
-            "  Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (1.5.3)\n",
-            "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2.31.0)\n",
-            "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (4.66.1)\n",
-            "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (3.4.1)\n",
-            "Collecting multiprocess (from datasets>=2.0.0->lm-eval==1.0.0)\n",
-            "  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m19.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2023.6.0)\n",
-            "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (3.8.6)\n",
-            "Collecting responses<0.19 (from evaluate->lm-eval==1.0.0)\n",
-            "  Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n",
-            "Requirement already satisfied: safetensors in /usr/local/lib/python3.10/dist-packages (from peft>=0.2.0->lm-eval==1.0.0) (0.4.0)\n",
-            "Requirement already satisfied: absl-py in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm-eval==1.0.0) (1.4.0)\n",
-            "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm-eval==1.0.0) (3.8.1)\n",
-            "Requirement already satisfied: six>=1.14.0 in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm-eval==1.0.0) (1.16.0)\n",
-            "Collecting portalocker (from sacrebleu>=1.5.0->lm-eval==1.0.0)\n",
-            "  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)\n",
-            "Requirement already satisfied: regex in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm-eval==1.0.0) (2023.6.3)\n",
-            "Requirement already satisfied: tabulate>=0.8.9 in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm-eval==1.0.0) (0.9.0)\n",
-            "Collecting colorama (from sacrebleu>=1.5.0->lm-eval==1.0.0)\n",
-            "  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n",
-            "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm-eval==1.0.0) (4.9.3)\n",
-            "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm-eval==1.0.0) (1.11.3)\n",
-            "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm-eval==1.0.0) (1.3.2)\n",
-            "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm-eval==1.0.0) (3.2.0)\n",
-            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (3.13.1)\n",
-            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (4.5.0)\n",
-            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (1.12)\n",
-            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (3.2.1)\n",
-            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (3.1.2)\n",
-            "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (2.1.0)\n",
-            "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.1->lm-eval==1.0.0) (0.15.0)\n",
-            "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonlines->lm-eval==1.0.0) (23.1.0)\n",
-            "Requirement already satisfied: setuptools>=38.3.0 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm-eval==1.0.0) (67.7.2)\n",
-            "Collecting DataProperty<2,>=1.0.1 (from pytablewriter->lm-eval==1.0.0)\n",
-            "  Downloading DataProperty-1.0.1-py3-none-any.whl (27 kB)\n",
-            "Collecting mbstrdecoder<2,>=1.0.0 (from pytablewriter->lm-eval==1.0.0)\n",
-            "  Downloading mbstrdecoder-1.1.3-py3-none-any.whl (7.8 kB)\n",
-            "Collecting pathvalidate<4,>=2.3.0 (from pytablewriter->lm-eval==1.0.0)\n",
-            "  Downloading pathvalidate-3.2.0-py3-none-any.whl (23 kB)\n",
-            "Collecting tabledata<2,>=1.3.1 (from pytablewriter->lm-eval==1.0.0)\n",
-            "  Downloading tabledata-1.3.3-py3-none-any.whl (11 kB)\n",
-            "Collecting tcolorpy<1,>=0.0.5 (from pytablewriter->lm-eval==1.0.0)\n",
-            "  Downloading tcolorpy-0.1.4-py3-none-any.whl (7.9 kB)\n",
-            "Collecting typepy[datetime]<2,>=1.3.2 (from pytablewriter->lm-eval==1.0.0)\n",
-            "  Downloading typepy-1.3.2-py3-none-any.whl (31 kB)\n",
-            "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (3.3.2)\n",
-            "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (6.0.4)\n",
-            "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (4.0.3)\n",
-            "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (1.9.2)\n",
-            "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (1.4.0)\n",
-            "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (1.3.1)\n",
-            "Requirement already satisfied: chardet<6,>=3.0.4 in /usr/local/lib/python3.10/dist-packages (from mbstrdecoder<2,>=1.0.0->pytablewriter->lm-eval==1.0.0) (5.2.0)\n",
-            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.0.0->lm-eval==1.0.0) (3.4)\n",
-            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.0.0->lm-eval==1.0.0) (2.0.7)\n",
-            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.0.0->lm-eval==1.0.0) (2023.7.22)\n",
-            "Requirement already satisfied: python-dateutil<3.0.0,>=2.8.0 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm-eval==1.0.0) (2.8.2)\n",
-            "Requirement already satisfied: pytz>=2018.9 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm-eval==1.0.0) (2023.3.post1)\n",
-            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.8->lm-eval==1.0.0) (2.1.3)\n",
-            "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->rouge-score>=0.0.4->lm-eval==1.0.0) (8.1.7)\n",
-            "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.8->lm-eval==1.0.0) (1.3.0)\n",
-            "Building wheels for collected packages: lm-eval, rouge-score, sqlitedict\n",
-            "  Building wheel for lm-eval (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
-            "  Created wheel for lm-eval: filename=lm_eval-1.0.0-py3-none-any.whl size=994254 sha256=88356155b19f2891981ecef948326ad6ce8ca40a6009378410ec20d0e225995a\n",
-            "  Stored in directory: /tmp/pip-ephem-wheel-cache-9v6ye7h3/wheels/17/01/26/599c0779e9858a70a73fa8a306699b5b9a868f820c225457b0\n",
-            "  Building wheel for rouge-score (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
-            "  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=6bb0d44e4881972c43ce194e7cb65233d309758cb15f0dec54590d3d2efcfc36\n",
-            "  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4\n",
-            "  Building wheel for sqlitedict (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
-            "  Created wheel for sqlitedict: filename=sqlitedict-2.1.0-py3-none-any.whl size=16863 sha256=5747f7dd73ddf3d8fbcebf51b5e4f718fabe1e94bccdf16d2f22a2e65ee7fdf4\n",
-            "  Stored in directory: /root/.cache/pip/wheels/79/d6/e7/304e0e6cb2221022c26d8161f7c23cd4f259a9e41e8bbcfabd\n",
-            "Successfully built lm-eval rouge-score sqlitedict\n",
-            "Installing collected packages: sqlitedict, zstandard, tcolorpy, pybind11, pyarrow-hotfix, portalocker, pathvalidate, mbstrdecoder, jsonlines, dill, colorama, typepy, tqdm-multiprocess, sacrebleu, rouge-score, responses, multiprocess, accelerate, datasets, DataProperty, tabledata, peft, evaluate, pytablewriter, lm-eval\n",
-            "Successfully installed DataProperty-1.0.1 accelerate-0.24.1 colorama-0.4.6 datasets-2.15.0 dill-0.3.7 evaluate-0.4.1 jsonlines-4.0.0 lm-eval-1.0.0 mbstrdecoder-1.1.3 multiprocess-0.70.15 pathvalidate-3.2.0 peft-0.6.2 portalocker-2.8.2 pyarrow-hotfix-0.6 pybind11-2.11.1 pytablewriter-1.2.0 responses-0.18.0 rouge-score-0.1.2 sacrebleu-2.3.2 sqlitedict-2.1.0 tabledata-1.3.3 tcolorpy-0.1.4 tqdm-multiprocess-0.0.11 typepy-1.3.2 zstandard-0.22.0\n"
-          ]
-        }
-      ],
-      "source": [
-        "# Install LM-Eval\n",
-        "!pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@big-refactor"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 2,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 0,
-          "referenced_widgets": [
-            "a1d3a8aa016544a78e8821c8f6199e06",
-            "f61ed33fad754146bdd2ac9db1ba1c48",
-            "bfa0af6aeff344c6845e1080a878e92e",
-            "fd1ad9e0367d4004aae853b91c3a7617",
-            "6b2d90209ec14230b3d58a74ac9b83bf",
-            "a73f357065d34d7baf0453ae4a8d75e2",
-            "46f521b73fd943c081c648fd873ebc0a",
-            "7c5689bc13684db8a22681f41863dddd",
-            "48763b6233374554ae76035c0483066f",
-            "4986a21eb560448fa79f4b25cde48951",
-            "aed3acd2f2d74003b44079c333a0698e"
-          ]
-        },
-        "id": "uyO5MaKkZyah",
-        "outputId": "d46e8096-5086-4e49-967e-ea33d4a2a335"
-      },
-      "outputs": [
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "a1d3a8aa016544a78e8821c8f6199e06",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "Downloading builder script:   0%|          | 0.00/5.67k [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        }
-      ],
-      "source": [
-        "from lm_eval import api"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "8rfUeX6n_wkK"
-      },
-      "source": [
-        "## Create new evaluation tasks with config-based tasks\n",
-        "\n",
-        "Even within the same task, many works have reported numbers based on different choices of evaluation. Some report on the test sets, validation sets, or even subset of the training sets. Others have specialized prompts and verbalizers. We introduce YAMLs to allow users to easily make different variations. By leveraging the YAML configs to configure evaluations, the refactored LM-Eval takes the methods of the `Task` object and makes them configurable by setting the appropriate attributes in the config file. There, users can set the tasks they want by setting the name of the HF dataset (local tasks are also possible), the dataset splits used, and much more. Key configurations relating to prompting, such as `doc_to_text`, previously implemented as a method of the same name, are now configurable with jinja2 to allow high-level scripting to transform a HF dataset to text string as input to the model.\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "HYFUhhfOSJKe"
-      },
-      "source": [
-        "A core-feature to LM-Eval is to configure tasks with YAML configs. With configs, you can fill preset fields to easily set up a task.\n",
-        "\n",
-        "Here, we write a demo YAML config for a multiple-choice evaluation of BoolQ:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 3,
-      "metadata": {
-        "id": "bg3dGROW-V39"
-      },
-      "outputs": [],
-      "source": [
-        "YAML_boolq_string = '''\n",
-        "task: demo_boolq\n",
-        "dataset_path: super_glue\n",
-        "dataset_name: boolq\n",
-        "output_type: multiple_choice\n",
-        "training_split: train\n",
-        "validation_split: validation\n",
-        "doc_to_text: \"{{passage}}\\nQuestion: {{question}}?\\nAnswer:\"\n",
-        "doc_to_target: label\n",
-        "doc_to_choice: [\"no\", \"yes\"]\n",
-        "should_decontaminate: true\n",
-        "doc_to_decontamination_query: passage\n",
-        "metric_list:\n",
-        "  - metric: acc\n",
-        "'''\n",
-        "with open('boolq.yaml', 'w') as f:\n",
-        "    f.write(YAML_boolq_string)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "And we can now run evaluation on this task, by pointing to the config file we've just created:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 4,
-      "metadata": {
-        "id": "LOUHK7PtQfq4"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "2023-11-29:11:54:55,156 INFO     [utils.py:160] NumExpr defaulting to 2 threads.\n",
-            "2023-11-29 11:54:55.942051: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
-            "2023-11-29 11:54:55.942108: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
-            "2023-11-29 11:54:55.942142: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
-            "2023-11-29 11:54:57.066802: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
-            "2023-11-29:11:55:00,954 INFO     [__main__.py:132] Verbosity set to INFO\n",
-            "2023-11-29:11:55:11,038 WARNING  [__main__.py:138]  --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
-            "2023-11-29:11:55:11,038 INFO     [__main__.py:143] Including path: ./\n",
-            "2023-11-29:11:55:11,046 INFO     [__main__.py:205] Selected Tasks: ['demo_boolq']\n",
-            "2023-11-29:11:55:11,047 WARNING  [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.\n",
-            "2023-11-29:11:55:11,110 INFO     [huggingface.py:120] Using device 'cuda'\n",
-            "config.json: 100% 571/571 [00:00<00:00, 2.87MB/s]\n",
-            "model.safetensors: 100% 5.68G/5.68G [00:32<00:00, 173MB/s]\n",
-            "tokenizer_config.json: 100% 396/396 [00:00<00:00, 2.06MB/s]\n",
-            "tokenizer.json: 100% 2.11M/2.11M [00:00<00:00, 11.6MB/s]\n",
-            "special_tokens_map.json: 100% 99.0/99.0 [00:00<00:00, 555kB/s]\n",
-            "2023-11-29:11:56:18,658 WARNING  [task.py:614] [Task: demo_boolq] metric acc is defined, but aggregation is not. using default aggregation=mean\n",
-            "2023-11-29:11:56:18,658 WARNING  [task.py:626] [Task: demo_boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True\n",
-            "Downloading builder script: 100% 30.7k/30.7k [00:00<00:00, 59.0MB/s]\n",
-            "Downloading metadata: 100% 38.7k/38.7k [00:00<00:00, 651kB/s]\n",
-            "Downloading readme: 100% 14.8k/14.8k [00:00<00:00, 37.3MB/s]\n",
-            "Downloading data: 100% 4.12M/4.12M [00:00<00:00, 55.1MB/s]\n",
-            "Generating train split: 100% 9427/9427 [00:00<00:00, 15630.89 examples/s]\n",
-            "Generating validation split: 100% 3270/3270 [00:00<00:00, 20002.56 examples/s]\n",
-            "Generating test split: 100% 3245/3245 [00:00<00:00, 20866.19 examples/s]\n",
-            "2023-11-29:11:56:22,315 INFO     [task.py:355] Building contexts for task on rank 0...\n",
-            "2023-11-29:11:56:22,322 INFO     [evaluator.py:319] Running loglikelihood requests\n",
-            "100% 20/20 [00:04<00:00,  4.37it/s]\n",
-            "fatal: not a git repository (or any of the parent directories): .git\n",
-            "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (), limit: 10.0, num_fewshot: None, batch_size: 1\n",
-            "|  Tasks   |Version|Filter|n-shot|Metric|Value|   |Stderr|\n",
-            "|----------|-------|------|-----:|------|----:|---|-----:|\n",
-            "|demo_boolq|Yaml   |none  |     0|acc   |    1|±  |     0|\n",
-            "\n"
-          ]
-        }
-      ],
-      "source": [
-        "!lm_eval \\\n",
-        "    --model hf \\\n",
-        "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
-        "    --include_path ./ \\\n",
-        "    --tasks demo_boolq \\\n",
-        "    --limit 10\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "LOUHK7PtQfq4"
-      },
-      "source": [
-        "Often, tasks are part of a larger group used to measure different capabilities. The dynamism of the field today means new dimensions of evaluation can come about which would mix and match new and older tasks alike. In LM-Eval, We can also group tasks and call that the group name to evaluate on a set of tasks easily. In this instance, let's evaluate the group `yes_or_no_tasks` which comprise of the tasks `demo_boolq` and `demo_cola`; tasks which are multiple choice tasks with options `yes` and `no` as the name suggests.\n",
-        "\n",
-        "<!-- making new groups is easier than ever, allowing user to work bottom-up by makiing individual tasks and linking them to a group or Top-Down, making a new group by listing existing tasks.\n",
-        "\n",
-        "We also show the aggregate across samples besides only showing the aggregation between subtasks. This may come in handy when certain groups want to be aggregated as a single task. -->\n",
-        "\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 5,
-      "metadata": {
-        "id": "fthNg3ywO-kA"
-      },
-      "outputs": [],
-      "source": [
-        "YAML_cola_string = '''\n",
-        "group: yes_or_no_tasks\n",
-        "task: demo_cola\n",
-        "dataset_path: glue\n",
-        "dataset_name: cola\n",
-        "output_type: multiple_choice\n",
-        "training_split: train\n",
-        "validation_split: validation\n",
-        "doc_to_text: \"{{sentence}}\\nQuestion: Does this sentence make sense?\\nAnswer:\"\n",
-        "doc_to_target: label\n",
-        "doc_to_choice: [\"no\", \"yes\"]\n",
-        "should_decontaminate: true\n",
-        "doc_to_decontamination_query: sentence\n",
-        "metric_list:\n",
-        "  - metric: acc\n",
-        "'''\n",
-        "with open('cola.yaml', 'w') as f:\n",
-        "    f.write(YAML_cola_string)"
-      ]
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Qw83KAePAhaS"
+   },
+   "source": [
+    "# Releasing LM-Evaluation-Harness v0.4.0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Z7k2vq1iAdqr"
+   },
+   "source": [
+    "With the vast amount of work done in the field today, it helps to have a tool that people can use easily to share their results and use to check others to ensure reported numbers are valid. The LM Evaluation Harness is one such tool the community has used extensively. We want to continue to support the community and with that in mind, we’re excited to announce a major update on the LM Evaluation Harness to further our goal for open and accessible AI research."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "0gDoM0AJAvEc"
+   },
+   "source": [
+    "Our refactor stems from our desires to make the following believed best practices easier to carry out.  \n",
+    "\n",
+    "1.   Never copy results from other papers\n",
+    "2.   Always share your exact prompts\n",
+    "3.   Always provide model outputs\n",
+    "4.   Qualitatively review a small batch of outputs before running evaluation jobs at scale\n",
+    "\n",
+    "We also wanted to make the library a better experience to use and to contribute or design evaluations within. New features in the new release that serve this purpose include:\n",
+    "\n",
+    "1. Faster Evaluation Runtimes (accelerated data-parallel inference with HF Transformers + Accelerate, and commonly used or faster inference libraries such as vLLM and Llama-CPP)\n",
+    "2. Easier addition and sharing of new tasks (YAML-based task config formats, allowing single-file sharing of custom tasks)\n",
+    "3. More configurability, for more advanced workflows and easier operation with modifying prompts\n",
+    "4. Better logging of data at runtime and post-hoc"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "nnwsOpjda_YW"
+   },
+   "source": [
+    "In this notebook we will be going through a short tutorial on how things work."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "zAov81vTbL2K"
+   },
+   "source": [
+    "## Install LM-Eval"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
    },
+    "id": "8hiosGzq_qZg",
+    "outputId": "6ab73e5e-1f54-417e-a388-07e0d870b132"
+   },
+   "outputs": [
    {
-      "cell_type": "code",
-      "execution_count": 6,
-      "metadata": {
-        "id": "XceRKCuuDtbn"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "2023-11-29:11:56:33,016 INFO     [utils.py:160] NumExpr defaulting to 2 threads.\n",
-            "2023-11-29 11:56:33.852995: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
-            "2023-11-29 11:56:33.853050: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
-            "2023-11-29 11:56:33.853087: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
-            "2023-11-29 11:56:35.129047: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
-            "2023-11-29:11:56:38,546 INFO     [__main__.py:132] Verbosity set to INFO\n",
-            "2023-11-29:11:56:47,509 WARNING  [__main__.py:138]  --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
-            "2023-11-29:11:56:47,509 INFO     [__main__.py:143] Including path: ./\n",
-            "2023-11-29:11:56:47,517 INFO     [__main__.py:205] Selected Tasks: ['yes_or_no_tasks']\n",
-            "2023-11-29:11:56:47,520 WARNING  [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.\n",
-            "2023-11-29:11:56:47,550 INFO     [huggingface.py:120] Using device 'cuda'\n",
-            "2023-11-29:11:57:08,743 WARNING  [task.py:614] [Task: demo_cola] metric acc is defined, but aggregation is not. using default aggregation=mean\n",
-            "2023-11-29:11:57:08,743 WARNING  [task.py:626] [Task: demo_cola] metric acc is defined, but higher_is_better is not. using default higher_is_better=True\n",
-            "Downloading builder script: 100% 28.8k/28.8k [00:00<00:00, 52.7MB/s]\n",
-            "Downloading metadata: 100% 28.7k/28.7k [00:00<00:00, 51.9MB/s]\n",
-            "Downloading readme: 100% 27.9k/27.9k [00:00<00:00, 48.0MB/s]\n",
-            "Downloading data: 100% 377k/377k [00:00<00:00, 12.0MB/s]\n",
-            "Generating train split: 100% 8551/8551 [00:00<00:00, 19744.58 examples/s]\n",
-            "Generating validation split: 100% 1043/1043 [00:00<00:00, 27057.01 examples/s]\n",
-            "Generating test split: 100% 1063/1063 [00:00<00:00, 22705.17 examples/s]\n",
-            "2023-11-29:11:57:11,698 INFO     [task.py:355] Building contexts for task on rank 0...\n",
-            "2023-11-29:11:57:11,704 INFO     [evaluator.py:319] Running loglikelihood requests\n",
-            "100% 20/20 [00:03<00:00,  5.15it/s]\n",
-            "fatal: not a git repository (or any of the parent directories): .git\n",
-            "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (), limit: 10.0, num_fewshot: None, batch_size: 1\n",
-            "|     Tasks     |Version|Filter|n-shot|Metric|Value|   |Stderr|\n",
-            "|---------------|-------|------|-----:|------|----:|---|-----:|\n",
-            "|yes_or_no_tasks|N/A    |none  |     0|acc   |  0.7|±  |0.1528|\n",
-            "| - demo_cola   |Yaml   |none  |     0|acc   |  0.7|±  |0.1528|\n",
-            "\n",
-            "|    Groups     |Version|Filter|n-shot|Metric|Value|   |Stderr|\n",
-            "|---------------|-------|------|-----:|------|----:|---|-----:|\n",
-            "|yes_or_no_tasks|N/A    |none  |     0|acc   |  0.7|±  |0.1528|\n",
-            "\n"
-          ]
-        }
-      ],
-      "source": [
-        "# !accelerate launch --no_python\n",
-        "!lm_eval \\\n",
-        "    --model hf \\\n",
-        "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
-        "    --include_path ./ \\\n",
-        "    --tasks yes_or_no_tasks \\\n",
-        "    --limit 10 \\\n",
-        "    --output output/yes_or_no_tasks/ \\\n",
-        "    --log_samples\n"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting git+https://github.com/EleutherAI/lm-evaluation-harness.git@big-refactor\n",
+      "  Cloning https://github.com/EleutherAI/lm-evaluation-harness.git (to revision big-refactor) to /tmp/pip-req-build-tnssql5s\n",
+      "  Running command git clone --filter=blob:none --quiet https://github.com/EleutherAI/lm-evaluation-harness.git /tmp/pip-req-build-tnssql5s\n",
+      "  Running command git checkout -b big-refactor --track origin/big-refactor\n",
+      "  Switched to a new branch 'big-refactor'\n",
+      "  Branch 'big-refactor' set up to track remote branch 'big-refactor' from 'origin'.\n",
+      "  Resolved https://github.com/EleutherAI/lm-evaluation-harness.git to commit 42f486ee49b65926a444cb0620870a39a5b4b0a8\n",
+      "  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
+      "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
+      "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+      "Collecting accelerate>=0.21.0 (from lm-eval==1.0.0)\n",
+      "  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.4/261.4 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting evaluate (from lm-eval==1.0.0)\n",
+      "  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.1/84.1 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting datasets>=2.0.0 (from lm-eval==1.0.0)\n",
+      "  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m521.2/521.2 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting jsonlines (from lm-eval==1.0.0)\n",
+      "  Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)\n",
+      "Requirement already satisfied: numexpr in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (2.8.7)\n",
+      "Collecting peft>=0.2.0 (from lm-eval==1.0.0)\n",
+      "  Downloading peft-0.6.2-py3-none-any.whl (174 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m174.7/174.7 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting pybind11>=2.6.2 (from lm-eval==1.0.0)\n",
+      "  Downloading pybind11-2.11.1-py3-none-any.whl (227 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.7/227.7 kB\u001b[0m \u001b[31m12.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting pytablewriter (from lm-eval==1.0.0)\n",
+      "  Downloading pytablewriter-1.2.0-py3-none-any.whl (111 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m111.1/111.1 kB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting rouge-score>=0.0.4 (from lm-eval==1.0.0)\n",
+      "  Downloading rouge_score-0.1.2.tar.gz (17 kB)\n",
+      "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+      "Collecting sacrebleu>=1.5.0 (from lm-eval==1.0.0)\n",
+      "  Downloading sacrebleu-2.3.2-py3-none-any.whl (119 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m119.7/119.7 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: scikit-learn>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (1.2.2)\n",
+      "Collecting sqlitedict (from lm-eval==1.0.0)\n",
+      "  Downloading sqlitedict-2.1.0.tar.gz (21 kB)\n",
+      "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+      "Requirement already satisfied: torch>=1.8 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (2.1.0+cu118)\n",
+      "Collecting tqdm-multiprocess (from lm-eval==1.0.0)\n",
+      "  Downloading tqdm_multiprocess-0.0.11-py3-none-any.whl (9.8 kB)\n",
+      "Requirement already satisfied: transformers>=4.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (4.35.2)\n",
+      "Collecting zstandard (from lm-eval==1.0.0)\n",
+      "  Downloading zstandard-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m29.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (1.23.5)\n",
+      "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (23.2)\n",
+      "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (5.9.5)\n",
+      "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (6.0.1)\n",
+      "Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (0.19.4)\n",
+      "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (9.0.0)\n",
+      "Collecting pyarrow-hotfix (from datasets>=2.0.0->lm-eval==1.0.0)\n",
+      "  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n",
+      "Collecting dill<0.3.8,>=0.3.0 (from datasets>=2.0.0->lm-eval==1.0.0)\n",
+      "  Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (1.5.3)\n",
+      "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2.31.0)\n",
+      "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (4.66.1)\n",
+      "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (3.4.1)\n",
+      "Collecting multiprocess (from datasets>=2.0.0->lm-eval==1.0.0)\n",
+      "  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m19.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2023.6.0)\n",
+      "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (3.8.6)\n",
+      "Collecting responses<0.19 (from evaluate->lm-eval==1.0.0)\n",
+      "  Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n",
+      "Requirement already satisfied: safetensors in /usr/local/lib/python3.10/dist-packages (from peft>=0.2.0->lm-eval==1.0.0) (0.4.0)\n",
+      "Requirement already satisfied: absl-py in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm-eval==1.0.0) (1.4.0)\n",
+      "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm-eval==1.0.0) (3.8.1)\n",
+      "Requirement already satisfied: six>=1.14.0 in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm-eval==1.0.0) (1.16.0)\n",
+      "Collecting portalocker (from sacrebleu>=1.5.0->lm-eval==1.0.0)\n",
+      "  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)\n",
+      "Requirement already satisfied: regex in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm-eval==1.0.0) (2023.6.3)\n",
+      "Requirement already satisfied: tabulate>=0.8.9 in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm-eval==1.0.0) (0.9.0)\n",
+      "Collecting colorama (from sacrebleu>=1.5.0->lm-eval==1.0.0)\n",
+      "  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n",
+      "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm-eval==1.0.0) (4.9.3)\n",
+      "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm-eval==1.0.0) (1.11.3)\n",
+      "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm-eval==1.0.0) (1.3.2)\n",
+      "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm-eval==1.0.0) (3.2.0)\n",
+      "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (3.13.1)\n",
+      "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (4.5.0)\n",
+      "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (1.12)\n",
+      "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (3.2.1)\n",
+      "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (3.1.2)\n",
+      "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (2.1.0)\n",
+      "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.1->lm-eval==1.0.0) (0.15.0)\n",
+      "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonlines->lm-eval==1.0.0) (23.1.0)\n",
+      "Requirement already satisfied: setuptools>=38.3.0 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm-eval==1.0.0) (67.7.2)\n",
+      "Collecting DataProperty<2,>=1.0.1 (from pytablewriter->lm-eval==1.0.0)\n",
+      "  Downloading DataProperty-1.0.1-py3-none-any.whl (27 kB)\n",
+      "Collecting mbstrdecoder<2,>=1.0.0 (from pytablewriter->lm-eval==1.0.0)\n",
+      "  Downloading mbstrdecoder-1.1.3-py3-none-any.whl (7.8 kB)\n",
+      "Collecting pathvalidate<4,>=2.3.0 (from pytablewriter->lm-eval==1.0.0)\n",
+      "  Downloading pathvalidate-3.2.0-py3-none-any.whl (23 kB)\n",
+      "Collecting tabledata<2,>=1.3.1 (from pytablewriter->lm-eval==1.0.0)\n",
+      "  Downloading tabledata-1.3.3-py3-none-any.whl (11 kB)\n",
+      "Collecting tcolorpy<1,>=0.0.5 (from pytablewriter->lm-eval==1.0.0)\n",
+      "  Downloading tcolorpy-0.1.4-py3-none-any.whl (7.9 kB)\n",
+      "Collecting typepy[datetime]<2,>=1.3.2 (from pytablewriter->lm-eval==1.0.0)\n",
+      "  Downloading typepy-1.3.2-py3-none-any.whl (31 kB)\n",
+      "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (3.3.2)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (6.0.4)\n",
+      "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (4.0.3)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (1.9.2)\n",
+      "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (1.4.0)\n",
+      "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (1.3.1)\n",
+      "Requirement already satisfied: chardet<6,>=3.0.4 in /usr/local/lib/python3.10/dist-packages (from mbstrdecoder<2,>=1.0.0->pytablewriter->lm-eval==1.0.0) (5.2.0)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.0.0->lm-eval==1.0.0) (3.4)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.0.0->lm-eval==1.0.0) (2.0.7)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.0.0->lm-eval==1.0.0) (2023.7.22)\n",
+      "Requirement already satisfied: python-dateutil<3.0.0,>=2.8.0 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm-eval==1.0.0) (2.8.2)\n",
+      "Requirement already satisfied: pytz>=2018.9 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm-eval==1.0.0) (2023.3.post1)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.8->lm-eval==1.0.0) (2.1.3)\n",
+      "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->rouge-score>=0.0.4->lm-eval==1.0.0) (8.1.7)\n",
+      "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.8->lm-eval==1.0.0) (1.3.0)\n",
+      "Building wheels for collected packages: lm-eval, rouge-score, sqlitedict\n",
+      "  Building wheel for lm-eval (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+      "  Created wheel for lm-eval: filename=lm_eval-1.0.0-py3-none-any.whl size=994254 sha256=88356155b19f2891981ecef948326ad6ce8ca40a6009378410ec20d0e225995a\n",
+      "  Stored in directory: /tmp/pip-ephem-wheel-cache-9v6ye7h3/wheels/17/01/26/599c0779e9858a70a73fa8a306699b5b9a868f820c225457b0\n",
+      "  Building wheel for rouge-score (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+      "  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=6bb0d44e4881972c43ce194e7cb65233d309758cb15f0dec54590d3d2efcfc36\n",
+      "  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4\n",
+      "  Building wheel for sqlitedict (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+      "  Created wheel for sqlitedict: filename=sqlitedict-2.1.0-py3-none-any.whl size=16863 sha256=5747f7dd73ddf3d8fbcebf51b5e4f718fabe1e94bccdf16d2f22a2e65ee7fdf4\n",
+      "  Stored in directory: /root/.cache/pip/wheels/79/d6/e7/304e0e6cb2221022c26d8161f7c23cd4f259a9e41e8bbcfabd\n",
+      "Successfully built lm-eval rouge-score sqlitedict\n",
+      "Installing collected packages: sqlitedict, zstandard, tcolorpy, pybind11, pyarrow-hotfix, portalocker, pathvalidate, mbstrdecoder, jsonlines, dill, colorama, typepy, tqdm-multiprocess, sacrebleu, rouge-score, responses, multiprocess, accelerate, datasets, DataProperty, tabledata, peft, evaluate, pytablewriter, lm-eval\n",
+      "Successfully installed DataProperty-1.0.1 accelerate-0.24.1 colorama-0.4.6 datasets-2.15.0 dill-0.3.7 evaluate-0.4.1 jsonlines-4.0.0 lm-eval-1.0.0 mbstrdecoder-1.1.3 multiprocess-0.70.15 pathvalidate-3.2.0 peft-0.6.2 portalocker-2.8.2 pyarrow-hotfix-0.6 pybind11-2.11.1 pytablewriter-1.2.0 responses-0.18.0 rouge-score-0.1.2 sacrebleu-2.3.2 sqlitedict-2.1.0 tabledata-1.3.3 tcolorpy-0.1.4 tqdm-multiprocess-0.0.11 typepy-1.3.2 zstandard-0.22.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Install LM-Eval\n",
+    "!pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 0,
+     "referenced_widgets": [
+      "a1d3a8aa016544a78e8821c8f6199e06",
+      "f61ed33fad754146bdd2ac9db1ba1c48",
+      "bfa0af6aeff344c6845e1080a878e92e",
+      "fd1ad9e0367d4004aae853b91c3a7617",
+      "6b2d90209ec14230b3d58a74ac9b83bf",
+      "a73f357065d34d7baf0453ae4a8d75e2",
+      "46f521b73fd943c081c648fd873ebc0a",
+      "7c5689bc13684db8a22681f41863dddd",
+      "48763b6233374554ae76035c0483066f",
+      "4986a21eb560448fa79f4b25cde48951",
+      "aed3acd2f2d74003b44079c333a0698e"
+     ]
    },
+    "id": "uyO5MaKkZyah",
+    "outputId": "d46e8096-5086-4e49-967e-ea33d4a2a335"
+   },
+   "outputs": [
    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "XceRKCuuDtbn"
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a1d3a8aa016544a78e8821c8f6199e06",
+       "version_major": 2,
+       "version_minor": 0
      },
-      "source": [
-        "## Edit Prompt Templates Quickly\n",
-        "\n",
-        "The following is a yaml made to evaluate the specific subtask of `high_school_geography` from MMLU. It uses the standard prompt where the we choose the letters from the options with most likelihood as the model's prediction."
+      "text/plain": [
+       "Downloading builder script:   0%|          | 0.00/5.67k [00:00<?, ?B/s]"
      ]
-    },
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "8rfUeX6n_wkK"
+   },
+   "source": [
+    "## Create new evaluation tasks with config-based tasks\n",
+    "\n",
+    "Even within the same task, many works have reported numbers based on different choices of evaluation. Some report on the test sets, validation sets, or even subset of the training sets. Others have specialized prompts and verbalizers. We introduce YAMLs to allow users to easily make different variations. By leveraging the YAML configs to configure evaluations, the refactored LM-Eval takes the methods of the `Task` object and makes them configurable by setting the appropriate attributes in the config file. There, users can set the tasks they want by setting the name of the HF dataset (local tasks are also possible), the dataset splits used, and much more. Key configurations relating to prompting, such as `doc_to_text`, previously implemented as a method of the same name, are now configurable with jinja2 to allow high-level scripting to transform a HF dataset to text string as input to the model.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "HYFUhhfOSJKe"
+   },
+   "source": [
+    "A core-feature to LM-Eval is to configure tasks with YAML configs. With configs, you can fill preset fields to easily set up a task.\n",
+    "\n",
+    "Here, we write a demo YAML config for a multiple-choice evaluation of BoolQ:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "id": "bg3dGROW-V39"
+   },
+   "outputs": [],
+   "source": [
+    "YAML_boolq_string = \"\"\"\n",
+    "task: demo_boolq\n",
+    "dataset_path: super_glue\n",
+    "dataset_name: boolq\n",
+    "output_type: multiple_choice\n",
+    "training_split: train\n",
+    "validation_split: validation\n",
+    "doc_to_text: \"{{passage}}\\nQuestion: {{question}}?\\nAnswer:\"\n",
+    "doc_to_target: label\n",
+    "doc_to_choice: [\"no\", \"yes\"]\n",
+    "should_decontaminate: true\n",
+    "doc_to_decontamination_query: passage\n",
+    "metric_list:\n",
+    "  - metric: acc\n",
+    "\"\"\"\n",
+    "with open(\"boolq.yaml\", \"w\") as f:\n",
+    "    f.write(YAML_boolq_string)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And we can now run evaluation on this task, by pointing to the config file we've just created:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "id": "LOUHK7PtQfq4"
+   },
+   "outputs": [
    {
-      "cell_type": "code",
-      "execution_count": 7,
-      "metadata": {
-        "id": "GTFvdt9kSlBG"
-      },
-      "outputs": [],
-      "source": [
-        "YAML_mmlu_geo_string = '''\n",
-        "group: mmlu\n",
-        "task: demo_mmlu_high_school_geography\n",
-        "dataset_path: cais/mmlu\n",
-        "dataset_name: high_school_geography\n",
-        "description: \"The following are multiple choice questions (with answers) about high school geography.\\n\\n\"\n",
-        "test_split: test\n",
-        "fewshot_split: dev\n",
-        "fewshot_config:\n",
-        "  sampler: first_n\n",
-        "output_type: multiple_choice\n",
-        "doc_to_text: \"{{question.strip()}}\\nA. {{choices[0]}}\\nB. {{choices[1]}}\\nC. {{choices[2]}}\\nD. {{choices[3]}}\\nAnswer:\"\n",
-        "doc_to_choice: [\"A\", \"B\", \"C\", \"D\"]\n",
-        "doc_to_target: answer\n",
-        "metric_list:\n",
-        "  - metric: acc\n",
-        "    aggregation: mean\n",
-        "    higher_is_better: true\n",
-        "  - metric: acc_norm\n",
-        "    aggregation: mean\n",
-        "    higher_is_better: true\n",
-        "'''\n",
-        "with open('mmlu_high_school_geography.yaml', 'w') as f:\n",
-        "    f.write(YAML_mmlu_geo_string)\n"
-      ]
-    },
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2023-11-29:11:54:55,156 INFO     [utils.py:160] NumExpr defaulting to 2 threads.\n",
+      "2023-11-29 11:54:55.942051: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "2023-11-29 11:54:55.942108: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "2023-11-29 11:54:55.942142: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2023-11-29 11:54:57.066802: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "2023-11-29:11:55:00,954 INFO     [__main__.py:132] Verbosity set to INFO\n",
+      "2023-11-29:11:55:11,038 WARNING  [__main__.py:138]  --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
+      "2023-11-29:11:55:11,038 INFO     [__main__.py:143] Including path: ./\n",
+      "2023-11-29:11:55:11,046 INFO     [__main__.py:205] Selected Tasks: ['demo_boolq']\n",
+      "2023-11-29:11:55:11,047 WARNING  [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.\n",
+      "2023-11-29:11:55:11,110 INFO     [huggingface.py:120] Using device 'cuda'\n",
+      "config.json: 100% 571/571 [00:00<00:00, 2.87MB/s]\n",
+      "model.safetensors: 100% 5.68G/5.68G [00:32<00:00, 173MB/s]\n",
+      "tokenizer_config.json: 100% 396/396 [00:00<00:00, 2.06MB/s]\n",
+      "tokenizer.json: 100% 2.11M/2.11M [00:00<00:00, 11.6MB/s]\n",
+      "special_tokens_map.json: 100% 99.0/99.0 [00:00<00:00, 555kB/s]\n",
+      "2023-11-29:11:56:18,658 WARNING  [task.py:614] [Task: demo_boolq] metric acc is defined, but aggregation is not. using default aggregation=mean\n",
+      "2023-11-29:11:56:18,658 WARNING  [task.py:626] [Task: demo_boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True\n",
+      "Downloading builder script: 100% 30.7k/30.7k [00:00<00:00, 59.0MB/s]\n",
+      "Downloading metadata: 100% 38.7k/38.7k [00:00<00:00, 651kB/s]\n",
+      "Downloading readme: 100% 14.8k/14.8k [00:00<00:00, 37.3MB/s]\n",
+      "Downloading data: 100% 4.12M/4.12M [00:00<00:00, 55.1MB/s]\n",
+      "Generating train split: 100% 9427/9427 [00:00<00:00, 15630.89 examples/s]\n",
+      "Generating validation split: 100% 3270/3270 [00:00<00:00, 20002.56 examples/s]\n",
+      "Generating test split: 100% 3245/3245 [00:00<00:00, 20866.19 examples/s]\n",
+      "2023-11-29:11:56:22,315 INFO     [task.py:355] Building contexts for task on rank 0...\n",
+      "2023-11-29:11:56:22,322 INFO     [evaluator.py:319] Running loglikelihood requests\n",
+      "100% 20/20 [00:04<00:00,  4.37it/s]\n",
+      "fatal: not a git repository (or any of the parent directories): .git\n",
+      "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (), limit: 10.0, num_fewshot: None, batch_size: 1\n",
+      "|  Tasks   |Version|Filter|n-shot|Metric|Value|   |Stderr|\n",
+      "|----------|-------|------|-----:|------|----:|---|-----:|\n",
+      "|demo_boolq|Yaml   |none  |     0|acc   |    1|±  |     0|\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!lm_eval \\\n",
+    "    --model hf \\\n",
+    "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
+    "    --include_path ./ \\\n",
+    "    --tasks demo_boolq \\\n",
+    "    --limit 10"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "LOUHK7PtQfq4"
+   },
+   "source": [
+    "Often, tasks are part of a larger group used to measure different capabilities. The dynamism of the field today means new dimensions of evaluation can come about which would mix and match new and older tasks alike. In LM-Eval, We can also group tasks and call that the group name to evaluate on a set of tasks easily. In this instance, let's evaluate the tag `yes_or_no_tasks` which comprise of the tasks `demo_boolq` and `demo_cola`; tasks which are multiple choice tasks with options `yes` and `no` as the name suggests.\n",
+    "\n",
+    "<!-- making new groups is easier than ever, allowing user to work bottom-up by makiing individual tasks and linking them to a group or Top-Down, making a new group by listing existing tasks.\n",
+    "\n",
+    "We also show the aggregate across samples besides only showing the aggregation between subtasks. This may come in handy when certain groups want to be aggregated as a single task. -->\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "id": "fthNg3ywO-kA"
+   },
+   "outputs": [],
+   "source": [
+    "YAML_cola_string = \"\"\"\n",
+    "tag: yes_or_no_tasks\n",
+    "task: demo_cola\n",
+    "dataset_path: glue\n",
+    "dataset_name: cola\n",
+    "output_type: multiple_choice\n",
+    "training_split: train\n",
+    "validation_split: validation\n",
+    "doc_to_text: \"{{sentence}}\\nQuestion: Does this sentence make sense?\\nAnswer:\"\n",
+    "doc_to_target: label\n",
+    "doc_to_choice: [\"no\", \"yes\"]\n",
+    "should_decontaminate: true\n",
+    "doc_to_decontamination_query: sentence\n",
+    "metric_list:\n",
+    "  - metric: acc\n",
+    "\"\"\"\n",
+    "with open(\"cola.yaml\", \"w\") as f:\n",
+    "    f.write(YAML_cola_string)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "id": "XceRKCuuDtbn"
+   },
+   "outputs": [
    {
-      "cell_type": "code",
-      "execution_count": 8,
-      "metadata": {
-        "id": "jyKOfCsKb-xy"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "2023-11-29:11:57:23,598 INFO     [utils.py:160] NumExpr defaulting to 2 threads.\n",
-            "2023-11-29 11:57:24.719750: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
-            "2023-11-29 11:57:24.719806: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
-            "2023-11-29 11:57:24.719847: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
-            "2023-11-29 11:57:26.656125: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
-            "2023-11-29:11:57:31,563 INFO     [__main__.py:132] Verbosity set to INFO\n",
-            "2023-11-29:11:57:40,541 WARNING  [__main__.py:138]  --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
-            "2023-11-29:11:57:40,541 INFO     [__main__.py:143] Including path: ./\n",
-            "2023-11-29:11:57:40,558 INFO     [__main__.py:205] Selected Tasks: ['demo_mmlu_high_school_geography']\n",
-            "2023-11-29:11:57:40,559 WARNING  [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.\n",
-            "2023-11-29:11:57:40,589 INFO     [huggingface.py:120] Using device 'cuda'\n",
-            "Downloading builder script: 100% 5.84k/5.84k [00:00<00:00, 17.7MB/s]\n",
-            "Downloading metadata: 100% 106k/106k [00:00<00:00, 892kB/s] \n",
-            "Downloading readme: 100% 39.7k/39.7k [00:00<00:00, 631kB/s]\n",
-            "Downloading data: 100% 166M/166M [00:01<00:00, 89.0MB/s]\n",
-            "Generating auxiliary_train split: 100% 99842/99842 [00:07<00:00, 12536.83 examples/s]\n",
-            "Generating test split: 100% 198/198 [00:00<00:00, 1439.20 examples/s]\n",
-            "Generating validation split: 100% 22/22 [00:00<00:00, 4181.76 examples/s]\n",
-            "Generating dev split: 100% 5/5 [00:00<00:00, 36.25 examples/s]\n",
-            "2023-11-29:11:58:09,798 INFO     [task.py:355] Building contexts for task on rank 0...\n",
-            "2023-11-29:11:58:09,822 INFO     [evaluator.py:319] Running loglikelihood requests\n",
-            "100% 40/40 [00:05<00:00,  7.86it/s]\n",
-            "fatal: not a git repository (or any of the parent directories): .git\n",
-            "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (), limit: 10.0, num_fewshot: None, batch_size: 1\n",
-            "|             Tasks             |Version|Filter|n-shot| Metric |Value|   |Stderr|\n",
-            "|-------------------------------|-------|------|-----:|--------|----:|---|-----:|\n",
-            "|demo_mmlu_high_school_geography|Yaml   |none  |     0|acc     |  0.3|±  |0.1528|\n",
-            "|                               |       |none  |     0|acc_norm|  0.3|±  |0.1528|\n",
-            "\n"
-          ]
-        }
-      ],
-      "source": [
-        "# !accelerate launch --no_python\n",
-        "!lm_eval \\\n",
-        "    --model hf \\\n",
-        "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
-        "    --include_path ./ \\\n",
-        "    --tasks demo_mmlu_high_school_geography \\\n",
-        "    --limit 10 \\\n",
-        "    --output output/mmlu_high_school_geography/ \\\n",
-        "    --log_samples"
-      ]
-    },
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2023-11-29:11:56:33,016 INFO     [utils.py:160] NumExpr defaulting to 2 threads.\n",
+      "2023-11-29 11:56:33.852995: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "2023-11-29 11:56:33.853050: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "2023-11-29 11:56:33.853087: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2023-11-29 11:56:35.129047: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "2023-11-29:11:56:38,546 INFO     [__main__.py:132] Verbosity set to INFO\n",
+      "2023-11-29:11:56:47,509 WARNING  [__main__.py:138]  --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
+      "2023-11-29:11:56:47,509 INFO     [__main__.py:143] Including path: ./\n",
+      "2023-11-29:11:56:47,517 INFO     [__main__.py:205] Selected Tasks: ['yes_or_no_tasks']\n",
+      "2023-11-29:11:56:47,520 WARNING  [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.\n",
+      "2023-11-29:11:56:47,550 INFO     [huggingface.py:120] Using device 'cuda'\n",
+      "2023-11-29:11:57:08,743 WARNING  [task.py:614] [Task: demo_cola] metric acc is defined, but aggregation is not. using default aggregation=mean\n",
+      "2023-11-29:11:57:08,743 WARNING  [task.py:626] [Task: demo_cola] metric acc is defined, but higher_is_better is not. using default higher_is_better=True\n",
+      "Downloading builder script: 100% 28.8k/28.8k [00:00<00:00, 52.7MB/s]\n",
+      "Downloading metadata: 100% 28.7k/28.7k [00:00<00:00, 51.9MB/s]\n",
+      "Downloading readme: 100% 27.9k/27.9k [00:00<00:00, 48.0MB/s]\n",
+      "Downloading data: 100% 377k/377k [00:00<00:00, 12.0MB/s]\n",
+      "Generating train split: 100% 8551/8551 [00:00<00:00, 19744.58 examples/s]\n",
+      "Generating validation split: 100% 1043/1043 [00:00<00:00, 27057.01 examples/s]\n",
+      "Generating test split: 100% 1063/1063 [00:00<00:00, 22705.17 examples/s]\n",
+      "2023-11-29:11:57:11,698 INFO     [task.py:355] Building contexts for task on rank 0...\n",
+      "2023-11-29:11:57:11,704 INFO     [evaluator.py:319] Running loglikelihood requests\n",
+      "100% 20/20 [00:03<00:00,  5.15it/s]\n",
+      "fatal: not a git repository (or any of the parent directories): .git\n",
+      "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (), limit: 10.0, num_fewshot: None, batch_size: 1\n",
+      "|     Tasks     |Version|Filter|n-shot|Metric|Value|   |Stderr|\n",
+      "|---------------|-------|------|-----:|------|----:|---|-----:|\n",
+      "|yes_or_no_tasks|N/A    |none  |     0|acc   |  0.7|±  |0.1528|\n",
+      "| - demo_cola   |Yaml   |none  |     0|acc   |  0.7|±  |0.1528|\n",
+      "\n",
+      "|    Groups     |Version|Filter|n-shot|Metric|Value|   |Stderr|\n",
+      "|---------------|-------|------|-----:|------|----:|---|-----:|\n",
+      "|yes_or_no_tasks|N/A    |none  |     0|acc   |  0.7|±  |0.1528|\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# !accelerate launch --no_python\n",
+    "!lm_eval \\\n",
+    "    --model hf \\\n",
+    "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
+    "    --include_path ./ \\\n",
+    "    --tasks yes_or_no_tasks \\\n",
+    "    --limit 10 \\\n",
+    "    --output output/yes_or_no_tasks/ \\\n",
+    "    --log_samples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "XceRKCuuDtbn"
+   },
+   "source": [
+    "## Edit Prompt Templates Quickly\n",
+    "\n",
+    "The following is a yaml made to evaluate the specific subtask of `high_school_geography` from MMLU. It uses the standard prompt where the we choose the letters from the options with most likelihood as the model's prediction."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "id": "GTFvdt9kSlBG"
+   },
+   "outputs": [],
+   "source": [
+    "YAML_mmlu_geo_string = \"\"\"\n",
+    "task: demo_mmlu_high_school_geography\n",
+    "dataset_path: cais/mmlu\n",
+    "dataset_name: high_school_geography\n",
+    "description: \"The following are multiple choice questions (with answers) about high school geography.\\n\\n\"\n",
+    "test_split: test\n",
+    "fewshot_split: dev\n",
+    "fewshot_config:\n",
+    "  sampler: first_n\n",
+    "output_type: multiple_choice\n",
+    "doc_to_text: \"{{question.strip()}}\\nA. {{choices[0]}}\\nB. {{choices[1]}}\\nC. {{choices[2]}}\\nD. {{choices[3]}}\\nAnswer:\"\n",
+    "doc_to_choice: [\"A\", \"B\", \"C\", \"D\"]\n",
+    "doc_to_target: answer\n",
+    "metric_list:\n",
+    "  - metric: acc\n",
+    "    aggregation: mean\n",
+    "    higher_is_better: true\n",
+    "  - metric: acc_norm\n",
+    "    aggregation: mean\n",
+    "    higher_is_better: true\n",
+    "\"\"\"\n",
+    "with open(\"mmlu_high_school_geography.yaml\", \"w\") as f:\n",
+    "    f.write(YAML_mmlu_geo_string)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "id": "jyKOfCsKb-xy"
+   },
+   "outputs": [
    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "jyKOfCsKb-xy"
-      },
-      "source": [
-        "We could also evaluate this task in a different way. For example, instead of observing the loglikelihood of the letters, we can instead evaluate on the choices themselves as the continuation. This is done by simply changing `doc_to_choice` from a list of letters to the corresponding `choices` field from the HF dataset. We write `\"{{choices}}\"` so that the string field is interpreted as jinja string that acquires the list from the HF dataset directly.\n",
-        "\n",
-        "Another convenient feature here is since we're only modifying the `doc_to_choice` and the rest of config is the same as the task above, we can use the above configuration as a template by using `include: mmlu_high_school_geography.yaml` to load the config from that file. We'll need to add a unique task name as to not colide with the existing yaml config we're including. For this case we'll simply name this one `mmlu_high_school_geography_continuation`. `doc_to_text` is added here just for sake of clarity."
-      ]
-    },
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2023-11-29:11:57:23,598 INFO     [utils.py:160] NumExpr defaulting to 2 threads.\n",
+      "2023-11-29 11:57:24.719750: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "2023-11-29 11:57:24.719806: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "2023-11-29 11:57:24.719847: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2023-11-29 11:57:26.656125: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "2023-11-29:11:57:31,563 INFO     [__main__.py:132] Verbosity set to INFO\n",
+      "2023-11-29:11:57:40,541 WARNING  [__main__.py:138]  --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
+      "2023-11-29:11:57:40,541 INFO     [__main__.py:143] Including path: ./\n",
+      "2023-11-29:11:57:40,558 INFO     [__main__.py:205] Selected Tasks: ['demo_mmlu_high_school_geography']\n",
+      "2023-11-29:11:57:40,559 WARNING  [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.\n",
+      "2023-11-29:11:57:40,589 INFO     [huggingface.py:120] Using device 'cuda'\n",
+      "Downloading builder script: 100% 5.84k/5.84k [00:00<00:00, 17.7MB/s]\n",
+      "Downloading metadata: 100% 106k/106k [00:00<00:00, 892kB/s] \n",
+      "Downloading readme: 100% 39.7k/39.7k [00:00<00:00, 631kB/s]\n",
+      "Downloading data: 100% 166M/166M [00:01<00:00, 89.0MB/s]\n",
+      "Generating auxiliary_train split: 100% 99842/99842 [00:07<00:00, 12536.83 examples/s]\n",
+      "Generating test split: 100% 198/198 [00:00<00:00, 1439.20 examples/s]\n",
+      "Generating validation split: 100% 22/22 [00:00<00:00, 4181.76 examples/s]\n",
+      "Generating dev split: 100% 5/5 [00:00<00:00, 36.25 examples/s]\n",
+      "2023-11-29:11:58:09,798 INFO     [task.py:355] Building contexts for task on rank 0...\n",
+      "2023-11-29:11:58:09,822 INFO     [evaluator.py:319] Running loglikelihood requests\n",
+      "100% 40/40 [00:05<00:00,  7.86it/s]\n",
+      "fatal: not a git repository (or any of the parent directories): .git\n",
+      "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (), limit: 10.0, num_fewshot: None, batch_size: 1\n",
+      "|             Tasks             |Version|Filter|n-shot| Metric |Value|   |Stderr|\n",
+      "|-------------------------------|-------|------|-----:|--------|----:|---|-----:|\n",
+      "|demo_mmlu_high_school_geography|Yaml   |none  |     0|acc     |  0.3|±  |0.1528|\n",
+      "|                               |       |none  |     0|acc_norm|  0.3|±  |0.1528|\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# !accelerate launch --no_python\n",
+    "!lm_eval \\\n",
+    "    --model hf \\\n",
+    "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
+    "    --include_path ./ \\\n",
+    "    --tasks demo_mmlu_high_school_geography \\\n",
+    "    --limit 10 \\\n",
+    "    --output output/mmlu_high_school_geography/ \\\n",
+    "    --log_samples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "jyKOfCsKb-xy"
+   },
+   "source": [
+    "We could also evaluate this task in a different way. For example, instead of observing the loglikelihood of the letters, we can instead evaluate on the choices themselves as the continuation. This is done by simply changing `doc_to_choice` from a list of letters to the corresponding `choices` field from the HF dataset. We write `\"{{choices}}\"` so that the string field is interpreted as jinja string that acquires the list from the HF dataset directly.\n",
+    "\n",
+    "Another convenient feature here is since we're only modifying the `doc_to_choice` and the rest of config is the same as the task above, we can use the above configuration as a template by using `include: mmlu_high_school_geography.yaml` to load the config from that file. We'll need to add a unique task name as to not colide with the existing yaml config we're including. For this case we'll simply name this one `mmlu_high_school_geography_continuation`. `doc_to_text` is added here just for sake of clarity."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "id": "lqElwU54TaK-"
+   },
+   "outputs": [],
+   "source": [
+    "YAML_mmlu_geo_string = \"\"\"\n",
+    "include: mmlu_high_school_geography.yaml\n",
+    "task: demo_mmlu_high_school_geography_continuation\n",
+    "doc_to_text: \"{{question.strip()}}\\nA. {{choices[0]}}\\nB. {{choices[1]}}\\nC. {{choices[2]}}\\nD. {{choices[3]}}\\nAnswer:\"\n",
+    "doc_to_choice: \"{{choices}}\"\n",
+    "\"\"\"\n",
+    "with open(\"mmlu_high_school_geography_continuation.yaml\", \"w\") as f:\n",
+    "    f.write(YAML_mmlu_geo_string)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "id": "-_CVnDirdy7j"
+   },
+   "outputs": [
    {
-      "cell_type": "code",
-      "execution_count": 9,
-      "metadata": {
-        "id": "lqElwU54TaK-"
-      },
-      "outputs": [],
-      "source": [
-        "YAML_mmlu_geo_string = '''\n",
-        "include: mmlu_high_school_geography.yaml\n",
-        "task: demo_mmlu_high_school_geography_continuation\n",
-        "doc_to_text: \"{{question.strip()}}\\nA. {{choices[0]}}\\nB. {{choices[1]}}\\nC. {{choices[2]}}\\nD. {{choices[3]}}\\nAnswer:\"\n",
-        "doc_to_choice: \"{{choices}}\"\n",
-        "'''\n",
-        "with open('mmlu_high_school_geography_continuation.yaml', 'w') as f:\n",
-        "    f.write(YAML_mmlu_geo_string)\n"
-      ]
-    },
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2023-11-29:11:58:21,284 INFO     [utils.py:160] NumExpr defaulting to 2 threads.\n",
+      "2023-11-29 11:58:22.850159: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "2023-11-29 11:58:22.850219: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "2023-11-29 11:58:22.850254: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2023-11-29 11:58:24.948103: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "2023-11-29:11:58:28,460 INFO     [__main__.py:132] Verbosity set to INFO\n",
+      "2023-11-29:11:58:37,935 WARNING  [__main__.py:138]  --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
+      "2023-11-29:11:58:37,935 INFO     [__main__.py:143] Including path: ./\n",
+      "2023-11-29:11:58:37,969 INFO     [__main__.py:205] Selected Tasks: ['demo_mmlu_high_school_geography_continuation']\n",
+      "2023-11-29:11:58:37,972 WARNING  [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.\n",
+      "2023-11-29:11:58:38,008 INFO     [huggingface.py:120] Using device 'cuda'\n",
+      "2023-11-29:11:58:59,758 INFO     [task.py:355] Building contexts for task on rank 0...\n",
+      "2023-11-29:11:58:59,777 INFO     [evaluator.py:319] Running loglikelihood requests\n",
+      "100% 40/40 [00:02<00:00, 16.23it/s]\n",
+      "fatal: not a git repository (or any of the parent directories): .git\n",
+      "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (), limit: 10.0, num_fewshot: None, batch_size: 1\n",
+      "|                   Tasks                    |Version|Filter|n-shot| Metric |Value|   |Stderr|\n",
+      "|--------------------------------------------|-------|------|-----:|--------|----:|---|-----:|\n",
+      "|demo_mmlu_high_school_geography_continuation|Yaml   |none  |     0|acc     |  0.1|±  |0.1000|\n",
+      "|                                            |       |none  |     0|acc_norm|  0.2|±  |0.1333|\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# !accelerate launch --no_python\n",
+    "!lm_eval \\\n",
+    "    --model hf \\\n",
+    "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
+    "    --include_path ./ \\\n",
+    "    --tasks demo_mmlu_high_school_geography_continuation \\\n",
+    "    --limit 10 \\\n",
+    "    --output output/mmlu_high_school_geography_continuation/ \\\n",
+    "    --log_samples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "-_CVnDirdy7j"
+   },
+   "source": [
+    "If we take a look at the samples, we can see that it is in fact evaluating the continuation based on the choices rather than the letters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "id": "duBDqC6PAdjL"
+   },
+   "outputs": [
    {
-      "cell_type": "code",
-      "execution_count": 10,
-      "metadata": {
-        "id": "-_CVnDirdy7j"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "2023-11-29:11:58:21,284 INFO     [utils.py:160] NumExpr defaulting to 2 threads.\n",
-            "2023-11-29 11:58:22.850159: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
-            "2023-11-29 11:58:22.850219: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
-            "2023-11-29 11:58:22.850254: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
-            "2023-11-29 11:58:24.948103: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
-            "2023-11-29:11:58:28,460 INFO     [__main__.py:132] Verbosity set to INFO\n",
-            "2023-11-29:11:58:37,935 WARNING  [__main__.py:138]  --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
-            "2023-11-29:11:58:37,935 INFO     [__main__.py:143] Including path: ./\n",
-            "2023-11-29:11:58:37,969 INFO     [__main__.py:205] Selected Tasks: ['demo_mmlu_high_school_geography_continuation']\n",
-            "2023-11-29:11:58:37,972 WARNING  [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.\n",
-            "2023-11-29:11:58:38,008 INFO     [huggingface.py:120] Using device 'cuda'\n",
-            "2023-11-29:11:58:59,758 INFO     [task.py:355] Building contexts for task on rank 0...\n",
-            "2023-11-29:11:58:59,777 INFO     [evaluator.py:319] Running loglikelihood requests\n",
-            "100% 40/40 [00:02<00:00, 16.23it/s]\n",
-            "fatal: not a git repository (or any of the parent directories): .git\n",
-            "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (), limit: 10.0, num_fewshot: None, batch_size: 1\n",
-            "|                   Tasks                    |Version|Filter|n-shot| Metric |Value|   |Stderr|\n",
-            "|--------------------------------------------|-------|------|-----:|--------|----:|---|-----:|\n",
-            "|demo_mmlu_high_school_geography_continuation|Yaml   |none  |     0|acc     |  0.1|±  |0.1000|\n",
-            "|                                            |       |none  |     0|acc_norm|  0.2|±  |0.1333|\n",
-            "\n"
-          ]
-        }
-      ],
-      "source": [
-        "# !accelerate launch --no_python\n",
-        "!lm_eval \\\n",
-        "    --model hf \\\n",
-        "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
-        "    --include_path ./ \\\n",
-        "    --tasks demo_mmlu_high_school_geography_continuation \\\n",
-        "    --limit 10 \\\n",
-        "    --output output/mmlu_high_school_geography_continuation/ \\\n",
-        "    --log_samples\n"
+     "data": {
+      "application/javascript": "\n      ((filepath) => {{\n        if (!google.colab.kernel.accessAllowed) {{\n          return;\n        }}\n        google.colab.files.view(filepath);\n      }})(\"/content/output/mmlu_high_school_geography_continuation/pretrained__EleutherAI__pythia-2.8b_demo_mmlu_high_school_geography_continuation.jsonl\")",
+      "text/plain": [
+       "<IPython.core.display.Javascript object>"
      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from google.colab import files\n",
+    "\n",
+    "\n",
+    "files.view(\n",
+    "    \"output/mmlu_high_school_geography_continuation/pretrained__EleutherAI__pythia-2.8b_demo_mmlu_high_school_geography_continuation.jsonl\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "6p0-KPwAgK5j"
+   },
+   "source": [
+    "## Closer Look at YAML Fields\n",
+    "\n",
+    "To prepare a task we can simply fill in a YAML config with the relevant information.\n",
+    "\n",
+    "`output_type`\n",
+    "The current provided evaluation types comprise of the following:\n",
+    "1.   `loglikelihood`: Evaluates the loglikelihood of a continuation, conditioned on some input string.\n",
+    "2.   `loglikelihood_rolling`: evaluate the loglikelihood of producing a string, conditioned on the empty string. (Used for perplexity evaluations)\n",
+    "3.   `multiple_choice`: Evaluates loglikelihood among the a number of choices predicted by the model.\n",
+    "4.   `greedy_until`: Model outputs greedy generation (can be configured to to use beam search and other generation-related parameters)\n",
+    "\n",
+    "The core prompt revolves around 3 fields.\n",
+    "1. `doc_to_text`: Denotes the prompt template that will be used as input to the model.\n",
+    "2. `doc_to_choice`: Available choices that will be used as continuation for the model. This is used when the `output_type` is `multiple_choice`, and otherwise can be left as `None`.\n",
+    "3. `doc_to_target`: When `output_type` is `multiple_choice`, this can be an index that corresponds to the correct answer, or the answer string itself (must be a subset of `doc_to_choice`). For other tasks, this is expected to be a string. You can fill this field with a feature name from the HF dataset so long as the resulting feature follows the conditioned described.\n",
+    "\n",
+    "These three fields can be expressed as strings, column names from the source dataset, or as Jinja2 templates that can use fields from the source dataset as variables.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "6p0-KPwAgK5j"
+   },
+   "source": [
+    "## What if Jinja is not Sufficient?\n",
+    "\n",
+    "There can be times where the Jinja2 templating language is not enough to make the prompt we had in mind. There are a few ways to circumvent this limitation:\n",
+    "\n",
+    "1. Use `!function` operator for the prompt-related fields to pass a python function that takes as input the dataset row, and will output the prompt template component.\n",
+    "2. Perform a transformation on the dataset beforehand."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below, we show an example of using `!function` to create `doc_to_text` from a python function:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
    },
+    "id": "DYZ5c0JhR1lJ",
+    "outputId": "ca945235-fb9e-4f17-8bfa-78e7d6ec1490"
+   },
+   "outputs": [
    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "-_CVnDirdy7j"
-      },
-      "source": [
-        "If we take a look at the samples, we can see that it is in fact evaluating the continuation based on the choices rather than the letters."
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2023-11-29:11:59:08,312 INFO     [utils.py:160] NumExpr defaulting to 2 threads.\n",
+      "2023-11-29 11:59:09.348327: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "2023-11-29 11:59:09.348387: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "2023-11-29 11:59:09.348421: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2023-11-29 11:59:10.573752: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "2023-11-29:11:59:14,044 INFO     [__main__.py:132] Verbosity set to INFO\n",
+      "2023-11-29:11:59:23,654 WARNING  [__main__.py:138]  --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
+      "2023-11-29:11:59:23,654 INFO     [__main__.py:143] Including path: ./\n",
+      "2023-11-29:11:59:23,678 INFO     [__main__.py:205] Selected Tasks: ['demo_mmlu_high_school_geography_function_prompt']\n",
+      "2023-11-29:11:59:23,679 WARNING  [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.\n",
+      "2023-11-29:11:59:23,708 INFO     [huggingface.py:120] Using device 'cuda'\n",
+      "2023-11-29:11:59:44,516 INFO     [task.py:355] Building contexts for task on rank 0...\n",
+      "2023-11-29:11:59:44,524 INFO     [evaluator.py:319] Running loglikelihood requests\n",
+      "100% 40/40 [00:02<00:00, 15.41it/s]\n",
+      "fatal: not a git repository (or any of the parent directories): .git\n",
+      "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (), limit: 10.0, num_fewshot: None, batch_size: 1\n",
+      "|                     Tasks                     |Version|Filter|n-shot| Metric |Value|   |Stderr|\n",
+      "|-----------------------------------------------|-------|------|-----:|--------|----:|---|-----:|\n",
+      "|demo_mmlu_high_school_geography_function_prompt|Yaml   |none  |     0|acc     |  0.1|±  |0.1000|\n",
+      "|                                               |       |none  |     0|acc_norm|  0.2|±  |0.1333|\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "YAML_mmlu_geo_string = \"\"\"\n",
+    "include: mmlu_high_school_geography.yaml\n",
+    "task: demo_mmlu_high_school_geography_function_prompt\n",
+    "doc_to_text: !function utils.doc_to_text\n",
+    "doc_to_choice: \"{{choices}}\"\n",
+    "\"\"\"\n",
+    "with open(\"demo_mmlu_high_school_geography_function_prompt.yaml\", \"w\") as f:\n",
+    "    f.write(YAML_mmlu_geo_string)\n",
+    "\n",
+    "DOC_TO_TEXT = \"\"\"\n",
+    "def doc_to_text(x):\n",
+    "    question = x[\"question\"].strip()\n",
+    "    choices = x[\"choices\"]\n",
+    "    option_a = choices[0]\n",
+    "    option_b = choices[1]\n",
+    "    option_c = choices[2]\n",
+    "    option_d = choices[3]\n",
+    "    return f\"{question}\\\\nA. {option_a}\\\\nB. {option_b}\\\\nC. {option_c}\\\\nD. {option_d}\\\\nAnswer:\"\n",
+    "\"\"\"\n",
+    "with open(\"utils.py\", \"w\") as f:\n",
+    "    f.write(DOC_TO_TEXT)\n",
+    "\n",
+    "!lm_eval \\\n",
+    "    --model hf \\\n",
+    "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
+    "    --include_path ./ \\\n",
+    "    --tasks demo_mmlu_high_school_geography_function_prompt \\\n",
+    "    --limit 10 \\\n",
+    "    --output output/demo_mmlu_high_school_geography_function_prompt/ \\\n",
+    "    --log_samples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we'll also show how to do this via preprocessing the dataset as necessary using the `process_docs` config field:\n",
+    "\n",
+    "We will write a function that will modify each document in our evaluation dataset's split to add a field that is suitable for us to use in `doc_to_text`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "YAML_mmlu_geo_string = \"\"\"\n",
+    "include: mmlu_high_school_geography.yaml\n",
+    "task: demo_mmlu_high_school_geography_function_prompt_2\n",
+    "process_docs: !function utils_process_docs.process_docs\n",
+    "doc_to_text: \"{{input}}\"\n",
+    "doc_to_choice: \"{{choices}}\"\n",
+    "\"\"\"\n",
+    "with open(\"demo_mmlu_high_school_geography_process_docs.yaml\", \"w\") as f:\n",
+    "    f.write(YAML_mmlu_geo_string)\n",
+    "\n",
+    "DOC_TO_TEXT = \"\"\"\n",
+    "def process_docs(dataset):\n",
+    "    def _process_doc(x):\n",
+    "        question = x[\"question\"].strip()\n",
+    "        choices = x[\"choices\"]\n",
+    "        option_a = choices[0]\n",
+    "        option_b = choices[1]\n",
+    "        option_c = choices[2]\n",
+    "        option_d = choices[3]\n",
+    "        doc[\"input\"] = f\"{question}\\\\nA. {option_a}\\\\nB. {option_b}\\\\nC. {option_c}\\\\nD. {option_d}\\\\nAnswer:\"\n",
+    "        return out_doc\n",
+    "\n",
+    "    return dataset.map(_process_doc)\n",
+    "\"\"\"\n",
+    "\n",
+    "with open(\"utils_process_docs.py\", \"w\") as f:\n",
+    "    f.write(DOC_TO_TEXT)\n",
+    "\n",
+    "!lm_eval \\\n",
+    "    --model hf \\\n",
+    "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
+    "    --include_path ./ \\\n",
+    "    --tasks demo_mmlu_high_school_geography_function_prompt_2 \\\n",
+    "    --limit 10 \\\n",
+    "    --output output/demo_mmlu_high_school_geography_function_prompt_2/ \\\n",
+    "    --log_samples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We hope that this explainer gives you a sense of what can be done with and how to work with LM-Evaluation-Harnes v0.4.0 ! \n",
+    "\n",
+    "For more information, check out our documentation pages in the `docs/` folder, and if you have questions, please raise them in GitHub issues, or in #lm-thunderdome or #release-discussion on the EleutherAI discord server."
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [
+    "zAov81vTbL2K"
+   ],
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "46f521b73fd943c081c648fd873ebc0a": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
    },
-    {
-      "cell_type": "code",
-      "execution_count": 11,
-      "metadata": {
-        "id": "duBDqC6PAdjL"
-      },
-      "outputs": [
-        {
-          "data": {
-            "application/javascript": "\n      ((filepath) => {{\n        if (!google.colab.kernel.accessAllowed) {{\n          return;\n        }}\n        google.colab.files.view(filepath);\n      }})(\"/content/output/mmlu_high_school_geography_continuation/pretrained__EleutherAI__pythia-2.8b_demo_mmlu_high_school_geography_continuation.jsonl\")",
-            "text/plain": [
-              "<IPython.core.display.Javascript object>"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        }
-      ],
-      "source": [
-        "from google.colab import files\n",
-        "files.view(\"output/mmlu_high_school_geography_continuation/pretrained__EleutherAI__pythia-2.8b_demo_mmlu_high_school_geography_continuation.jsonl\")\n"
-      ]
+    "48763b6233374554ae76035c0483066f": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "6p0-KPwAgK5j"
-      },
-      "source": [
-        "## Closer Look at YAML Fields\n",
-        "\n",
-        "To prepare a task we can simply fill in a YAML config with the relevant information.\n",
-        "\n",
-        "`output_type`\n",
-        "The current provided evaluation types comprise of the following:\n",
-        "1.   `loglikelihood`: Evaluates the loglikelihood of a continuation, conditioned on some input string.\n",
-        "2.   `loglikelihood_rolling`: evaluate the loglikelihood of producing a string, conditioned on the empty string. (Used for perplexity evaluations)\n",
-        "3.   `multiple_choice`: Evaluates loglikelihood among the a number of choices predicted by the model.\n",
-        "4.   `greedy_until`: Model outputs greedy generation (can be configured to to use beam search and other generation-related parameters)\n",
-        "\n",
-        "The core prompt revolves around 3 fields.\n",
-        "1. `doc_to_text`: Denotes the prompt template that will be used as input to the model.\n",
-        "2. `doc_to_choice`: Available choices that will be used as continuation for the model. This is used when the `output_type` is `multiple_choice`, and otherwise can be left as `None`.\n",
-        "3. `doc_to_target`: When `output_type` is `multiple_choice`, this can be an index that corresponds to the correct answer, or the answer string itself (must be a subset of `doc_to_choice`). For other tasks, this is expected to be a string. You can fill this field with a feature name from the HF dataset so long as the resulting feature follows the conditioned described.\n",
-        "\n",
-        "These three fields can be expressed as strings, column names from the source dataset, or as Jinja2 templates that can use fields from the source dataset as variables.\n"
-      ]
+    "4986a21eb560448fa79f4b25cde48951": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "6p0-KPwAgK5j"
-      },
-      "source": [
-        "## What if Jinja is not Sufficient?\n",
-        "\n",
-        "There can be times where the Jinja2 templating language is not enough to make the prompt we had in mind. There are a few ways to circumvent this limitation:\n",
-        "\n",
-        "1. Use `!function` operator for the prompt-related fields to pass a python function that takes as input the dataset row, and will output the prompt template component.\n",
-        "2. Perform a transformation on the dataset beforehand."
-      ]
+    "6b2d90209ec14230b3d58a74ac9b83bf": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "Below, we show an example of using `!function` to create `doc_to_text` from a python function:"
-      ]
+    "7c5689bc13684db8a22681f41863dddd": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
    },
-    {
-      "cell_type": "code",
-      "execution_count": 12,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "DYZ5c0JhR1lJ",
-        "outputId": "ca945235-fb9e-4f17-8bfa-78e7d6ec1490"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "2023-11-29:11:59:08,312 INFO     [utils.py:160] NumExpr defaulting to 2 threads.\n",
-            "2023-11-29 11:59:09.348327: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
-            "2023-11-29 11:59:09.348387: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
-            "2023-11-29 11:59:09.348421: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
-            "2023-11-29 11:59:10.573752: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
-            "2023-11-29:11:59:14,044 INFO     [__main__.py:132] Verbosity set to INFO\n",
-            "2023-11-29:11:59:23,654 WARNING  [__main__.py:138]  --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
-            "2023-11-29:11:59:23,654 INFO     [__main__.py:143] Including path: ./\n",
-            "2023-11-29:11:59:23,678 INFO     [__main__.py:205] Selected Tasks: ['demo_mmlu_high_school_geography_function_prompt']\n",
-            "2023-11-29:11:59:23,679 WARNING  [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.\n",
-            "2023-11-29:11:59:23,708 INFO     [huggingface.py:120] Using device 'cuda'\n",
-            "2023-11-29:11:59:44,516 INFO     [task.py:355] Building contexts for task on rank 0...\n",
-            "2023-11-29:11:59:44,524 INFO     [evaluator.py:319] Running loglikelihood requests\n",
-            "100% 40/40 [00:02<00:00, 15.41it/s]\n",
-            "fatal: not a git repository (or any of the parent directories): .git\n",
-            "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (), limit: 10.0, num_fewshot: None, batch_size: 1\n",
-            "|                     Tasks                     |Version|Filter|n-shot| Metric |Value|   |Stderr|\n",
-            "|-----------------------------------------------|-------|------|-----:|--------|----:|---|-----:|\n",
-            "|demo_mmlu_high_school_geography_function_prompt|Yaml   |none  |     0|acc     |  0.1|±  |0.1000|\n",
-            "|                                               |       |none  |     0|acc_norm|  0.2|±  |0.1333|\n",
-            "\n"
-          ]
-        }
+    "a1d3a8aa016544a78e8821c8f6199e06": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_f61ed33fad754146bdd2ac9db1ba1c48",
+       "IPY_MODEL_bfa0af6aeff344c6845e1080a878e92e",
+       "IPY_MODEL_fd1ad9e0367d4004aae853b91c3a7617"
      ],
-      "source": [
-        "YAML_mmlu_geo_string = '''\n",
-        "include: mmlu_high_school_geography.yaml\n",
-        "task: demo_mmlu_high_school_geography_function_prompt\n",
-        "doc_to_text: !function utils.doc_to_text\n",
-        "doc_to_choice: \"{{choices}}\"\n",
-        "'''\n",
-        "with open('demo_mmlu_high_school_geography_function_prompt.yaml', 'w') as f:\n",
-        "    f.write(YAML_mmlu_geo_string)\n",
-        "\n",
-        "DOC_TO_TEXT = '''\n",
-        "def doc_to_text(x):\n",
-        "    question = x[\"question\"].strip()\n",
-        "    choices = x[\"choices\"]\n",
-        "    option_a = choices[0]\n",
-        "    option_b = choices[1]\n",
-        "    option_c = choices[2]\n",
-        "    option_d = choices[3]\n",
-        "    return f\"{question}\\\\nA. {option_a}\\\\nB. {option_b}\\\\nC. {option_c}\\\\nD. {option_d}\\\\nAnswer:\"\n",
-        "'''\n",
-        "with open('utils.py', 'w') as f:\n",
-        "    f.write(DOC_TO_TEXT)\n",
-        "\n",
-        "!lm_eval \\\n",
-        "    --model hf \\\n",
-        "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
-        "    --include_path ./ \\\n",
-        "    --tasks demo_mmlu_high_school_geography_function_prompt \\\n",
-        "    --limit 10 \\\n",
-        "    --output output/demo_mmlu_high_school_geography_function_prompt/ \\\n",
-        "    --log_samples\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "Next, we'll also show how to do this via preprocessing the dataset as necessary using the `process_docs` config field:\n",
-        "\n",
-        "We will write a function that will modify each document in our evaluation dataset's split to add a field that is suitable for us to use in `doc_to_text`."
-      ]
+      "layout": "IPY_MODEL_6b2d90209ec14230b3d58a74ac9b83bf"
+     }
    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "YAML_mmlu_geo_string = '''\n",
-        "include: mmlu_high_school_geography.yaml\n",
-        "task: demo_mmlu_high_school_geography_function_prompt_2\n",
-        "process_docs: !function utils_process_docs.process_docs\n",
-        "doc_to_text: \"{{input}}\"\n",
-        "doc_to_choice: \"{{choices}}\"\n",
-        "'''\n",
-        "with open('demo_mmlu_high_school_geography_process_docs.yaml', 'w') as f:\n",
-        "    f.write(YAML_mmlu_geo_string)\n",
-        "\n",
-        "DOC_TO_TEXT = '''\n",
-        "def process_docs(dataset):\n",
-        "    def _process_doc(x):\n",
-        "        question = x[\"question\"].strip()\n",
-        "        choices = x[\"choices\"]\n",
-        "        option_a = choices[0]\n",
-        "        option_b = choices[1]\n",
-        "        option_c = choices[2]\n",
-        "        option_d = choices[3]\n",
-        "        doc[\"input\"] = f\"{question}\\\\nA. {option_a}\\\\nB. {option_b}\\\\nC. {option_c}\\\\nD. {option_d}\\\\nAnswer:\"\n",
-        "        return out_doc\n",
-        "\n",
-        "    return dataset.map(_process_doc)\n",
-        "'''\n",
-        "\n",
-        "with open('utils_process_docs.py', 'w') as f:\n",
-        "    f.write(DOC_TO_TEXT)\n",
-        "\n",
-        "!lm_eval \\\n",
-        "    --model hf \\\n",
-        "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
-        "    --include_path ./ \\\n",
-        "    --tasks demo_mmlu_high_school_geography_function_prompt_2 \\\n",
-        "    --limit 10 \\\n",
-        "    --output output/demo_mmlu_high_school_geography_function_prompt_2/ \\\n",
-        "    --log_samples\n"
-      ]
+    "a73f357065d34d7baf0453ae4a8d75e2": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "We hope that this explainer gives you a sense of what can be done with and how to work with LM-Evaluation-Harnes v0.4.0 ! \n",
-        "\n",
-        "For more information, check out our documentation pages in the `docs/` folder, and if you have questions, please raise them in GitHub issues, or in #lm-thunderdome or #release-discussion on the EleutherAI discord server."
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "collapsed_sections": [
-        "zAov81vTbL2K"
-      ],
-      "gpuType": "T4",
-      "provenance": []
+    "aed3acd2f2d74003b44079c333a0698e": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
+    "bfa0af6aeff344c6845e1080a878e92e": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_7c5689bc13684db8a22681f41863dddd",
+      "max": 5669,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_48763b6233374554ae76035c0483066f",
+      "value": 5669
+     }
    },
-    "language_info": {
-      "name": "python"
+    "f61ed33fad754146bdd2ac9db1ba1c48": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_a73f357065d34d7baf0453ae4a8d75e2",
+      "placeholder": "",
+      "style": "IPY_MODEL_46f521b73fd943c081c648fd873ebc0a",
+      "value": "Downloading builder script: 100%"
+     }
    },
-    "widgets": {
-      "application/vnd.jupyter.widget-state+json": {
-        "46f521b73fd943c081c648fd873ebc0a": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "48763b6233374554ae76035c0483066f": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "4986a21eb560448fa79f4b25cde48951": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "6b2d90209ec14230b3d58a74ac9b83bf": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "7c5689bc13684db8a22681f41863dddd": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "a1d3a8aa016544a78e8821c8f6199e06": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_f61ed33fad754146bdd2ac9db1ba1c48",
-              "IPY_MODEL_bfa0af6aeff344c6845e1080a878e92e",
-              "IPY_MODEL_fd1ad9e0367d4004aae853b91c3a7617"
-            ],
-            "layout": "IPY_MODEL_6b2d90209ec14230b3d58a74ac9b83bf"
-          }
-        },
-        "a73f357065d34d7baf0453ae4a8d75e2": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "aed3acd2f2d74003b44079c333a0698e": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "bfa0af6aeff344c6845e1080a878e92e": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_7c5689bc13684db8a22681f41863dddd",
-            "max": 5669,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_48763b6233374554ae76035c0483066f",
-            "value": 5669
-          }
-        },
-        "f61ed33fad754146bdd2ac9db1ba1c48": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_a73f357065d34d7baf0453ae4a8d75e2",
-            "placeholder": "",
-            "style": "IPY_MODEL_46f521b73fd943c081c648fd873ebc0a",
-            "value": "Downloading builder script: 100%"
-          }
-        },
-        "fd1ad9e0367d4004aae853b91c3a7617": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_4986a21eb560448fa79f4b25cde48951",
-            "placeholder": "",
-            "style": "IPY_MODEL_aed3acd2f2d74003b44079c333a0698e",
-            "value": " 5.67k/5.67k [00:00&lt;00:00, 205kB/s]"
-          }
-        }
-      }
+    "fd1ad9e0367d4004aae853b91c3a7617": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_4986a21eb560448fa79f4b25cde48951",
+      "placeholder": "",
+      "style": "IPY_MODEL_aed3acd2f2d74003b44079c333a0698e",
+      "value": " 5.67k/5.67k [00:00&lt;00:00, 205kB/s]"
+     }
    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
--- a/examples/visualize-wandb.ipynb
+++ b/examples/visualize-wandb.ipynb
@@ -68,6 +68,7 @@
   "source": [
    "import wandb\n",
    "\n",
+    "\n",
    "wandb.login()"
   ]
  },
@@ -110,13 +111,15 @@
   "cell_type": "markdown",
   "id": "e974cabdbe70b667",
   "metadata": {},
-   "source": ""
+   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "5178ca9445b844e4",
   "metadata": {},
-   "source": "W&B can also be initialized programmatically for use outside the CLI to parse and log the results."
+   "source": [
+    "W&B can also be initialized programmatically for use outside the CLI to parse and log the results."
+   ]
  },
  {
   "cell_type": "code",
@@ -126,7 +129,8 @@
   "outputs": [],
   "source": [
    "import lm_eval\n",
-    "from lm_eval.logging_utils import WandbLogger\n",
+    "from lm_eval.loggers import WandbLogger\n",
+    "\n",
    "\n",
    "results = lm_eval.simple_evaluate(\n",
    "    model=\"hf\",\n",

--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -73,7 +73,7 @@ def setup_parser() -> argparse.ArgumentParser:
        default=None,
        type=str,
        metavar="task1,task2",
-        help="To get full list of tasks, use the command lm-eval --tasks list",
+        help="Comma-separated list of task names or task groupings to evaluate on.\nTo get full list of tasks, use one of the commands `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above",
    )
    parser.add_argument(
        "--model_args",
@@ -170,9 +170,16 @@ def setup_parser() -> argparse.ArgumentParser:
    )
    parser.add_argument(
        "--apply_chat_template",
-        action="store_true",
+        type=str,
+        nargs="?",
+        const=True,
        default=False,
-        help="If True, applies the chat template to the prompt",
+        help=(
+            "If True, apply chat template to the prompt. "
+            "Providing `--apply_chat_template` without an argument will apply the default chat template to the prompt. "
+            "To apply a specific template from the available list of templates, provide the template name as an argument. "
+            "E.g. `--apply_chat_template template_name`"
+        ),
    )
    parser.add_argument(
        "--fewshot_as_multiturn",
@@ -289,14 +296,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:

    if args.fewshot_as_multiturn and args.apply_chat_template is False:
        raise ValueError(
-            "If fewshot_as_multiturn is set, apply_chat_template must be set to True."
-        )
-
-    if (
-        args.num_fewshot is None or args.num_fewshot == 0
-    ) and args.fewshot_as_multiturn:
-        raise ValueError(
-            "If fewshot_as_multiturn is set, num_fewshot must be greater than 0."
+            "When `fewshot_as_multiturn` is selected, `apply_chat_template` must be set (either to `True` or to the chosen template name)."
        )

    if args.include_path is not None:
@@ -318,9 +318,16 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        eval_logger.error("Need to specify task to evaluate.")
        sys.exit()
    elif args.tasks == "list":
-        eval_logger.info(
-            "Available Tasks:\n - {}".format("\n - ".join(task_manager.all_tasks))
-        )
+        print(task_manager.list_all_tasks())
+        sys.exit()
+    elif args.tasks == "list_groups":
+        print(task_manager.list_all_tasks(list_subtasks=False, list_tags=False))
+        sys.exit()
+    elif args.tasks == "list_tags":
+        print(task_manager.list_all_tasks(list_groups=False, list_subtasks=False))
+        sys.exit()
+    elif args.tasks == "list_subtasks":
+        print(task_manager.list_all_tasks(list_groups=False, list_tags=False))
        sys.exit()
    else:
        if os.path.isdir(args.tasks):
@@ -349,16 +356,22 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
                    f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
                )
                raise ValueError(
-                    f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks, or '--verbosity DEBUG' to troubleshoot task registration issues."
+                    f"Tasks not found: {missing}. Try `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above, or pass '--verbosity DEBUG' to troubleshoot task registration issues."
                )

    # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args
    if args.trust_remote_code:
-        os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = str(args.trust_remote_code)
-        args.model_args = (
-            args.model_args
-            + f",trust_remote_code={os.environ['HF_DATASETS_TRUST_REMOTE_CODE']}"
+        eval_logger.info(
+            "Passed `--trust_remote_code`, setting environment variable `HF_DATASETS_TRUST_REMOTE_CODE=true`"
        )
+        # HACK: import datasets and override its HF_DATASETS_TRUST_REMOTE_CODE value internally,
+        # because it's already been determined based on the prior env var before launching our
+        # script--`datasets` gets imported by lm_eval internally before these lines can update the env.
+        import datasets
+
+        datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
+
+        args.model_args = args.model_args + ",trust_remote_code=True"

    eval_logger.info(f"Selected Tasks: {task_names}")


--- a/lm_eval/api/group.py
+++ b/lm_eval/api/group.py
+import abc
+from dataclasses import asdict, dataclass
+from inspect import getsource
+from typing import Any, Callable, List, Optional, Union
+
+
+@dataclass
+class AggMetricConfig(dict):
+    metric: Optional[str] = None
+    aggregation: Optional[str] = "mean"
+    weight_by_size: Optional[str] = False
+    # list of filter names which should be incorporated into the aggregated metric.
+    filter_list: Optional[Union[str, list]] = "none"
+
+    def __post_init__(self):
+        if self.aggregation != "mean" and not callable(self.aggregation):
+            raise ValueError(
+                f"Currently, 'mean' is the only pre-defined aggregation across groups' subtasks. Got '{self.aggregation}'."
+            )
+
+        if isinstance(self.filter_list, str):
+            self.filter_list = [self.filter_list]
+
+
+@dataclass
+class GroupConfig(dict):
+    group: Optional[str] = None
+    group_alias: Optional[str] = None
+    task: Optional[Union[str, list]] = None
+    aggregate_metric_list: Optional[
+        Union[List[AggMetricConfig], AggMetricConfig, dict]
+    ] = None
+    metadata: Optional[dict] = (
+        None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
+    )
+
+    def __getitem__(self, item):
+        return getattr(self, item)
+
+    def __setitem__(self, item, value):
+        return setattr(self, item, value)
+
+    def __post_init__(self):
+        if self.aggregate_metric_list is not None:
+            if isinstance(self.aggregate_metric_list, dict):
+                self.aggregate_metric_list = [self.aggregate_metric_list]
+
+            self.aggregate_metric_list = [
+                AggMetricConfig(**item) if isinstance(item, dict) else item
+                for item in self.aggregate_metric_list
+            ]
+
+    def to_dict(self, keep_callable: bool = False) -> dict:
+        """dumps the current config as a dictionary object, as a printable format.
+        null fields will not be printed.
+        Used for dumping results alongside full task configuration
+
+        :return: dict
+            A printable dictionary version of the TaskConfig object.
+
+        # TODO: should any default value in the TaskConfig not be printed?
+        """
+        cfg_dict = asdict(self)
+        # remove values that are `None`
+        for k, v in list(cfg_dict.items()):
+            if callable(v):
+                cfg_dict[k] = self.serialize_function(v, keep_callable=keep_callable)
+        return cfg_dict
+
+    def serialize_function(
+        self, value: Union[Callable, str], keep_callable=False
+    ) -> Union[Callable, str]:
+        """Serializes a given function or string.
+
+        If 'keep_callable' is True, the original callable is returned.
+        Otherwise, attempts to return the source code of the callable using 'getsource'.
+        """
+        if keep_callable:
+            return value
+        else:
+            try:
+                return getsource(value)
+            except (TypeError, OSError):
+                return str(value)
+
+
+class ConfigurableGroup(abc.ABC):
+    def __init__(
+        self,
+        config: Optional[dict] = None,
+    ) -> None:
+        self._config = GroupConfig(**config)
+
+    @property
+    def group(self):
+        return self._config.group
+
+    @property
+    def group_alias(self):
+        return self._config.group_alias
+
+    @property
+    def version(self):
+        return self._config.version
+
+    @property
+    def config(self):
+        return self._config.to_dict()
+
+    @property
+    def group_name(self) -> Any:
+        return self._config.group
+
+    def __repr__(self):
+        return (
+            f"ConfigurableGroup(group={self.group}," f"group_alias={self.group_alias})"
+        )
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
 import logging
 import math
 import random
+import re
+import string
 from collections.abc import Iterable
 from typing import List

-import evaluate as hf_evaluate
 import numpy as np
 import sacrebleu
-import sklearn.metrics

 from lm_eval.api.registry import register_aggregation, register_metric

@@ -50,21 +50,24 @@ def bits_per_byte(items):

 @register_aggregation("f1")
 def f1_score(items):
+    from sklearn.metrics import f1_score
+
    unzipped_list = list(zip(*items))
    golds = unzipped_list[0]
    preds = unzipped_list[1]
-    fscore = sklearn.metrics.f1_score(golds, preds)
+    fscore = f1_score(golds, preds)

    return np.max(fscore)


 @register_aggregation("matthews_corrcoef")
 def matthews_corrcoef(items):
+    from sklearn.metrics import matthews_corrcoef
+
    unzipped_list = list(zip(*items))
    golds = unzipped_list[0]
    preds = unzipped_list[1]
-    # print(preds)
-    return sklearn.metrics.matthews_corrcoef(golds, preds)
+    return matthews_corrcoef(golds, preds)


 @register_aggregation("bleu")
@@ -166,7 +169,60 @@ def acc_mutual_info_fn(items):  # This is a passthrough function
    return items


-exact_match = hf_evaluate.load("exact_match")
+### the code used in the `exact_match_hf_evaluate` function is ported from
+### https://github.com/huggingface/evaluate/blob/main/metrics/exact_match/exact_match.py
+### which is under the apache license.
+
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def exact_match_hf_evaluate(
+    predictions,
+    references,
+    regexes_to_ignore=None,
+    ignore_case=False,
+    ignore_punctuation=False,
+    ignore_numbers=False,
+):
+    if regexes_to_ignore is not None:
+        for s in regexes_to_ignore:
+            predictions = np.array([re.sub(s, "", x) for x in predictions])
+            references = np.array([re.sub(s, "", x) for x in references])
+    else:
+        predictions = np.asarray(predictions)
+        references = np.asarray(references)
+
+    if ignore_case:
+        predictions = np.char.lower(predictions)
+        references = np.char.lower(references)
+
+    if ignore_punctuation:
+        repl_table = string.punctuation.maketrans("", "", string.punctuation)
+        predictions = np.char.translate(predictions, table=repl_table)
+        references = np.char.translate(references, table=repl_table)
+
+    if ignore_numbers:
+        repl_table = string.digits.maketrans("", "", string.digits)
+        predictions = np.char.translate(predictions, table=repl_table)
+        references = np.char.translate(references, table=repl_table)
+
+    score_list = predictions == references
+
+    return {"exact_match": np.mean(score_list)}
+
+
+###


 @register_metric(
@@ -176,7 +232,7 @@ exact_match = hf_evaluate.load("exact_match")
    aggregation="mean",
 )
 def exact_match_fn(**kwargs):
-    return exact_match.compute(**kwargs)
+    return exact_match_hf_evaluate(**kwargs)


 @register_metric(