merged latest update

0348ed97 · lintangsutawika · 451a1873 · 6769119f · 0348ed97 · 0348ed97
Commit 0348ed97 authored Oct 09, 2023 by lintangsutawika
20 changed files
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
@@ -3,10 +3,10 @@ name: Tasks Modified
 on:
  push:
    branches:
-      - big-refactor
+      - 'big-refactor*'
  pull_request:
    branches:
-      - big-refactor
+      - 'big-refactor*'
  workflow_dispatch:
 # comment/edit out the above to stop/change the triggers
 jobs:
@@ -18,7 +18,7 @@ jobs:
      - name: checkout
        uses: actions/checkout@v3
        with:
-          fetch-depth: 0  # OR "2" -> To retrieve the preceding commit.
+          fetch-depth: 2  # OR "2" -> To retrieve the preceding commit.

      # Uses the tj-actions/changed-files@v37 action to check for changes.
      # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
@@ -51,6 +51,7 @@ jobs:
        with:
          python-version: 3.9
          cache: 'pip'
+          cache-dependency-path: setup.py
      - name: Install dependencies
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
        run: |
@@ -62,10 +63,10 @@ jobs:
      - name: Test with pytest
        # if new tasks are added, run tests on them
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
-        run: python -m pytest tests/extra/test_new_tasks.py -s -vv -n=auto
+        run: python -m pytest tests/test_tasks.py -s -vv
        # if api is modified, run tests on it
      - name: Test more tasks with pytest
        env:
          API: true
        if: steps.changed-tasks.outputs.api_any_modified == 'true'
-        run: python -m pytest tests/extra/test_new_tasks.py -s -vv -n=auto
+        run: python -m pytest tests/test_tasks.py -s -vv
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -22,11 +22,12 @@ jobs:
    steps:
    - name: Checkout Code
      uses: actions/checkout@v3
-    - name: Set up Python 3.9
+    - name: Set up Python 3.8
      uses: actions/setup-python@v4
      with:
-        python-version: 3.9
-        cache: 'pip'
+        python-version: 3.8
+        cache: pip
+        cache-dependency-path: setup.py
    - name: Install dependencies
      run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
    - name: Pre-Commit
@@ -39,29 +40,38 @@ jobs:
        flake8 . --count --select=F,E9,E71,E72,E501,E112,E113,W6 --extend-ignore=F541 --show-source --statistics --exit-zero
        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-      # mypy turned off for now
+#       # mypy turned off for now
 #    - name: Lint with mypy
 #      run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
-# Job 2
+Job 2
  testcpu:
    name: CPU Tests
    runs-on: ubuntu-latest
-    timeout-minutes: 20
-
+    strategy:
+      matrix:
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+    timeout-minutes: 30
    steps:
    - name: Checkout Code
      uses: actions/checkout@v3
-    - name: Set up Python 3.9
+    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v4
      with:
-        python-version: 3.9
-        cache: 'pip'
+        python-version: ${{ matrix.python-version }}
+        cache: pip
+        cache-dependency-path: setup.py
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install -e '.[testing,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
-        #         Install optional git dependencies
+#         Install optional git dependencies
 #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
 #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
    - name: Test with pytest
      run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/tests_master --ignore=tests/extra
+    - name: Archive artifacts
+      uses: actions/upload-artifact@v3
+      with:
+        name: output_results
+        path: |
+          test_logs/*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,13 +33,19 @@ repos:
    rev: 22.3.0
    hooks:
      - id: black
-        language_version: python3.9
+        language_version: python3.8
  - repo: https://github.com/codespell-project/codespell
    rev: v2.1.0
    hooks:
      - id: codespell
        exclude: >
          (?x)^(
-              .*\.json|ignore.txt
+              .*\.json|ignore.txt|lm_eval/tasks/.*|.*yaml
          )$
        args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.5.1
+    hooks:
+    - id: mypy
+      additional_dependencies: [".[sentencepiece,multilingual,promptsource,gptq]", "types-PyYAML", "types-requests"]
+      exclude: ^tests/.*$
--- a/README.md
+++ b/README.md
@@ -9,8 +9,8 @@ We’d like your help to test it out! you can help by:
 2. Porting tasks supported in the previous version of the harness to the new YAML configuration format. Please check out our [task implementation guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md) for more information.

 If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with:
- A command of the form `python main.py --model hf --model_args ..... --tasks <task name> ...` which will run the task in the `master` branch, and what the score is
- A command of the form `python main.py --model hf --model_args ..... --tasks <task name> ...` to run the task in your PR branch to `big-refactor`, and what the resulting score is, to show that we achieve equality between the two implementations.
+- A command of the form `python -m lm_eval --model hf --model_args ..... --tasks <task name> ...` which will run the task in the `master` branch, and what the score is
+- A command of the form `python -m lm_eval --model hf --model_args ..... --tasks <task name> ...` to run the task in your PR branch to `big-refactor`, and what the resulting score is, to show that we achieve equality between the two implementations.

 Lastly, we'll no longer be accepting new feature requests beyond those that are already open to the master branch as we carry out this switch to the new version over the next week, though we will be accepting bugfixes to `master` branch and PRs to `big-refactor`. Feel free to reach out in the #lm-thunderdome channel of the EAI discord for more information.

@@ -67,7 +67,7 @@ To evaluate a model hosted on the [HuggingFace Hub](https://huggingface.co/model


 ```bash
-python main.py \
+python -m lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/gpt-j-6B \
    --tasks hellaswag \
@@ -78,7 +78,7 @@ python main.py \
 Additional arguments can be provided to the model constructor using the `--model_args` flag. Most notably, this supports the common practice of using the `revisions` feature on the Hub to store partially trained checkpoints, or to specify the datatype for running a model:

 ```bash
-python main.py \
+python -m lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/pythia-160m,revision=step100000,dtype="float" \
    --tasks lambada_openai,hellaswag \
@@ -91,7 +91,7 @@ Models that are loaded via either `transformers.AutoModelForCausalLM` (autoregre
 Batch size selection can be automated by setting the  ```--batch_size``` flag to ```auto```. This will perform automatic detection of the largest batch size that will fit on your device. On tasks where there is a large difference between the longest and shortest example, it can be helpful to periodically recompute the largest batch size, to gain a further speedup. To do this, append ```:N``` to above flag to automatically recompute the largest batch size ```N``` times. For example, to recompute the batch size 4 times, the command would be:

 ```bash
-python main.py \
+python -m lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/pythia-160m,revision=step100000,dtype="float" \
    --tasks lambada_openai,hellaswag \
@@ -99,7 +99,7 @@ python main.py \
    --batch_size auto:4
 ```

-Alternatively, you can use `lm-eval` instead of `python main.py` to call lm eval from anywhere.
+Alternatively, you can use `lm-eval` or `lm_eval` instead of `python -m lm_eval` to call lm eval from anywhere.

 ### Multi-GPU Evaluation with Hugging Face `accelerate`

@@ -108,7 +108,7 @@ To parallelize evaluation of HuggingFace models across multiple GPUs, we allow f
 The first is performed by launching evaluation via the `accelerate` library as follows:

 ```
-accelerate launch main.py \
+accelerate launch -m lm_eval \
    --model hf \
    --tasks lambada_openai,arc_easy \
    --batch_size 16 \
@@ -116,10 +116,12 @@ accelerate launch main.py \

 This will perform *data-parallel evaluation*: that is, placing a **single full copy** of your model onto each available GPU and *splitting batches across GPUs* to evaluate on K GPUs K times faster than on one.

-However, if your model *is too large to be run on a single one of your GPUs*, then we provide an alternative method to run these large models: use of the `parallelize` argument.
+If your model is *is too large to be run on a single one of your GPUs* then you can use `accelerate` with Fully Sharded Data Parallel (FSDP) that splits the weights of the model across your data parallel ranks. To enable this, ensure you select `YES` when asked ```Do you want to use FullyShardedDataParallel?``` when running `accelerate config`. To enable memory-efficient loading, select `YES` when asked `Do you want each individually wrapped FSDP unit to broadcast module parameters from rank 0 at the start?`. This will ensure only the rank 0 process loads the model and then broadcasts the parameters to the other ranks instead of having each rank load all parameters which can lead to large RAM usage spikes around the start of the script that may cause errors.

+
+We also provide an second method to run these large models: use of the `parallelize` argument.
 ```
-python main.py \
+python -m lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/pythia-12b,parallelize=True
    --tasks lambada_openai,arc_easy \
@@ -132,9 +134,9 @@ To pass even more advanced keyword arguments to `accelerate`, we allow for the f
 - `max_cpu_memory`: the max amount of CPU memory to use when offloading the model weights to RAM.
 - `offload_folder`: a folder where model weights will be offloaded to disk if needed.

-Using this setting helps for massive models like BLOOM which require, or to avoid exceeding your total system RAM (by default, with `accelerate launch` one copy of the model for each GPU is initialized in RAM before moving it to GPU, resulting in large RAM usage spikes around the start of the script that may cause errors such as `Killed`.) However, it naively splits models across GPUs, resulting in only a single GPU performing work at any point in time, and so is much slower than launching with `accelerate launch`, possibly by a factor of the total # of GPUs.
+Note that this method naively splits models across GPUs, resulting in only a single GPU performing work at any point in time, and so is much slower than launching with `accelerate launch`, possibly by a factor of the total # of GPUs.

-**Note that this option requires launching evaluation via `python main.py` rather than `accelerate launch main.py`.**
+**Note that this option requires launching evaluation via `python -m lm_eval` rather than `accelerate launch -m lm_eval`.**

 To use `accelerate` with the `lm-eval` command, use
 ```
@@ -165,7 +167,7 @@ Our library supports language models served via the OpenAI Completions API as fo

 ```bash
 export OPENAI_API_SECRET_KEY=YOUR_KEY_HERE
-python main.py \
+python -m lm_eval \
    --model openai-completions \
    --model_args engine=davinci \
    --tasks lambada_openai,hellaswag
@@ -196,7 +198,7 @@ This will write out one text file for each task.
 To verify the data integrity of the tasks you're performing in addition to running the tasks themselves, you can use the `--check_integrity` flag:

 ```bash
-python main.py \
+python -m lm_eval \
    --model openai \
    --model_args engine=davinci \
    --tasks lambada_openai,hellaswag \
@@ -207,7 +209,7 @@ python main.py \

 For models loaded with the HuggingFace  `transformers` library, any arguments provided via `--model_args` get passed to the relevant constructor directly. This means that anything you can do with `AutoModel` can be done with our library. For example, you can pass a local path via `pretrained=` or use models finetuned with [PEFT](https://github.com/huggingface/peft) by taking the call you would run to evaluate the base model and add `,peft=PATH` to the `model_args` argument:
 ```bash
-python main.py \
+python -m lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/gpt-j-6b,parallelize=True,load_in_4bit=True,peft=nomic-ai/gpt4all-j-lora \
    --tasks openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq \
@@ -217,7 +219,7 @@ python main.py \
 [GPTQ](https://github.com/PanQiWei/AutoGPTQ) quantized models can be loaded by specifying their file names in `,gptq=NAME` (or `,gptq=True` for default names) in the `model_args` argument:

 ```bash
-python main.py \
+python -m lm_eval \
    --model hf \
    --model_args pretrained=model-name-or-path,gptq=model.safetensors,gptq_use_triton=True \
    --tasks hellaswag

--- a/docs/README.md
+++ b/docs/README.md
@@ -4,6 +4,7 @@ Welcome to the docs for the LM Evaluation Harness!

 ## Table of Contents

+* To learn about the public interface of the library, as well as how to evaluate via the commandline or as integrated into an external library, see the [Interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/user_guide.md)
 * To learn how to add a new library, API, or model type to the library, as well as a quick explainer on the types of ways to evaluate an LM, see the [Model Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/model_guide.md).
 * For a crash course on adding new tasks to the library, see our [New Task Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md).
 * To learn more about pushing the limits of task configuration that the Eval Harness supports, see the [Advanced Task Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/advanced_task_guide.md).

--- a/docs/decontamination.md
+++ b/docs/decontamination.md
@@ -2,11 +2,11 @@

 ## Usage

-Simply add a "--decontamination_ngrams_path" when running main.py. The provided directory should contain
+Simply add a "--decontamination_ngrams_path" when running \__main\__.py. The provided directory should contain
 the ngram files and info.json produced in "Pile Ngram Generation" further down.

 ```bash
-python main.py \
+python -m lm_eval \
    --model gpt2 \
    --device 0 \
    --tasks sciq \

--- a/docs/interface.md
+++ b/docs/interface.md
+# User Guide
+
+This document details the interface exposed by `lm-eval` and provides details on what flags are available to users.
+
+## Command-line Interface
+
+A majority of users run the library by cloning it from Github, installing the package as editable, and running the `python -m lm_eval` script.
+
+Equivalently, running the library can be done via the `lm-eval` entrypoint at the command line.
+
+This mode supports a number of command-line arguments, the details of which can be also be seen via running with `-h` or `--help`:
+
+* `--model` : Selects which model type or provider is evaluated. Must be a string corresponding to the name of the model type/provider being used. See [the main README](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor#commercial-apis) for a full list of enabled model names and supported libraries or APIs.
+
+* `--model_args` : Controls parameters passed to the model constructor. Accepts a string containing comma-separated keyword arguments to the model class of the format `"arg1=val1,arg2=val2,..."`, such as, for example `--model_args pretrained=EleutherAI/pythia-160m,dtype=float32`. For a full list of what keyword arguments, see the initialization of the `lm_eval.api.model.LM` subclass, e.g. [`HFLM`](https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/models/huggingface.py#L66)
+
+* `--tasks` : Determines which tasks or task groups are evaluated. Accepts a comma-separated list of task names or task group names. Must be solely comprised of valid tasks/groups.
+
+* `--num_fewshot` : Sets the number of few-shot examples to place in context. Must be an integer.
+
+* `--batch_size` : Sets the batch size used for evaluation. Can be a positive integer or `"auto"` to automatically select the largest batch size that will fit in memory, speeding up evaluation. One can pass `--batch_size auto:N` to re-select the maximum batch size `N` times during evaluation. This can help accelerate evaluation further, since `lm-eval` sorts documents in descending order of context length.
+
+* `--max_batch_size` : Sets the maximum batch size to try to fit in memory, if `--batch_size auto` is passed.
+
+* `--device` : Sets which device to place the model onto. Must be a string, for example, `"cuda", "cuda:0", "cpu", "mps"`. Defaults to "cuda", and can be ignored if running multi-GPU or running a non-local model type.
+
+* `--output_path` : A string of the form `dir/file.jsonl` or `dir/`. Provides a path where high-level results will be saved, either into the file named or into the directory named. If `--log_samples` is passed as well, then per-document outputs and metrics will be saved into the directory as well.
+
+* `--log_samples` : If this flag is passed, then the model's outputs, and the text fed into the model, will be saved at per-document granularity. Must be used with `--output_path`.
+
+* `--limit` : Accepts an integer, or a float between 0.0 and 1.0 . If passed, will limit the number of documents to evaluate to the first X documents (if an integer) per task or first X% of documents per task. Useful for debugging, especially on costly API models.
+
+* `--use_cache` : Should be a path where a sqlite db file can be written to. Takes a string of format `/path/to/sqlite_cache_` in order to create a cache db at `/path/to/sqlite_cache_rank{i}.db` for each process (0-NUM_GPUS). This allows results of prior runs to be cached, so that there is no need to re-run results in order to re-score or re-run a given (model, task) pair again.
+
+* `--decontamination_ngrams_path` : Deprecated, see (this commit)[https://github.com/EleutherAI/lm-evaluation-harness/commit/00209e10f6e27edf5d766145afaf894079b5fe10] or older for a working decontamination-checker tool.
+
+* `--check_integrity` : If this flag is used, the library tests for each task selected are run to confirm task integrity.
+
+* `--write_out` : Used for diagnostic purposes to observe the format of task documents passed to a model. If this flag is used, then prints the prompt and gold target string for the first document of each task.
+
+* `--show_config` : If used, prints the full `lm_eval.api.task.TaskConfig` contents (non-default settings the task YAML file) for each task which was run, at the completion of an evaluation. Useful for when one is modifying a task's configuration YAML locally to transmit the exact configurations used for debugging or for reproducibility purposes.
+
+* `--include_path` : Accepts a path to a folder. If passed, then all YAML files containing `lm-eval`` compatible task configurations will be added to the task registry as available tasks. Used for when one is writing config files for their own task in a folder other than `lm_eval/tasks/`
+
+## External Library Usage
+
+We also support using the library's external API for use within model training loops or other scripts.
+
+`lm_eval` supplies two functions for external import and use: `lm_eval.evaluate()` and `lm_eval.simple_evaluate()`.
+
+
+`simple_evaluate()` can be used by simply creating an `lm_eval.api.model.LM` subclass that implements the methods described in the [Model Guide](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor/docs/model_guide.md), and wrapping your custom model in that class as follows:
+
+```python
+import lm_eval
+...
+
+my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
+...
+lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.greedy_until()`
+
+results = lm_eval.simple_evaluate( # call simple_evaluate
+    model=lm_obj,
+    tasks=["taskname1", "taskname2"],
+    num_fewshot=0,
+    ...
+)
+```
+
+
+See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L35 for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously.
+
+Additionally, the `evaluate()` function offers the core evaluation functionality provided by the library, but without some of the special handling and simplification + abstraction provided by `simple_evaluate()`.
+
+See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L173 for more details.
+
+As a brief example usage of `evaluate()`:
+```python
+import lm_eval
+
+from my_tasks import MyTask1 # suppose you've defined a custom lm_eval.api.Task subclass in your own external codebase
+...
+
+my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
+...
+lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.greedy_until()`
+
+
+
+def evaluate(
+    lm=lm_obj,
+    task_dict={"mytask1": MyTask1},
+    ...
+):
+```
--- a/docs/model_guide.md
+++ b/docs/model_guide.md
@@ -70,9 +70,9 @@ smth smth tokenizer-agnostic

 Congrats on implementing your model! Now it's time to test it out.

-To make your model usable via the command line interface to `lm-eval` using `main.py`, you'll need to tell `lm-eval` what your model's name is.
+To make your model usable via the command line interface to `lm-eval` using `python -m lm_eval`, you'll need to tell `lm-eval` what your model's name is.

-This is done via a *decorator*, `lm_eval.api.registry.register_model`. Using `register_model()`, one can both tell the package what the model's name(s) to be used are when invoking it with `python main.py --model <name>` and alert `lm-eval` to the model's existence.
+This is done via a *decorator*, `lm_eval.api.registry.register_model`. Using `register_model()`, one can both tell the package what the model's name(s) to be used are when invoking it with `python -m lm_eval --model <name>` and alert `lm-eval` to the model's existence.

 ```python
 from lm_eval.api.registry import register_model

--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -214,7 +214,7 @@ metric_list:
 ```
 `aggregation` and `higher_is_better` can optionally be left out to default to the manually-set defaults if using a natively supported metric, otherwise it must be defined explicitly (for example, when using a custom metric implemented as a function).

-For a full list of natively supported metrics and aggregation functions see `docs/advanced_task_guide.md`. All metrics supported in [HuggingFace Evaluate](https://github.com/huggingface/evaluate/tree/main/metrics) can also be used, and will be loaded if a given metric name is not one natively supported in `lm-eval`.
+For a full list of natively supported metrics and aggregation functions see `docs/advanced_task_guide.md`. All metrics supported in [HuggingFace Evaluate](https://github.com/huggingface/evaluate/tree/main/metrics) can also be used, and will be loaded if a given metric name is not one natively supported in `lm-eval` or `hf_evaluate` is set to `true`.

 ### Optional, More Advanced Setup

@@ -258,7 +258,7 @@ You can do this via adding the Python snippet
 from lm_eval.tasks import include_task_folder
 include_task_folder("/path/to/yaml/parent/folder")
 ```
-to the top of any Python file that is run or imported when performing evaluation, such as `main.py`.
+to the top of any Python file that is run or imported when performing evaluation, such as `\_\_main\_\_.py`.

 Passing `--tasks /path/to/yaml/file` is also accepted.


--- a/lm_eval/__init__.py
+++ b/lm_eval/__init__.py
+from .evaluator import evaluate, simple_evaluate
--- a/main.py
+++ b/main.py
@@ -9,23 +9,25 @@ from pathlib import Path

 from lm_eval import evaluator, utils
 from lm_eval.api.registry import ALL_TASKS
-from lm_eval.logger import eval_logger
-from lm_eval.tasks import include_task_folder
+from lm_eval.logger import eval_logger, SPACING
+from lm_eval.tasks import include_path

-os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from typing import Union


-def parse_args():
-    parser = argparse.ArgumentParser()
+def parse_eval_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument("--model", required=True, help="Name of model e.g. `hf`")
+    parser.add_argument(
+        "--tasks",
+        default=None,
+        help="Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS))),
+    )
    parser.add_argument(
        "--model_args",
        default="",
        help="String arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
    )
-    parser.add_argument(
-        "--tasks", default=None  # , choices=utils.MultiChoice(sorted(ALL_TASKS))
-    )
    parser.add_argument(
        "--num_fewshot",
        type=int,
@@ -98,8 +100,13 @@ def parse_args():
    return parser.parse_args()


-def main():
-    args = parse_args()
+def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
+
+    if not args:
+        # we allow for args to be passed externally, else we parse them ourselves
+        args = parse_eval_args()
+
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    if args.limit:
        eval_logger.warning(
@@ -109,7 +116,7 @@ def main():

    if args.include_path is not None:
        eval_logger.info(f"Including path: {args.include_path}")
-        include_task_folder(args.include_path)
+        include_path(args.include_path)

    if args.tasks is None:
        task_names = ALL_TASKS
@@ -125,11 +132,20 @@ def main():
        else:
            tasks_list = args.tasks.split(",")
            task_names = utils.pattern_match(tasks_list, ALL_TASKS)
+            task_missing = []
            for task in [task for task in tasks_list if task not in task_names]:
                if os.path.isfile(task):
                    config = utils.load_yaml_config(task)
                    task_names.append(config)

+        if task_missing != []:
+            missing = ", ".join(task_missing)
+            eval_logger.error(
+                f"Tasks were not found: {missing}\n"
+                f"{SPACING}Try `lm-eval -h` for list of available tasks",
+            )
+            raise ValueError(f"Tasks {missing} were not found.")
+
    if args.output_path:
        path = Path(args.output_path)
        # check if file or 'dir/results.json' exists
@@ -195,9 +211,9 @@ def main():
            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
        )
        print(evaluator.make_table(results))
-        if "aggregate" in results:
-            print(evaluator.make_table(results, "aggregate"))
+        if "groups" in results:
+            print(evaluator.make_table(results, "groups"))


 if __name__ == "__main__":
-    main()
+    cli_evaluate()
--- a/lm_eval/api/filter.py
+++ b/lm_eval/api/filter.py
@@ -2,6 +2,7 @@ from dataclasses import dataclass
 from typing import List

 from lm_eval.api.instance import Instance
+from datasets import Dataset


 class Filter:
@@ -13,12 +14,12 @@ class Filter:

    """

-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args, **kwargs) -> None:
        """
        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
        """

-    def apply(self, resps):
+    def apply(self, resps, docs):
        """
        Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects.
        Should return the list of (filtered) response lists *in the same order as they were input*, e.g.
@@ -40,14 +41,14 @@ class FilterEnsemble:
    name: str
    filters: List[Filter]

-    def apply(self, instances: List[Instance]):
+    def apply(self, instances: List[Instance], docs: List[Dataset]) -> None:

        resps = [
            inst.resps for inst in instances
        ]  # operate just on the model responses
        for f in self.filters:
            # apply filters in sequence
-            resps = f.apply(resps)
+            resps = f.apply(resps, docs)

        # add the end results after filtering to filtered_requests of their respective source instances.
        # has key `self.name`: each FilterEnsemble applied in a given run should use a different name.

--- a/lm_eval/api/instance.py
+++ b/lm_eval/api/instance.py
@@ -19,7 +19,7 @@ class Instance:
    doc_id: str = None
    repeats: str = None

-    def __post_init__(self):
+    def __post_init__(self) -> None:
        # unpack metadata field
        self.task_name, self.doc_id, self.repeats = self.metadata


--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -56,6 +56,55 @@ def matthews_corrcoef(items):
    return sklearn.metrics.matthews_corrcoef(golds, preds)


+@register_aggregation("bleu")
+def bleu(items):
+    """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
+    for evaluating a generated sentence to a reference sentence. It counts matching
+    n-grams in the candidate translation to n-grams in the reference text, where
+    1-gram or unigram would be each token and a bigram comparison would be each
+    word pair. The comparison is made regardless of word order
+    Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
+    Paper: https://www.aclweb.org/anthology/P02-1040/
+
+    Higher is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_bleu(preds, refs).score
+
+
+@register_aggregation("chrf")
+def chrf(items):
+    """chrF++ is a tool for automatic evaluation of machine translation output
+    based on character n-gram precision and recall enhanced with word n-grams.
+    Source: https://github.com/m-popovic/chrF
+    Paper: https://www.aclweb.org/anthology/W15-3049.pdf
+
+    Higher is better  # TODO I think
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_chrf(preds, refs).score
+
+
+@register_aggregation("ter")
+def ter(items):
+    """Translation Error Rate is an error metric for machine translation that
+    measures the number of edits required to change a system output into one
+    of the references
+    Source: http://www.cs.umd.edu/~snover/tercom/
+    Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
+
+    Lower is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_ter(preds, refs).score
+
+
 @register_metric(
    metric="acc",
    higher_is_better=True,
@@ -160,6 +209,36 @@ def f1_fn(items):  # This is a passthrough function
    return items


+@register_metric(
+    metric="bleu",
+    higher_is_better=True,
+    output_type="greedy_until",
+    aggregation="bleu",
+)
+def bleu_fn(items):  # This is a passthrough function
+    return items
+
+
+@register_metric(
+    metric="chrf",
+    higher_is_better=True,
+    output_type="greedy_until",
+    aggregation="chrf",
+)
+def chrf_fn(items):  # This is a passthrough function
+    return items
+
+
+@register_metric(
+    metric="ter",
+    higher_is_better=True,
+    output_type="greedy_until",
+    aggregation="ter",
+)
+def ter_fn(items):  # This is a passthrough function
+    return items
+
+
 @register_metric(
    metric="acc_all",
    higher_is_better=True,
@@ -217,55 +296,6 @@ def weighted_mean(items):
    return sum(a) / sum(b)


-@register_metric(metric="bleu", higher_is_better=True, aggregation="mean")
-def bleu(items):
-    """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
-    for evaluating a generated sentence to a reference sentence. It counts matching
-    n-grams in the candidate translation to n-grams in the reference text, where
-    1-gram or unigram would be each token and a bigram comparison would be each
-    word pair. The comparison is made regardless of word order
-    Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
-    Paper: https://www.aclweb.org/anthology/P02-1040/
-
-    Higher is better
-    """
-    refs = list(zip(*items))[0]
-    preds = list(zip(*items))[1]
-    refs, preds = _sacreformat(refs, preds)
-    return sacrebleu.corpus_bleu(preds, refs).score
-
-
-@register_metric(metric="chrf", higher_is_better=True, aggregation="mean")
-def chrf(items):
-    """chrF++ is a tool for automatic evaluation of machine translation output
-    based on character n-gram precision and recall enhanced with word n-grams.
-    Source: https://github.com/m-popovic/chrF
-    Paper: https://www.aclweb.org/anthology/W15-3049.pdf
-
-    Higher is better  # TODO I think
-    """
-    refs = list(zip(*items))[0]
-    preds = list(zip(*items))[1]
-    refs, preds = _sacreformat(refs, preds)
-    return sacrebleu.corpus_chrf(preds, refs).score
-
-
-@register_metric(metric="ter", higher_is_better=True, aggregation="mean")
-def ter(items):
-    """Translation Error Rate is an error metric for machine translation that
-    measures the number of edits required to change a system output into one
-    of the references
-    Source: http://www.cs.umd.edu/~snover/tercom/
-    Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
-
-    Lower is better
-    """
-    refs = list(zip(*items))[0]
-    preds = list(zip(*items))[1]
-    refs, preds = _sacreformat(refs, preds)
-    return sacrebleu.corpus_ter(preds, refs).score
-
-
 def is_non_str_iterable(obj):
    return isinstance(obj, Iterable) and not isinstance(obj, str)

@@ -302,7 +332,7 @@ def _sacreformat(refs, preds):


 class _bootstrap_internal:
-    def __init__(self, f, n):
+    def __init__(self, f, n) -> None:
        self.f = f
        self.n = n


--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
 import abc
 import os

-from typing import Union, List, Tuple
+import torch
+from typing import Union, List, Tuple, Optional, Type, TypeVar
 from sqlitedict import SqliteDict
 import json
 import hashlib
@@ -11,9 +12,11 @@ from tqdm import tqdm
 from lm_eval import utils
 from lm_eval.logger import eval_logger

+T = TypeVar("T", bound="LM")
+

 class LM(abc.ABC):
-    def __init__(self):
+    def __init__(self) -> None:
        """Defines the interface that should be implemented by all LM subclasses.
        LMs are assumed to take text (strings) as input and yield strings as output
        (inputs/outputs should be tokenization-agnostic.)
@@ -111,11 +114,28 @@ class LM(abc.ABC):
        pass

    @classmethod
-    def create_from_arg_string(cls, arg_string, additional_config=None):
+    def create_from_arg_string(
+        cls: Type[T], arg_string: str, additional_config: Optional[dict] = None
+    ) -> T:
+        """
+        Creates an instance of the LM class using the given argument string and additional config.
+
+        Parameters:
+        - arg_string: A string containing arguments in the format key1=value1,key2=value2.
+        - additional_config: Optional dictionary containing additional configuration parameters.
+
+        Returns:
+        - Instance of the LM class.
+        """
        additional_config = {} if additional_config is None else additional_config
        args = utils.simple_parse_args_string(arg_string)
        args2 = {k: v for k, v in additional_config.items() if v is not None}
-        if args2.get("device") == "mps" or args.get("device") == "mps":
+        # TODO: delete once float16 MPS is fixed in torch stable
+        if (
+            args2.get("device") in ("mps", "mps:0")
+            or args.get("device") in ("mps", "mps:0")
+            and "dev" not in torch.__version__
+        ):
            args["dtype"] = "float32"
        return cls(**args, **args2)

@@ -133,7 +153,7 @@ class LM(abc.ABC):
        # not support multi-device parallelism nor expect it.
        return self._world_size

-    def set_cache_hook(self, cache_hook):
+    def set_cache_hook(self, cache_hook) -> None:
        self.cache_hook = cache_hook


@@ -144,14 +164,14 @@ def hash_args(attr, args):


 class CacheHook:
-    def __init__(self, cachinglm):
+    def __init__(self, cachinglm) -> None:
        if cachinglm is None:
            self.dbdict = None
            return

        self.dbdict = cachinglm.dbdict

-    def add_partial(self, attr, req, res):
+    def add_partial(self, attr, req, res) -> None:
        if self.dbdict is None:
            return
        hsh = hash_args(attr, req)
@@ -159,7 +179,7 @@ class CacheHook:


 class CachingLM:
-    def __init__(self, lm, cache_db):
+    def __init__(self, lm, cache_db) -> None:
        """LM wrapper that returns cached results if they exist, and uses the underlying LM if not.

        :param lm: LM

--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -117,24 +117,23 @@ def register_metric(**args):
    return decorate


-def get_metric(name):
+def get_metric(name, hf_evaluate_metric=False):
+
+    if not hf_evaluate_metric:
+        if name in METRIC_REGISTRY:
+            return METRIC_REGISTRY[name]
+        else:
+            eval_logger.warning(
+                f"Could not find registered metric '{name}' in lm-eval, searching in HF Evaluate library..."
+            )

    try:
-        return METRIC_REGISTRY[name]
-    except KeyError:
-        # TODO: change this print to logging?
-        print(
-            f"Could not find registered metric '{name}' in lm-eval, \
-searching in HF Evaluate library..."
+        metric_object = evaluate.load(name)
+        return metric_object.compute
+    except Exception:
+        eval_logger.error(
+            f"{name} not found in the evaluate library! Please check https://huggingface.co/evaluate-metric",
        )
-        try:
-            metric_object = evaluate.load(name)
-            return metric_object.compute
-        except Exception:
-            eval_logger.error(
-                "{} not found in the evaluate library!".format(name),
-                "Please check https://huggingface.co/evaluate-metric",
-            )


 def register_aggregation(name):

--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
-class Sampler:
-    def __init__(self, docs, task, fewshot_indices=None, rnd=None):
-
+class ContextSampler:
+    def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
        self.rnd = rnd
        assert self.rnd, "must pass rnd to FewShotSampler!"

@@ -19,7 +18,6 @@ class Sampler:
            self.docs = self.docs.select(fewshot_indices)

    def get_context(self, doc, num_fewshot):
-
        # draw an extra fewshot sample if using same split as evaluating on
        n_samples = (
            num_fewshot + 1
@@ -48,14 +46,14 @@ class Sampler:
                    )
                    + self.target_delimiter
                    + (
-                        self.doc_to_target(doc)[0]
+                        str(self.doc_to_target(doc)[0])
                        if type(self.doc_to_target(doc)) is list
                        else self.doc_to_target(doc)
                        if (
                            self.config.doc_to_choice is None
                            or type(self.doc_to_target(doc)) is str
                        )
-                        else self.doc_to_choice(doc)[self.doc_to_target(doc)]
+                        else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
                    )
                    for doc in selected_docs
                ]
@@ -73,8 +71,20 @@ class Sampler:
        return self.rnd.sample(self.docs, n)


-class BalancedSampler(Sampler):
-    def sample(self, n):
+class FirstNSampler(ContextSampler):
+    def sample(self, n) -> None:
+        """
+        Draw the first `n` samples in order from the specified split.
+        Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
+        """
+        assert n <= len(
+            self.docs
+        ), f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
+        return self.docs[:n]
+
+
+class BalancedSampler(ContextSampler):
+    def sample(self, n) -> None:
        """
        TODO: this should return approximately class-balanced samples from our fewshot examples.
        TODO: what order should they be in? maybe random?
@@ -83,12 +93,27 @@ class BalancedSampler(Sampler):
        pass


-class ManualSampler(Sampler):
-    def sample(self, n):
+class ManualSampler(ContextSampler):
+    def sample(self, n) -> None:
        """ """
        pass


+SAMPLER_REGISTRY = {
+    "default": ContextSampler,
+    "first_n": FirstNSampler,
+}
+
+
+def get_sampler(name):
+    try:
+        return SAMPLER_REGISTRY[name]
+    except KeyError:
+        raise ValueError(
+            f"Attempted to use contextsampler '{name}', but no sampling strategy for this name found! Supported model names: {', '.join(SAMPLER_REGISTRY.keys())}"
+        )
+
+
 # TODO: how should we do design here? might be better to have a single sampler and pass more kwargs at init.
 # Depends what's easier for new user to add own functionality on top of


--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
--- a/lm_eval/benchmarks/__init__.py
+++ b/lm_eval/benchmarks/__init__.py
@@ -11,10 +11,9 @@ from lm_eval.api.registry import (
 )


-def include_benchmarks(task_dir):
-
+def include_benchmarks(task_dir: str) -> None:
    for root, subdirs, file_list in os.walk(task_dir):
-        if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
+        if (subdirs == [] or "__pycache__" in subdirs) and (len(file_list) > 0):
            for f in file_list:
                if f.endswith(".yaml"):
                    try:
@@ -23,6 +22,9 @@ def include_benchmarks(task_dir):
                        with open(benchmark_path, "rb") as file:
                            yaml_config = yaml.full_load(file)

+                        if "prompts" in yaml_config:
+                            continue  # Skip it
+
                        assert "group" in yaml_config
                        group = yaml_config["group"]
                        all_task_list = yaml_config["task"]
@@ -34,6 +36,16 @@ def include_benchmarks(task_dir):
                        ]

                        for task_config in config_list:
+                            yaml_dir = os.path.dirname(benchmark_path)
+                            task_config = utils.load_yaml_config(
+                                yaml_config=task_config, yaml_dir=yaml_dir
+                            )
+                            if "use_prompt" in task_config:
+                                if "yaml" in task_config["use_prompt"]:
+                                    task_config["use_prompt"] = os.path.join(
+                                        root, task_config["use_prompt"]
+                                    )
+
                            var_configs = check_prompt_config(
                                {
                                    **task_config,

--- a/lm_eval/benchmarks/minerva_math.yaml
+++ b/lm_eval/benchmarks/minerva_math.yaml
+group: minerva_math
+task:
+  - minerva_math_algebra
+  - minerva_math_counting_and_prob
+  - minerva_math_geometry
+  - minerva_math_intermediate_algebra
+  - minerva_math_num_theory
+  - minerva_math_prealgebra
+  - minerva_math_precalc