update version

a6b358ca · Rayyyyy · ed53d51c · a6b358ca · a6b358ca · a6b358ca
Commit a6b358ca authored May 24, 2024 by Rayyyyy
20 changed files
--- a/.gitignore
+++ b/.gitignore
 env
 *.pyc
+output/
 data/
 lm_cache
 .idea
+build
+dist
+*.egg-info
+venv
+.vscode/
+temp
+__pycache__
+.ipynb_checkpoints
+temp
+# IPython
+profile_default/
+ipython_config.py
+# don't track (the default location of) the cached requests
+lm_eval/caching/.cache
+# don't track files created by wandb
+wandb
+examples/wandb
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 exclude: ^tests/testdata/
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.1.0
+    rev: v4.5.0
    hooks:
      - id: check-added-large-files
      - id: check-ast
@@ -12,33 +12,43 @@ repos:
      - id: check-merge-conflict
      - id: check-symlinks
      - id: check-yaml
+        args: ["--unsafe"]
      - id: destroyed-symlinks
      - id: detect-private-key
      - id: end-of-file-fixer
      - id: no-commit-to-branch
+        always_run: false
      - id: requirements-txt-fixer
      - id: trailing-whitespace
+        args: [--markdown-linebreak-ext=md]
      - id: fix-byte-order-marker
        exclude: docs/CNAME
      - id: fix-encoding-pragma
        args: [--remove]
      - id: mixed-line-ending
        args: [--fix=lf]
-  - repo: https://github.com/pycqa/flake8
+  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: 3.7.9
+    # Ruff version.
+    rev: v0.2.2
    hooks:
-      - id: flake8
+      # Run the linter.
-  - repo: https://github.com/psf/black
+      - id: ruff
-    rev: 22.3.0
+        args:
-    hooks:
+          - --fix
-      - id: black
+        # Run the formatter.
-        language_version: python3.9
+      - id: ruff-format
  - repo: https://github.com/codespell-project/codespell
-    rev: v2.1.0
+    rev: v2.2.6
    hooks:
      - id: codespell
        exclude: >
          (?x)^(
-              .*\.json|ignore.txt
+              .*\.json|ignore.txt|lm_eval/tasks/.*|.*yaml|.*\.ipynb
          )$
        args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.5.1
+    hooks:
+    - id: mypy
+      additional_dependencies: [".[sentencepiece,multilingual,promptsource,gptq]", "types-PyYAML", "types-requests"]
+      exclude: ^tests/.*$
--- a/CITATION.bib
+++ b/CITATION.bib
-@software{eval-harness,
+@misc{eval-harness,
-  author       = {Gao, Leo and
+  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
-                  Tow, Jonathan and
-                  Biderman, Stella and
-                  Black, Sid and
-                  DiPofi, Anthony and
-                  Foster, Charles and
-                  Golding, Laurence and
-                  Hsu, Jeffrey and
-                  McDonell, Kyle and
-                  Muennighoff, Niklas and
-                  Phang, Jason and
-                  Reynolds, Laria and
-                  Tang, Eric and
-                  Thite, Anish and
-                  Wang, Ben and
-                  Wang, Kevin and
-                  Zou, Andy},
  title        = {A framework for few-shot language model evaluation},
-  month        = sep,
+  month        = 12,
-  year         = 2021,
+  year         = 2023,
  publisher    = {Zenodo},
-  version      = {v0.0.1},
+  version      = {v0.4.0},
-  doi          = {10.5281/zenodo.5371628},
+  doi          = {10.5281/zenodo.10256836},
-  url          = {https://doi.org/10.5281/zenodo.5371628}
+  url          = {https://zenodo.org/records/10256836}
 }
--- a/README.md
+++ b/README.md
--- a/docs/CONTRIBUTING.md
+++ b/docs/CONTRIBUTING.md
+# Contributing to LM Evaluation Harness
+Welcome and thank you for your interest in the LM Evaluation Harness! We welcome contributions and feedback and appreciate your time spent with our library, and hope you find it useful!
+We intend LM Evaluation Harness to be a broadly useful and
+## Important Resources
+There are several places information about LM Evaluation Harness is located:
+- Our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs)
+- We occasionally use [GitHub Milestones](https://github.com/EleutherAI/lm-evaluation-harness/milestones) to track progress toward specific near-term version releases.
+- We maintain a [Project Board](https://github.com/orgs/EleutherAI/projects/25) for tracking current work items and PRs, and for future roadmap items or feature requests.
+- Further discussion and support conversations are located in the #lm-thunderdome channel of the [EleutherAI discord](discord.gg/eleutherai).
+## Code Style
+LM Evaluation Harness uses [ruff](https://github.com/astral-sh/ruff) for linting via [pre-commit](https://pre-commit.com/).
+You can install linters and dev tools via
+```pip install lm_eval[dev]``` or ```pip install -e ".[dev]"```
+Then, run
+```pre-commit install```
+in order to ensure linters and other checks will be run upon committing.
+## Testing
+We use [pytest](https://docs.pytest.org/en/latest/) for running unit tests. All library unit tests can be run via:
+```
+python -m pytest --ignore=tests/tests_master --ignore=tests/extra
+```
+## Contributor License Agreement
+We ask that new contributors agree to a Contributor License Agreement affirming that EleutherAI has the rights to use your contribution to our library.
+First-time pull requests will have a reply added by @CLAassistant containing instructions for how to confirm this, and we require it before merging your PR.
+## Contribution Best Practices
+We recommend a few best practices to make your contributions or reported errors easier to assist with.
+**For Pull Requests:**
+- PRs should be titled descriptively, and be opened with a brief description of the scope and intent of the new contribution.
+- New features should have appropriate documentation added alongside them.
+- Aim for code maintainability, and minimize code copying.
+- If opening a task, try to share test results on the task using a publicly-available model, and if any public results are available on the task, compare to them.
+**For Feature Requests:**
+- Provide a short paragraph's worth of description. What is the feature you are requesting? What is its motivation, and an example use case of it? How does this differ from what is currently supported?
+**For Bug Reports**:
+- Provide a short description of the bug.
+- Provide a *reproducible example*--what is the command you run with our library that results in this error? Have you tried any other steps to resolve it?
+- Provide a *full error traceback* of the error that occurs, if applicable. A one-line error message or small screenshot snippet is unhelpful without the surrounding context.
+- Note what version of the codebase you are using, and any specifics of your environment and setup that may be relevant.
+**For Requesting New Tasks**:
+- Provide a 1-2 sentence description of what the task is and what it evaluates.
+- Provide a link to the paper introducing the task.
+- Provide a link to where the dataset can be found.
+- Provide a link to a paper containing results on an open-source model on the task, for use in comparisons and implementation validation.
+- If applicable, link to any codebase that has implemented the task (especially the original publication's codebase, if existent).
+## How Can I Get Involved?
+To quickly get started, we maintain a list of good first issues, which can be found [on our project board](https://github.com/orgs/EleutherAI/projects/25/views/8) or by [filtering GH Issues](https://github.com/EleutherAI/lm-evaluation-harness/issues?q=is%3Aopen+label%3A%22good+first+issue%22+label%3A%22help+wanted%22). These are typically smaller code changes or self-contained features which can be added without extensive familiarity with library internals, and we recommend new contributors consider taking a stab at one of these first if they are feeling uncertain where to begin.
+There are a number of distinct ways to contribute to LM Evaluation Harness, and all are extremely helpful! A sampling of ways to contribute include:
+- **Implementing and verifying new evaluation tasks**: Is there a task you'd like to see LM Evaluation Harness support? Consider opening an issue requesting it, or helping add it! Verifying and cross-checking task implementations with their original versions is also a very valuable form of assistance in ensuring standardized evaluation.
+- **Improving documentation** - Improvements to the documentation, or noting pain points / gaps in documentation, are helpful in order for us to improve the user experience of the library and clarity + coverage of documentation.
+- **Testing and devops** - We are very grateful for any assistance in adding tests for the library that can be run for new PRs, and other devops workflows.
+- **Adding new modeling / inference library integrations** - We hope to support a broad range of commonly-used inference libraries popular among the community, and welcome PRs for new integrations, so long as they are documented properly and maintainable.
+- **Proposing or Contributing New Features** - We want LM Evaluation Harness to support a broad range of evaluation usecases. If you have a feature that is not currently supported but desired, feel free to open an issue describing the feature and, if applicable, how you intend to implement it. We would be happy to give feedback on the cleanest way to implement new functionalities and are happy to coordinate with interested contributors via GH discussions or via discord.
+We hope that this has been helpful, and appreciate your interest in contributing! Further questions can be directed to [our Discord](discord.gg/eleutherai).
--- a/docs/README.md
+++ b/docs/README.md
+# Eval Harness Documentation
+Welcome to the docs for the LM Evaluation Harness!
+## Table of Contents
+* To learn about the public interface of the library, as well as how to evaluate via the commandline or as integrated into an external library, see the [Interface](./interface.md)
+* To learn how to add a new library, API, or model type to the library, as well as a quick explainer on the types of ways to evaluate an LM, see the [Model Guide](./model_guide.md).
+* For a crash course on adding new tasks to the library, see our [New Task Guide](./new_task_guide.md).
+* To learn more about pushing the limits of task configuration that the Eval Harness supports, see the [Task Configuration Guide](./task_guide.md).
--- a/docs/decontamination.md
+++ b/docs/decontamination.md
@@ -2,15 +2,14 @@
 ## Usage
-Simply add a "--decontamination_ngrams_path" when running main.py. The provided directory should contain
+The provided directory should contain
 the ngram files and info.json produced in "Pile Ngram Generation" further down.
 ```bash
-python main.py \
+python -m lm_eval \
    --model gpt2 \
    --device 0 \
-    --tasks sciq \
+    --tasks sciq
-    --decontamination_ngrams_path path/containing/training/set/ngrams
 ```
 ## Background
@@ -70,5 +69,3 @@ python -m scripts/clean_training_data/compress_and_package \
       -output path/to/final/directory \
       -procs 8
 ```
-Congratulations, the final directory can now be passed to lm-evaulation-harness with the "--decontamination_ngrams_path" argument.
--- a/docs/description_guide.md
+++ b/docs/description_guide.md
-# Description Guide
-![fewshot-example](./img/fewshot_example_gpt3.png)
-(Figure from [Brown et al., 2020](https://arxiv.org/pdf/2005.14165.pdf))
-Task descriptions provide in-context task instruction for your language model. If you'd like to prepend a natural language description to your few-shot examples and prompt, you can do so on a per-task basis via the `description_dict` arg of [`evaluator.evaluate`](../lm_eval/evaluator.py). This `description_dict` must adhere to the following key-value structure:
- **key**: the task name (`str`) as specified in the lm-eval-harness [task registry](../lm_eval/tasks/__init__.py).
- **value**: the corresponding (`str`) description/prompt for the task identified by **key**.
-```python
-description_dict = {
-    "task_name_1": "description",
-    "task_name_2": "description",
-    ...
-}
-```
-Note that a task's description will be separated from its following few-shot examples and prompt by a new line as such:
-```python
-"""
-<description>
-<examples>
-<prompt>
-"""
-```
-## Descriptions in File
-One can also interface with the aforementioned [`evaluator.evaluate`](../lm_eval/evaluator.py) (or `evaluator.simple_evaluate`) method from a higher level by simply passing a JSON file path to the `description_dict_path` arg of the command-line interface (CLI) program, `main.py`. The JSON file pointed to should be structured the same as the `description_dict`. E.g. for some file at `/your/path/descriptions.json` you may have:
-```json
-{
-    "cycle_letters": "Please unscramble the letters into a word, and write that word:",
-    "copa": "Given a premise and one alternative with a causal relation to the premise and another without, choose the more plausible alternative"
-}
-```
-which can then be supplied to the CLI as:
-```bash
-python main.py  \
--tasks cycle_letters,copa \
--description_dict_path /your/path/descriptions.json \
-...
-```
--- a/docs/interface.md
+++ b/docs/interface.md
+# User Guide
+This document details the interface exposed by `lm-eval` and provides details on what flags are available to users.
+## Command-line Interface
+A majority of users run the library by cloning it from Github, installing the package as editable, and running the `python -m lm_eval` script.
+Equivalently, running the library can be done via the `lm-eval` entrypoint at the command line.
+This mode supports a number of command-line arguments, the details of which can be also be seen via running with `-h` or `--help`:
+- `--model` : Selects which model type or provider is evaluated. Must be a string corresponding to the name of the model type/provider being used. See [the main README](https://github.com/EleutherAI/lm-evaluation-harness/tree/main#commercial-apis) for a full list of enabled model names and supported libraries or APIs.
+- `--model_args` : Controls parameters passed to the model constructor. Accepts a string containing comma-separated keyword arguments to the model class of the format `"arg1=val1,arg2=val2,..."`, such as, for example `--model_args pretrained=EleutherAI/pythia-160m,dtype=float32`. For a full list of what keyword arguments, see the initialization of the `lm_eval.api.model.LM` subclass, e.g. [`HFLM`](https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/models/huggingface.py#L66)
+- `--tasks` : Determines which tasks or task groups are evaluated. Accepts a comma-separated list of task names or task group names. Must be solely comprised of valid tasks/groups. A list of supported tasks can be viewed with `--tasks list`.
+- `--num_fewshot` : Sets the number of few-shot examples to place in context. Must be an integer.
+- `--gen_kwargs` : takes an arg string in same format as `--model_args` and creates a dictionary of keyword arguments. These will be passed to the models for all called `generate_until` (free-form or greedy generation task) tasks, to set options such as the sampling temperature or `top_p` / `top_k`. For a list of what args are supported for each model type, reference the respective library's documentation (for example, the documentation for `transformers.AutoModelForCausalLM.generate()`.) These kwargs will be applied to all `generate_until` tasks called--we do not currently support unique gen_kwargs or batch_size values per task in a single run of the library. To control these on a per-task level, set them in that task's YAML file.
+- `--batch_size` : Sets the batch size used for evaluation. Can be a positive integer or `"auto"` to automatically select the largest batch size that will fit in memory, speeding up evaluation. One can pass `--batch_size auto:N` to re-select the maximum batch size `N` times during evaluation. This can help accelerate evaluation further, since `lm-eval` sorts documents in descending order of context length.
+- `--max_batch_size` : Sets the maximum batch size to try to fit in memory, if `--batch_size auto` is passed.
+- `--device` : Sets which device to place the model onto. Must be a string, for example, `"cuda", "cuda:0", "cpu", "mps"`. Defaults to "cuda", and can be ignored if running multi-GPU or running a non-local model type.
+- `--output_path` : A string of the form `dir/file.jsonl` or `dir/`. Provides a path where high-level results will be saved, either into the file named or into the directory named. If `--log_samples` is passed as well, then per-document outputs and metrics will be saved into the directory as well.
+- `--log_samples` : If this flag is passed, then the model's outputs, and the text fed into the model, will be saved at per-document granularity. Must be used with `--output_path`.
+- `--limit` : Accepts an integer, or a float between 0.0 and 1.0 . If passed, will limit the number of documents to evaluate to the first X documents (if an integer) per task or first X% of documents per task. Useful for debugging, especially on costly API models.
+- `--use_cache` : Should be a path where a sqlite db file can be written to. Takes a string of format `/path/to/sqlite_cache_` in order to create a cache db at `/path/to/sqlite_cache_rank{i}.db` for each process (0-NUM_GPUS). This allows results of prior runs to be cached, so that there is no need to re-run results in order to re-score or re-run a given (model, task) pair again.
+- `--cache_requests` : Can be "true", "refresh", or "delete". "true" means that the cache should be used. "refresh" means that you wish to regenerate the cache, which you should run if you change your dataset configuration for a given task. "delete" will delete the cache. Cached files are stored under lm_eval/cache/.cache unless you specify a different path via the environment variable: `LM_HARNESS_CACHE_PATH`. e.g. `LM_HARNESS_CACHE_PATH=~/Documents/cache_for_lm_harness`.
+- `--check_integrity` : If this flag is used, the library tests for each task selected are run to confirm task integrity.
+- `--write_out` : Used for diagnostic purposes to observe the format of task documents passed to a model. If this flag is used, then prints the prompt and gold target string for the first document of each task.
+- `--show_config` : If used, prints the full `lm_eval.api.task.TaskConfig` contents (non-default settings the task YAML file) for each task which was run, at the completion of an evaluation. Useful for when one is modifying a task's configuration YAML locally to transmit the exact configurations used for debugging or for reproducibility purposes.
+- `--include_path` : Accepts a path to a folder. If passed, then all YAML files containing `lm-eval` compatible task configurations will be added to the task registry as available tasks. Used for when one is writing config files for their own task in a folder other than `lm_eval/tasks/`.
+- `--predict_only`: Generates the model outputs without computing metrics. Use with `--log_samples` to retrieve decoded results.
+* `--seed`: Set seed for python's random, numpy and torch.  Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three.  The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility).  E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`.  E.g, `--seed 42` sets all three seeds to 42.
+* `--wandb_args`:  Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list [here](https://docs.wandb.ai/ref/python/init). e.g., ```--wandb_args project=test-project,name=test-run```
+* `--hf_hub_log_args` : Logs evaluation results to Hugging Face Hub. Accepts a string with the arguments separated by commas. Available arguments:
+    * `hub_results_org` - organization name on Hugging Face Hub, e.g., `EleutherAI`,
+    * `hub_repo_name` - repository name on Hugging Face Hub, e.g., `lm-eval-results`,
+    * `push_results_to_hub` - whether to push results to Hugging Face Hub, can be `True` or `False`,
+    * `push_samples_to_hub` - whether to push samples results to Hugging Face Hub, can be `True` or `False`. Requires `--log_samples` to be set,
+    * `public_repo` - whether the repository is public, can be `True` or `False`,
+## External Library Usage
+We also support using the library's external API for use within model training loops or other scripts.
+`lm_eval` supplies two functions for external import and use: `lm_eval.evaluate()` and `lm_eval.simple_evaluate()`.
+`simple_evaluate()` can be used by simply creating an `lm_eval.api.model.LM` subclass that implements the methods described in the [Model Guide](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs/model_guide.md), and wrapping your custom model in that class as follows:
+```python
+import lm_eval
+...
+my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
+...
+# instantiate an LM subclass that takes your initialized model and can run
+# - `Your_LM.loglikelihood()`
+# - `Your_LM.loglikelihood_rolling()`
+# - `Your_LM.generate_until()`
+lm_obj = Your_LM(model=my_model, batch_size=16)
+# indexes all tasks from the `lm_eval/tasks` subdirectory.
+# Alternatively, you can set `TaskManager(include_path="path/to/my/custom/task/configs")`
+# to include a set of tasks in a separate directory.
+task_manager = lm_eval.tasks.TaskManager()
+# Setting `task_manager` to the one above is optional and should generally be done
+# if you want to include tasks from paths other than ones in `lm_eval/tasks`.
+# `simple_evaluate` will instantiate its own task_manager if it is set to None here.
+results = lm_eval.simple_evaluate( # call simple_evaluate
+    model=lm_obj,
+    tasks=["taskname1", "taskname2"],
+    num_fewshot=0,
+    task_manager=task_manager,
+    ...
+)
+```
+See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L35 for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously.
+Additionally, the `evaluate()` function offers the core evaluation functionality provided by the library, but without some of the special handling and simplification + abstraction provided by `simple_evaluate()`.
+See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L173 for more details.
+As a brief example usage of `evaluate()`:
+```python
+import lm_eval
+# suppose you've defined a custom lm_eval.api.Task subclass in your own external codebase
+from my_tasks import MyTask1
+...
+# create your model (could be running finetuning with some custom modeling code)
+my_model = initialize_my_model()
+...
+# instantiate an LM subclass that takes your initialized model and can run
+# - `Your_LM.loglikelihood()`
+# - `Your_LM.loglikelihood_rolling()`
+# - `Your_LM.generate_until()`
+lm_obj = Your_LM(model=my_model, batch_size=16)
+# optional: the task_manager indexes tasks including ones
+# specified by the user through `include_path`.
+task_manager = lm_eval.tasks.TaskManager(
+    include_path="/path/to/custom/yaml"
+    )
+# To get a task dict for `evaluate`
+task_dict = lm_eval.tasks.get_task_dict(
+    [
+        "mmlu", # A stock task
+        "my_custom_task", # A custom task
+        {
+            "task": ..., # A dict that configures a task
+            "doc_to_text": ...,
+            },
+        MyTask1 # A task object from `lm_eval.task.Task`
+        ],
+    task_manager # A task manager that allows lm_eval to
+                 # load the task during evaluation.
+                 # If none is provided, `get_task_dict`
+                 # will instantiated one itself, but this
+                 # only includes the stock tasks so users
+                 # will need to set this if including
+                 # custom paths is required.
+    )
+results = evaluate(
+    lm=lm_obj,
+    task_dict=task_dict,
+    ...
+)
+```
--- a/docs/model_guide.md
+++ b/docs/model_guide.md
+# New Model Guide
+This guide may be of special interest to users who are using the library outside of the repository, via installing the library via pypi and calling `lm_eval.evaluator.evaluate()` to evaluate an existing model.
+In order to properly evaluate a given LM, we require implementation of a wrapper class subclassing the `lm_eval.api.model.LM` class, that defines how the Evaluation Harness should interface with your model. This guide walks through how to write this `LM` subclass via adding it to the library!
+## Setup
+To get started contributing, go ahead and fork the main repo, clone it, create a branch with the name of your model, and install the project requirements in your environment:
+```sh
+# After forking...
+git clone https://github.com/<YOUR-USERNAME>/lm-evaluation-harness.git
+cd lm-evaluation-harness
+git checkout -b <model-type>
+pip install -e ".[dev]"
+```
+Now, we'll create a new file where we'll be adding our model:
+```sh
+touch lm_eval/models/<my_model_filename>.py
+```
+**Tip: this filename should not shadow package names! For example, naming your file `anthropic.py` is disallowed since the API's name on pypi is `anthropic`, but naming it `anthropic_llms.py` works with no problems.**
+## Interface
+All models must subclass the `lm_eval.api.model.LM` class.
+The LM class enforces a common interface via which we can extract responses from a model:
+```python
+class MyCustomLM(LM):
+    #...
+    def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]:
+        #...
+    def loglikelihood_rolling(self, requests: list[Instance]) -> list[tuple[float, bool]]:
+        #...
+    def generate_until(self, requests: list[Instance]) -> list[str]:
+        #...
+    #...
+```
+Where `Instance` is a dataclass defined in [`lm_eval.api.instance`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/api/instance.py) with property `args` of request-dependent type signature described below.
+We support three types of requests, consisting of different interactions / measurements with an autoregressive LM.
+All three request types take as input `requests` of type `list[Instance]` that have a matching `Instance.request_type` to the method name.
+- `generate_until`
+  - Each request contains `Instance.args : Tuple[str, dict]` containing 1. an input string to the LM and 2. a dictionary of keyword arguments used to control generation parameters.
+  - Using this input and these generation parameters, text will be sampled from the language model (typically until a maximum output length or specific stopping string sequences--for example, `{"until": ["\n\n", "."], "max_gen_toks": 128}`).
+  - The generated input+output text from the model will then be returned.
+- `loglikelihood`
+  - Each request contains `Instance.args : Tuple[str, str]` containing 1. an input string to the LM and 2. a target string on which the loglikelihood of the LM producing this target, conditioned on the input, will be returned.
+  - Each request will have, as result, `(ll, is_greedy): Tuple[float, int]` returned, where `ll` is a floating point number representing the log probability of generating the target string conditioned on the input, and `is_greedy` being either the value `0` or `1`, with it being `1` if and only if the target string *would be generated by greedy sampling from the LM* (that is, if the  target string is the *most likely* N-token string to be output by the LM given the input. )
+- `loglikelihood_rolling`
+  - Each request contains `Instance.args : Tuple[str]`, which is an input string to the model whose *entire* loglikelihood, conditioned on purely the EOT token, will be calculated.
+  - This is used to evaluate *perplexity* on a data distribution.
+  - It should return `(ll,) : Tuple[float]` , a.k.a. solely the *loglikelihood* of producing each piece of text given no starting input.
+To allow a model to be evaluated on all types of tasks, you will need to implement these three types of measurements (note that `loglikelihood_rolling` is a special case of `loglikelihood`). For a reference implementation, check out `lm_eval/models/huggingface.py` ! Additionally, check out `lm_eval.api.model.TemplateLM` for a class that abstracts away some commonly used functions across LM subclasses, or see if your model would lend itself well to subclassing the `lm_eval.models.huggingface.HFLM` class and overriding just the initialization or a couple methods!
+**Tip: be careful of indexing in loglikelihood!**
+LMs take in tokens in position `[0 1 2 ... N]` and output a probability distribution for token position `N+1`. We provide a simplified graphic here, excerpted from `huggingface.py`:
+```
+# how this all works (illustrated on a causal decoder-only setup):
+#          CTX      CONT
+# inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+# model  \               \
+# logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+# cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+```
+The final token of the target is not passed into the LM, because we want the LM's predictions *up to but not past* that final target token. For more information, check out https://github.com/EleutherAI/lm-evaluation-harness/issues/942 .
+## Registration
+Congrats on implementing your model! Now it's time to test it out.
+To make your model usable via the command line interface to `lm-eval` using `python -m lm_eval`, you'll need to tell `lm-eval` what your model's name is.
+This is done via a *decorator*, `lm_eval.api.registry.register_model`. Using `register_model()`, one can both tell the package what the model's name(s) to be used are when invoking it with `python -m lm_eval --model <name>` and alert `lm-eval` to the model's existence.
+```python
+from lm_eval.api.registry import register_model
+@register_model("<name1>", "<name2>")
+class MyCustomLM(LM):
+```
+Using this decorator results in the class being added to an accounting of the usable LM types maintained internally to the library at `lm_eval.api.registry.MODEL_REGISTRY`. See `lm_eval.api.registry` for more detail on what sorts of registries and decorators exist in the library!
+**Tip: be sure to import your model in `lm_eval/models/__init__.py!`**
+## Testing
+We also recommend that new model contributions be accompanied by short tests of their 3 core functionalities, at minimum. To see an example of such tests, look at https://github.com/EleutherAI/lm-evaluation-harness/blob/35bdecd379c0cefad6897e67db892f4a6026a128/tests/test_ggml.py .
+## Other
+**Pro tip**: In order to make the Evaluation Harness overestimate total runtimes rather than underestimate it, HuggingFace models come in-built with the ability to provide responses on data points in *descending order by total input length* via `lm_eval.utils.Reorderer`. Take a look at `lm_eval.models.hf_causal.HFLM` to see how this is done, and see if you can implement it in your own model!
+## Conclusion
+After reading this guide, you should be able to add new model APIs or implementations to the Eval Harness library!
--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
--- a/docs/task_guide.md
+++ b/docs/task_guide.md
--- a/docs/task_table.md
+++ b/docs/task_table.md
--- a/examples/lm-eval-overview.ipynb
+++ b/examples/lm-eval-overview.ipynb
--- a/examples/visualize-wandb.ipynb
+++ b/examples/visualize-wandb.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "fc477b96-adee-4829-a9d7-a5eb990df358",
+   "metadata": {},
+   "source": [
+    "# Visualizing Results in Weights and Biases\n",
+    "\n",
+    "With the Weights and Biases integration, you can now spend more time extracting deeper insights into your evaluation results. The integration is designed to streamline the process of logging and visualizing experiment results using the Weights & Biases (W&B) platform.\n",
+    "\n",
+    "The integration provide functionalities\n",
+    "\n",
+    "- to automatically log the evaluation results,\n",
+    "- log the samples as W&B Tables for easy visualization,\n",
+    "- log the `results.json` file as an artifact for version control,\n",
+    "- log the `<task_name>_eval_samples.json` file if the samples are logged,\n",
+    "- generate a comprehensive report for analysis and visualization with all the important metric,\n",
+    "- log task and cli configs,\n",
+    "- and more out of the box like the command used to run the evaluation, GPU/CPU counts, timestamp, etc.\n",
+    "\n",
+    "The integration is super easy to use with the eval harness. Let's see how!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3851439a-bff4-41f2-bf21-1b3d8704913b",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Install this project if you did not already have it.\n",
+    "# This is all that is needed to be installed to start using Weights and Biases\n",
+    "\n",
+    "!pip -qq install -e ..[wandb]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8507fd7e-3b99-4a92-89fa-9eaada74ba91",
+   "metadata": {},
+   "source": [
+    "# Run the Eval Harness\n",
+    "\n",
+    "Run the eval harness as usual with a `wandb_args` flag. This flag is used to provide arguments for initializing a wandb run ([wandb.init](https://docs.wandb.ai/ref/python/init)) as comma separated string arguments.\n",
+    "\n",
+    "If `wandb_args` flag is used, the metrics and all other goodness will be automatically logged to Weights and Biases. In the stdout, you will find the link to the W&B run page as well as link to the generated report."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eec5866e-f01e-42f8-8803-9d77472ef991",
+   "metadata": {},
+   "source": [
+    "## Set your API Key\n",
+    "\n",
+    "Before you can use W&B, you need to authenticate your machine with an authentication key. Visit https://wandb.ai/authorize to get one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d824d163-71a9-4313-935d-f1d56397841c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import wandb\n",
+    "\n",
+    "wandb.login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "124e4a34-1547-4bed-bc09-db012bacbda6",
+   "metadata": {},
+   "source": [
+    "> Note that if you are using command line you can simply authenticate your machine by doing `wandb login` in your terminal. For more info check out the [documentation](https://docs.wandb.ai/quickstart#2-log-in-to-wb)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "abc6f6b6-179a-4aff-ada9-f380fb74df6e",
+   "metadata": {},
+   "source": [
+    "## Run and log to W&B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd0a8130-a97b-451a-acd2-3f9885b88643",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!lm_eval \\\n",
+    "    --model hf \\\n",
+    "    --model_args pretrained=microsoft/phi-2,trust_remote_code=True \\\n",
+    "    --tasks hellaswag,mmlu_abstract_algebra \\\n",
+    "    --device cuda:0 \\\n",
+    "    --batch_size 8 \\\n",
+    "    --output_path output/phi-2 \\\n",
+    "    --limit 10 \\\n",
+    "    --wandb_args project=lm-eval-harness-integration \\\n",
+    "    --log_samples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e974cabdbe70b667",
+   "metadata": {},
+   "source": ""
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5178ca9445b844e4",
+   "metadata": {},
+   "source": "W&B can also be initialized programmatically for use outside the CLI to parse and log the results."
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6a421b2cf3ddac5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import lm_eval\n",
+    "from lm_eval.logging_utils import WandbLogger\n",
+    "\n",
+    "results = lm_eval.simple_evaluate(\n",
+    "    model=\"hf\",\n",
+    "    model_args=\"pretrained=microsoft/phi-2,trust_remote_code=True\",\n",
+    "    tasks=\"hellaswag,mmlu_abstract_algebra\",\n",
+    "    log_samples=True,\n",
+    ")\n",
+    "\n",
+    "wandb_logger = WandbLogger(\n",
+    "    project=\"lm-eval-harness-integration\", job_type=\"eval\"\n",
+    ")  # or empty if wandb.init(...) already called before\n",
+    "wandb_logger.post_init(results)\n",
+    "wandb_logger.log_eval_result()\n",
+    "wandb_logger.log_eval_samples(results[\"samples\"])  # if log_samples"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/examples/visualize-zeno.ipynb
+++ b/examples/visualize-zeno.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Visualizing Results in Zeno\n",
+    "\n",
+    "Benchmarking your models is the first step towards making sure your model performs well.\n",
+    "However, looking at the data behind the benchmark, slicing the data into subsets, and comparing models on individual instances can help you even more in evaluating and quantifying the behavior of your AI system.\n",
+    "\n",
+    "All of this can be done in [Zeno](https://zenoml.com)!\n",
+    "Zeno is super easy to use with the eval harness, let's explore how you can easily upload and visualize your eval results.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install this project if you did not already do that. This is all that needs to be installed for you to be able to visualize your data in Zeno!\n",
+    "!pip install -e ..\n",
+    "!pip install -e ..[zeno]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Run the Eval Harness\n",
+    "\n",
+    "To visualize the results, run the eval harness with the `log_samples` and `output_path` flags. We expect `output_path` to contain multiple folders that represent individual model names. You can thus run your evaluation on any number of tasks and models and upload all of the results as projects on Zeno.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!lm_eval \\\n",
+    "    --model hf \\\n",
+    "    --model_args pretrained=EleutherAI/gpt-neo-2.7B \\\n",
+    "    --tasks hellaswag,wikitext \\\n",
+    "    --batch_size 8 \\\n",
+    "    --device mps \\\n",
+    "    --log_samples \\\n",
+    "    --output_path output/gpt-neo-2.7B \\\n",
+    "    --limit 10"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Set your API Key\n",
+    "\n",
+    "This is so you can be authenticated with Zeno.\n",
+    "If you don't already have a Zeno account, first create an account on [Zeno Hub](https://hub.zenoml.com).\n",
+    "After logging in to Zeno Hub, generate your API key by clicking on your profile at the bottom left to navigate to your account page.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%env ZENO_API_KEY=YOUR_API_KEY"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Visualize Eval Results\n",
+    "\n",
+    "You can now use the `zeno_visualize` script to upload the results to Zeno.\n",
+    "\n",
+    "This will use all subfolders in `data_path` as different models and upload all tasks within these model folders to Zeno. If you run the eval harness on multiple tasks, the `project_name` will be used as a prefix and one project will be created per task.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!python ../scripts/zeno_visualize.py --data_path output --project_name \"Zeno Upload Test\""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "zeno_projects",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/ignore.txt
+++ b/ignore.txt
@@ -4,3 +4,5 @@ nin
 maka
 mor
 te
+ond
+extraversion
--- a/lm_eval/__init__.py
+++ b/lm_eval/__init__.py
 from .evaluator import evaluate, simple_evaluate
\ No newline at end of file
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
+import argparse
+import json
+import logging
+import os
+import sys
+from functools import partial
+from typing import Union
+from lm_eval import evaluator, utils
+from lm_eval.evaluator import request_caching_arg_to_dict
+from lm_eval.logging import EvaluationTracker, WandbLogger
+from lm_eval.tasks import TaskManager
+from lm_eval.utils import handle_non_serializable, make_table, simple_parse_args_string
+def _int_or_none_list_arg_type(
+    min_len: int, max_len: int, defaults: str, value: str, split_char: str = ","
+):
+    def parse_value(item):
+        item = item.strip().lower()
+        if item == "none":
+            return None
+        try:
+            return int(item)
+        except ValueError:
+            raise argparse.ArgumentTypeError(f"{item} is not an integer or None")
+    items = [parse_value(v) for v in value.split(split_char)]
+    num_items = len(items)
+    if num_items == 1:
+        # Makes downstream handling the same for single and multiple values
+        items = items * max_len
+    elif num_items < min_len or num_items > max_len:
+        raise argparse.ArgumentTypeError(
+            f"Argument requires {max_len} integers or None, separated by '{split_char}'"
+        )
+    elif num_items != max_len:
+        logging.warning(
+            f"Argument requires {max_len} integers or None, separated by '{split_char}'. "
+            "Missing values will be filled with defaults."
+        )
+        default_items = [parse_value(v) for v in defaults.split(split_char)]
+        items.extend(
+            default_items[num_items:]
+        )  # extend items list with missing defaults
+    return items
+def check_argument_types(parser: argparse.ArgumentParser):
+    """
+    Check to make sure all CLI args are typed, raises error if not
+    """
+    for action in parser._actions:
+        if action.dest != "help" and not action.const:
+            if action.type is None:
+                raise ValueError(
+                    f"Argument '{action.dest}' doesn't have a type specified."
+                )
+            else:
+                continue
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument(
+        "--model", "-m", type=str, default="hf", help="Name of model e.g. `hf`"
+    )
+    parser.add_argument(
+        "--tasks",
+        "-t",
+        default=None,
+        type=str,
+        metavar="task1,task2",
+        help="To get full list of tasks, use the command lm-eval --tasks list",
+    )
+    parser.add_argument(
+        "--model_args",
+        "-a",
+        default="",
+        type=str,
+        help="Comma separated string arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
+    )
+    parser.add_argument(
+        "--num_fewshot",
+        "-f",
+        type=int,
+        default=None,
+        metavar="N",
+        help="Number of examples in few-shot context",
+    )
+    parser.add_argument(
+        "--batch_size",
+        "-b",
+        type=str,
+        default=1,
+        metavar="auto|auto:N|N",
+        help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.",
+    )
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        default=None,
+        metavar="N",
+        help="Maximal batch size to try with --batch_size auto.",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default=None,
+        help="Device to use (e.g. cuda, cuda:0, cpu).",
+    )
+    parser.add_argument(
+        "--output_path",
+        "-o",
+        default=None,
+        type=str,
+        metavar="DIR|DIR/file.json",
+        help="The path to the output file where the result metrics will be saved. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
+    )
+    parser.add_argument(
+        "--limit",
+        "-L",
+        type=float,
+        default=None,
+        metavar="N|0<N<1",
+        help="Limit the number of examples per task. "
+        "If <1, limit is a percentage of the total number of examples.",
+    )
+    parser.add_argument(
+        "--use_cache",
+        "-c",
+        type=str,
+        default=None,
+        metavar="DIR",
+        help="A path to a sqlite db file for caching model responses. `None` if not caching.",
+    )
+    parser.add_argument(
+        "--cache_requests",
+        type=str,
+        default=None,
+        choices=["true", "refresh", "delete"],
+        help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
+    )
+    parser.add_argument(
+        "--check_integrity",
+        action="store_true",
+        help="Whether to run the relevant part of the test suite for the tasks.",
+    )
+    parser.add_argument(
+        "--write_out",
+        "-w",
+        action="store_true",
+        default=False,
+        help="Prints the prompt for the first few documents.",
+    )
+    parser.add_argument(
+        "--log_samples",
+        "-s",
+        action="store_true",
+        default=False,
+        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
+    )
+    parser.add_argument(
+        "--show_config",
+        action="store_true",
+        default=False,
+        help="If True, shows the the full config of all tasks at the end of the evaluation.",
+    )
+    parser.add_argument(
+        "--include_path",
+        type=str,
+        default=None,
+        metavar="DIR",
+        help="Additional path to include if there are external tasks to include.",
+    )
+    parser.add_argument(
+        "--gen_kwargs",
+        type=str,
+        default=None,
+        help=(
+            "String arguments for model generation on greedy_until tasks,"
+            " e.g. `temperature=0,top_k=0,top_p=0`."
+        ),
+    )
+    parser.add_argument(
+        "--verbosity",
+        "-v",
+        type=str.upper,
+        default="INFO",
+        metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
+        help="Controls the reported logging error level. Set to DEBUG when testing + adding new task configurations for comprehensive log output.",
+    )
+    parser.add_argument(
+        "--wandb_args",
+        type=str,
+        default="",
+        help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval",
+    )
+    parser.add_argument(
+        "--hf_hub_log_args",
+        type=str,
+        default="",
+        help="Comma separated string arguments passed to Hugging Face Hub's log function, e.g. `hub_results_org=EleutherAI,hub_repo_name=lm-eval-results`",
+    )
+    parser.add_argument(
+        "--predict_only",
+        "-x",
+        action="store_true",
+        default=False,
+        help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
+    )
+    default_seed_string = "0,1234,1234,1234"
+    parser.add_argument(
+        "--seed",
+        type=partial(_int_or_none_list_arg_type, 3, 4, default_seed_string),
+        default=default_seed_string,  # for backward compatibility
+        help=(
+            "Set seed for python's random, numpy, torch, and fewshot sampling.\n"
+            "Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, "
+            "respectively, or a single integer to set the same seed for all three.\n"
+            f"The values are either an integer or 'None' to not set the seed. Default is `{default_seed_string}` "
+            "(for backward compatibility).\n"
+            "E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. "
+            "Here numpy's seed is not set since the second value is `None`.\n"
+            "E.g, `--seed 42` sets all four seeds to 42."
+        ),
+    )
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
+    )
+    return parser
+def parse_eval_args(parser: argparse.ArgumentParser) -> argparse.Namespace:
+    check_argument_types(parser)
+    return parser.parse_args()
+def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
+    if not args:
+        # we allow for args to be passed externally, else we parse them ourselves
+        parser = setup_parser()
+        args = parse_eval_args(parser)
+    if args.wandb_args:
+        wandb_logger = WandbLogger(**simple_parse_args_string(args.wandb_args))
+    eval_logger = utils.eval_logger
+    eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
+    eval_logger.info(f"Verbosity set to {args.verbosity}")
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    # update the evaluation tracker args with the output path and the HF token
+    args.hf_hub_log_args = f"output_path={args.output_path},token={os.environ.get('HF_TOKEN')},{args.hf_hub_log_args}"
+    evaluation_tracker_args = simple_parse_args_string(args.hf_hub_log_args)
+    evaluation_tracker = EvaluationTracker(**evaluation_tracker_args)
+    evaluation_tracker.general_config_tracker.log_experiment_args(
+        model_source=args.model,
+        model_args=args.model_args,
+    )
+    if args.predict_only:
+        args.log_samples = True
+    if (args.log_samples or args.predict_only) and not args.output_path:
+        raise ValueError(
+            "Specify --output_path if providing --log_samples or --predict_only"
+        )
+    if args.include_path is not None:
+        eval_logger.info(f"Including path: {args.include_path}")
+    task_manager = TaskManager(args.verbosity, include_path=args.include_path)
+    if (
+        "push_results_to_hub" in evaluation_tracker_args
+        or "push_samples_to_hub" in evaluation_tracker_args
+    ) and "hub_results_org" not in evaluation_tracker_args:
+        raise ValueError(
+            "If push_results_to_hub or push_samples_to_hub is set, results_org must be specified."
+        )
+    if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
+        eval_logger.warning(
+            "Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub."
+        )
+    if args.limit:
+        eval_logger.warning(
+            " --limit SHOULD ONLY BE USED FOR TESTING."
+            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
+        )
+    if args.tasks is None:
+        eval_logger.error("Need to specify task to evaluate.")
+        sys.exit()
+    elif args.tasks == "list":
+        eval_logger.info(
+            "Available Tasks:\n - {}".format("\n - ".join(task_manager.all_tasks))
+        )
+        sys.exit()
+    else:
+        if os.path.isdir(args.tasks):
+            import glob
+            task_names = []
+            yaml_path = os.path.join(args.tasks, "*.yaml")
+            for yaml_file in glob.glob(yaml_path):
+                config = utils.load_yaml_config(yaml_file)
+                task_names.append(config)
+        else:
+            task_list = args.tasks.split(",")
+            task_names = task_manager.match_tasks(task_list)
+            for task in [task for task in task_list if task not in task_names]:
+                if os.path.isfile(task):
+                    config = utils.load_yaml_config(task)
+                    task_names.append(config)
+            task_missing = [
+                task for task in task_list if task not in task_names and "*" not in task
+            ]  # we don't want errors if a wildcard ("*") task name was used
+            if task_missing:
+                missing = ", ".join(task_missing)
+                eval_logger.error(
+                    f"Tasks were not found: {missing}\n"
+                    f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
+                )
+                raise ValueError(
+                    f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks, or '--verbosity DEBUG' to troubleshoot task registration issues."
+                )
+    # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args
+    if args.trust_remote_code:
+        os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = str(args.trust_remote_code)
+        args.model_args = (
+            args.model_args
+            + f",trust_remote_code={os.environ['HF_DATASETS_TRUST_REMOTE_CODE']}"
+        )
+    eval_logger.info(f"Selected Tasks: {task_names}")
+    request_caching_args = request_caching_arg_to_dict(
+        cache_requests=args.cache_requests
+    )
+    results = evaluator.simple_evaluate(
+        model=args.model,
+        model_args=args.model_args,
+        tasks=task_names,
+        num_fewshot=args.num_fewshot,
+        batch_size=args.batch_size,
+        max_batch_size=args.max_batch_size,
+        device=args.device,
+        use_cache=args.use_cache,
+        limit=args.limit,
+        check_integrity=args.check_integrity,
+        write_out=args.write_out,
+        log_samples=args.log_samples,
+        gen_kwargs=args.gen_kwargs,
+        task_manager=task_manager,
+        verbosity=args.verbosity,
+        predict_only=args.predict_only,
+        random_seed=args.seed[0],
+        numpy_random_seed=args.seed[1],
+        torch_random_seed=args.seed[2],
+        fewshot_random_seed=args.seed[3],
+        **request_caching_args,
+    )
+    if results is not None:
+        if args.log_samples:
+            samples = results.pop("samples")
+        dumped = json.dumps(
+            results, indent=2, default=handle_non_serializable, ensure_ascii=False
+        )
+        if args.show_config:
+            print(dumped)
+        batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
+        # Add W&B logging
+        if args.wandb_args:
+            try:
+                wandb_logger.post_init(results)
+                wandb_logger.log_eval_result()
+                if args.log_samples:
+                    wandb_logger.log_eval_samples(samples)
+            except Exception as e:
+                eval_logger.info(f"Logging to Weights and Biases failed due to {e}")
+        evaluation_tracker.save_results_aggregated(
+            results=results, samples=samples if args.log_samples else None
+        )
+        if args.log_samples:
+            for task_name, config in results["configs"].items():
+                evaluation_tracker.save_results_samples(
+                    task_name=task_name, samples=samples[task_name]
+                )
+        print(
+            f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
+            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
+        )
+        print(make_table(results))
+        if "groups" in results:
+            print(make_table(results, "groups"))
+        if args.wandb_args:
+            # Tear down wandb run once all the logging is done.
+            wandb_logger.run.finish()
+if __name__ == "__main__":
+    cli_evaluate()
--- a/lm_eval/datasets/__init__.py
+++ b/lm_eval/datasets/__init__.py