Unverified Commit a2af2101 authored by Yen-Ting Lin's avatar Yen-Ting Lin Committed by GitHub
Browse files

Merge branch 'EleutherAI:main' into main

parents 82cb25c1 d5f39bf8
...@@ -20,13 +20,13 @@ jobs: ...@@ -20,13 +20,13 @@ jobs:
with: with:
fetch-depth: 2 # OR "2" -> To retrieve the preceding commit. fetch-depth: 2 # OR "2" -> To retrieve the preceding commit.
# Uses the tj-actions/changed-files@v37 action to check for changes. # Uses the tj-actions/changed-files action to check for changes.
# Outputs provided here: https://github.com/tj-actions/changed-files#outputs # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
# The `files_yaml` input optionally takes a yaml string to specify filters, # The `files_yaml` input optionally takes a yaml string to specify filters,
# and prepends the filter name to the standard output names. # and prepends the filter name to the standard output names.
- name: Check task folders - name: Check task folders
id: changed-tasks id: changed-tasks
uses: tj-actions/changed-files@v37.1.2 uses: tj-actions/changed-files@v44.5.2
with: with:
# tasks checks the tasks folder and api checks the api folder for changes # tasks checks the tasks folder and api checks the api folder for changes
files_yaml: | files_yaml: |
...@@ -56,7 +56,7 @@ jobs: ...@@ -56,7 +56,7 @@ jobs:
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install -e '.[dev]' --extra-index-url https://download.pytorch.org/whl/cpu pip install -e '.[dev,ifeval]' --extra-index-url https://download.pytorch.org/whl/cpu
# Install optional git dependencies # Install optional git dependencies
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt # pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
......
...@@ -32,7 +32,7 @@ jobs: ...@@ -32,7 +32,7 @@ jobs:
env: env:
SKIP: "no-commit-to-branch,mypy" SKIP: "no-commit-to-branch,mypy"
uses: pre-commit/action@v3.0.0 uses: pre-commit/action@v3.0.1
# # mypy turned off for now # # mypy turned off for now
# - name: Lint with mypy # - name: Lint with mypy
# run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable # run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
...@@ -56,12 +56,37 @@ jobs: ...@@ -56,12 +56,37 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install -e '.[dev,anthropic,sentencepiece,optimum,deepsparse,sparseml]' --extra-index-url https://download.pytorch.org/whl/cpu pip install -e '.[dev,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
# Install optional git dependencies # Install optional git dependencies
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt # pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Test with pytest - name: Test with pytest
run: python -m pytest --showlocals -s -vv -n=auto run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py
- name: Archive artifacts
uses: actions/upload-artifact@v3
with:
name: output_results
path: |
test_logs/*
testmodels:
name: External LM Tests
runs-on: ubuntu-latest
timeout-minutes: 30
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Set up Python 3.8
uses: actions/setup-python@v5
with:
python-version: 3.8
cache: pip
cache-dependency-path: pyproject.toml
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e '.[dev,optimum,deepsparse,sparseml]' --extra-index-url https://download.pytorch.org/whl/cpu
- name: Test with pytest
run: python -m pytest tests/models --showlocals -s -vv
- name: Archive artifacts - name: Archive artifacts
uses: actions/upload-artifact@v3 uses: actions/upload-artifact@v3
with: with:
......
...@@ -10,6 +10,7 @@ repos: ...@@ -10,6 +10,7 @@ repos:
- id: check-case-conflict - id: check-case-conflict
- id: check-json - id: check-json
- id: check-merge-conflict - id: check-merge-conflict
args: [--assume-in-merge]
- id: check-symlinks - id: check-symlinks
- id: check-yaml - id: check-yaml
args: ["--unsafe"] args: ["--unsafe"]
...@@ -28,8 +29,7 @@ repos: ...@@ -28,8 +29,7 @@ repos:
- id: mixed-line-ending - id: mixed-line-ending
args: [--fix=lf] args: [--fix=lf]
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version. rev: v0.4.8
rev: v0.2.2
hooks: hooks:
# Run the linter. # Run the linter.
- id: ruff - id: ruff
...@@ -38,7 +38,7 @@ repos: ...@@ -38,7 +38,7 @@ repos:
# Run the formatter. # Run the formatter.
- id: ruff-format - id: ruff-format
- repo: https://github.com/codespell-project/codespell - repo: https://github.com/codespell-project/codespell
rev: v2.2.6 rev: v2.3.0
hooks: hooks:
- id: codespell - id: codespell
exclude: > exclude: >
...@@ -46,9 +46,9 @@ repos: ...@@ -46,9 +46,9 @@ repos:
.*\.json|ignore.txt|lm_eval/tasks/.*|.*yaml|.*\.ipynb .*\.json|ignore.txt|lm_eval/tasks/.*|.*yaml|.*\.ipynb
)$ )$
args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt] args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
- repo: https://github.com/pre-commit/mirrors-mypy # - repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.5.1 # rev: v1.5.1
hooks: # hooks:
- id: mypy # - id: mypy
additional_dependencies: [".[sentencepiece,multilingual,promptsource,gptq]", "types-PyYAML", "types-requests"] # additional_dependencies: [".[sentencepiece,multilingual,promptsource,gptq]", "types-PyYAML", "types-requests"]
exclude: ^tests/.*$ # exclude: ^tests/.*$
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
New updates and features include: New updates and features include:
- **New Open LLM Leaderboard tasks have been added ! You can find them under the [leaderboard](lm_eval/tasks/leaderboard/README.md) task group.**
- Internal refactoring - Internal refactoring
- Config-based task creation and configuration - Config-based task creation and configuration
- Easier import and sharing of externally-defined task config YAMLs - Easier import and sharing of externally-defined task config YAMLs
...@@ -49,6 +50,11 @@ pip install -e . ...@@ -49,6 +50,11 @@ pip install -e .
We also provide a number of optional dependencies for extended functionality. A detailed table is available at the end of this document. We also provide a number of optional dependencies for extended functionality. A detailed table is available at the end of this document.
## Basic Usage ## Basic Usage
### User Guide
A user guide detailing the full list of supported arguments is provided [here](./docs/interface.md), and on the terminal by calling `lm_eval -h`. Alternatively, you can use `lm-eval` instead of `lm_eval`.
A list of supported tasks (or groupings of tasks) can be viewed with `lm-eval --tasks list`. Task descriptions and links to corresponding subfolders are provided [here](./lm_eval/tasks/README.md).
### Hugging Face `transformers` ### Hugging Face `transformers`
...@@ -84,8 +90,6 @@ lm_eval --model hf \ ...@@ -84,8 +90,6 @@ lm_eval --model hf \
--batch_size auto:4 --batch_size auto:4
``` ```
The full list of supported arguments are provided [here](./docs/interface.md), and on the terminal by calling `lm_eval -h`. Alternatively, you can use `lm-eval` instead of `lm_eval`. A list of supported tasks can be viewed with `lm-eval --tasks list`.
> [!Note] > [!Note]
> Just like you can provide a local path to `transformers.AutoModel`, you can also provide a local path to `lm_eval` via `--model_args pretrained=/path/to/model` > Just like you can provide a local path to `transformers.AutoModel`, you can also provide a local path to `lm_eval` via `--model_args pretrained=/path/to/model`
...@@ -307,7 +311,7 @@ To save evaluation results provide an `--output_path`. We also support logging m ...@@ -307,7 +311,7 @@ To save evaluation results provide an `--output_path`. We also support logging m
Additionally, one can provide a directory with `--use_cache` to cache the results of prior runs. This allows you to avoid repeated execution of the same (model, task) pairs for re-scoring. Additionally, one can provide a directory with `--use_cache` to cache the results of prior runs. This allows you to avoid repeated execution of the same (model, task) pairs for re-scoring.
To push results and samples to the Hugging Face Hub, first ensure an access token with write access is set in the `HF_TOKEN` environment variable. Then, use the --hf_hub_log_args flag to specify the organization, repository name, repository visibility, and whether to push results and samples to the Hub. For example: To push results and samples to the Hugging Face Hub, first ensure an access token with write access is set in the `HF_TOKEN` environment variable. Then, use the `--hf_hub_log_args` flag to specify the organization, repository name, repository visibility, and whether to push results and samples to the Hub - [example dataset on the HF Hub](https://huggingface.co/datasets/KonradSzafer/lm-eval-results-demo). For instance:
```bash ```bash
lm_eval --model hf \ lm_eval --model hf \
...@@ -318,6 +322,13 @@ lm_eval --model hf \ ...@@ -318,6 +322,13 @@ lm_eval --model hf \
--hf_hub_log_args hub_results_org=EleutherAI,hub_repo_name=lm-eval-results,push_results_to_hub=True,push_samples_to_hub=True,public_repo=False \ --hf_hub_log_args hub_results_org=EleutherAI,hub_repo_name=lm-eval-results,push_results_to_hub=True,push_samples_to_hub=True,public_repo=False \
``` ```
This allows you to easily download the results and samples from the Hub, using:
```python
from datasets import load_dataset
load_dataset("EleutherAI/lm-eval-results-private", "hellaswag", "latest")
```
For a full list of supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md) guide in our documentation! For a full list of supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md) guide in our documentation!
## Visualizing Results ## Visualizing Results
......
...@@ -4,7 +4,7 @@ Welcome to the docs for the LM Evaluation Harness! ...@@ -4,7 +4,7 @@ Welcome to the docs for the LM Evaluation Harness!
## Table of Contents ## Table of Contents
* To learn about the public interface of the library, as well as how to evaluate via the commandline or as integrated into an external library, see the [Interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/interface.md) * To learn about the public interface of the library, as well as how to evaluate via the commandline or as integrated into an external library, see the [Interface](./interface.md)
* To learn how to add a new library, API, or model type to the library, as well as a quick explainer on the types of ways to evaluate an LM, see the [Model Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/model_guide.md). * To learn how to add a new library, API, or model type to the library, as well as a quick explainer on the types of ways to evaluate an LM, see the [Model Guide](./model_guide.md).
* For a crash course on adding new tasks to the library, see our [New Task Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md). * For a crash course on adding new tasks to the library, see our [New Task Guide](./new_task_guide.md).
* To learn more about pushing the limits of task configuration that the Eval Harness supports, see the [Task Configuration Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/task_guide.md). * To learn more about pushing the limits of task configuration that the Eval Harness supports, see the [Task Configuration Guide](./task_guide.md).
...@@ -10,7 +10,7 @@ Equivalently, running the library can be done via the `lm-eval` entrypoint at th ...@@ -10,7 +10,7 @@ Equivalently, running the library can be done via the `lm-eval` entrypoint at th
This mode supports a number of command-line arguments, the details of which can be also be seen via running with `-h` or `--help`: This mode supports a number of command-line arguments, the details of which can be also be seen via running with `-h` or `--help`:
- `--model` : Selects which model type or provider is evaluated. Must be a string corresponding to the name of the model type/provider being used. See [the main README](https://github.com/EleutherAI/lm-evaluation-harness/tree/main#commercial-apis) for a full list of enabled model names and supported libraries or APIs. - `--model` : Selects which model type or provider is evaluated. Must be a string corresponding to the name of the model type/provider being used. See [the main README](https://github.com/EleutherAI/lm-evaluation-harness/tree/main#model-apis-and-inference-servers) for a full list of enabled model names and supported libraries or APIs.
- `--model_args` : Controls parameters passed to the model constructor. Accepts a string containing comma-separated keyword arguments to the model class of the format `"arg1=val1,arg2=val2,..."`, such as, for example `--model_args pretrained=EleutherAI/pythia-160m,dtype=float32`. For a full list of what keyword arguments, see the initialization of the `lm_eval.api.model.LM` subclass, e.g. [`HFLM`](https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/models/huggingface.py#L66) - `--model_args` : Controls parameters passed to the model constructor. Accepts a string containing comma-separated keyword arguments to the model class of the format `"arg1=val1,arg2=val2,..."`, such as, for example `--model_args pretrained=EleutherAI/pythia-160m,dtype=float32`. For a full list of what keyword arguments, see the initialization of the `lm_eval.api.model.LM` subclass, e.g. [`HFLM`](https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/models/huggingface.py#L66)
...@@ -42,13 +42,31 @@ This mode supports a number of command-line arguments, the details of which can ...@@ -42,13 +42,31 @@ This mode supports a number of command-line arguments, the details of which can
- `--show_config` : If used, prints the full `lm_eval.api.task.TaskConfig` contents (non-default settings the task YAML file) for each task which was run, at the completion of an evaluation. Useful for when one is modifying a task's configuration YAML locally to transmit the exact configurations used for debugging or for reproducibility purposes. - `--show_config` : If used, prints the full `lm_eval.api.task.TaskConfig` contents (non-default settings the task YAML file) for each task which was run, at the completion of an evaluation. Useful for when one is modifying a task's configuration YAML locally to transmit the exact configurations used for debugging or for reproducibility purposes.
- `--include_path` : Accepts a path to a folder. If passed, then all YAML files containing ` lm-eval`` compatible task configurations will be added to the task registry as available tasks. Used for when one is writing config files for their own task in a folder other than `lm_eval/tasks/` - `--include_path` : Accepts a path to a folder. If passed, then all YAML files containing `lm-eval` compatible task configurations will be added to the task registry as available tasks. Used for when one is writing config files for their own task in a folder other than `lm_eval/tasks/`.
- `--system_instruction`: Specifies a system instruction string to prepend to the prompt.
- `--apply_chat_template` : If this flag is on, a chat template will be applied to the prompt. For Hugging Face models, the chat template is taken from the tokenizer, if the tokenizer does not have a chat template, a default one will be applied. For other models, chat templating is not currently implemented.
- `--fewshot_as_multiturn` : If this flag is on, the Fewshot examples are treated as a multi-turn conversation. Questions are provided as user content and answers are provided as assistant responses. Requires `--num_fewshot` to be set to be greater than 0, and `--apply_chat_template` to be on.
- `--predict_only`: Generates the model outputs without computing metrics. Use with `--log_samples` to retrieve decoded results. - `--predict_only`: Generates the model outputs without computing metrics. Use with `--log_samples` to retrieve decoded results.
* `--seed`: Set seed for python's random, numpy and torch. Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three. The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility). E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`. E.g, `--seed 42` sets all three seeds to 42. * `--seed`: Set seed for python's random, numpy and torch. Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three. The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility). E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`. E.g, `--seed 42` sets all three seeds to 42.
* `--wandb_args`: Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list (here.)[https://docs.wandb.ai/ref/python/init]. e.g., ```--wandb_args project=test-project,name=test-run``` * `--wandb_args`: Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list [here](https://docs.wandb.ai/ref/python/init). e.g., ```--wandb_args project=test-project,name=test-run```
* `--hf_hub_log_args` : Logs evaluation results to Hugging Face Hub. Accepts a string with the arguments separated by commas. Available arguments:
* `hub_results_org` - organization name on Hugging Face Hub, e.g., `EleutherAI`. If not provided, the results will be pushed to the owner of the Hugging Face token,
* `hub_repo_name` - repository name on Hugging Face Hub (deprecated, `details_repo_name` and `results_repo_name` should be used instead), e.g., `lm-eval-results`,
* `details_repo_name` - repository name on Hugging Face Hub to store details, e.g., `lm-eval-results`,
* `results_repo_name` - repository name on Hugging Face Hub to store results, e.g., `lm-eval-results`,
* `push_results_to_hub` - whether to push results to Hugging Face Hub, can be `True` or `False`,
* `push_samples_to_hub` - whether to push samples results to Hugging Face Hub, can be `True` or `False`. Requires `--log_samples` to be set,
* `public_repo` - whether the repository is public, can be `True` or `False`,
* `leaderboard_url` - URL to the leaderboard, e.g., `https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard`.
* `point_of_contact` - Point of contact for the results dataset, e.g., `yourname@example.com`.
* `gated` - whether to gate the details dataset, can be `True` or `False`.
## External Library Usage ## External Library Usage
...@@ -77,7 +95,7 @@ task_manager = lm_eval.tasks.TaskManager() ...@@ -77,7 +95,7 @@ task_manager = lm_eval.tasks.TaskManager()
# Setting `task_manager` to the one above is optional and should generally be done # Setting `task_manager` to the one above is optional and should generally be done
# if you want to include tasks from paths other than ones in `lm_eval/tasks`. # if you want to include tasks from paths other than ones in `lm_eval/tasks`.
# `simple_evaluate` will instantiate its own task_manager is the it is set to None here. # `simple_evaluate` will instantiate its own task_manager if it is set to None here.
results = lm_eval.simple_evaluate( # call simple_evaluate results = lm_eval.simple_evaluate( # call simple_evaluate
model=lm_obj, model=lm_obj,
tasks=["taskname1", "taskname2"], tasks=["taskname1", "taskname2"],
...@@ -87,12 +105,10 @@ results = lm_eval.simple_evaluate( # call simple_evaluate ...@@ -87,12 +105,10 @@ results = lm_eval.simple_evaluate( # call simple_evaluate
) )
``` ```
See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L35 for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously. See the `simple_evaluate()` and `evaluate()` functions in [lm_eval/evaluator.py](../lm_eval/evaluator.py#:~:text=simple_evaluate) for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously.
Additionally, the `evaluate()` function offers the core evaluation functionality provided by the library, but without some of the special handling and simplification + abstraction provided by `simple_evaluate()`. Additionally, the `evaluate()` function offers the core evaluation functionality provided by the library, but without some of the special handling and simplification + abstraction provided by `simple_evaluate()`.
See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L173 for more details.
As a brief example usage of `evaluate()`: As a brief example usage of `evaluate()`:
```python ```python
...@@ -132,7 +148,7 @@ task_dict = lm_eval.tasks.get_task_dict( ...@@ -132,7 +148,7 @@ task_dict = lm_eval.tasks.get_task_dict(
task_manager # A task manager that allows lm_eval to task_manager # A task manager that allows lm_eval to
# load the task during evaluation. # load the task during evaluation.
# If none is provided, `get_task_dict` # If none is provided, `get_task_dict`
# will instantiated one itself, but this # will instantiate one itself, but this
# only includes the stock tasks so users # only includes the stock tasks so users
# will need to set this if including # will need to set this if including
# custom paths is required. # custom paths is required.
......
...@@ -6,7 +6,7 @@ In order to properly evaluate a given LM, we require implementation of a wrapper ...@@ -6,7 +6,7 @@ In order to properly evaluate a given LM, we require implementation of a wrapper
## Setup ## Setup
To get started contributing, go ahead and fork the main repo, clone it, create a branch with the name of your task, and install the project requirements in your environment: To get started contributing, go ahead and fork the main repo, clone it, create a branch with the name of your model, and install the project requirements in your environment:
```sh ```sh
# After forking... # After forking...
...@@ -107,6 +107,53 @@ Using this decorator results in the class being added to an accounting of the us ...@@ -107,6 +107,53 @@ Using this decorator results in the class being added to an accounting of the us
We also recommend that new model contributions be accompanied by short tests of their 3 core functionalities, at minimum. To see an example of such tests, look at https://github.com/EleutherAI/lm-evaluation-harness/blob/35bdecd379c0cefad6897e67db892f4a6026a128/tests/test_ggml.py . We also recommend that new model contributions be accompanied by short tests of their 3 core functionalities, at minimum. To see an example of such tests, look at https://github.com/EleutherAI/lm-evaluation-harness/blob/35bdecd379c0cefad6897e67db892f4a6026a128/tests/test_ggml.py .
## Chat Templating
Many models are fine-tuned with a [Chat Template](https://huggingface.co/docs/transformers/main/en/chat_templating) in order to enable back-and-forth interaction between a "User"'s queries and the model (often called "Assistant")'s responses. It can be desirable to evaluate fine-tuned models on evaluation tasks while wrapped in the conversational format they expect.
In order to make your model optionally compatible with a chat format, three additional methods must be implemented:
```python
class MyCustomLM(LM):
#...
@property
def tokenizer_name(self) -> str:
# should return a string denoting the name of the model's tokenizer and/or the accompanying chat template.
@property
def chat_template(self) -> str:
# should return a chat template formatting string that is used to build prompt from a user/assistant chat history.
# this will be saved in the evaluation results for reproducibility.
def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
# responsible for taking as input a chat history that would be fed into the model, and
# rendering it as a string that can be then tokenized and input into the model.
#...
```
- `apply_chat_template`
- This method performs the bulk of the work required for chat-formatting.
- As input, a `chat_history: List[Dict[str, str]]` is passed in. This is a transcript of a conversation of a form similar to
```
[
{"system": <user-provided system message such as "You are a helpful math-focused chatbot">},
{"user": <task example - a few-shot example 'input'>}
{"assistant": <correct response to the above example>},
# ... more few-shot examples, potentially
{"user": <test set query--response on which we will evaluate>},
]
```
which can then be converted into a string input.
- The output is a string representing this conversation that can be fed into the model.
- For example, this consists of simply calling `tokenizer.apply_chat_template` for HFLM--see the implementation there for reference.
- `tokenizer_name`
- LM Eval Harness supports [caching requests](https://github.com/EleutherAI/lm-evaluation-harness/blob/4902aaaf1f374682f95ac25fe2e13b23faddc91a/lm_eval/__main__.py#L140) that are sent to a model, for faster setup when repeating an already-performed evaluation.
- However, we don't want to use the cache of chat transcripts rendered using one chat template or system prompt to send to a model with a different template! So, we use this `lm.tokenizer_name` string to distinguish caches for a given model (and chat template) from one another.
- `chat_template`
- Chat templates are typically provided as a Jinja template string or a string formatted with str.format to include user and assistant messages in a single prompt. This template string is saved in the evaluation results to ensure reproducibility.
If not implemented for a given model type, the flags `--apply_chat_template` , `--fewshot_as_multiturn`, and `--system_instruction` cannot be used.
## Other ## Other
**Pro tip**: In order to make the Evaluation Harness overestimate total runtimes rather than underestimate it, HuggingFace models come in-built with the ability to provide responses on data points in *descending order by total input length* via `lm_eval.utils.Reorderer`. Take a look at `lm_eval.models.hf_causal.HFLM` to see how this is done, and see if you can implement it in your own model! **Pro tip**: In order to make the Evaluation Harness overestimate total runtimes rather than underestimate it, HuggingFace models come in-built with the ability to provide responses on data points in *descending order by total input length* via `lm_eval.utils.Reorderer`. Take a look at `lm_eval.models.hf_causal.HFLM` to see how this is done, and see if you can implement it in your own model!
......
...@@ -35,7 +35,7 @@ and rename the folders and YAML file(s) as desired. ...@@ -35,7 +35,7 @@ and rename the folders and YAML file(s) as desired.
### Selecting and configuring a dataset ### Selecting and configuring a dataset
All data downloading and management is handled through the HuggingFace (**HF**) [`datasets`](https://github.com/huggingface/datasets) API. So, the first thing you should do is check to see if your task's dataset is already provided in their catalog [here](https://huggingface.co/datasets). If it's not in there, please consider adding it to their Hub to make it accessible to a wider user base by following their [new dataset guide](https://github.com/huggingface/datasets/blob/master/ADD_NEW_DATASET.md) All data downloading and management is handled through the HuggingFace (**HF**) [`datasets`](https://github.com/huggingface/datasets) API. So, the first thing you should do is check to see if your task's dataset is already provided in their catalog [here](https://huggingface.co/datasets). If it's not in there, please consider adding it to their Hub to make it accessible to a wider user base by following their [new dataset guide](https://github.com/huggingface/datasets/blob/main/ADD_NEW_DATASET.md)
. .
Once you have a HuggingFace dataset prepared for your task, we want to assign our new YAML to use this dataset: Once you have a HuggingFace dataset prepared for your task, we want to assign our new YAML to use this dataset:
...@@ -59,7 +59,25 @@ We can also specify from which split the task should retrieve few-shot examples ...@@ -59,7 +59,25 @@ We can also specify from which split the task should retrieve few-shot examples
```yaml ```yaml
fewshot_split: <split name to draw fewshot examples from, or `null`> fewshot_split: <split name to draw fewshot examples from, or `null`>
``` ```
though if this is not set, we will default to train/validation/test sets, in that order. or by hardcoding them, either using the following in the yaml file:
```yaml
fewshot_config:
sampler: first_n
samples: [
{<sample 1>},
{<sample 2>},
]
```
or by adding the function `list_fewshot_samples` in the associated utils.py file:
```python
def list_fewshot_samples() -> list[dict]:
return [{<sample 1>}, {<sample 2>}]
```
See `lm_eval/tasks/minerva_math/minerva_math_algebra.yaml` for an example of the latter, and `lm_eval/tasks/gsm8k/gsm8k-cot.yaml` for an example of the former.
In this case, each sample must contain the same fields as the samples in the above sets--for example, if `doc_to_text` expects an `input` field when rendering input prompts, these provided samples must include an `input` key.
If neither above options are not set, we will default to train/validation/test sets, in that order.
Finally, our dataset may not be already in the exact format we want. Maybe we have to strip whitespace and special characters via a regex from our dataset's "question" field! Or maybe we just want to rename its columns to match a convention we'll be using for our prompts. Finally, our dataset may not be already in the exact format we want. Maybe we have to strip whitespace and special characters via a regex from our dataset's "question" field! Or maybe we just want to rename its columns to match a convention we'll be using for our prompts.
...@@ -172,7 +190,7 @@ doc_to_target: "{{answer}}" ...@@ -172,7 +190,7 @@ doc_to_target: "{{answer}}"
``` ```
**Important**: we now add `target_delimiter` between input and target which defaults to " ", such that the full input-output string is `doc_to_target(doc) + target_delimiter + doc_to_text(doc)`. doc_to_text and doc_to_target should not contain trailing right or left whitespace, respectively. **Important**: we now add `target_delimiter` between input and target which defaults to " ", such that the full input-output string is `doc_to_target(doc) + target_delimiter + doc_to_text(doc)`. `doc_to_text` and `doc_to_target` should not contain trailing right or left whitespace, respectively.
#### Multiple choice format #### Multiple choice format
...@@ -213,7 +231,7 @@ def wikitext_detokenizer(doc): ...@@ -213,7 +231,7 @@ def wikitext_detokenizer(doc):
return string return string
``` ```
We can load this function in `doc_to_target` by using a `!function` operator after `doc_to_target` and followed by `<file name>.<function name>`. In the file [wikitext.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/6ae376e3a43caa58b95bb8aa73054a94827bf560/lm_eval/tasks/wikitext/wikitext.yaml) we write: We can load this function in `doc_to_target` by using a `!function` operator after `doc_to_target` and followed by `<file name>.<function name>`. In the file [wikitext.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/wikitext/wikitext.yaml) we write:
``` ```
doc_to_target: !function preprocess_wikitext.wikitext_detokenizer doc_to_target: !function preprocess_wikitext.wikitext_detokenizer
``` ```
...@@ -267,7 +285,7 @@ As a heuristic check: ...@@ -267,7 +285,7 @@ As a heuristic check:
For more detail on the task system and advanced features, see [`docs/task_guide.md`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md) . If none of the above sound like they apply to your task, it's time to continue onto checking your task performance! For more detail on the task system and advanced features, see [`docs/task_guide.md`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md) . If none of the above sound like they apply to your task, it's time to continue onto checking your task performance!
### Task name + groups (registering a task) ### Task name + tags (registering a task)
To test a task conveniently, it helps to *register* the task--that is, to give it a name and make the `lm-eval` library aware it exists! To test a task conveniently, it helps to *register* the task--that is, to give it a name and make the `lm-eval` library aware it exists!
...@@ -278,14 +296,14 @@ task: <name of the task> ...@@ -278,14 +296,14 @@ task: <name of the task>
``` ```
Including a task name is mandatory. Including a task name is mandatory.
It is often also convenient to label your task with several `groups`, or tags, though this field is optional: It is often also convenient to label your task with several `tag` values, though this field is optional:
```yaml ```yaml
group: tag:
- group1 - tag1
- group2 - tag2
``` ```
This will add your task to the `group1` and `group2` groups, enabling people to know how to categorize your task, and if desired run all tasks in one of these groups at once, your task along with them. This will add your task to the `tag1` and `tag2` tags, enabling people to know how to categorize your task, and if desired run all tasks in one of these groups at once, your task along with them.
If your task is not in the `lm_eval/tasks` folder, you'll need to tell the Eval Harness where to look for YAML files. If your task is not in the `lm_eval/tasks` folder, you'll need to tell the Eval Harness where to look for YAML files.
...@@ -301,7 +319,48 @@ Passing `--tasks /path/to/yaml/file` is also accepted. ...@@ -301,7 +319,48 @@ Passing `--tasks /path/to/yaml/file` is also accepted.
### Advanced Group Configs ### Advanced Group Configs
You can make more complete group config while also tailoring parameters for individual tasks. While `tag` values are helpful when you want to be able to quickly and conveniently run a set of related tasks via `--tasks my_tag_name`, often, we wish to implement more complex logic. For example, the MMLU benchmark contains 57 *subtasks* that must all be *averaged* together in order to report a final 'MMLU score'.
Groupings of tasks might also use particular variants of a task--for example, we might want to default to evaluating a task as 5-shot when called as part of a given grouping, but not have a preference for number of shots when evaluating it as a standalone.
We implement this via **groups**, which are distinct from tags. Groups can be implemented via *group config* YAML files, which are laid out similarly but slightly differently to tasks' YAML configs.
The most basic form of group can be defined via a YAML config similar to the following:
```yaml
group: nli_tasks
task:
- cb
- anli_r1
- rte
metadata:
version: 1.0
```
This will behave almost identically to a `tag` that includes these 3 tasks, but with one key distinction: we'll print the `nli_tasks` group as a row (with no associated metrics) in our table of outputs, and visually show that these 3 tasks appear under its subheader.
Now, let's assume we actually want to report an aggregate score for `nli_tasks`. We would instead use a YAML config like the following:
```yaml
group: nli_tasks
task:
- cb
- anli_r1
- rte
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean).
metadata:
version: 1.0
```
Similar to our `metric_list` for listing out the metrics we want to calculate for a given task, we use an `aggregate_metric_list` field to specify which metric name to aggregate across subtasks, what aggregation function to use, and whether we should micro- or macro- average these metrics. See [./task_guide.md](./task_guide.md) for a full list of related sub-keys.
**[!Tip]: currently, we predominantly only support the aggregation of group metrics that use `mean` (either micro- or macro- averaged) over their subtasks. If you require even more complex aggregation rules, you may want to perform aggregation offline.**
Group configs can be fairly complex! We can do various operations, such as defining new subtask(s) inline in our group YAML, overriding an existing task's specific config value, or nesting existing groups within our
For example, let's build a config for evaluating MMLU and a few natural language inference tasks. For MMLU, we can write the name for the benchmark as a subtask written under `task`. You can configure the parameters such as `num_fewshot`. If the task being configured is a group such as `mmlu` or `super_glue`, the parameter set will be applied to all of the subtasks. For example, let's build a config for evaluating MMLU and a few natural language inference tasks. For MMLU, we can write the name for the benchmark as a subtask written under `task`. You can configure the parameters such as `num_fewshot`. If the task being configured is a group such as `mmlu` or `super_glue`, the parameter set will be applied to all of the subtasks.
...@@ -313,33 +372,13 @@ task: ...@@ -313,33 +372,13 @@ task:
- cb - cb
- anli_r1 - anli_r1
- rte - rte
aggregate_metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- task: mmlu - task: mmlu
num_fewshot: 2 num_fewshot: 2
``` ```
It's also important to note how you can basically insert a group config as a task. Here, to make a group of natural language inference tasks, you simply write like how you would normally write a group config but this time place that as part of a task list under the main group being built.
### Duplicate Tasks in Group Configs
There might be cases where you might want to evaluate prompts and how models perform over prompt variations. You can list an existing task (In the example below, `anli_r1`) which varying `doc_to_text` implementation. To differentiate from each variation, we can utilize `task_alias`. LM-Eval will recognize that there are multiple variations of the same tasks and differentiate them.
```yaml
group: flan_held_in
group_alias: Flan (Held-In)
task:
# ANLI R1
- group: anli_r1_flan
group_alias: ANLI R1
task:
- task: anli_r1
task_alias: prompt-0
include: _held_in_template_yaml
doc_to_text: "{{premise}}\n\nChoose your answer ..."
...
- task: anli_r1
task_alias: prompt-1
include: _held_in_template_yaml
doc_to_text: "{{premise}}\n\nBased on ..."
...
```
### Configuring python classes ### Configuring python classes
...@@ -364,23 +403,29 @@ task: ...@@ -364,23 +403,29 @@ task:
... ...
``` ```
You can also pass a custom argument to your class by accepting `config` in the custom class constructor.
Here's how to do it:
```yaml
task: 20_newsgroups
class: !function task.Unitxt
recipe: card=cards.20_newsgroups,template=templates.classification.multi_class.title
```
In this example, `recipe` is the custom argument for the `Unitxt` class.
## Beautifying Table Display ## Beautifying Table Display
To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed. To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed. For example in `mmlu_abstract_algebra.yaml` we set `task_alias` to `abstract_algebra`. In group configs, a `group_alias` for a group can also be set.
``
for example in `mmlu_abstract_algebra.yaml` we set `group_alias` to `stem` and `task_alias` to `abstract_algebra`.
``` ```
"dataset_name": "abstract_algebra" "dataset_name": "abstract_algebra"
"description": "The following are multiple choice questions (with answers) about abstract\ "description": "The following are multiple choice questions (with answers) about abstract\
\ algebra.\n\n" \ algebra.\n\n"
"group": "mmlu_stem"
"group_alias": "stem"
"include": "_default_template_yaml" "include": "_default_template_yaml"
"task": "mmlu_abstract_algebra" "task": "mmlu_abstract_algebra"
"task_alias": "abstract_algebra" "task_alias": "abstract_algebra"
``` ```
Note: Even though `group` can be a list, for now, `group_alias` can only be a single string.
## Checking validity ## Checking validity
...@@ -400,9 +445,9 @@ a simple eye test. ...@@ -400,9 +445,9 @@ a simple eye test.
## Versioning ## Versioning
One key feature in LM Evaluation Harness is the ability to version tasks--that is, mark them with a specific version number that can be bumped whenever a breaking change is made. One key feature in LM Evaluation Harness is the ability to version tasks and groups--that is, mark them with a specific version number that can be bumped whenever a breaking change is made.
This version info can be provided by adding the following to your new task config file: This version info can be provided by adding the following to your new task or group config file:
``` ```
metadata: metadata:
...@@ -440,6 +485,8 @@ If other tasks on this dataset are already supported: ...@@ -440,6 +485,8 @@ If other tasks on this dataset are already supported:
It is recommended to include a filled-out copy of this checklist in the README.md for the subfolder you are creating, if you have created a new subfolder in `lm_eval/tasks`. It is recommended to include a filled-out copy of this checklist in the README.md for the subfolder you are creating, if you have created a new subfolder in `lm_eval/tasks`.
**Finally, please add a short description of your task(s), along with a link to its subfolder in lm_eval/tasks , to [`lm_eval/tasks/README.md`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/README.md) so that users can discover your task in the library, and follow the link to your README for more information about the variants supported, their task names, and the original source of the dataset and/or evaluation setup.**
## Submitting your task ## Submitting your task
You're all set! Now push your work and make a pull request to the `main` branch! Thanks for the contribution :). If there are any questions, please leave a message in the `#lm-thunderdome` channel on the EAI discord! You're all set! Now push your work and make a pull request to the `main` branch! Thanks for the contribution :). If there are any questions, please leave a message in the `#lm-thunderdome` channel on the EAI discord!
...@@ -16,7 +16,8 @@ Tasks are configured via the `TaskConfig` object. Below, we describe all fields ...@@ -16,7 +16,8 @@ Tasks are configured via the `TaskConfig` object. Below, we describe all fields
Task naming + registration: Task naming + registration:
- **task** (`str`, defaults to None) — name of the task. - **task** (`str`, defaults to None) — name of the task.
- **group** (`str`, *optional*) — name of the task group(s) a task belongs to. Enables one to run all tasks with a specified tag or group name at once. - **task_alias** (`str`, defaults to None) - Alias of the task name that will be printed in the final table results.
- **tag** (`str`, *optional*) — name of the task tags(s) a task belongs to. Enables one to run all tasks with a specified tag name at once.
Dataset configuration options: Dataset configuration options:
- **dataset_path** (`str`) — The name of the dataset as listed by HF in the datasets Hub. - **dataset_path** (`str`) — The name of the dataset as listed by HF in the datasets Hub.
...@@ -31,8 +32,8 @@ Dataset configuration options: ...@@ -31,8 +32,8 @@ Dataset configuration options:
Prompting / in-context formatting options: Prompting / in-context formatting options:
- **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text, doc_to_target, and doc_to_choice. - **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text, doc_to_target, and doc_to_choice.
- **description** (`str`, *optional*) — An optional prepended Jinja2 template or string which will be prepended to the few-shot examples passed into the model, often describing the task or providing instructions to a model, such as `"The following are questions (with answers) about {{subject}}.\n\n"`. No delimiters or spacing are inserted between the description and the first few-shot example. - **description** (`str`, *optional*) — An optional prepended Jinja2 template or string which will be prepended to the few-shot examples passed into the model, often describing the task or providing instructions to a model, such as `"The following are questions (with answers) about {{subject}}.\n\n"`. No delimiters or spacing are inserted between the description and the first few-shot example.
- **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate input for the model - **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate input for the model.
- **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate target output for the model. For multiple choice tasks, this should return an index into - **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate target output for the model. For multiple choice tasks, this should return an index into the answer choice list of the correct answer.
- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `generate_until` tasks. - **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `generate_until` tasks.
- **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples. - **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples.
- **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested. - **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested.
...@@ -55,8 +56,6 @@ Other: ...@@ -55,8 +56,6 @@ Other:
## Filters ## Filters
Explain: What are filters? What is their place in the pipeline?
A key component of the `lm-evaluation-harness` library is the `Filter` object. In a typical evaluation run of the harness, we take the formatted inputs and run them through our LM, with the appropriate output type (greedy or free-form generation, or loglikelihood-based comparative scoring). A key component of the `lm-evaluation-harness` library is the `Filter` object. In a typical evaluation run of the harness, we take the formatted inputs and run them through our LM, with the appropriate output type (greedy or free-form generation, or loglikelihood-based comparative scoring).
After getting scores or output text from our LM on each `Instance` or document in the dataset, we then need to feed these responses into a metric or scoring function to return scores to a user. After getting scores or output text from our LM on each `Instance` or document in the dataset, we then need to feed these responses into a metric or scoring function to return scores to a user.
...@@ -190,7 +189,7 @@ You can base a YAML on another YAML file as a template. This can be handy when y ...@@ -190,7 +189,7 @@ You can base a YAML on another YAML file as a template. This can be handy when y
include: <YAML filename or with full path> include: <YAML filename or with full path>
... ...
``` ```
You can find an example of how to use this feature at [gsm8k-cot-self-consistency.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/3c07cc04a92fc467d7c9a94894aeddd58c93a5da/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml) where it is based off [gsm8k-cot.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/3c07cc04a92fc467d7c9a94894aeddd58c93a5da/lm_eval/tasks/gsm8k/gsm8k-cot.yaml) You can find an example of how to use this feature at [gsm8k-cot-self-consistency.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml) where it is based off [gsm8k-cot.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k-cot.yaml)
## Passing Arguments to Metrics ## Passing Arguments to Metrics
...@@ -295,105 +294,24 @@ Generative tasks: ...@@ -295,105 +294,24 @@ Generative tasks:
Tasks using complex filtering: Tasks using complex filtering:
- GSM8k with CoT (+ with Self-Consistency): (`lm_eval/tasks/gsm8k/gsm8k-cot.yaml` ; `lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml`) - GSM8k with CoT (+ with Self-Consistency): (`lm_eval/tasks/gsm8k/gsm8k-cot.yaml` ; `lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml`)
# Group Configuration
## Benchmarks
When evaluating a language model, it's is not unusual to test across a number of tasks that may not be related to one another in order to assess a variety of capabilities. To this end, it may be combursome to have to list the set of tasks or add a new group name to each yaml of each individual task. When evaluating a language model, it's is not unusual to test across a number of tasks that may not be related to one another in order to assess a variety of capabilities. To this end, it may be combursome to have to list the set of tasks or add a new group name to each yaml of each individual task.
To solve this, we can create a benchmark yaml config. This is a config that contains the names of the tasks that should be included in a particular benchmark. The config consists of two main keys `group` which denotes the name of the benchmark and `task` which is where we can list the tasks. The tasks listed in `task` are the task names that have been registered. A good example would be the list of tasks used to evaluate the Pythia Suite. To solve this, we can create a **group** yaml config. This is a config that contains the names of the tasks that should be included in a particular group. The config consists of two main keys: a `group` key which denotes the name of the group (as it would be called from the command line, e.g. `mmlu`) and a `task` key which is where we can list the tasks. The tasks listed in `task` are the task names that have been registered. A good example of a group yaml config can be found at [../lm_eval/tasks/mmlu/default/_mmlu.yaml]. See also the [New Task Guide](./new_task_guide.md) for a more in-depth and tutorial-esque explanation of how to write complex GroupConfigs.
```yaml
group: pythia
task:
- lambada_openai
- wikitext
- piqa
- sciq
- wsc
- winogrande
- arc
- logiqa
- blimp
- hendrycksTest*
```
It is also possible to list an existing task in your benchmark configuration with some adjustments. For example, a few tasks from mmlu is included `multimedqa`. There, the `task_alias` and `group_alias` (See [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md#beautifying-table-display) for more details) are modified to suit the benchmark.
```yaml
group: multimedqa
task:
- pubmedqa
- medmcqa
- medqa_4options
- task: mmlu_anatomy
task_alias: "anatomy (mmlu)"
group_alias: null
- task: mmlu_clinical_knowledge
task_alias: "clinical_knowledge (mmlu)"
group_alias: null
...
```
Alternatively, benchmarks can have tasks that are customizable for each task. They can be defined like how a yaml task is usually set. ## Configurations
```yaml Groups are configured via the `GroupConfig` object. Below, we describe all fields usable within the object, and their role in defining a task.
group: t0_eval
task:
# Coreference Resolution
- dataset_path: super_glue
dataset_name: wsc.fixed
use_prompt: promptsource:*
training_split: train
validation_split: validation
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
# Coreference Resolution
- dataset_path: winogrande
dataset_name: winogrande_xl
use_prompt: promptsource:*
training_split: train
validation_split: validation
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
...
```
If the benchmark contains the same dataset but with different configurations, use `task` to differentiate between them. For example, T0-Eval evaluates on 3 versions of ANLI but the huggingface dataset collects them in one dataset. ### Parameters
```YAML
group: t0_eval
task:
...
- task: anli_r1
dataset_path: anli
use_prompt: promptsource:*
training_split: train_r1
validation_split: dev_r1
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
- task: anli_r2
dataset_path: anli
use_prompt: promptsource:*
training_split: train_r2
validation_split: dev_r2
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
```
Calling the benchmark is done the same way we would call any task with `--tasks`. Benchmarks can be added in `lm_eval/tasks/benchmarks/` - **group** (`str`, defaults to `None`) — name of the group. Used to invoke it from the command line.
- **group_alias** (`str`, defaults to `None`) - Alternative name for the group that will be printed in the table output.
- **task** (`Union[str, list]`, defaults to `None`) - List of tasks that constitute the group.
- **aggregate_metric_list** (`list`, defaults to `None`) - similar to `metric_list` in TaskConfigs, provide a list of configurations for metrics that should be aggregated across subtasks. Leaving empty will result in no aggregation being performed for this group. Keys for each list entry are:
- `metric: str` - the name of the metric to aggregate over (all subtasks must report a metric holding this name.)
- `aggregation: str` - what aggregation function to apply to aggregate these per-subtask metrics. **currently, only `mean` is supported.**
- `weight_by_size: bool = True` whether to perform micro- averaging (`True`) or macro- (`False`) averaging of subtasks' accuracy scores when reporting the group's metric. MMLU, for example, averages over per-document accuracies (the *micro average*), resulting in the same accuracy as if one simply concatenated all 57 subjects into a single dataset and evaluated accuracy on that dataset.
- `filter_list: Union[str, List[str]] = "none"` - what filter keys one should match on to aggregate results. For example, if trying to aggregate over the `exact_match` metric using `strict-match` filter for `bbh_cot_zeroshot`, then set this to be `filter_list: "strict-match"`.
- **metadata** (`dict`, *optional*) - As with TaskConfigs, a field where extra config metadata can be passed. set the `num_fewshot` key within this to override the printed n_shot value in a results table for your group, for example.
...@@ -377,7 +377,7 @@ ...@@ -377,7 +377,7 @@
"id": "LOUHK7PtQfq4" "id": "LOUHK7PtQfq4"
}, },
"source": [ "source": [
"Often, tasks are part of a larger group used to measure different capabilities. The dynamism of the field today means new dimensions of evaluation can come about which would mix and match new and older tasks alike. In LM-Eval, We can also group tasks and call that the group name to evaluate on a set of tasks easily. In this instance, let's evaluate the group `yes_or_no_tasks` which comprise of the tasks `demo_boolq` and `demo_cola`; tasks which are multiple choice tasks with options `yes` and `no` as the name suggests.\n", "Often, tasks are part of a larger group used to measure different capabilities. The dynamism of the field today means new dimensions of evaluation can come about which would mix and match new and older tasks alike. In LM-Eval, We can also group tasks and call that the group name to evaluate on a set of tasks easily. In this instance, let's evaluate the tag `yes_or_no_tasks` which comprise of the tasks `demo_boolq` and `demo_cola`; tasks which are multiple choice tasks with options `yes` and `no` as the name suggests.\n",
"\n", "\n",
"<!-- making new groups is easier than ever, allowing user to work bottom-up by makiing individual tasks and linking them to a group or Top-Down, making a new group by listing existing tasks.\n", "<!-- making new groups is easier than ever, allowing user to work bottom-up by makiing individual tasks and linking them to a group or Top-Down, making a new group by listing existing tasks.\n",
"\n", "\n",
...@@ -395,7 +395,7 @@ ...@@ -395,7 +395,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"YAML_cola_string = '''\n", "YAML_cola_string = '''\n",
"group: yes_or_no_tasks\n", "tag: yes_or_no_tasks\n",
"task: demo_cola\n", "task: demo_cola\n",
"dataset_path: glue\n", "dataset_path: glue\n",
"dataset_name: cola\n", "dataset_name: cola\n",
...@@ -494,7 +494,6 @@ ...@@ -494,7 +494,6 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"YAML_mmlu_geo_string = '''\n", "YAML_mmlu_geo_string = '''\n",
"group: mmlu\n",
"task: demo_mmlu_high_school_geography\n", "task: demo_mmlu_high_school_geography\n",
"dataset_path: cais/mmlu\n", "dataset_path: cais/mmlu\n",
"dataset_name: high_school_geography\n", "dataset_name: high_school_geography\n",
......
...@@ -110,13 +110,15 @@ ...@@ -110,13 +110,15 @@
"cell_type": "markdown", "cell_type": "markdown",
"id": "e974cabdbe70b667", "id": "e974cabdbe70b667",
"metadata": {}, "metadata": {},
"source": "" "source": []
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "5178ca9445b844e4", "id": "5178ca9445b844e4",
"metadata": {}, "metadata": {},
"source": "W&B can also be initialized programmatically for use outside the CLI to parse and log the results." "source": [
"W&B can also be initialized programmatically for use outside the CLI to parse and log the results."
]
}, },
{ {
"cell_type": "code", "cell_type": "code",
...@@ -126,7 +128,7 @@ ...@@ -126,7 +128,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"import lm_eval\n", "import lm_eval\n",
"from lm_eval.logging_utils import WandbLogger\n", "from lm_eval.loggers import WandbLogger\n",
"\n", "\n",
"results = lm_eval.simple_evaluate(\n", "results = lm_eval.simple_evaluate(\n",
" model=\"hf\",\n", " model=\"hf\",\n",
......
...@@ -8,7 +8,7 @@ from typing import Union ...@@ -8,7 +8,7 @@ from typing import Union
from lm_eval import evaluator, utils from lm_eval import evaluator, utils
from lm_eval.evaluator import request_caching_arg_to_dict from lm_eval.evaluator import request_caching_arg_to_dict
from lm_eval.logging import EvaluationTracker, WandbLogger from lm_eval.loggers import EvaluationTracker, WandbLogger
from lm_eval.tasks import TaskManager from lm_eval.tasks import TaskManager
from lm_eval.utils import handle_non_serializable, make_table, simple_parse_args_string from lm_eval.utils import handle_non_serializable, make_table, simple_parse_args_string
...@@ -73,7 +73,7 @@ def setup_parser() -> argparse.ArgumentParser: ...@@ -73,7 +73,7 @@ def setup_parser() -> argparse.ArgumentParser:
default=None, default=None,
type=str, type=str,
metavar="task1,task2", metavar="task1,task2",
help="To get full list of tasks, use the command lm-eval --tasks list", help="Comma-separated list of task names or task groupings to evaluate on.\nTo get full list of tasks, use one of the commands `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above",
) )
parser.add_argument( parser.add_argument(
"--model_args", "--model_args",
...@@ -162,6 +162,24 @@ def setup_parser() -> argparse.ArgumentParser: ...@@ -162,6 +162,24 @@ def setup_parser() -> argparse.ArgumentParser:
default=False, default=False,
help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.", help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
) )
parser.add_argument(
"--system_instruction",
type=str,
default=None,
help="System instruction to be used in the prompt",
)
parser.add_argument(
"--apply_chat_template",
action="store_true",
default=False,
help="If True, applies the chat template to the prompt",
)
parser.add_argument(
"--fewshot_as_multiturn",
action="store_true",
default=False,
help="If True, uses the fewshot as a multi-turn conversation",
)
parser.add_argument( parser.add_argument(
"--show_config", "--show_config",
action="store_true", action="store_true",
...@@ -219,7 +237,7 @@ def setup_parser() -> argparse.ArgumentParser: ...@@ -219,7 +237,7 @@ def setup_parser() -> argparse.ArgumentParser:
help=( help=(
"Set seed for python's random, numpy, torch, and fewshot sampling.\n" "Set seed for python's random, numpy, torch, and fewshot sampling.\n"
"Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, " "Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, "
"respectively, or a single integer to set the same seed for all three.\n" "respectively, or a single integer to set the same seed for all four.\n"
f"The values are either an integer or 'None' to not set the seed. Default is `{default_seed_string}` " f"The values are either an integer or 'None' to not set the seed. Default is `{default_seed_string}` "
"(for backward compatibility).\n" "(for backward compatibility).\n"
"E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. " "E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. "
...@@ -255,13 +273,12 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -255,13 +273,12 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
# update the evaluation tracker args with the output path and the HF token # update the evaluation tracker args with the output path and the HF token
args.hf_hub_log_args = f"output_path={args.output_path},token={os.environ.get('HF_TOKEN')},{args.hf_hub_log_args}" if args.output_path:
args.hf_hub_log_args += f",output_path={args.output_path}"
if os.environ.get("HF_TOKEN", None):
args.hf_hub_log_args += f",token={os.environ.get('HF_TOKEN')}"
evaluation_tracker_args = simple_parse_args_string(args.hf_hub_log_args) evaluation_tracker_args = simple_parse_args_string(args.hf_hub_log_args)
evaluation_tracker = EvaluationTracker(**evaluation_tracker_args) evaluation_tracker = EvaluationTracker(**evaluation_tracker_args)
evaluation_tracker.general_config_tracker.log_experiment_args(
model_source=args.model,
model_args=args.model_args,
)
if args.predict_only: if args.predict_only:
args.log_samples = True args.log_samples = True
...@@ -270,17 +287,22 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -270,17 +287,22 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
"Specify --output_path if providing --log_samples or --predict_only" "Specify --output_path if providing --log_samples or --predict_only"
) )
if args.include_path is not None: if args.fewshot_as_multiturn and args.apply_chat_template is False:
eval_logger.info(f"Including path: {args.include_path}") raise ValueError(
task_manager = TaskManager(args.verbosity, include_path=args.include_path) "If fewshot_as_multiturn is set, apply_chat_template must be set to True."
)
if ( if (
"push_results_to_hub" in evaluation_tracker_args args.num_fewshot is None or args.num_fewshot == 0
or "push_samples_to_hub" in evaluation_tracker_args ) and args.fewshot_as_multiturn:
) and "hub_results_org" not in evaluation_tracker_args:
raise ValueError( raise ValueError(
"If push_results_to_hub or push_samples_to_hub is set, results_org must be specified." "If fewshot_as_multiturn is set, num_fewshot must be greater than 0."
) )
if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}")
task_manager = TaskManager(args.verbosity, include_path=args.include_path)
if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples: if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
eval_logger.warning( eval_logger.warning(
"Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub." "Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub."
...@@ -296,9 +318,16 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -296,9 +318,16 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
eval_logger.error("Need to specify task to evaluate.") eval_logger.error("Need to specify task to evaluate.")
sys.exit() sys.exit()
elif args.tasks == "list": elif args.tasks == "list":
eval_logger.info( print(task_manager.list_all_tasks())
"Available Tasks:\n - {}".format("\n - ".join(task_manager.all_tasks)) sys.exit()
) elif args.tasks == "list_groups":
print(task_manager.list_all_tasks(list_subtasks=False, list_tags=False))
sys.exit()
elif args.tasks == "list_tags":
print(task_manager.list_all_tasks(list_groups=False, list_subtasks=False))
sys.exit()
elif args.tasks == "list_subtasks":
print(task_manager.list_all_tasks(list_groups=False, list_tags=False))
sys.exit() sys.exit()
else: else:
if os.path.isdir(args.tasks): if os.path.isdir(args.tasks):
...@@ -327,16 +356,22 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -327,16 +356,22 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks", f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
) )
raise ValueError( raise ValueError(
f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks, or '--verbosity DEBUG' to troubleshoot task registration issues." f"Tasks not found: {missing}. Try `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above, or pass '--verbosity DEBUG' to troubleshoot task registration issues."
) )
# Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args
if args.trust_remote_code: if args.trust_remote_code:
os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = str(args.trust_remote_code) eval_logger.info(
args.model_args = ( "Passed `--trust_remote_code`, setting environment variable `HF_DATASETS_TRUST_REMOTE_CODE=true`"
args.model_args
+ f",trust_remote_code={os.environ['HF_DATASETS_TRUST_REMOTE_CODE']}"
) )
# HACK: import datasets and override its HF_DATASETS_TRUST_REMOTE_CODE value internally,
# because it's already been determined based on the prior env var before launching our
# script--`datasets` gets imported by lm_eval internally before these lines can update the env.
import datasets
datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
args.model_args = args.model_args + ",trust_remote_code=True"
eval_logger.info(f"Selected Tasks: {task_names}") eval_logger.info(f"Selected Tasks: {task_names}")
...@@ -357,6 +392,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -357,6 +392,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
check_integrity=args.check_integrity, check_integrity=args.check_integrity,
write_out=args.write_out, write_out=args.write_out,
log_samples=args.log_samples, log_samples=args.log_samples,
evaluation_tracker=evaluation_tracker,
system_instruction=args.system_instruction,
apply_chat_template=args.apply_chat_template,
fewshot_as_multiturn=args.fewshot_as_multiturn,
gen_kwargs=args.gen_kwargs, gen_kwargs=args.gen_kwargs,
task_manager=task_manager, task_manager=task_manager,
verbosity=args.verbosity, verbosity=args.verbosity,
...@@ -399,6 +438,12 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -399,6 +438,12 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
task_name=task_name, samples=samples[task_name] task_name=task_name, samples=samples[task_name]
) )
if (
evaluation_tracker.push_results_to_hub
or evaluation_tracker.push_samples_to_hub
):
evaluation_tracker.recreate_metadata_card()
print( print(
f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, " f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}" f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
......
import abc
from dataclasses import asdict, dataclass
from inspect import getsource
from typing import Any, Callable, List, Optional, Union
@dataclass
class AggMetricConfig(dict):
metric: Optional[str] = None
aggregation: Optional[str] = "mean"
weight_by_size: Optional[str] = False
# list of filter names which should be incorporated into the aggregated metric.
filter_list: Optional[Union[str, list]] = "none"
def __post_init__(self):
if self.aggregation != "mean":
raise ValueError(
f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{self.aggregation}'."
)
if isinstance(self.filter_list, str):
self.filter_list = [self.filter_list]
@dataclass
class GroupConfig(dict):
group: Optional[str] = None
group_alias: Optional[str] = None
task: Optional[Union[str, list]] = None
aggregate_metric_list: Optional[
Union[List[AggMetricConfig], AggMetricConfig, dict]
] = None
metadata: Optional[dict] = (
None # by default, not used in the code. allows for users to pass arbitrary info to tasks
)
def __getitem__(self, item):
return getattr(self, item)
def __setitem__(self, item, value):
return setattr(self, item, value)
def __post_init__(self):
if self.aggregate_metric_list is not None:
if isinstance(self.aggregate_metric_list, dict):
self.aggregate_metric_list = [self.aggregate_metric_list]
self.aggregate_metric_list = [
AggMetricConfig(**item) if isinstance(item, dict) else item
for item in self.aggregate_metric_list
]
def to_dict(self, keep_callable: bool = False) -> dict:
"""dumps the current config as a dictionary object, as a printable format.
null fields will not be printed.
Used for dumping results alongside full task configuration
:return: dict
A printable dictionary version of the TaskConfig object.
# TODO: should any default value in the TaskConfig not be printed?
"""
cfg_dict = asdict(self)
# remove values that are `None`
for k, v in list(cfg_dict.items()):
if callable(v):
cfg_dict[k] = self.serialize_function(v, keep_callable=keep_callable)
return cfg_dict
def serialize_function(
self, value: Union[Callable, str], keep_callable=False
) -> Union[Callable, str]:
"""Serializes a given function or string.
If 'keep_callable' is True, the original callable is returned.
Otherwise, attempts to return the source code of the callable using 'getsource'.
"""
if keep_callable:
return value
else:
try:
return getsource(value)
except (TypeError, OSError):
return str(value)
class ConfigurableGroup(abc.ABC):
def __init__(
self,
config: Optional[dict] = None,
) -> None:
self._config = GroupConfig(**config)
@property
def group(self):
return self._config.group
@property
def group_alias(self):
return self._config.group_alias
@property
def version(self):
return self._config.version
@property
def config(self):
return self._config.to_dict()
@property
def group_name(self) -> Any:
return self._config.group
def __repr__(self):
return (
f"ConfigurableGroup(group={self.group}," f"group_alias={self.group_alias})"
)
import logging import logging
import math import math
import random import random
import re
import string
from collections.abc import Iterable from collections.abc import Iterable
from typing import List from typing import List
import evaluate as hf_evaluate
import numpy as np import numpy as np
import sacrebleu import sacrebleu
import sklearn.metrics import sklearn.metrics
...@@ -119,9 +120,10 @@ def ter(items): ...@@ -119,9 +120,10 @@ def ter(items):
@register_aggregation("brier_score") @register_aggregation("brier_score")
def brier_score(items): # This is a passthrough function def brier_score(items): # This is a passthrough function
gold, predictions = list(zip(*items)) gold, predictions = list(zip(*items))
bs, num_class = np.array(predictions).shape
gold = list(gold) gold = list(gold)
gold_one_hot = np.eye(np.max(gold) + 1)[gold] gold_one_hot = np.eye(num_class)[gold]
predictions = list(zip(*items))[1]
return np.mean(np.sum((predictions - gold_one_hot) ** 2, axis=1)) return np.mean(np.sum((predictions - gold_one_hot) ** 2, axis=1))
...@@ -165,7 +167,60 @@ def acc_mutual_info_fn(items): # This is a passthrough function ...@@ -165,7 +167,60 @@ def acc_mutual_info_fn(items): # This is a passthrough function
return items return items
exact_match = hf_evaluate.load("exact_match") ### the code used in the `exact_match_hf_evaluate` function is ported from
### https://github.com/huggingface/evaluate/blob/main/metrics/exact_match/exact_match.py
### which is under the apache license.
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
def exact_match_hf_evaluate(
predictions,
references,
regexes_to_ignore=None,
ignore_case=False,
ignore_punctuation=False,
ignore_numbers=False,
):
if regexes_to_ignore is not None:
for s in regexes_to_ignore:
predictions = np.array([re.sub(s, "", x) for x in predictions])
references = np.array([re.sub(s, "", x) for x in references])
else:
predictions = np.asarray(predictions)
references = np.asarray(references)
if ignore_case:
predictions = np.char.lower(predictions)
references = np.char.lower(references)
if ignore_punctuation:
repl_table = string.punctuation.maketrans("", "", string.punctuation)
predictions = np.char.translate(predictions, table=repl_table)
references = np.char.translate(references, table=repl_table)
if ignore_numbers:
repl_table = string.digits.maketrans("", "", string.digits)
predictions = np.char.translate(predictions, table=repl_table)
references = np.char.translate(references, table=repl_table)
score_list = predictions == references
return {"exact_match": np.mean(score_list)}
###
@register_metric( @register_metric(
...@@ -175,7 +230,7 @@ exact_match = hf_evaluate.load("exact_match") ...@@ -175,7 +230,7 @@ exact_match = hf_evaluate.load("exact_match")
aggregation="mean", aggregation="mean",
) )
def exact_match_fn(**kwargs): def exact_match_fn(**kwargs):
return exact_match.compute(**kwargs) return exact_match_hf_evaluate(**kwargs)
@register_metric( @register_metric(
...@@ -428,7 +483,11 @@ def bootstrap_stderr(f, xs, iters): ...@@ -428,7 +483,11 @@ def bootstrap_stderr(f, xs, iters):
return sample_stddev(res) return sample_stddev(res)
def stderr_for_metric(metric, bootstrap_iters): def stderr_for_metric(metric, bootstrap_iters: int):
if bootstrap_iters <= 0:
# return no function (don't compute stderr) if bootstrap iters = 0
return None
bootstrappable = [ bootstrappable = [
median, median,
matthews_corrcoef, matthews_corrcoef,
......
...@@ -3,7 +3,7 @@ import hashlib ...@@ -3,7 +3,7 @@ import hashlib
import json import json
import logging import logging
import os import os
from typing import List, Optional, Tuple, Type, TypeVar from typing import Dict, List, Optional, Tuple, Type, TypeVar
import transformers import transformers
from sqlitedict import SqliteDict from sqlitedict import SqliteDict
...@@ -114,6 +114,20 @@ class LM(abc.ABC): ...@@ -114,6 +114,20 @@ class LM(abc.ABC):
""" """
pass pass
def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
"""
Defines how to transform few-shot examples provided as chat history into a format that can be used as input to the LM.
:param chat_history: list[dict[str, str]]
A list of dictionaries with keys 'role' and 'content'.
Values are strings representing the role name and the content of the message, respectively.
:return: str
A string representing the chat history in a format that can be used as input to the LM.
"""
raise NotImplementedError(
"To use this model with chat templates, please implement the 'apply_chat_template' method for your model type."
)
@classmethod @classmethod
def create_from_arg_string( def create_from_arg_string(
cls: Type[T], arg_string: str, additional_config: Optional[dict] = None cls: Type[T], arg_string: str, additional_config: Optional[dict] = None
...@@ -169,6 +183,26 @@ class LM(abc.ABC): ...@@ -169,6 +183,26 @@ class LM(abc.ABC):
# not support multi-device parallelism nor expect it. # not support multi-device parallelism nor expect it.
return self._world_size return self._world_size
@property
def tokenizer_name(self) -> str:
"""Must be defined for LM subclasses which implement Chat Templating.
Should return the name of the tokenizer or chat template used.
Used only to properly fingerprint caches when requests are being cached with `--cache_requests`, otherwise not used.
"""
raise NotImplementedError(
"To use this model with chat templates, please implement the 'tokenizer_name' property."
)
@property
def chat_template(self) -> str:
"""Must be defined for LM subclasses that implement Chat Templating.
Should return the structure of the chat template applied to user/assistant messages.
This is used only to save in the experiment results for reproducibility.
"""
raise NotImplementedError(
"To use this model with chat templates, please implement the 'chat_template' property."
)
def set_cache_hook(self, cache_hook) -> None: def set_cache_hook(self, cache_hook) -> None:
self.cache_hook = cache_hook self.cache_hook = cache_hook
...@@ -212,9 +246,10 @@ class CachingLM: ...@@ -212,9 +246,10 @@ class CachingLM:
# add hook to lm # add hook to lm
lm.set_cache_hook(self.get_cache_hook()) lm.set_cache_hook(self.get_cache_hook())
def __getattr__(self, attr): def __getattr__(self, attr: str):
lm_attr = getattr(self.lm, attr) lm_attr = getattr(self.lm, attr)
if not callable(lm_attr): if attr not in ["loglikelihood", "loglikelihood_rolling", "generate_until"]:
eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM")
return lm_attr return lm_attr
def fn(requests): def fn(requests):
......
import datasets
class ContextSampler: class ContextSampler:
def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None: def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
self.rnd = rnd self.rnd = rnd
...@@ -18,6 +21,10 @@ class ContextSampler: ...@@ -18,6 +21,10 @@ class ContextSampler:
self.docs = docs # HF dataset split, provided by task._fewshot_docs() self.docs = docs # HF dataset split, provided by task._fewshot_docs()
if fewshot_indices: # subset few-shot docs from if fewshot_indices: # subset few-shot docs from
if not isinstance(self.docs, datasets.Dataset):
raise ValueError(
"Got `fewshot_indices` but fewshot_docs are not a HF dataset. Don't use both `fewshot_indices` and a user-defined few-shot sample list simultaneously"
)
self.docs = self.docs.select(fewshot_indices) self.docs = self.docs.select(fewshot_indices)
def get_context(self, doc, num_fewshot): def get_context(self, doc, num_fewshot):
...@@ -35,37 +42,79 @@ class ContextSampler: ...@@ -35,37 +42,79 @@ class ContextSampler:
# TODO: should we just stop people from using fewshot from same split as evaluating? # TODO: should we just stop people from using fewshot from same split as evaluating?
selected_docs = [x for x in fewshotex if x != doc][:num_fewshot] selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
labeled_examples = ( labeled_examples = ""
self.fewshot_delimiter.join( for doc in selected_docs:
[ doc_content = self.doc_to_text(doc)
# TODO: is separating doc_to_text and doc_to_target by one space always desired? doc_target = self.doc_to_target(doc)
( labeled_examples += (
self.doc_to_text(doc) doc_content
if ( if self.config.doc_to_choice is None or isinstance(doc_content, str)
self.config.doc_to_choice is None else self.doc_to_choice(doc)[doc_content]
or isinstance(self.doc_to_text(doc), str)
)
else self.doc_to_choice(doc)[self.doc_to_text(doc)]
)
+ self.target_delimiter
+ (
str(self.doc_to_target(doc)[0])
if isinstance(self.doc_to_target(doc), list)
else self.doc_to_target(doc)
if (
self.config.doc_to_choice is None
or isinstance(self.doc_to_target(doc), str)
)
else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
)
for doc in selected_docs
]
) )
+ self.fewshot_delimiter labeled_examples += self.target_delimiter
) labeled_examples += (
str(doc_target[0])
if isinstance(doc_target, list)
else str(doc_target)
if self.config.doc_to_choice is None or isinstance(doc_target, str)
else str(self.doc_to_choice(doc)[doc_target])
)
labeled_examples += self.fewshot_delimiter
return labeled_examples return labeled_examples
def get_chat_context(
self,
doc,
num_fewshot,
fewshot_as_multiturn: bool = False,
):
chat_history = []
# draw an extra fewshot sample if using same split as evaluating on
n_samples = (
num_fewshot + 1
if self.config.fewshot_split == self.config.test_split
else num_fewshot
)
# draw `n_samples` docs from fewshot_docs
fewshotex = self.sample(n_samples)
# get rid of the doc that's the one we're evaluating, if it's in the fewshot
# TODO: should we just stop people from using fewshot from same split as evaluating?
selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
if fewshot_as_multiturn:
for doc in selected_docs:
doc_content = self.doc_to_text(doc)
doc_target = self.doc_to_target(doc)
chat_history.append(
{
"role": "user",
"content": doc_content
if self.config.doc_to_choice is None
or isinstance(doc_content, str)
else self.doc_to_choice(doc)[doc_content],
}
)
chat_history.append(
{
"role": "assistant",
"content": str(doc_target[0])
if isinstance(doc_target, list)
else doc_target
if self.config.doc_to_choice is None
or isinstance(doc_target, str)
else str(self.doc_to_choice(doc)[doc_target]),
}
)
else:
# get fewshot context as one user turn
chat_history.append(
{"role": "user", "content": self.get_context(doc, num_fewshot)}
)
return chat_history
def sample(self, n): def sample(self, n):
""" """
Draw `n` samples from our fewshot docs. This method should be overridden by subclasses. Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
......
...@@ -56,8 +56,8 @@ class TaskConfig(dict): ...@@ -56,8 +56,8 @@ class TaskConfig(dict):
# task naming/registry # task naming/registry
task: Optional[str] = None task: Optional[str] = None
task_alias: Optional[str] = None task_alias: Optional[str] = None
tag: Optional[Union[str, list]] = None
group: Optional[Union[str, list]] = None group: Optional[Union[str, list]] = None
group_alias: Optional[Union[str, list]] = None
# HF dataset options. # HF dataset options.
# which dataset to use, # which dataset to use,
# and what splits for what purpose # and what splits for what purpose
...@@ -67,9 +67,9 @@ class TaskConfig(dict): ...@@ -67,9 +67,9 @@ class TaskConfig(dict):
training_split: Optional[str] = None training_split: Optional[str] = None
validation_split: Optional[str] = None validation_split: Optional[str] = None
test_split: Optional[str] = None test_split: Optional[str] = None
fewshot_split: Optional[ fewshot_split: Optional[str] = (
str None # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
] = None # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?) )
# formatting / prompting options. # formatting / prompting options.
# see docs/advanced_task_guide.md for more info # see docs/advanced_task_guide.md for more info
process_docs: Optional[Callable] = None process_docs: Optional[Callable] = None
...@@ -92,11 +92,23 @@ class TaskConfig(dict): ...@@ -92,11 +92,23 @@ class TaskConfig(dict):
filter_list: Optional[Union[str, list]] = None filter_list: Optional[Union[str, list]] = None
should_decontaminate: bool = False should_decontaminate: bool = False
doc_to_decontamination_query: Optional[str] = None doc_to_decontamination_query: Optional[str] = None
metadata: Optional[ metadata: Optional[dict] = (
dict None # by default, not used in the code. allows for users to pass arbitrary info to tasks
] = None # by default, not used in the code. allows for users to pass arbitrary info to tasks )
def __post_init__(self) -> None: def __post_init__(self) -> None:
if self.group is not None:
eval_logger.warning(
"A task YAML file was found to contain a `group` key. Groups which provide aggregate scores over several subtasks now require a separate config file--if not aggregating, you may want to use the `tag` config option instead within your config. Setting `group` within a TaskConfig will be deprecated in v0.4.4. Please see https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md for more information."
)
if self.tag is None:
self.tag = self.group
else:
raise ValueError(
"Got both a `group` and `tag` entry within a TaskConfig. Please use one or the other--`group` values will be deprecated in v0.4.4."
)
if self.generation_kwargs is not None: if self.generation_kwargs is not None:
if self.output_type != "generate_until": if self.output_type != "generate_until":
eval_logger.warning( eval_logger.warning(
...@@ -229,9 +241,9 @@ class Task(abc.ABC): ...@@ -229,9 +241,9 @@ class Task(abc.ABC):
self._config: TaskConfig = TaskConfig({**config}) if config else TaskConfig() self._config: TaskConfig = TaskConfig({**config}) if config else TaskConfig()
self._filters = [build_filter_ensemble("none", [["take_first", None]])] self._filters = [build_filter_ensemble("none", [["take_first", None]])]
self.fewshot_rnd: Optional[ self.fewshot_rnd: Optional[random.Random] = (
random.Random None # purposely induce errors in case of improper usage
] = None # purposely induce errors in case of improper usage )
def download( def download(
self, self,
...@@ -368,11 +380,16 @@ class Task(abc.ABC): ...@@ -368,11 +380,16 @@ class Task(abc.ABC):
def build_all_requests( def build_all_requests(
self, self,
*, *,
limit=None, limit: Union[int, None] = None,
rank=None, rank: int = 0,
world_size=None, world_size: int = 1,
cache_requests=False, cache_requests: bool = False,
rewrite_requests_cache=False, rewrite_requests_cache: bool = False,
system_instruction: Optional[str] = None,
apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
chat_template: Optional[Callable] = None,
tokenizer_name: str = "",
) -> None: ) -> None:
"""Build a set of Instances for a task, and store them in task.instances""" """Build a set of Instances for a task, and store them in task.instances"""
...@@ -380,6 +397,14 @@ class Task(abc.ABC): ...@@ -380,6 +397,14 @@ class Task(abc.ABC):
og_limit = limit og_limit = limit
cache_key = f"requests-{self._config.task}-{self.config.num_fewshot}shot-rank{rank}-world_size{world_size}" cache_key = f"requests-{self._config.task}-{self.config.num_fewshot}shot-rank{rank}-world_size{world_size}"
cache_key += "-chat_template" if apply_chat_template else ""
cache_key += "-fewshot_as_multiturn" if fewshot_as_multiturn else ""
cache_key += (
f"-system_prompt_hash{utils.hash_string(system_instruction)}"
if system_instruction is not None
else ""
)
cache_key += f"-tokenizer{tokenizer_name}"
cached_instances = load_from_cache(file_name=cache_key) cached_instances = load_from_cache(file_name=cache_key)
...@@ -421,6 +446,10 @@ class Task(abc.ABC): ...@@ -421,6 +446,10 @@ class Task(abc.ABC):
fewshot_ctx = self.fewshot_context( fewshot_ctx = self.fewshot_context(
doc, doc,
0 if self.config.num_fewshot is None else self.config.num_fewshot, 0 if self.config.num_fewshot is None else self.config.num_fewshot,
system_instruction,
apply_chat_template,
fewshot_as_multiturn,
chat_template,
) )
# TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
...@@ -948,17 +977,58 @@ class ConfigurableTask(Task): ...@@ -948,17 +977,58 @@ class ConfigurableTask(Task):
if self.config.process_docs is not None: if self.config.process_docs is not None:
return self.config.process_docs(self.dataset[self.config.fewshot_split]) return self.config.process_docs(self.dataset[self.config.fewshot_split])
return self.dataset[self.config.fewshot_split] return self.dataset[self.config.fewshot_split]
elif (
self.config.fewshot_config is not None
and self.config.fewshot_config.get("samples", None) is not None
):
if isinstance(self.config.fewshot_config["samples"], list):
return self.config.fewshot_config["samples"]
elif callable(self.config.fewshot_config["samples"]):
return self.config.fewshot_config["samples"]()
else:
raise Exception(
"`fewshot_config['samples']` was incorrectly defined in the configuration. It should be either a list of samples as a dict, or function returning this list."
)
else: else:
if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0): if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0):
eval_logger.warning( eval_logger.warning(
f"Task '{self.config.task}': " f"[Task: {self.config.task}] "
"num_fewshot > 0 but fewshot_split is None. " "num_fewshot > 0 but fewshot_split is None. "
"using preconfigured rule." "using preconfigured rule."
) )
return super().fewshot_docs() return super().fewshot_docs()
@staticmethod
def append_target_question(
labeled_examples: List[Dict[str, str]],
question: str,
fewshot_as_multiturn: bool = False,
) -> None:
"""Adds a target question to the labeled examples list.
If fewshot_as_multiturn is True, or labeled_examples is empty, or the last entry is a system turn, appends the question as a new user entry.
Otherwise, it is appended to the last user entry, ensuring that the conversation alternates between the user and the assistant.
"""
if not fewshot_as_multiturn:
# if no messages or last message is system, append as new user entry
if len(labeled_examples) == 0 or labeled_examples[-1]["role"] == "system":
labeled_examples.append({"role": "user", "content": question})
# if last message is user, append to it to avoid two user messages in a row
else:
labeled_examples[-1]["content"] += question
else:
# if fewshot_as_multiturn is True, append as next user entry (last is always assistant)
labeled_examples.append({"role": "user", "content": question})
@utils.positional_deprecated @utils.positional_deprecated
def fewshot_context(self, doc: str, num_fewshot: int) -> str: def fewshot_context(
self,
doc: str,
num_fewshot: int,
system_instruction: Optional[str] = None,
apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
chat_template: Optional[Callable] = None,
) -> str:
"""Returns a fewshot context string that is made up of a prepended description """Returns a fewshot context string that is made up of a prepended description
(if provided), the `num_fewshot` number of examples, and an appended prompt example. (if provided), the `num_fewshot` number of examples, and an appended prompt example.
...@@ -966,22 +1036,90 @@ class ConfigurableTask(Task): ...@@ -966,22 +1036,90 @@ class ConfigurableTask(Task):
The document as returned from training_docs, validation_docs, or test_docs. The document as returned from training_docs, validation_docs, or test_docs.
:param num_fewshot: int :param num_fewshot: int
The number of fewshot examples to provide in the returned context string. The number of fewshot examples to provide in the returned context string.
:param system_instruction: str
System instruction to be applied to the prompt.
:param apply_chat_template: bool
Whether to apply the chat template to the fewshot context.
:param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:param chat_template: Callable
Chat template to be applied to the fewshot context.
:returns: str :returns: str
The fewshot context. The fewshot context.
""" """
if apply_chat_template:
labeled_examples = []
else:
labeled_examples = ""
# get task description
if description := self.config.description: if description := self.config.description:
description = utils.apply_template(self.config.description, doc) description = utils.apply_template(self.config.description, doc)
if num_fewshot == 0: # create system prompt based on the provided system instruction and description
# always prepend the (possibly empty) task description if system_instruction is not None and description:
labeled_examples = description system_prompt = (
f"{system_instruction}{self.sampler.fewshot_delimiter}{description}"
)
elif system_instruction is not None:
system_prompt = system_instruction
elif description:
system_prompt = description
else: else:
labeled_examples = description + self.sampler.get_context(doc, num_fewshot) system_prompt = ""
# add system prompt if specified
if system_prompt:
if apply_chat_template:
labeled_examples.append({"role": "system", "content": system_prompt})
else:
labeled_examples = system_prompt
# if few-shot - append examples after the system prompt
if num_fewshot > 0:
if apply_chat_template:
labeled_examples.extend(
self.sampler.get_chat_context(
doc, num_fewshot, fewshot_as_multiturn
)
)
else:
labeled_examples += self.sampler.get_context(doc, num_fewshot)
example = self.doc_to_text(doc) example = self.doc_to_text(doc)
if self.multiple_input: if apply_chat_template:
return labeled_examples if self.multiple_input:
return chat_template(labeled_examples)
if isinstance(example, str):
self.append_target_question(
labeled_examples, example, fewshot_as_multiturn
)
# for loglikelihood create a list of questions with appended choices
elif isinstance(example, list):
labeled_examples_list = []
# copy chat history for each example and append the answer
for ex in example:
chat = deepcopy(labeled_examples)
self.append_target_question(chat, ex, fewshot_as_multiturn)
labeled_examples_list.append(chat_template(chat))
return labeled_examples_list
# if example is an integer, append the choice or convert to string
elif isinstance(example, int):
if self.config.doc_to_choice is not None:
choices = self.doc_to_choice(doc)
self.append_target_question(
labeled_examples, choices[example], fewshot_as_multiturn
)
else:
self.append_target_question(
labeled_examples, str(example), fewshot_as_multiturn
)
# return lm.apply_chat_template(labeled_examples)
return chat_template(labeled_examples)
else: else:
if self.multiple_input:
return labeled_examples
if isinstance(example, str): if isinstance(example, str):
return labeled_examples + example return labeled_examples + example
elif isinstance(example, list): elif isinstance(example, list):
...@@ -1394,10 +1532,13 @@ class ConfigurableTask(Task): ...@@ -1394,10 +1532,13 @@ class ConfigurableTask(Task):
def get_config(self, key: str) -> Any: def get_config(self, key: str) -> Any:
return getattr(self._config, key, None) return getattr(self._config, key, None)
@property
def task_name(self) -> Any:
return getattr(self.config, "task", None)
def __repr__(self): def __repr__(self):
return ( return (
f"ConfigurableTask(task_name={getattr(self.config, 'task', None)}," f"ConfigurableTask(task_name={getattr(self.config, 'task', None)},"
f"group_name={getattr(self.config, 'group', None)},"
f"output_type={self.OUTPUT_TYPE}," f"output_type={self.OUTPUT_TYPE},"
f"num_fewshot={getattr(self.config, 'num_fewshot', None)}," f"num_fewshot={getattr(self.config, 'num_fewshot', None)},"
f"num_samples={len(self.eval_docs)})" f"num_samples={len(self.eval_docs)})"
......
...@@ -11,18 +11,25 @@ import torch ...@@ -11,18 +11,25 @@ import torch
import lm_eval.api.metrics import lm_eval.api.metrics
import lm_eval.api.registry import lm_eval.api.registry
import lm_eval.api.task
import lm_eval.models import lm_eval.models
from lm_eval.caching.cache import delete_cache from lm_eval.caching.cache import delete_cache
from lm_eval.evaluator_utils import ( from lm_eval.evaluator_utils import (
consolidate_group_results,
consolidate_results, consolidate_results,
get_sample_size, get_sample_size,
get_subtask_list,
get_task_list, get_task_list,
prepare_print_tasks, prepare_print_tasks,
print_writeout, print_writeout,
run_task_tests, run_task_tests,
) )
from lm_eval.logging.utils import add_env_info, get_git_commit_hash from lm_eval.loggers import EvaluationTracker
from lm_eval.tasks import TaskManager, get_task_dict from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash
from lm_eval.tasks import (
TaskManager,
get_task_dict,
)
from lm_eval.utils import ( from lm_eval.utils import (
eval_logger, eval_logger,
handle_non_serializable, handle_non_serializable,
...@@ -34,7 +41,7 @@ from lm_eval.utils import ( ...@@ -34,7 +41,7 @@ from lm_eval.utils import (
if TYPE_CHECKING: if TYPE_CHECKING:
from lm_eval.api.model import LM from lm_eval.api.model import LM
from lm_eval.tasks import Task from lm_eval.api.task import Task
@positional_deprecated @positional_deprecated
...@@ -43,7 +50,7 @@ def simple_evaluate( ...@@ -43,7 +50,7 @@ def simple_evaluate(
model_args: Optional[Union[str, dict]] = None, model_args: Optional[Union[str, dict]] = None,
tasks: Optional[List[Union[str, dict, object]]] = None, tasks: Optional[List[Union[str, dict, object]]] = None,
num_fewshot: Optional[int] = None, num_fewshot: Optional[int] = None,
batch_size: Optional[int] = None, batch_size: Optional[Union[int, str]] = None,
max_batch_size: Optional[int] = None, max_batch_size: Optional[int] = None,
device: Optional[str] = None, device: Optional[str] = None,
use_cache: Optional[str] = None, use_cache: Optional[str] = None,
...@@ -55,6 +62,10 @@ def simple_evaluate( ...@@ -55,6 +62,10 @@ def simple_evaluate(
check_integrity: bool = False, check_integrity: bool = False,
write_out: bool = False, write_out: bool = False,
log_samples: bool = True, log_samples: bool = True,
evaluation_tracker: Optional[EvaluationTracker] = None,
system_instruction: Optional[str] = None,
apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
gen_kwargs: Optional[str] = None, gen_kwargs: Optional[str] = None,
task_manager: Optional[TaskManager] = None, task_manager: Optional[TaskManager] = None,
verbosity: str = "INFO", verbosity: str = "INFO",
...@@ -92,13 +103,19 @@ def simple_evaluate( ...@@ -92,13 +103,19 @@ def simple_evaluate(
:param limit: int or float, optional :param limit: int or float, optional
Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples. Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples.
:param bootstrap_iters: :param bootstrap_iters:
Number of iterations for bootstrap statistics Number of iterations for bootstrap statistics, used when calculating stderrs. set to 0 for no stderr calculations to be performed.
:param check_integrity: bool :param check_integrity: bool
Whether to run the relevant part of the test suite for the tasks Whether to run the relevant part of the test suite for the tasks
:param write_out: bool :param write_out: bool
If True, write out an example document and model input for checking task integrity If True, write out an example document and model input for checking task integrity
:param log_samples: bool :param log_samples: bool
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
:param system_instruction: str
System instruction to be applied to the prompt
:param apply_chat_template: bool
If True, apply chat template to the prompt
:param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:param gen_kwargs: str :param gen_kwargs: str
String arguments for model generation String arguments for model generation
Ignored for all tasks with loglikelihood output_type Ignored for all tasks with loglikelihood output_type
...@@ -208,51 +225,74 @@ def simple_evaluate( ...@@ -208,51 +225,74 @@ def simple_evaluate(
task_manager = TaskManager(verbosity) task_manager = TaskManager(verbosity)
task_dict = get_task_dict(tasks, task_manager) task_dict = get_task_dict(tasks, task_manager)
for task_name in task_dict.keys():
task_obj = task_dict[task_name]
if isinstance(task_obj, tuple):
_, task_obj = task_obj
if task_obj is None:
continue
if task_obj.get_config("output_type") == "generate_until":
if gen_kwargs is not None:
task_obj.set_config(
key="generation_kwargs", value=gen_kwargs, update=True
)
if predict_only: # helper function to recursively apply config overrides to leaf subtasks, skipping their constituent groups.
log_samples = True # (setting of num_fewshot ; bypassing metric calculation ; setting fewshot seed)
eval_logger.info( def _adjust_config(task_dict):
f"Processing {task_name} in output-only mode. Metrics will not be calculated!" adjusted_task_dict = {}
) for task_name, task_obj in task_dict.items():
# we have to change the class properties post-hoc. This is pretty hacky. if isinstance(task_obj, dict):
task_obj.override_metric(metric_name="bypass") adjusted_task_dict = {
**adjusted_task_dict,
**{task_name: _adjust_config(task_obj)},
}
# override tasks' fewshot values to the provided num_fewshot arg value
# except if tasks have it set to 0 manually in their configs--then we should never overwrite that
if num_fewshot is not None:
if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
eval_logger.info(
f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
)
else: else:
eval_logger.warning( if task_obj.get_config("output_type") == "generate_until":
f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}" if gen_kwargs is not None:
task_obj.set_config(
key="generation_kwargs", value=gen_kwargs, update=True
)
if predict_only:
eval_logger.info(
f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
)
# we have to change the class properties post-hoc. This is pretty hacky.
task_obj.override_metric(metric_name="bypass")
# override tasks' fewshot values to the provided num_fewshot arg value
# except if tasks have it set to 0 manually in their configs--then we should never overwrite that
if num_fewshot is not None:
if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
eval_logger.info(
f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
)
else:
eval_logger.warning(
f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
)
task_obj.set_config(key="num_fewshot", value=num_fewshot)
else:
# if num_fewshot not provided, and the task does not define a default one, default to 0
if (
default_num_fewshot := task_obj.get_config("num_fewshot")
) is None:
task_obj.set_config(key="num_fewshot", value=0)
# fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file)
task_obj.set_fewshot_seed(seed=fewshot_random_seed)
eval_logger.info(
f"Setting fewshot random generator seed to {fewshot_random_seed}"
) )
task_obj.set_config(key="num_fewshot", value=num_fewshot)
task_obj.set_fewshot_seed(seed=fewshot_random_seed) adjusted_task_dict[task_name] = task_obj
eval_logger.info(
f"Setting fewshot random generator seed to {fewshot_random_seed}" return adjusted_task_dict
)
else: task_dict = _adjust_config(task_dict)
# if num_fewshot not provided, and the task does not define a default one, default to 0
if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None:
task_obj.set_config(key="num_fewshot", value=0)
if check_integrity: if check_integrity:
run_task_tests(task_list=tasks) run_task_tests(task_list=tasks)
if evaluation_tracker is not None:
evaluation_tracker.general_config_tracker.log_experiment_args(
model_source=model,
model_args=model_args,
system_instruction=system_instruction,
chat_template=lm.chat_template if apply_chat_template else None,
fewshot_as_multiturn=fewshot_as_multiturn,
)
results = evaluate( results = evaluate(
lm=lm, lm=lm,
task_dict=task_dict, task_dict=task_dict,
...@@ -261,7 +301,10 @@ def simple_evaluate( ...@@ -261,7 +301,10 @@ def simple_evaluate(
rewrite_requests_cache=rewrite_requests_cache, rewrite_requests_cache=rewrite_requests_cache,
bootstrap_iters=bootstrap_iters, bootstrap_iters=bootstrap_iters,
write_out=write_out, write_out=write_out,
log_samples=log_samples, log_samples=True if predict_only else log_samples,
system_instruction=system_instruction,
apply_chat_template=apply_chat_template,
fewshot_as_multiturn=fewshot_as_multiturn,
verbosity=verbosity, verbosity=verbosity,
) )
...@@ -302,6 +345,7 @@ def simple_evaluate( ...@@ -302,6 +345,7 @@ def simple_evaluate(
results["git_hash"] = get_git_commit_hash() results["git_hash"] = get_git_commit_hash()
results["date"] = start_date results["date"] = start_date
add_env_info(results) # additional environment info to results add_env_info(results) # additional environment info to results
add_tokenizer_info(results, lm) # additional info about tokenizer
return results return results
else: else:
return None return None
...@@ -317,6 +361,9 @@ def evaluate( ...@@ -317,6 +361,9 @@ def evaluate(
bootstrap_iters: Optional[int] = 100000, bootstrap_iters: Optional[int] = 100000,
write_out: bool = False, write_out: bool = False,
log_samples: bool = True, log_samples: bool = True,
system_instruction: Optional[str] = None,
apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
verbosity: str = "INFO", verbosity: str = "INFO",
): ):
"""Instantiate and evaluate a model on a list of tasks. """Instantiate and evaluate a model on a list of tasks.
...@@ -328,11 +375,17 @@ def evaluate( ...@@ -328,11 +375,17 @@ def evaluate(
:param limit: int, optional :param limit: int, optional
Limit the number of examples per task (only use this for testing) Limit the number of examples per task (only use this for testing)
:param bootstrap_iters: :param bootstrap_iters:
Number of iterations for bootstrap statistics Number of iterations for bootstrap statistics, used when calculating stderr. Set to 0 for skipping all stderr calculations.
:param write_out: bool :param write_out: bool
If True, write out an example document and model input for checking task integrity If True, write out an example document and model input for checking task integrity
:param log_samples: bool :param log_samples: bool
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
:param system_instruction: str
System instruction to be applied to the prompt
:param apply_chat_template: bool
If True, apply chat template to the prompt
:param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:return :return
Dictionary of results Dictionary of results
""" """
...@@ -346,7 +399,7 @@ def evaluate( ...@@ -346,7 +399,7 @@ def evaluate(
padding_requests = defaultdict(int) padding_requests = defaultdict(int)
# get lists of group hierarchy and each type of request # get lists of group hierarchy and each type of request
task_hierarchy, eval_tasks = get_task_list(task_dict) eval_tasks = get_task_list(task_dict)
if not log_samples: if not log_samples:
if not all( if not all(
"bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys() "bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys()
...@@ -362,6 +415,15 @@ def evaluate( ...@@ -362,6 +415,15 @@ def evaluate(
world_size=lm.world_size, world_size=lm.world_size,
cache_requests=cache_requests, cache_requests=cache_requests,
rewrite_requests_cache=rewrite_requests_cache, rewrite_requests_cache=rewrite_requests_cache,
system_instruction=system_instruction,
apply_chat_template=apply_chat_template,
fewshot_as_multiturn=fewshot_as_multiturn,
chat_template=getattr(lm, "apply_chat_template")
if apply_chat_template
else None,
tokenizer_name=getattr(lm, "tokenizer_name", "")
if apply_chat_template
else "",
) )
eval_logger.debug( eval_logger.debug(
f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}" f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}"
...@@ -503,94 +565,60 @@ def evaluate( ...@@ -503,94 +565,60 @@ def evaluate(
# aggregate results ; run bootstrap CIs # aggregate results ; run bootstrap CIs
for task_output in eval_tasks: for task_output in eval_tasks:
task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters) task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters)
results, samples, configs, versions, num_fewshot = consolidate_results( (
eval_tasks results,
) samples,
configs,
versions,
num_fewshot,
higher_is_better,
) = consolidate_results(eval_tasks)
### Calculate group metrics ### ### Calculate group metrics ###
if bool(results): if bool(results):
for group, task_list in reversed(task_hierarchy.items()): results, versions, show_group_table, *_ = consolidate_group_results(
if len(task_list) == 0: results, versions, task_dict
# task_hierarchy entries are either )
# `group_name: [subtask1, subtask2, ...]`
# or `task_name: []`.
# we only want to operate on groups here.
continue
metric_list = list(
{
key
for task in task_list
for key in results[task].keys()
if "_stderr" not in key and key not in ["alias", "samples"]
}
)
for metric in metric_list:
stderr = "_stderr,".join(metric.split(","))
# gather metrics, sizes, and stderrs from subtasks
metrics = [
results[task][metric]
for task in task_list
if metric in results[task]
] # TODO: copy?
stderrs = [
results[task][stderr]
for task in task_list
if stderr in results[task]
]
sizes = [
results[task]["samples"]
for task in task_list
if metric in results[task]
]
# compute group's pooled metric and stderr
results[group][
metric
] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
# TODO: calculate grouped metric using aggregation fn
if "N/A" in stderrs:
results[group][stderr] = "N/A"
else:
results[group][
stderr
] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
# TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
# To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
# results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
results[group]["samples"] = sum(sizes)
results_agg = defaultdict(dict)
groups_agg = defaultdict(dict)
all_tasks_list = list(task_hierarchy.keys())
while True:
add_tasks_list = list(k for k in results_agg.keys())
left_tasks_list = sorted(list(set(all_tasks_list) - set(add_tasks_list)))
if len(left_tasks_list) == 0:
break
_task_hierarchy = {
k: v for k, v in task_hierarchy.items() if k in left_tasks_list
}
_results_agg, _groups_agg = prepare_print_tasks(_task_hierarchy, results)
results_agg = {**results_agg, **_results_agg}
groups_agg = {**groups_agg, **_groups_agg}
for group_name, task_list in task_hierarchy.items(): results_agg, group_agg = prepare_print_tasks(task_dict, results)
if task_list: subtask_list = get_subtask_list(task_dict)
num_fewshot[group_name] = num_fewshot[
task_list[0] # collect all higher_is_better values for metrics
] # TODO: validate this # in the group's subtasks.
# TODO: clean this up ; unify with the below metric_list loop?
_higher_is_better = {}
for group, task_list in subtask_list.items():
if (
len(task_list) != 0
): # subtask list will list "task_name": [] for solo tasks
for task in task_list:
for m, h in higher_is_better[task].items():
if m not in _higher_is_better.keys():
_higher_is_better[m] = h
if (
m in _higher_is_better
and _higher_is_better[m] is not None
and _higher_is_better[m] != h
):
eval_logger.warning(
f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
)
_higher_is_better[m] = None
higher_is_better[group] = _higher_is_better
results_dict = { results_dict = {
"results": dict(results_agg.items()), "results": dict(results_agg.items()),
**({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}), **(
"group_subtasks": dict(reversed(task_hierarchy.items())), {"groups": dict(group_agg.items())}
if (bool(group_agg) & show_group_table)
else {}
),
"group_subtasks": dict(reversed(subtask_list.items())),
"configs": dict(sorted(configs.items())), "configs": dict(sorted(configs.items())),
"versions": dict(sorted(versions.items())), "versions": dict(sorted(versions.items())),
"n-shot": dict(sorted(num_fewshot.items())), "n-shot": dict(sorted(num_fewshot.items())),
"higher_is_better": dict(sorted(higher_is_better.items())),
"n-samples": { "n-samples": {
task_output.task_name: { task_output.task_name: {
"original": len(task_output.task.eval_docs), "original": len(task_output.task.eval_docs),
......
...@@ -2,9 +2,15 @@ import collections ...@@ -2,9 +2,15 @@ import collections
import math import math
import pathlib import pathlib
import sys import sys
from typing import Dict, List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
from lm_eval.api import metrics from lm_eval.api.group import ConfigurableGroup
from lm_eval.api.metrics import (
aggregate_subtask_metrics,
pooled_sample_stderr,
stderr_for_metric,
)
from lm_eval.api.task import Task
from lm_eval.utils import eval_logger, positional_deprecated from lm_eval.utils import eval_logger, positional_deprecated
...@@ -97,8 +103,8 @@ class TaskOutput: ...@@ -97,8 +103,8 @@ class TaskOutput:
metric_key = f"{metric},{filter_key}" metric_key = f"{metric},{filter_key}"
self.agg_metrics[metric_key] = agg_fn(items) self.agg_metrics[metric_key] = agg_fn(items)
self.sample_len = len(items) # TODO: same sample size for each metric? self.sample_len = len(items) # TODO: same sample size for each metric?
if bootstrap_iters: if isinstance(bootstrap_iters, int):
stderr_fn = metrics.stderr_for_metric( stderr_fn = stderr_for_metric(
metric=agg_fn, metric=agg_fn,
bootstrap_iters=min(bootstrap_iters, 100) bootstrap_iters=min(bootstrap_iters, 100)
if metric in ["bleu", "chrf", "ter"] if metric in ["bleu", "chrf", "ter"]
...@@ -107,28 +113,80 @@ class TaskOutput: ...@@ -107,28 +113,80 @@ class TaskOutput:
self.agg_metrics[f"{metric}_stderr,{filter_key}"] = ( self.agg_metrics[f"{metric}_stderr,{filter_key}"] = (
stderr_fn(items) if (stderr_fn and len(items) > 1) else "N/A" stderr_fn(items) if (stderr_fn and len(items) > 1) else "N/A"
) )
else:
raise ValueError(
f"Received bootstrap_iters '{bootstrap_iters}' but expected an integer. Set to 0 to turn off stderr calculations."
)
def __repr__(self): def __repr__(self):
return ( return (
f"TaskOutput(task_name={self.task_name}, " f"TaskOutput(task_name={self.task_name}, "
f"group_name={self.group_name}, " f"group_name={self.group_name}, "
f"version={self.version}," f"version={self.version}, "
f"n_shot={self.n_shot}" f"n_shot={self.n_shot}, "
f"task_alias={self.task_alias}, group_alias={self.group_alias})" f"task_alias={self.task_alias}, "
f"group_alias={self.group_alias})"
) )
def get_task_list(task_dict: dict) -> Tuple[Dict[str, list], List[TaskOutput]]: def get_task_list(task_dict: dict) -> List[TaskOutput]:
task_hierarchy = collections.defaultdict(list) outputs = []
outputs = list(TaskOutput.from_taskdict(x, y) for x, y in task_dict.items()) for task_name, task_obj in task_dict.items():
for task_output in outputs: if isinstance(task_obj, dict):
if group_name := task_output.group_name: _outputs = get_task_list(task_obj)
task_hierarchy[group_name].append(task_output.task_name) outputs.extend(_outputs)
else:
task_output = TaskOutput.from_taskdict(task_name, task_obj)
outputs.append(task_output)
return outputs
def get_subtask_list(task_dict, task_root=None, depth=0):
subtask_list = {}
for group_obj, task_obj in task_dict.items():
if isinstance(group_obj, ConfigurableGroup):
# group_name = group_obj.group_name
group_name = group_obj.group_name
else: else:
task_hierarchy[task_output.task_name] = [] group_name = group_obj
# returns task_hierarchy tracking which groups contain which subtasks, if isinstance(task_obj, dict):
# and a list of TaskOutput classes for each non-group subtask _subtask_list = get_subtask_list(
return task_hierarchy, [x for x in outputs if x.task] task_obj, task_root=group_name, depth=depth + 1
)
if task_root:
subtask_list.setdefault((task_root, depth), []).extend(
[
_task
for (_task, _depth) in _subtask_list.keys()
if (_depth - 1) == depth
]
)
subtask_list = {**subtask_list, **_subtask_list}
else:
if isinstance(task_obj, ConfigurableGroup):
# group_or_task_name = task_obj.group_name
group_or_task_name = task_obj.group_name
elif isinstance(task_obj, Task):
# group_or_task_name = task_obj.task_name
group_or_task_name = task_obj.task_name
if task_root is None:
subtask_list.setdefault((group_or_task_name, depth), [])
else:
subtask_list.setdefault((task_root, depth), []).append(
group_or_task_name
)
if depth == 0:
_subtask_list = {}
for group_key, task_list in subtask_list.items():
group_name, depth = group_key
_subtask_list[group_name] = task_list
subtask_list = _subtask_list
return subtask_list
def print_writeout(task) -> None: def print_writeout(task) -> None:
...@@ -151,75 +209,100 @@ def get_sample_size(task, limit: Optional[int]) -> Union[int, None]: ...@@ -151,75 +209,100 @@ def get_sample_size(task, limit: Optional[int]) -> Union[int, None]:
def prepare_print_tasks( def prepare_print_tasks(
task_hierarchy: dict, results: dict, tab=0 task_dict: dict,
results: dict,
task_depth=0,
group_depth=0,
) -> Tuple[dict, dict]: ) -> Tuple[dict, dict]:
""" """
@param task_hierarchy: Dictionary representing the group hierarchy of tasks. Each key is a group name and its @param task_dict: Dictionary representing the group hierarchy of tasks. Each key is a group name and its
value is a list of task names. value is a list of task names.
@param results: Dictionary containing the results of each task. Each key is a @param results: Dictionary containing the results of each task. Each key is a
group name and its value is a dictionary of task results. group name and its value is a dictionary of task results.
@param tab: The indentation level for printing the task @param task_depth: The indentation level for printing the task
hierarchy. Default is 0.
@param group_depth: The indentation level for printing the group
hierarchy. Default is 0. hierarchy. Default is 0.
@return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains @return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains
aggregated results for each task, and groups_agg contains aggregated results for each group. aggregated results for each task, and groups_agg contains aggregated results for each group.
Prepares the task hierarchy and aggregates the results for each task and group recursively for printing. Prepares the task hierarchy and aggregates the results for each task and group recursively for printing.
""" """
results_agg = collections.defaultdict(dict)
groups_agg = collections.defaultdict(dict)
(group_name, task_list), *_ = task_hierarchy.items()
task_list = sorted(task_list)
results_agg[group_name] = results[group_name].copy()
# results_agg[group_name]["tab"] = tab
if "samples" in results_agg[group_name]:
results_agg[group_name].pop("samples")
tab_string = " " * tab + "- " if tab > 0 else ""
if "alias" in results_agg[group_name]:
results_agg[group_name]["alias"] = tab_string + results_agg[group_name]["alias"]
else:
results_agg[group_name]["alias"] = tab_string + group_name
if len(task_list) > 0:
groups_agg[group_name] = results[group_name].copy()
# groups_agg[group_name]["tab"] = tab
if "samples" in groups_agg[group_name]:
groups_agg[group_name].pop("samples")
if "alias" in groups_agg[group_name]: def _sort_task_dict(task_dict):
groups_agg[group_name]["alias"] = ( """
tab_string + groups_agg[group_name]["alias"] Helper utility. Sorts the task dict at the current level of the hierarchy based on alphabetized task name.
Required so that we end up sorting within each sub-header correctly.
"""
return dict(
sorted(
task_dict.items(),
key=lambda item: item[0].group_name
if isinstance(item[0], ConfigurableGroup)
else item[0],
) )
else: )
groups_agg[group_name]["alias"] = tab_string + group_name
for task_name in task_list: task_agg = collections.defaultdict(dict)
if task_name in task_hierarchy: group_agg = collections.defaultdict(dict)
_task_hierarchy = { task_dict = _sort_task_dict(task_dict)
**{task_name: task_hierarchy[task_name]}, for task_or_group_name, task_or_group_obj in task_dict.items():
**task_hierarchy, tab_string = " " * task_depth + "- " if task_depth > 0 else ""
} if isinstance(task_or_group_name, ConfigurableGroup):
# string_name = task_or_group_name.group_name
name = task_or_group_name.group_name
from_configurable_group = True
task_or_group_obj = _sort_task_dict(task_or_group_obj)
elif isinstance(task_or_group_name, str):
name = task_or_group_name
if isinstance(task_or_group_obj, Task):
# string_name = task_or_group_obj.task_name
name = task_or_group_obj.task_name
from_configurable_group = False
task_agg[name] = results[name].copy()
if from_configurable_group:
if task_or_group_name.group_alias is not None:
alias = task_or_group_name.group_alias
else: else:
_task_hierarchy = { alias = task_or_group_name.group
**{task_name: []}, else:
**task_hierarchy, if "alias" in task_agg[name]:
} alias = task_agg[name]["alias"]
else:
_results_agg, _groups_agg = prepare_print_tasks( alias = name
_task_hierarchy, results, tab + 1
task_agg[name]["alias"] = tab_string + alias
if "samples" in task_agg[name]:
task_agg[name].pop("samples")
if from_configurable_group and (" " not in results[name]):
group_tab_string = " " * group_depth + "- " if group_depth > 0 else ""
group_agg[name] = results[name].copy()
group_agg[name]["alias"] = group_tab_string + alias
if "samples" in group_agg[name]:
group_agg[name].pop("samples")
if isinstance(task_or_group_obj, dict):
task_depth += 1
group_depth += 1
_task_agg, _group_agg = prepare_print_tasks(
task_or_group_obj, results, task_depth, group_depth
) )
results_agg = {**results_agg, **_results_agg} task_agg = {
groups_agg = {**groups_agg, **_groups_agg} **task_agg,
**_task_agg,
return results_agg, groups_agg }
group_agg = {**group_agg, **_group_agg}
task_depth -= 1
group_depth -= 1
return task_agg, group_agg
def consolidate_results( def consolidate_results(
eval_tasks: List[TaskOutput], eval_tasks: List[TaskOutput],
) -> Tuple[dict, dict, dict, dict, dict]: ) -> Tuple[dict, dict, dict, dict, dict, dict]:
""" """
@param eval_tasks: list(TaskOutput). @param eval_tasks: list(TaskOutput).
@return: A tuple containing the consolidated results, samples, configs, versions, and num_fewshot. @return: A tuple containing the consolidated results, samples, configs, versions, and num_fewshot.
...@@ -236,6 +319,8 @@ def consolidate_results( ...@@ -236,6 +319,8 @@ def consolidate_results(
- configs: A defaultdict with task names as keys and task configurations as values. - configs: A defaultdict with task names as keys and task configurations as values.
- versions: A defaultdict with task names as keys and task versions as values. - versions: A defaultdict with task names as keys and task versions as values.
- num_fewshot: A defaultdict with task names as keys and number of few-shot samples as values. - num_fewshot: A defaultdict with task names as keys and number of few-shot samples as values.
- higher_is_better: A defaultdict with task names as keys and indicators of whether higher values are better
for each metric as values.
The method then returns the consolidated results, samples, configs, versions, and num_fewshot as a tuple. The method then returns the consolidated results, samples, configs, versions, and num_fewshot as a tuple.
""" """
...@@ -249,9 +334,14 @@ def consolidate_results( ...@@ -249,9 +334,14 @@ def consolidate_results(
configs = collections.defaultdict(dict) configs = collections.defaultdict(dict)
# Tracks each task's version. # Tracks each task's version.
versions = collections.defaultdict(dict) versions = collections.defaultdict(dict)
# Track `higher_is_better` for each metric
higher_is_better = collections.defaultdict(dict)
for task_output in eval_tasks: for task_output in eval_tasks:
if "task_alias" in (task_config := task_output.task_config): if "task_alias" in (task_config := task_output.task_config):
results[task_output.task_name]["alias"] = task_config["task_alias"] results[task_output.task_name]["alias"] = task_config["task_alias"]
else:
results[task_output.task_name]["alias"] = task_output.task_name
if group_alias := task_output.group_alias: if group_alias := task_output.group_alias:
if group_alias not in results and (group_name := task_output.group_name): if group_alias not in results and (group_name := task_output.group_name):
results[group_name]["alias"] = group_alias results[group_name]["alias"] = group_alias
...@@ -259,16 +349,156 @@ def consolidate_results( ...@@ -259,16 +349,156 @@ def consolidate_results(
configs[task_output.task_name] = task_output.task_config configs[task_output.task_name] = task_output.task_config
versions[task_output.task_name] = task_output.version versions[task_output.task_name] = task_output.version
samples[task_output.task_name] = task_output.logged_samples samples[task_output.task_name] = task_output.logged_samples
higher_is_better[task_output.task_name] = task_output.task.higher_is_better()
for (metric, filter_key), items in task_output.sample_metrics.items(): for (metric, filter_key), items in task_output.sample_metrics.items():
metric_key = f"{metric},{filter_key}" metric_key = f"{metric},{filter_key}"
results[task_output.task_name][metric_key] = task_output.agg_metrics[ results[task_output.task_name][metric_key] = task_output.agg_metrics[
metric_key metric_key
] ]
results[task_output.task_name]["samples"] = task_output.sample_len results[task_output.task_name]["samples"] = task_output.sample_len
results[task_output.task_name][ results[task_output.task_name][f"{metric}_stderr,{filter_key}"] = (
f"{metric}_stderr,{filter_key}" task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
] = task_output.agg_metrics[f"{metric}_stderr,{filter_key}"] )
return results, samples, configs, versions, num_fewshot return results, samples, configs, versions, num_fewshot, higher_is_better
def consolidate_group_results(
results,
versions,
task_dict,
task_root=None,
show_group_table=False,
task_aggregation_list=None,
) -> Tuple[dict, dict, bool, Union[None,]]:
"""
(Recursively) calculates groups' aggregated metrics and updates the results and versions dictionaries with this info.
@return: a tuple [results, versions, show_group_table, task_aggregation_list] with formats described below:
- results: A defaultdict with task names (and, after this function is called, group names of
groups that perform aggregation) as keys, and dictionaries with "alias" and metric,filter_name pairs as keys.
- versions: A defaultdict with task names (and, after this function is called, group names of
groups that perform aggregation) as keys, and float values representing the task or group's version if a version is specified. (defaulting to None).
- show_group_table: a boolean which is true if there exists a group that requires printing of its aggregated scores in a group table.
- task_aggregation_list: a defaultdict listing the subtasks to average over to produce a given group's end metric.
The method then returns the updated results, versions, show_group_table, and task_aggregation_list as a tuple.
In the top-level invocation of this function, task_aggregation_list is ignored.
"""
if task_root is None:
task_root = {}
if task_aggregation_list is None:
task_aggregation_list = {}
for group_or_task, group_or_task_info in task_dict.items():
# Convert to string
if isinstance(group_or_task, ConfigurableGroup):
group_config = group_or_task.config
group_or_task = group_or_task.group_name
else:
group_config = None
if isinstance(group_or_task_info, Task):
if task_root:
task_aggregation_list.setdefault(task_root, []).append(
group_or_task_info.task_name
)
else:
(
results,
versions,
show_group_table,
_task_aggregation_list,
) = consolidate_group_results(
results,
versions,
group_or_task_info,
group_or_task,
show_group_table,
task_aggregation_list,
)
if task_root:
task_aggregation_list.setdefault(task_root, []).extend(
task_aggregation_list.get(group_or_task, [])
)
if (group_config is None) or (
group_config["aggregate_metric_list"] is None
):
results[group_or_task][" "] = " "
continue
if "aggregate_metric_list" in group_config:
agg_metric_list = group_config["aggregate_metric_list"]
show_group_table = show_group_table | bool(
group_config["aggregate_metric_list"]
)
task_list = _task_aggregation_list[group_or_task]
metric_list = list(
{
key
for task in task_list
for key in results[task].keys()
if "_stderr" not in key and key not in ["task", "alias", "samples"]
}
)
for metric in metric_list:
stderr = "_stderr,".join(metric.split(","))
# gather metrics, sizes, and stderrs from subtasks
metrics = [
results[task][metric]
for task in task_list
if metric in results[task]
] # TODO: copy?
stderrs = [
results[task][stderr]
for task in task_list
if stderr in results[task]
]
sizes = [
results[task]["samples"]
for task in task_list
if metric in results[task]
]
for metric_config in agg_metric_list:
for filter_name in metric_config["filter_list"]:
if metric != ",".join([metric_config["metric"], filter_name]):
continue
# compute group's pooled metric and stderr
if metric_config["aggregation"] == "mean":
aggregate_fn = aggregate_subtask_metrics
else:
raise ValueError(
f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{metric_config['aggregation']}' for group '{group_or_task}'"
)
results[group_or_task][metric] = aggregate_fn(
metrics,
sizes,
metric_config["weight_by_size"],
)
# TODO: calculate groups' metrics using arbitrary agg fns
if "N/A" in stderrs:
results[group_or_task][stderr] = "N/A"
else:
# NOTE: this assumes we are using the mean to aggregate. There are warnings about this elsewhere
results[group_or_task][stderr] = pooled_sample_stderr(
stderrs, sizes
)
results[group_or_task]["samples"] = sum(sizes)
group_metadata = group_config.get("metadata", None)
if group_metadata is not None:
versions[group_or_task] = group_metadata.get("version", None)
# print(results)
return results, versions, show_group_table, task_aggregation_list
@positional_deprecated @positional_deprecated
......
...@@ -4,7 +4,6 @@ from lm_eval.api.registry import register_filter ...@@ -4,7 +4,6 @@ from lm_eval.api.registry import register_filter
@register_filter("decontaminate") @register_filter("decontaminate")
class DecontaminationFilter(Filter): class DecontaminationFilter(Filter):
""" """
A filter which evaluates A filter which evaluates
""" """
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment