Commit 60c9c170 authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

Merge branch 'main' into inverse-scaling-tasks

parents 4b2d565b b4cd85d4
...@@ -56,7 +56,7 @@ jobs: ...@@ -56,7 +56,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install -e '.[dev,anthropic,sentencepiece,optimum]' --extra-index-url https://download.pytorch.org/whl/cpu pip install -e '.[dev,anthropic,sentencepiece,optimum,deepsparse,sparseml]' --extra-index-url https://download.pytorch.org/whl/cpu
# Install optional git dependencies # Install optional git dependencies
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt # pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
......
...@@ -84,7 +84,7 @@ lm_eval --model hf \ ...@@ -84,7 +84,7 @@ lm_eval --model hf \
--batch_size auto:4 --batch_size auto:4
``` ```
The full list of supported arguments are provided [here](./docs/interface.md), and on the terminal by calling `lm_eval -h`. Alternatively, you can use `lm-eval` instead of `lm_eval`. The full list of supported arguments are provided [here](./docs/interface.md), and on the terminal by calling `lm_eval -h`. Alternatively, you can use `lm-eval` instead of `lm_eval`. A list of supported tasks can be viewed with `lm-eval --tasks list`.
> [!Note] > [!Note]
> Just like you can provide a local path to `transformers.AutoModel`, you can also provide a local path to `lm_eval` via `--model_args pretrained=/path/to/model` > Just like you can provide a local path to `transformers.AutoModel`, you can also provide a local path to `lm_eval` via `--model_args pretrained=/path/to/model`
...@@ -129,6 +129,53 @@ These two options (`accelerate launch` and `parallelize=True`) are mutually excl ...@@ -129,6 +129,53 @@ These two options (`accelerate launch` and `parallelize=True`) are mutually excl
**Note: we do not currently support multi-node evaluations natively, and advise using either an externally hosted server to run inference requests against, or creating a custom integration with your distributed framework [as is done for the GPT-NeoX library](https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py).** **Note: we do not currently support multi-node evaluations natively, and advise using either an externally hosted server to run inference requests against, or creating a custom integration with your distributed framework [as is done for the GPT-NeoX library](https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py).**
### NVIDIA `nemo` models
[NVIDIA NeMo Framework](https://github.com/NVIDIA/NeMo) is a generative AI framework built for researchers and pytorch developers working on language models.
To evaluate a `nemo` model, start by installing NeMo following [the documentation](https://github.com/NVIDIA/NeMo?tab=readme-ov-file#installation). We highly recommended to use the NVIDIA PyTorch or NeMo container, especially if having issues installing Apex or any other dependencies (see [latest released containers](https://github.com/NVIDIA/NeMo/releases)). Please also install the lm evaluation harness library following the instructions in [the Install section](https://github.com/EleutherAI/lm-evaluation-harness/tree/main?tab=readme-ov-file#install).
NeMo models can be obtained through [NVIDIA NGC Catalog](https://catalog.ngc.nvidia.com/models) or in [NVIDIA's Hugging Face page](https://huggingface.co/nvidia). In [NVIDIA NeMo Framework](https://github.com/NVIDIA/NeMo/tree/main/scripts/nlp_language_modeling) there are conversion scripts to convert the `hf` checkpoints of popular models like llama, falcon, mixtral or mpt to `nemo`.
Run a `nemo` model on one GPU:
```bash
lm_eval --model nemo_lm \
--model_args path=<path_to_nemo_model> \
--tasks hellaswag \
--batch_size 32
```
It is recommended to unpack the `nemo` model to avoid the unpacking inside the docker container - it may overflow disk space. For that you can run:
```
mkdir MY_MODEL
tar -xvf MY_MODEL.nemo -c MY_MODEL
```
#### Multi-GPU evaluation with NVIDIA `nemo` models
By default, only one GPU is used. But we do support either data replication or tensor/pipeline parallelism during evaluation, on one node.
1) To enable data replication, set the `model_args` of `devices` to the number of data replicas to run. For example, the command to run 8 data replicas over 8 GPUs is:
```bash
torchrun --nproc-per-node=8 --no-python lm_eval \
--model nemo_lm \
--model_args path=<path_to_nemo_model>,devices=8 \
--tasks hellaswag \
--batch_size 32
```
2) To enable tensor and/or pipeline parallelism, set the `model_args` of `tensor_model_parallel_size` and/or `pipeline_model_parallel_size`. In addition, you also have to set up `devices` to be equal to the product of `tensor_model_parallel_size` and/or `pipeline_model_parallel_size`. For example, the command to use one node of 4 GPUs with tensor parallelism of 2 and pipeline parallelism of 2 is:
```bash
torchrun --nproc-per-node=4 --no-python lm_eval \
--model nemo_lm \
--model_args path=<path_to_nemo_model>,devices=4,tensor_model_parallel_size=2,pipeline_model_parallel_size=2 \
--tasks hellaswag \
--batch_size 32
```
Note that it is recommended to substitute the `python` command by `torchrun --nproc-per-node=<number of devices> --no-python` to facilitate loading the model into the GPUs. This is especially important for large checkpoints loaded into multiple GPUs.
Not supported yet: multi-node evaluation and combinations of data replication with tensor or pipeline parallelism.
### Tensor + Data Parallel and Optimized Inference with `vLLM` ### Tensor + Data Parallel and Optimized Inference with `vLLM`
...@@ -144,6 +191,12 @@ To use vllm, do `pip install lm_eval[vllm]`. For a full list of supported vLLM c ...@@ -144,6 +191,12 @@ To use vllm, do `pip install lm_eval[vllm]`. For a full list of supported vLLM c
vLLM occasionally differs in output from Huggingface. We treat Huggingface as the reference implementation, and provide a [script](./scripts/model_comparator.py) for checking the validity of vllm results against HF. vLLM occasionally differs in output from Huggingface. We treat Huggingface as the reference implementation, and provide a [script](./scripts/model_comparator.py) for checking the validity of vllm results against HF.
> [!Tip]
> For fastest performance, we recommend using `--batch_size auto` for vLLM whenever possible, to leverage its continuous batching functionality!
> [!Tip]
> Passing `max_model_len=4096` or some other reasonable default to vLLM through model args may cause speedups or prevent out-of-memory errors when trying to use auto batch size, such as for Mistral-7B-v0.1 which defaults to a maximum length of 32k.
### Model APIs and Inference Servers ### Model APIs and Inference Servers
Our library also supports the evaluation of models served via several commercial APIs, and we hope to implement support for the most commonly used performant local/self-hosted inference servers. Our library also supports the evaluation of models served via several commercial APIs, and we hope to implement support for the most commonly used performant local/self-hosted inference servers.
...@@ -169,6 +222,7 @@ Note that for externally hosted models, configs such as `--device` and `--batch_ ...@@ -169,6 +222,7 @@ Note that for externally hosted models, configs such as `--device` and `--batch_
| OpenAI Completions | :heavy_check_mark: | `openai-completions`, `local-completions` | All OpenAI Completions API models | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | OpenAI Completions | :heavy_check_mark: | `openai-completions`, `local-completions` | All OpenAI Completions API models | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| OpenAI ChatCompletions | :heavy_check_mark: | `openai-chat-completions`, `local-chat-completions` | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt) | `generate_until` (no logprobs) | | OpenAI ChatCompletions | :heavy_check_mark: | `openai-chat-completions`, `local-chat-completions` | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt) | `generate_until` (no logprobs) |
| Anthropic | :heavy_check_mark: | `anthropic` | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model) | `generate_until` (no logprobs) | | Anthropic | :heavy_check_mark: | `anthropic` | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model) | `generate_until` (no logprobs) |
| Anthropic Chat | :heavy_check_mark: | `anthropic-chat`, `anthropic-chat-completions` | [Supported Anthropic Engines](https://docs.anthropic.com/claude/docs/models-overview) | `generate_until` (no logprobs) |
| Textsynth | :heavy_check_mark: | `textsynth` | [All supported engines](https://textsynth.com/documentation.html#engines) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | Textsynth | :heavy_check_mark: | `textsynth` | [All supported engines](https://textsynth.com/documentation.html#engines) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Cohere | [:hourglass: - blocked on Cohere API bug](https://github.com/EleutherAI/lm-evaluation-harness/pull/395) | N/A | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | Cohere | [:hourglass: - blocked on Cohere API bug](https://github.com/EleutherAI/lm-evaluation-harness/pull/395) | N/A | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)) | :heavy_check_mark: | `gguf`, `ggml` | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp) | `generate_until`, `loglikelihood`, (perplexity evaluation not yet implemented) | | [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)) | :heavy_check_mark: | `gguf`, `ggml` | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp) | `generate_until`, `loglikelihood`, (perplexity evaluation not yet implemented) |
...@@ -176,12 +230,18 @@ Note that for externally hosted models, configs such as `--device` and `--batch_ ...@@ -176,12 +230,18 @@ Note that for externally hosted models, configs such as `--device` and `--batch_
| Mamba | :heavy_check_mark: | `mamba_ssm` | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | Mamba | :heavy_check_mark: | `mamba_ssm` | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Huggingface Optimum (Causal LMs) | ✔️ | `openvino` | Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... | | Huggingface Optimum (Causal LMs) | ✔️ | `openvino` | Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... |
| Neuron via AWS Inf2 (Causal LMs) | ✔️ | `neuronx` | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... | | Neuron via AWS Inf2 (Causal LMs) | ✔️ | `neuronx` | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... |
| [Neural Magic DeepSparse](https://github.com/neuralmagic/deepsparse) | ✔️ | `deepsparse` | Any LM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub with the "deepsparse" tag](https://huggingface.co/models?other=deepsparse) | `generate_until`, `loglikelihood` | ... |
| [Neural Magic SparseML](https://github.com/neuralmagic/sparseml) | ✔️ | `sparseml` | Any decoder-only AutoModelForCausalLM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub](https://huggingface.co/neuralmagic). Especially useful for models with quantization like [`zoo:llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized`](https://sparsezoo.neuralmagic.com/models/llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... |
| Your local inference server! | :heavy_check_mark: | `local-completions` or `local-chat-completions` (using `openai-chat-completions` model type) | Any server address that accepts GET requests using HF models and mirror's OpenAI's Completions or ChatCompletions interface | `generate_until` | | ... | | Your local inference server! | :heavy_check_mark: | `local-completions` or `local-chat-completions` (using `openai-chat-completions` model type) | Any server address that accepts GET requests using HF models and mirror's OpenAI's Completions or ChatCompletions interface | `generate_until` | | ... |
Models which do not supply logits or logprobs can be used with tasks of type `generate_until` only, while local models, or APIs that supply logprobs/logits of their prompts, can be run on all task types: `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`. Models which do not supply logits or logprobs can be used with tasks of type `generate_until` only, while local models, or APIs that supply logprobs/logits of their prompts, can be run on all task types: `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
For more information on the different task `output_types` and model request types, see [our documentation](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/model_guide.md#interface). For more information on the different task `output_types` and model request types, see [our documentation](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/model_guide.md#interface).
> [!Note]
> For best performance with closed chat model APIs such as Anthropic Claude 3 and GPT-4, we recommend carefully looking at a few sample outputs using `--limit 10` first to confirm answer extraction and scoring on generative tasks is performing as expected. providing `system="<some system prompt here>"` within `--model_args` for anthropic-chat-completions, to instruct the model what format to respond in, may be useful.
### Other Frameworks ### Other Frameworks
A number of other libraries contain scripts for calling the eval harness through their library. These include [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py), [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md), and [mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/blob/master/eval_harness.py). A number of other libraries contain scripts for calling the eval harness through their library. These include [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py), [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md), and [mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/blob/master/eval_harness.py).
...@@ -192,7 +252,7 @@ To create your own custom integration you can follow instructions from [this tut ...@@ -192,7 +252,7 @@ To create your own custom integration you can follow instructions from [this tut
> [!Note] > [!Note]
> For tasks unsuitable for direct evaluation — either due risks associated with executing untrusted code or complexities in the evaluation process — the `--predict_only` flag is available to obtain decoded generations for post-hoc evaluation. > For tasks unsuitable for direct evaluation — either due risks associated with executing untrusted code or complexities in the evaluation process — the `--predict_only` flag is available to obtain decoded generations for post-hoc evaluation.
If you have a Metal compatible Mac, you can run the eval harness using the MPS back-end by replacing `--device cuda:0` with `--device mps` (requires PyTorch version 2.1 or higher). If you have a Metal compatible Mac, you can run the eval harness using the MPS back-end by replacing `--device cuda:0` with `--device mps` (requires PyTorch version 2.1 or higher). **Note that the PyTorch MPS backend is still in early stages of development, so correctness issues or unsupported operations may exist. If you observe oddities in model performance on the MPS back-end, we recommend first checking that a forward pass of your model on `--device cpu` and `--device mps` match.**
> [!Note] > [!Note]
> You can inspect what the LM inputs look like by running the following command: > You can inspect what the LM inputs look like by running the following command:
...@@ -224,6 +284,13 @@ lm_eval --model hf \ ...@@ -224,6 +284,13 @@ lm_eval --model hf \
--device cuda:0 --device cuda:0
``` ```
Models provided as delta weights can be easily loaded using the Hugging Face transformers library. Within --model_args, set the delta argument to specify the delta weights, and use the pretrained argument to designate the relative base model to which they will be applied:
```bash
lm_eval --model hf \
--model_args pretrained=Ejafa/llama_7B,delta=lmsys/vicuna-7b-delta-v1.1 \
--tasks hellaswag
```
[GPTQ](https://github.com/PanQiWei/AutoGPTQ) quantized models can be loaded by specifying their file names in `,autogptq=NAME` (or `,autogptq=True` for default names) in the `model_args` argument: [GPTQ](https://github.com/PanQiWei/AutoGPTQ) quantized models can be loaded by specifying their file names in `,autogptq=NAME` (or `,autogptq=True` for default names) in the `model_args` argument:
```bash ```bash
...@@ -234,14 +301,24 @@ lm_eval --model hf \ ...@@ -234,14 +301,24 @@ lm_eval --model hf \
We support wildcards in task names, for example you can run all of the machine-translated lambada tasks via `--task lambada_openai_mt_*`. We support wildcards in task names, for example you can run all of the machine-translated lambada tasks via `--task lambada_openai_mt_*`.
## Saving Results
To save evaluation results provide an `--output_path`. We also support logging model responses with the `--log_samples` flag for post-hoc analysis. To save evaluation results provide an `--output_path`. We also support logging model responses with the `--log_samples` flag for post-hoc analysis.
Additionally, one can provide a directory with `--use_cache` to cache the results of prior runs. This allows you to avoid repeated execution of the same (model, task) pairs for re-scoring. Additionally, one can provide a directory with `--use_cache` to cache the results of prior runs. This allows you to avoid repeated execution of the same (model, task) pairs for re-scoring.
For a full list of supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md) guide in our documentation! To push results and samples to the Hugging Face Hub, first ensure an access token with write access is set in the `HF_TOKEN` environment variable. Then, use the `--hf_hub_log_args` flag to specify the organization, repository name, repository visibility, and whether to push results and samples to the Hub - [example output](https://huggingface.co/datasets/KonradSzafer/lm-eval-results-demo/tree/main/microsoft__phi-2). For instance:
> [!Tip] ```bash
> Running lm-evaluation-harness as an external library and can't find (almost) any tasks available? Run `lm_eval.tasks.initialize_tasks()` to load the library's stock tasks before calling `lm_eval.evaluate()` or `lm_eval.simple_evaluate()` ! lm_eval --model hf \
--model_args pretrained=model-name-or-path,autogptq=model.safetensors,gptq_use_triton=True \
--tasks hellaswag \
--log_samples \
--output_path results \
--hf_hub_log_args hub_results_org=EleutherAI,hub_repo_name=lm-eval-results,push_results_to_hub=True,push_samples_to_hub=True,public_repo=False \
```
For a full list of supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md) guide in our documentation!
## Visualizing Results ## Visualizing Results
...@@ -351,6 +428,7 @@ Extras dependencies can be installed via `pip install -e ".[NAME]"` ...@@ -351,6 +428,7 @@ Extras dependencies can be installed via `pip install -e ".[NAME]"`
| Name | Use | | Name | Use |
|---------------|---------------------------------------| |---------------|---------------------------------------|
| anthropic | For using Anthropic's models | | anthropic | For using Anthropic's models |
| deepsparse | For running NM's DeepSparse models |
| dev | For linting PRs and contributions | | dev | For linting PRs and contributions |
| gptq | For loading models with GPTQ | | gptq | For loading models with GPTQ |
| hf_transfer | For speeding up HF Hub file downloads | | hf_transfer | For speeding up HF Hub file downloads |
...@@ -363,7 +441,9 @@ Extras dependencies can be installed via `pip install -e ".[NAME]"` ...@@ -363,7 +441,9 @@ Extras dependencies can be installed via `pip install -e ".[NAME]"`
| optimum | For running Intel OpenVINO models | | optimum | For running Intel OpenVINO models |
| promptsource | For using PromptSource prompts | | promptsource | For using PromptSource prompts |
| sentencepiece | For using the sentencepiece tokenizer | | sentencepiece | For using the sentencepiece tokenizer |
| sparseml | For using NM's SparseML models |
| testing | For running library test suite | | testing | For running library test suite |
| unitxt | For IBM's unitxt dataset tasks |
| vllm | For loading models with vLLM | | vllm | For loading models with vLLM |
| zeno | For visualizing results with Zeno | | zeno | For visualizing results with Zeno |
|---------------|---------------------------------------| |---------------|---------------------------------------|
......
...@@ -4,7 +4,7 @@ Welcome to the docs for the LM Evaluation Harness! ...@@ -4,7 +4,7 @@ Welcome to the docs for the LM Evaluation Harness!
## Table of Contents ## Table of Contents
* To learn about the public interface of the library, as well as how to evaluate via the commandline or as integrated into an external library, see the [Interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/interface.md) * To learn about the public interface of the library, as well as how to evaluate via the commandline or as integrated into an external library, see the [Interface](./interface.md)
* To learn how to add a new library, API, or model type to the library, as well as a quick explainer on the types of ways to evaluate an LM, see the [Model Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/model_guide.md). * To learn how to add a new library, API, or model type to the library, as well as a quick explainer on the types of ways to evaluate an LM, see the [Model Guide](./model_guide.md).
* For a crash course on adding new tasks to the library, see our [New Task Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md). * For a crash course on adding new tasks to the library, see our [New Task Guide](./new_task_guide.md).
* To learn more about pushing the limits of task configuration that the Eval Harness supports, see the [Task Configuration Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/task_guide.md). * To learn more about pushing the limits of task configuration that the Eval Harness supports, see the [Task Configuration Guide](./task_guide.md).
...@@ -10,11 +10,11 @@ Equivalently, running the library can be done via the `lm-eval` entrypoint at th ...@@ -10,11 +10,11 @@ Equivalently, running the library can be done via the `lm-eval` entrypoint at th
This mode supports a number of command-line arguments, the details of which can be also be seen via running with `-h` or `--help`: This mode supports a number of command-line arguments, the details of which can be also be seen via running with `-h` or `--help`:
- `--model` : Selects which model type or provider is evaluated. Must be a string corresponding to the name of the model type/provider being used. See [the main README](https://github.com/EleutherAI/lm-evaluation-harness/tree/main#commercial-apis) for a full list of enabled model names and supported libraries or APIs. - `--model` : Selects which model type or provider is evaluated. Must be a string corresponding to the name of the model type/provider being used. See [the main README](https://github.com/EleutherAI/lm-evaluation-harness/tree/main#model-apis-and-inference-servers) for a full list of enabled model names and supported libraries or APIs.
- `--model_args` : Controls parameters passed to the model constructor. Accepts a string containing comma-separated keyword arguments to the model class of the format `"arg1=val1,arg2=val2,..."`, such as, for example `--model_args pretrained=EleutherAI/pythia-160m,dtype=float32`. For a full list of what keyword arguments, see the initialization of the `lm_eval.api.model.LM` subclass, e.g. [`HFLM`](https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/models/huggingface.py#L66) - `--model_args` : Controls parameters passed to the model constructor. Accepts a string containing comma-separated keyword arguments to the model class of the format `"arg1=val1,arg2=val2,..."`, such as, for example `--model_args pretrained=EleutherAI/pythia-160m,dtype=float32`. For a full list of what keyword arguments, see the initialization of the `lm_eval.api.model.LM` subclass, e.g. [`HFLM`](https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/models/huggingface.py#L66)
- `--tasks` : Determines which tasks or task groups are evaluated. Accepts a comma-separated list of task names or task group names. Must be solely comprised of valid tasks/groups. - `--tasks` : Determines which tasks or task groups are evaluated. Accepts a comma-separated list of task names or task group names. Must be solely comprised of valid tasks/groups. A list of supported tasks can be viewed with `--tasks list`.
- `--num_fewshot` : Sets the number of few-shot examples to place in context. Must be an integer. - `--num_fewshot` : Sets the number of few-shot examples to place in context. Must be an integer.
...@@ -42,13 +42,20 @@ This mode supports a number of command-line arguments, the details of which can ...@@ -42,13 +42,20 @@ This mode supports a number of command-line arguments, the details of which can
- `--show_config` : If used, prints the full `lm_eval.api.task.TaskConfig` contents (non-default settings the task YAML file) for each task which was run, at the completion of an evaluation. Useful for when one is modifying a task's configuration YAML locally to transmit the exact configurations used for debugging or for reproducibility purposes. - `--show_config` : If used, prints the full `lm_eval.api.task.TaskConfig` contents (non-default settings the task YAML file) for each task which was run, at the completion of an evaluation. Useful for when one is modifying a task's configuration YAML locally to transmit the exact configurations used for debugging or for reproducibility purposes.
- `--include_path` : Accepts a path to a folder. If passed, then all YAML files containing ` lm-eval`` compatible task configurations will be added to the task registry as available tasks. Used for when one is writing config files for their own task in a folder other than `lm_eval/tasks/` - `--include_path` : Accepts a path to a folder. If passed, then all YAML files containing `lm-eval` compatible task configurations will be added to the task registry as available tasks. Used for when one is writing config files for their own task in a folder other than `lm_eval/tasks/`.
- `--predict_only`: Generates the model outputs without computing metrics. Use with `--log_samples` to retrieve decoded results. - `--predict_only`: Generates the model outputs without computing metrics. Use with `--log_samples` to retrieve decoded results.
* `--seed`: Set seed for python's random, numpy and torch. Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three. The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility). E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`. E.g, `--seed 42` sets all three seeds to 42. * `--seed`: Set seed for python's random, numpy and torch. Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three. The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility). E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`. E.g, `--seed 42` sets all three seeds to 42.
* `--wandb_args`: Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list (here.)[https://docs.wandb.ai/ref/python/init]. e.g., ```--wandb_args project=test-project,name=test-run``` * `--wandb_args`: Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list [here](https://docs.wandb.ai/ref/python/init). e.g., ```--wandb_args project=test-project,name=test-run```
* `--hf_hub_log_args` : Logs evaluation results to Hugging Face Hub. Accepts a string with the arguments separated by commas. Available arguments:
* `hub_results_org` - organization name on Hugging Face Hub, e.g., `EleutherAI`,
* `hub_repo_name` - repository name on Hugging Face Hub, e.g., `lm-eval-results`,
* `push_results_to_hub` - whether to push results to Hugging Face Hub, can be `True` or `False`,
* `push_samples_to_hub` - whether to push samples results to Hugging Face Hub, can be `True` or `False`. Requires `--log_samples` to be set,
* `public_repo` - whether the repository is public, can be `True` or `False`,
## External Library Usage ## External Library Usage
...@@ -77,7 +84,7 @@ task_manager = lm_eval.tasks.TaskManager() ...@@ -77,7 +84,7 @@ task_manager = lm_eval.tasks.TaskManager()
# Setting `task_manager` to the one above is optional and should generally be done # Setting `task_manager` to the one above is optional and should generally be done
# if you want to include tasks from paths other than ones in `lm_eval/tasks`. # if you want to include tasks from paths other than ones in `lm_eval/tasks`.
# `simple_evaluate` will instantiate its own task_manager is the it is set to None here. # `simple_evaluate` will instantiate its own task_manager if it is set to None here.
results = lm_eval.simple_evaluate( # call simple_evaluate results = lm_eval.simple_evaluate( # call simple_evaluate
model=lm_obj, model=lm_obj,
tasks=["taskname1", "taskname2"], tasks=["taskname1", "taskname2"],
...@@ -112,8 +119,8 @@ my_model = initialize_my_model() ...@@ -112,8 +119,8 @@ my_model = initialize_my_model()
# - `Your_LM.generate_until()` # - `Your_LM.generate_until()`
lm_obj = Your_LM(model=my_model, batch_size=16) lm_obj = Your_LM(model=my_model, batch_size=16)
# The task_manager indexes tasks including ones # optional: the task_manager indexes tasks including ones
# specified by the user through `include_path` # specified by the user through `include_path`.
task_manager = lm_eval.tasks.TaskManager( task_manager = lm_eval.tasks.TaskManager(
include_path="/path/to/custom/yaml" include_path="/path/to/custom/yaml"
) )
...@@ -138,9 +145,9 @@ task_dict = lm_eval.tasks.get_task_dict( ...@@ -138,9 +145,9 @@ task_dict = lm_eval.tasks.get_task_dict(
# custom paths is required. # custom paths is required.
) )
def evaluate( results = evaluate(
lm=lm_obj, lm=lm_obj,
task_dict=task_dict, task_dict=task_dict,
... ...
): )
``` ```
...@@ -6,7 +6,7 @@ In order to properly evaluate a given LM, we require implementation of a wrapper ...@@ -6,7 +6,7 @@ In order to properly evaluate a given LM, we require implementation of a wrapper
## Setup ## Setup
To get started contributing, go ahead and fork the main repo, clone it, create a branch with the name of your task, and install the project requirements in your environment: To get started contributing, go ahead and fork the main repo, clone it, create a branch with the name of your model, and install the project requirements in your environment:
```sh ```sh
# After forking... # After forking...
......
...@@ -35,7 +35,7 @@ and rename the folders and YAML file(s) as desired. ...@@ -35,7 +35,7 @@ and rename the folders and YAML file(s) as desired.
### Selecting and configuring a dataset ### Selecting and configuring a dataset
All data downloading and management is handled through the HuggingFace (**HF**) [`datasets`](https://github.com/huggingface/datasets) API. So, the first thing you should do is check to see if your task's dataset is already provided in their catalog [here](https://huggingface.co/datasets). If it's not in there, please consider adding it to their Hub to make it accessible to a wider user base by following their [new dataset guide](https://github.com/huggingface/datasets/blob/master/ADD_NEW_DATASET.md) All data downloading and management is handled through the HuggingFace (**HF**) [`datasets`](https://github.com/huggingface/datasets) API. So, the first thing you should do is check to see if your task's dataset is already provided in their catalog [here](https://huggingface.co/datasets). If it's not in there, please consider adding it to their Hub to make it accessible to a wider user base by following their [new dataset guide](https://github.com/huggingface/datasets/blob/main/ADD_NEW_DATASET.md)
. .
Once you have a HuggingFace dataset prepared for your task, we want to assign our new YAML to use this dataset: Once you have a HuggingFace dataset prepared for your task, we want to assign our new YAML to use this dataset:
...@@ -172,7 +172,7 @@ doc_to_target: "{{answer}}" ...@@ -172,7 +172,7 @@ doc_to_target: "{{answer}}"
``` ```
**Important**: we now add `target_delimiter` between input and target which defaults to " ", such that the full input-output string is `doc_to_target(doc) + target_delimiter + doc_to_text(doc)`. doc_to_text and doc_to_target should not contain trailing right or left whitespace, respectively. **Important**: we now add `target_delimiter` between input and target which defaults to " ", such that the full input-output string is `doc_to_target(doc) + target_delimiter + doc_to_text(doc)`. `doc_to_text` and `doc_to_target` should not contain trailing right or left whitespace, respectively.
#### Multiple choice format #### Multiple choice format
...@@ -213,7 +213,7 @@ def wikitext_detokenizer(doc): ...@@ -213,7 +213,7 @@ def wikitext_detokenizer(doc):
return string return string
``` ```
We can load this function in `doc_to_target` by using a `!function` operator after `doc_to_target` and followed by `<file name>.<function name>`. In the file [wikitext.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/6ae376e3a43caa58b95bb8aa73054a94827bf560/lm_eval/tasks/wikitext/wikitext.yaml) we write: We can load this function in `doc_to_target` by using a `!function` operator after `doc_to_target` and followed by `<file name>.<function name>`. In the file [wikitext.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/wikitext/wikitext.yaml) we write:
``` ```
doc_to_target: !function preprocess_wikitext.wikitext_detokenizer doc_to_target: !function preprocess_wikitext.wikitext_detokenizer
``` ```
...@@ -366,9 +366,7 @@ task: ...@@ -366,9 +366,7 @@ task:
## Beautifying Table Display ## Beautifying Table Display
To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed. To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed. For example in `mmlu_abstract_algebra.yaml` we set `group_alias` to `stem` and `task_alias` to `abstract_algebra`.
``
for example in `mmlu_abstract_algebra.yaml` we set `group_alias` to `stem` and `task_alias` to `abstract_algebra`.
``` ```
"dataset_name": "abstract_algebra" "dataset_name": "abstract_algebra"
......
...@@ -31,8 +31,8 @@ Dataset configuration options: ...@@ -31,8 +31,8 @@ Dataset configuration options:
Prompting / in-context formatting options: Prompting / in-context formatting options:
- **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text, doc_to_target, and doc_to_choice. - **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text, doc_to_target, and doc_to_choice.
- **description** (`str`, *optional*) — An optional prepended Jinja2 template or string which will be prepended to the few-shot examples passed into the model, often describing the task or providing instructions to a model, such as `"The following are questions (with answers) about {{subject}}.\n\n"`. No delimiters or spacing are inserted between the description and the first few-shot example. - **description** (`str`, *optional*) — An optional prepended Jinja2 template or string which will be prepended to the few-shot examples passed into the model, often describing the task or providing instructions to a model, such as `"The following are questions (with answers) about {{subject}}.\n\n"`. No delimiters or spacing are inserted between the description and the first few-shot example.
- **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate input for the model - **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate input for the model.
- **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate target output for the model. For multiple choice tasks, this should return an index into - **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate target output for the model. For multiple choice tasks, this should return an index into the answer choice list of the correct answer.
- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `generate_until` tasks. - **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `generate_until` tasks.
- **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples. - **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples.
- **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested. - **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested.
...@@ -155,6 +155,21 @@ Our final filter pipeline, "maj@8", does majority voting across the first 8 of t ...@@ -155,6 +155,21 @@ Our final filter pipeline, "maj@8", does majority voting across the first 8 of t
Thus, given the 64 responses from our LM on each document, we can report metrics on these responses in these 3 different ways, as defined by our filter pipelines. Thus, given the 64 responses from our LM on each document, we can report metrics on these responses in these 3 different ways, as defined by our filter pipelines.
### Adding a custom filter
Just like adding a custom model with `register_model` decorator one is able to do the same with filters, for example
```python
from lm_eval.api.filter import Filter
from lm_eval.api.registry import register_filter
@register_filter("new_filter")
class NewFilter(Filter)
...
```
## Embedded Python Code ## Embedded Python Code
Use can use python functions for certain arguments by using the `!function` operator after the argument name followed by `<filename>.<pythonfunctionname>`. This feature can be used for the following arguments: Use can use python functions for certain arguments by using the `!function` operator after the argument name followed by `<filename>.<pythonfunctionname>`. This feature can be used for the following arguments:
...@@ -175,7 +190,7 @@ You can base a YAML on another YAML file as a template. This can be handy when y ...@@ -175,7 +190,7 @@ You can base a YAML on another YAML file as a template. This can be handy when y
include: <YAML filename or with full path> include: <YAML filename or with full path>
... ...
``` ```
You can find an example of how to use this feature at [gsm8k-cot-self-consistency.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/3c07cc04a92fc467d7c9a94894aeddd58c93a5da/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml) where it is based off [gsm8k-cot.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/3c07cc04a92fc467d7c9a94894aeddd58c93a5da/lm_eval/tasks/gsm8k/gsm8k-cot.yaml) You can find an example of how to use this feature at [gsm8k-cot-self-consistency.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml) where it is based off [gsm8k-cot.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k-cot.yaml)
## Passing Arguments to Metrics ## Passing Arguments to Metrics
......
...@@ -2,34 +2,20 @@ import argparse ...@@ -2,34 +2,20 @@ import argparse
import json import json
import logging import logging
import os import os
import re
import sys import sys
from functools import partial from functools import partial
from pathlib import Path
from typing import Union from typing import Union
import numpy as np
from lm_eval import evaluator, utils from lm_eval import evaluator, utils
from lm_eval.evaluator import request_caching_arg_to_dict from lm_eval.evaluator import request_caching_arg_to_dict
from lm_eval.logging_utils import WandbLogger from lm_eval.loggers import EvaluationTracker, WandbLogger
from lm_eval.tasks import TaskManager, include_path, initialize_tasks from lm_eval.tasks import TaskManager
from lm_eval.utils import make_table, simple_parse_args_string from lm_eval.utils import handle_non_serializable, make_table, simple_parse_args_string
DEFAULT_RESULTS_FILE = "results.json"
def _handle_non_serializable(o):
if isinstance(o, np.int64) or isinstance(o, np.int32):
return int(o)
elif isinstance(o, set):
return list(o)
else:
return str(o)
def _int_or_none_list_arg_type(max_len: int, value: str, split_char: str = ","): def _int_or_none_list_arg_type(
min_len: int, max_len: int, defaults: str, value: str, split_char: str = ","
):
def parse_value(item): def parse_value(item):
item = item.strip().lower() item = item.strip().lower()
if item == "none": if item == "none":
...@@ -45,21 +31,47 @@ def _int_or_none_list_arg_type(max_len: int, value: str, split_char: str = ","): ...@@ -45,21 +31,47 @@ def _int_or_none_list_arg_type(max_len: int, value: str, split_char: str = ","):
if num_items == 1: if num_items == 1:
# Makes downstream handling the same for single and multiple values # Makes downstream handling the same for single and multiple values
items = items * max_len items = items * max_len
elif num_items != max_len: elif num_items < min_len or num_items > max_len:
raise argparse.ArgumentTypeError( raise argparse.ArgumentTypeError(
f"Argument requires {max_len} integers or None, separated by '{split_char}'" f"Argument requires {max_len} integers or None, separated by '{split_char}'"
) )
elif num_items != max_len:
logging.warning(
f"Argument requires {max_len} integers or None, separated by '{split_char}'. "
"Missing values will be filled with defaults."
)
default_items = [parse_value(v) for v in defaults.split(split_char)]
items.extend(
default_items[num_items:]
) # extend items list with missing defaults
return items return items
def parse_eval_args() -> argparse.Namespace: def check_argument_types(parser: argparse.ArgumentParser):
"""
Check to make sure all CLI args are typed, raises error if not
"""
for action in parser._actions:
if action.dest != "help" and not action.const:
if action.type is None:
raise ValueError(
f"Argument '{action.dest}' doesn't have a type specified."
)
else:
continue
def setup_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("--model", "-m", default="hf", help="Name of model e.g. `hf`") parser.add_argument(
"--model", "-m", type=str, default="hf", help="Name of model e.g. `hf`"
)
parser.add_argument( parser.add_argument(
"--tasks", "--tasks",
"-t", "-t",
default=None, default=None,
type=str,
metavar="task1,task2", metavar="task1,task2",
help="To get full list of tasks, use the command lm-eval --tasks list", help="To get full list of tasks, use the command lm-eval --tasks list",
) )
...@@ -67,6 +79,7 @@ def parse_eval_args() -> argparse.Namespace: ...@@ -67,6 +79,7 @@ def parse_eval_args() -> argparse.Namespace:
"--model_args", "--model_args",
"-a", "-a",
default="", default="",
type=str,
help="Comma separated string arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`", help="Comma separated string arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
) )
parser.add_argument( parser.add_argument(
...@@ -164,6 +177,7 @@ def parse_eval_args() -> argparse.Namespace: ...@@ -164,6 +177,7 @@ def parse_eval_args() -> argparse.Namespace:
) )
parser.add_argument( parser.add_argument(
"--gen_kwargs", "--gen_kwargs",
type=str,
default=None, default=None,
help=( help=(
"String arguments for model generation on greedy_until tasks," "String arguments for model generation on greedy_until tasks,"
...@@ -180,9 +194,16 @@ def parse_eval_args() -> argparse.Namespace: ...@@ -180,9 +194,16 @@ def parse_eval_args() -> argparse.Namespace:
) )
parser.add_argument( parser.add_argument(
"--wandb_args", "--wandb_args",
type=str,
default="", default="",
help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval", help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval",
) )
parser.add_argument(
"--hf_hub_log_args",
type=str,
default="",
help="Comma separated string arguments passed to Hugging Face Hub's log function, e.g. `hub_results_org=EleutherAI,hub_repo_name=lm-eval-results`",
)
parser.add_argument( parser.add_argument(
"--predict_only", "--predict_only",
"-x", "-x",
...@@ -190,17 +211,20 @@ def parse_eval_args() -> argparse.Namespace: ...@@ -190,17 +211,20 @@ def parse_eval_args() -> argparse.Namespace:
default=False, default=False,
help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.", help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
) )
default_seed_string = "0,1234,1234,1234"
parser.add_argument( parser.add_argument(
"--seed", "--seed",
type=partial(_int_or_none_list_arg_type, 3), type=partial(_int_or_none_list_arg_type, 3, 4, default_seed_string),
default="0,1234,1234", # for backward compatibility default=default_seed_string, # for backward compatibility
help=( help=(
"Set seed for python's random, numpy and torch.\n" "Set seed for python's random, numpy, torch, and fewshot sampling.\n"
"Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, " "Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, "
"or a single integer to set the same seed for all three.\n" "respectively, or a single integer to set the same seed for all three.\n"
"The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility).\n" f"The values are either an integer or 'None' to not set the seed. Default is `{default_seed_string}` "
"E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`.\n" "(for backward compatibility).\n"
"E.g, `--seed 42` sets all three seeds to 42." "E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. "
"Here numpy's seed is not set since the second value is `None`.\n"
"E.g, `--seed 42` sets all four seeds to 42."
), ),
) )
parser.add_argument( parser.add_argument(
...@@ -208,14 +232,19 @@ def parse_eval_args() -> argparse.Namespace: ...@@ -208,14 +232,19 @@ def parse_eval_args() -> argparse.Namespace:
action="store_true", action="store_true",
help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub", help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
) )
return parser
def parse_eval_args(parser: argparse.ArgumentParser) -> argparse.Namespace:
check_argument_types(parser)
return parser.parse_args() return parser.parse_args()
def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
if not args: if not args:
# we allow for args to be passed externally, else we parse them ourselves # we allow for args to be passed externally, else we parse them ourselves
args = parse_eval_args() parser = setup_parser()
args = parse_eval_args(parser)
if args.wandb_args: if args.wandb_args:
wandb_logger = WandbLogger(**simple_parse_args_string(args.wandb_args)) wandb_logger = WandbLogger(**simple_parse_args_string(args.wandb_args))
...@@ -225,6 +254,18 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -225,6 +254,18 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
eval_logger.info(f"Verbosity set to {args.verbosity}") eval_logger.info(f"Verbosity set to {args.verbosity}")
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
# update the evaluation tracker args with the output path and the HF token
if args.output_path:
args.hf_hub_log_args += f",output_path={args.output_path}"
if os.environ.get("HF_TOKEN", None):
args.hf_hub_log_args += f",token={os.environ.get('HF_TOKEN')}"
evaluation_tracker_args = simple_parse_args_string(args.hf_hub_log_args)
evaluation_tracker = EvaluationTracker(**evaluation_tracker_args)
evaluation_tracker.general_config_tracker.log_experiment_args(
model_source=args.model,
model_args=args.model_args,
)
if args.predict_only: if args.predict_only:
args.log_samples = True args.log_samples = True
if (args.log_samples or args.predict_only) and not args.output_path: if (args.log_samples or args.predict_only) and not args.output_path:
...@@ -232,17 +273,27 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -232,17 +273,27 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
"Specify --output_path if providing --log_samples or --predict_only" "Specify --output_path if providing --log_samples or --predict_only"
) )
initialize_tasks(args.verbosity) if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}")
task_manager = TaskManager(args.verbosity, include_path=args.include_path) task_manager = TaskManager(args.verbosity, include_path=args.include_path)
if (
"push_results_to_hub" in evaluation_tracker_args
or "push_samples_to_hub" in evaluation_tracker_args
) and "hub_results_org" not in evaluation_tracker_args:
raise ValueError(
"If push_results_to_hub or push_samples_to_hub is set, results_org must be specified."
)
if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
eval_logger.warning(
"Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub."
)
if args.limit: if args.limit:
eval_logger.warning( eval_logger.warning(
" --limit SHOULD ONLY BE USED FOR TESTING." " --limit SHOULD ONLY BE USED FOR TESTING."
"REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
) )
if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}")
include_path(args.include_path)
if args.tasks is None: if args.tasks is None:
eval_logger.error("Need to specify task to evaluate.") eval_logger.error("Need to specify task to evaluate.")
...@@ -282,24 +333,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -282,24 +333,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks, or '--verbosity DEBUG' to troubleshoot task registration issues." f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks, or '--verbosity DEBUG' to troubleshoot task registration issues."
) )
if args.output_path:
path = Path(args.output_path)
# check if file or 'dir/results.json' exists
if path.is_file():
raise FileExistsError(f"File already exists at {path}")
output_path_file = path.joinpath(DEFAULT_RESULTS_FILE)
if output_path_file.is_file():
eval_logger.warning(
f"File {output_path_file} already exists. Results will be overwritten."
)
# if path json then get parent dir
elif path.suffix in (".json", ".jsonl"):
output_path_file = path
path.parent.mkdir(parents=True, exist_ok=True)
path = path.parent
else:
path.mkdir(parents=True, exist_ok=True)
# Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args
if args.trust_remote_code: if args.trust_remote_code:
os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = str(args.trust_remote_code) os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = str(args.trust_remote_code)
...@@ -309,7 +342,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -309,7 +342,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
) )
eval_logger.info(f"Selected Tasks: {task_names}") eval_logger.info(f"Selected Tasks: {task_names}")
eval_logger.info("Loading selected tasks...")
request_caching_args = request_caching_arg_to_dict( request_caching_args = request_caching_arg_to_dict(
cache_requests=args.cache_requests cache_requests=args.cache_requests
...@@ -335,6 +367,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -335,6 +367,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
random_seed=args.seed[0], random_seed=args.seed[0],
numpy_random_seed=args.seed[1], numpy_random_seed=args.seed[1],
torch_random_seed=args.seed[2], torch_random_seed=args.seed[2],
fewshot_random_seed=args.seed[3],
**request_caching_args, **request_caching_args,
) )
...@@ -342,7 +375,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -342,7 +375,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
if args.log_samples: if args.log_samples:
samples = results.pop("samples") samples = results.pop("samples")
dumped = json.dumps( dumped = json.dumps(
results, indent=2, default=_handle_non_serializable, ensure_ascii=False results, indent=2, default=handle_non_serializable, ensure_ascii=False
) )
if args.show_config: if args.show_config:
print(dumped) print(dumped)
...@@ -359,22 +392,15 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -359,22 +392,15 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
except Exception as e: except Exception as e:
eval_logger.info(f"Logging to Weights and Biases failed due to {e}") eval_logger.info(f"Logging to Weights and Biases failed due to {e}")
if args.output_path: evaluation_tracker.save_results_aggregated(
output_path_file.open("w", encoding="utf-8").write(dumped) results=results, samples=samples if args.log_samples else None
)
if args.log_samples:
for task_name, config in results["configs"].items(): if args.log_samples:
output_name = "{}_{}".format( for task_name, config in results["configs"].items():
re.sub("/|=", "__", args.model_args), task_name evaluation_tracker.save_results_samples(
) task_name=task_name, samples=samples[task_name]
filename = path.joinpath(f"{output_name}.jsonl") )
samples_dumped = json.dumps(
samples[task_name],
indent=2,
default=_handle_non_serializable,
ensure_ascii=False,
)
filename.write_text(samples_dumped, encoding="utf-8")
print( print(
f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, " f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
......
...@@ -119,9 +119,10 @@ def ter(items): ...@@ -119,9 +119,10 @@ def ter(items):
@register_aggregation("brier_score") @register_aggregation("brier_score")
def brier_score(items): # This is a passthrough function def brier_score(items): # This is a passthrough function
gold, predictions = list(zip(*items)) gold, predictions = list(zip(*items))
bs, num_class = np.array(predictions).shape
gold = list(gold) gold = list(gold)
gold_one_hot = np.eye(np.max(gold) + 1)[gold] gold_one_hot = np.eye(num_class)[gold]
predictions = list(zip(*items))[1]
return np.mean(np.sum((predictions - gold_one_hot) ** 2, axis=1)) return np.mean(np.sum((predictions - gold_one_hot) ** 2, axis=1))
...@@ -428,7 +429,11 @@ def bootstrap_stderr(f, xs, iters): ...@@ -428,7 +429,11 @@ def bootstrap_stderr(f, xs, iters):
return sample_stddev(res) return sample_stddev(res)
def stderr_for_metric(metric, bootstrap_iters): def stderr_for_metric(metric, bootstrap_iters: int):
if bootstrap_iters <= 0:
# return no function (don't compute stderr) if bootstrap iters = 0
return None
bootstrappable = [ bootstrappable = [
median, median,
matthews_corrcoef, matthews_corrcoef,
......
...@@ -5,6 +5,7 @@ import logging ...@@ -5,6 +5,7 @@ import logging
import os import os
from typing import List, Optional, Tuple, Type, TypeVar from typing import List, Optional, Tuple, Type, TypeVar
import transformers
from sqlitedict import SqliteDict from sqlitedict import SqliteDict
from tqdm import tqdm from tqdm import tqdm
...@@ -65,11 +66,11 @@ class LM(abc.ABC): ...@@ -65,11 +66,11 @@ class LM(abc.ABC):
multiple chunks, the last input will still a full-sized context. multiple chunks, the last input will still a full-sized context.
Example: Example:
Input tokens: [ 0 1 2 3 4 5 6 7 8 9 ] Input tokens: [ 0 1 2 3 4 5 6 7 8 9 ]
Prefix: EOT Prefix: BOS/EOS
Max context length: 4 Max context length: 4
Resulting input/prediction pairs: Resulting input/prediction pairs:
INPUT: EOT 0 1 2 INPUT: BOS 0 1 2
PRED: 0 1 2 3 PRED: 0 1 2 3
INPUT: 3 4 5 6 INPUT: 3 4 5 6
...@@ -89,7 +90,8 @@ class LM(abc.ABC): ...@@ -89,7 +90,8 @@ class LM(abc.ABC):
:return: list[tuple[float]] :return: list[tuple[float]]
A list of tuples (logprob,) A list of tuples (logprob,)
logprob: float logprob: float
The log probability of `context` conditioned on the EOT token. The log probability of `context` conditioned on the BOS/EOS token.
Can also be overridden for custom cases by `prefix_token_id`.
""" """
pass pass
...@@ -282,6 +284,11 @@ class TemplateLM(LM): ...@@ -282,6 +284,11 @@ class TemplateLM(LM):
def eot_token_id(self): def eot_token_id(self):
pass pass
@property
def prefix_token_id(self):
# it is used as prefix for loglikelihood
return self.eot_token_id
@abc.abstractmethod @abc.abstractmethod
def tok_encode(self, string: str, **kwargs): def tok_encode(self, string: str, **kwargs):
pass pass
...@@ -296,11 +303,17 @@ class TemplateLM(LM): ...@@ -296,11 +303,17 @@ class TemplateLM(LM):
continuation = context[-n_spaces:] + continuation continuation = context[-n_spaces:] + continuation
context = context[:-n_spaces] context = context[:-n_spaces]
whole_enc = self.tok_encode(context + continuation) model_class = getattr(self, "AUTO_MODEL_CLASS", None)
context_enc = self.tok_encode(context)
if model_class == transformers.AutoModelForSeq2SeqLM:
context_enc = self.tok_encode(context)
continuation_enc = self.tok_encode(continuation, add_special_tokens=False)
else:
whole_enc = self.tok_encode(context + continuation)
context_enc = self.tok_encode(context)
context_enc_len = len(context_enc) context_enc_len = len(context_enc)
continuation_enc = whole_enc[context_enc_len:] continuation_enc = whole_enc[context_enc_len:]
return context_enc, continuation_enc return context_enc, continuation_enc
...@@ -310,9 +323,9 @@ class TemplateLM(LM): ...@@ -310,9 +323,9 @@ class TemplateLM(LM):
new_reqs = [] new_reqs = []
for context, continuation in [req.args for req in requests]: for context, continuation in [req.args for req in requests]:
if context == "": if context == "":
# end of text as context # BOS or EOS as context
context_enc, continuation_enc = ( context_enc, continuation_enc = (
[self.eot_token_id], [self.prefix_token_id],
self.tok_encode(continuation), self.tok_encode(continuation),
) )
else: else:
......
...@@ -78,6 +78,7 @@ METRIC_REGISTRY = {} ...@@ -78,6 +78,7 @@ METRIC_REGISTRY = {}
METRIC_AGGREGATION_REGISTRY = {} METRIC_AGGREGATION_REGISTRY = {}
AGGREGATION_REGISTRY: Dict[str, Callable[[], Dict[str, Callable]]] = {} AGGREGATION_REGISTRY: Dict[str, Callable[[], Dict[str, Callable]]] = {}
HIGHER_IS_BETTER_REGISTRY = {} HIGHER_IS_BETTER_REGISTRY = {}
FILTER_REGISTRY = {}
DEFAULT_METRIC_REGISTRY = { DEFAULT_METRIC_REGISTRY = {
"loglikelihood": [ "loglikelihood": [
...@@ -170,3 +171,22 @@ def is_higher_better(metric_name) -> bool: ...@@ -170,3 +171,22 @@ def is_higher_better(metric_name) -> bool:
eval_logger.warning( eval_logger.warning(
f"higher_is_better not specified for metric '{metric_name}'!" f"higher_is_better not specified for metric '{metric_name}'!"
) )
def register_filter(name):
def decorate(cls):
if name in FILTER_REGISTRY:
eval_logger.info(
f"Registering filter `{name}` that is already in Registry {FILTER_REGISTRY}"
)
FILTER_REGISTRY[name] = cls
return cls
return decorate
def get_filter(filter_name: str) -> type:
try:
return FILTER_REGISTRY[filter_name]
except KeyError:
eval_logger.warning(f"filter `{filter_name}` is not registered!")
class ContextSampler: class ContextSampler:
def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None: def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
self.rnd = rnd self.rnd = rnd
assert self.rnd, "must pass rnd to FewShotSampler!" if not self.rnd:
raise ValueError(
"A `random.Random` generator argument must be provided to `rnd` of FewShotSampler!"
)
self.task = task self.task = task
self.config = task._config self.config = task._config
......
...@@ -99,7 +99,7 @@ class TaskConfig(dict): ...@@ -99,7 +99,7 @@ class TaskConfig(dict):
def __post_init__(self) -> None: def __post_init__(self) -> None:
if self.generation_kwargs is not None: if self.generation_kwargs is not None:
if self.output_type != "generate_until": if self.output_type != "generate_until":
raise ValueError( eval_logger.warning(
f"[{self.task}] passed `generation_kwargs`, but not using `output_type: generate_until`!" f"[{self.task}] passed `generation_kwargs`, but not using `output_type: generate_until`!"
) )
...@@ -229,6 +229,9 @@ class Task(abc.ABC): ...@@ -229,6 +229,9 @@ class Task(abc.ABC):
self._config: TaskConfig = TaskConfig({**config}) if config else TaskConfig() self._config: TaskConfig = TaskConfig({**config}) if config else TaskConfig()
self._filters = [build_filter_ensemble("none", [["take_first", None]])] self._filters = [build_filter_ensemble("none", [["take_first", None]])]
self.fewshot_rnd: Optional[
random.Random
] = None # purposely induce errors in case of improper usage
def download( def download(
self, self,
...@@ -376,7 +379,7 @@ class Task(abc.ABC): ...@@ -376,7 +379,7 @@ class Task(abc.ABC):
# used with caching # used with caching
og_limit = limit og_limit = limit
cache_key = f"requests-{self._config.task}" cache_key = f"requests-{self._config.task}-{self.config.num_fewshot}shot-rank{rank}-world_size{world_size}"
cached_instances = load_from_cache(file_name=cache_key) cached_instances = load_from_cache(file_name=cache_key)
...@@ -520,7 +523,7 @@ class Task(abc.ABC): ...@@ -520,7 +523,7 @@ class Task(abc.ABC):
self, self,
doc, doc,
num_fewshot, num_fewshot,
rnd=random.Random(1234), rnd=None,
description=None, description=None,
): ):
"""Returns a fewshot context string that is made up of a prepended description """Returns a fewshot context string that is made up of a prepended description
...@@ -539,9 +542,12 @@ class Task(abc.ABC): ...@@ -539,9 +542,12 @@ class Task(abc.ABC):
The fewshot context. The fewshot context.
""" """
if rnd is None: if rnd is None:
raise ValueError( if self.fewshot_rnd is not None:
"A `random.Random` generator argument must be provided to `rnd`" rnd = self.fewshot_rnd
) else:
raise ValueError(
"A `random.Random` generator argument must be provided to `rnd`"
)
description = description if description else "" description = description if description else ""
...@@ -632,6 +638,11 @@ class Task(abc.ABC): ...@@ -632,6 +638,11 @@ class Task(abc.ABC):
setattr(self._config, "metric_list", [{"metric": metric_name}]) setattr(self._config, "metric_list", [{"metric": metric_name}])
setattr(self._config, "process_results", None) setattr(self._config, "process_results", None)
def set_fewshot_seed(self, seed: Optional[int] = None) -> None:
self.fewshot_rnd = random.Random(seed)
if hasattr(self, "sampler"):
self.sampler.rnd = self.fewshot_rnd
@property @property
def eval_docs(self) -> Union[datasets.Dataset, List[dict]]: def eval_docs(self) -> Union[datasets.Dataset, List[dict]]:
if self.has_test_docs(): if self.has_test_docs():
...@@ -808,11 +819,29 @@ class ConfigurableTask(Task): ...@@ -808,11 +819,29 @@ class ConfigurableTask(Task):
self.prompt = None self.prompt = None
if self.fewshot_docs() is not None: if self.fewshot_docs() is not None:
self.sampler = samplers.get_sampler( self.fewshot_rnd = (
random.Random()
) # setting with no seed, to be overridden at a later time
config_sampler: Union[str, Callable] = (
self.config.fewshot_config.get("sampler", "default") self.config.fewshot_config.get("sampler", "default")
if self.config.fewshot_config if self.config.fewshot_config
else "default" else "default"
)(list(self.fewshot_docs()), self, rnd=random.Random(1234)) )
if isinstance(config_sampler, str):
self.sampler = samplers.get_sampler(config_sampler)(
list(self.fewshot_docs()), self, rnd=self.fewshot_rnd
)
elif callable(config_sampler) and issubclass(
config_sampler, samplers.ContextSampler
):
self.sampler = config_sampler(
docs=list(self.fewshot_docs()), task=self, rnd=self.fewshot_rnd
)
else:
raise TypeError(
f"fewshot_config.sampler should be a string or callable of ContextSampler type, "
f"not {type(config_sampler)}"
)
self.task_docs = self.eval_docs self.task_docs = self.eval_docs
......
import itertools import itertools
import json
import logging import logging
import random import random
import time
from collections import defaultdict from collections import defaultdict
from typing import TYPE_CHECKING, List, Optional, Union from typing import TYPE_CHECKING, List, Optional, Union
...@@ -19,9 +21,15 @@ from lm_eval.evaluator_utils import ( ...@@ -19,9 +21,15 @@ from lm_eval.evaluator_utils import (
print_writeout, print_writeout,
run_task_tests, run_task_tests,
) )
from lm_eval.logging_utils import add_env_info, get_git_commit_hash from lm_eval.loggers.utils import add_env_info, get_git_commit_hash
from lm_eval.tasks import TaskManager, get_task_dict from lm_eval.tasks import TaskManager, get_task_dict
from lm_eval.utils import eval_logger, positional_deprecated, simple_parse_args_string from lm_eval.utils import (
eval_logger,
handle_non_serializable,
hash_string,
positional_deprecated,
simple_parse_args_string,
)
if TYPE_CHECKING: if TYPE_CHECKING:
...@@ -54,6 +62,7 @@ def simple_evaluate( ...@@ -54,6 +62,7 @@ def simple_evaluate(
random_seed: int = 0, random_seed: int = 0,
numpy_random_seed: int = 1234, numpy_random_seed: int = 1234,
torch_random_seed: int = 1234, torch_random_seed: int = 1234,
fewshot_random_seed: int = 1234,
): ):
"""Instantiate and evaluate a model on a list of tasks. """Instantiate and evaluate a model on a list of tasks.
...@@ -83,7 +92,7 @@ def simple_evaluate( ...@@ -83,7 +92,7 @@ def simple_evaluate(
:param limit: int or float, optional :param limit: int or float, optional
Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples. Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples.
:param bootstrap_iters: :param bootstrap_iters:
Number of iterations for bootstrap statistics Number of iterations for bootstrap statistics, used when calculating stderrs. set to 0 for no stderr calculations to be performed.
:param check_integrity: bool :param check_integrity: bool
Whether to run the relevant part of the test suite for the tasks Whether to run the relevant part of the test suite for the tasks
:param write_out: bool :param write_out: bool
...@@ -101,11 +110,14 @@ def simple_evaluate( ...@@ -101,11 +110,14 @@ def simple_evaluate(
Random seed for numpy. If set to None, the seed will not be set. Random seed for numpy. If set to None, the seed will not be set.
:param torch_random_seed: int :param torch_random_seed: int
Random seed for torch. If set to None, the seed will not be set. Random seed for torch. If set to None, the seed will not be set.
:param fewshot_random_seed: int
Random seed for fewshot sampler random generator. If set to None, the seed of generator will be set to None.
:return :return
Dictionary of results Dictionary of results
""" """
eval_logger.setLevel(getattr(logging, f"{verbosity}")) eval_logger.setLevel(getattr(logging, f"{verbosity}"))
start_date = time.time()
if delete_requests_cache: if delete_requests_cache:
eval_logger.info("Deleting requests cache...") eval_logger.info("Deleting requests cache...")
...@@ -146,9 +158,13 @@ def simple_evaluate( ...@@ -146,9 +158,13 @@ def simple_evaluate(
if isinstance(model, str): if isinstance(model, str):
if model_args is None: if model_args is None:
eval_logger.warning("model_args not specified. Using defaults.")
model_args = "" model_args = ""
if isinstance(model_args, dict): if isinstance(model_args, dict):
eval_logger.info(
f"Initializing {model} model, with arguments: {model_args}"
)
lm = lm_eval.api.registry.get_model(model).create_from_arg_obj( lm = lm_eval.api.registry.get_model(model).create_from_arg_obj(
model_args, model_args,
{ {
...@@ -159,6 +175,9 @@ def simple_evaluate( ...@@ -159,6 +175,9 @@ def simple_evaluate(
) )
else: else:
eval_logger.info(
f"Initializing {model} model, with arguments: {simple_parse_args_string(model_args)}"
)
lm = lm_eval.api.registry.get_model(model).create_from_arg_string( lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
model_args, model_args,
{ {
...@@ -170,6 +189,7 @@ def simple_evaluate( ...@@ -170,6 +189,7 @@ def simple_evaluate(
else: else:
if not isinstance(model, lm_eval.api.model.LM): if not isinstance(model, lm_eval.api.model.LM):
raise TypeError raise TypeError
eval_logger.info("Using pre-initialized model")
lm = model lm = model
if use_cache is not None: if use_cache is not None:
...@@ -187,10 +207,6 @@ def simple_evaluate( ...@@ -187,10 +207,6 @@ def simple_evaluate(
if task_manager is None: if task_manager is None:
task_manager = TaskManager(verbosity) task_manager = TaskManager(verbosity)
eval_logger.info(
"get_task_dict has been updated to accept an optional argument, `task_manager`"
"Read more here:https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
)
task_dict = get_task_dict(tasks, task_manager) task_dict = get_task_dict(tasks, task_manager)
for task_name in task_dict.keys(): for task_name in task_dict.keys():
task_obj = task_dict[task_name] task_obj = task_dict[task_name]
...@@ -213,6 +229,8 @@ def simple_evaluate( ...@@ -213,6 +229,8 @@ def simple_evaluate(
# we have to change the class properties post-hoc. This is pretty hacky. # we have to change the class properties post-hoc. This is pretty hacky.
task_obj.override_metric(metric_name="bypass") task_obj.override_metric(metric_name="bypass")
# override tasks' fewshot values to the provided num_fewshot arg value
# except if tasks have it set to 0 manually in their configs--then we should never overwrite that
if num_fewshot is not None: if num_fewshot is not None:
if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0: if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
eval_logger.info( eval_logger.info(
...@@ -223,6 +241,14 @@ def simple_evaluate( ...@@ -223,6 +241,14 @@ def simple_evaluate(
f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}" f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
) )
task_obj.set_config(key="num_fewshot", value=num_fewshot) task_obj.set_config(key="num_fewshot", value=num_fewshot)
task_obj.set_fewshot_seed(seed=fewshot_random_seed)
eval_logger.info(
f"Setting fewshot random generator seed to {fewshot_random_seed}"
)
else:
# if num_fewshot not provided, and the task does not define a default one, default to 0
if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None:
task_obj.set_config(key="num_fewshot", value=0)
if check_integrity: if check_integrity:
run_task_tests(task_list=tasks) run_task_tests(task_list=tasks)
...@@ -251,17 +277,30 @@ def simple_evaluate( ...@@ -251,17 +277,30 @@ def simple_evaluate(
results["config"] = { results["config"] = {
"model": model_name, "model": model_name,
"model_args": model_args, "model_args": model_args,
"batch_size": batch_size,
"batch_sizes": (
list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else []
),
"device": device,
"use_cache": use_cache,
"limit": limit,
"bootstrap_iters": bootstrap_iters,
"gen_kwargs": gen_kwargs,
} }
# add more detailed model info if available
if isinstance(lm, lm_eval.models.huggingface.HFLM):
results["config"].update(lm.get_model_info())
# add info about execution
results["config"].update(
{
"batch_size": batch_size,
"batch_sizes": (
list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else []
),
"device": device,
"use_cache": use_cache,
"limit": limit,
"bootstrap_iters": bootstrap_iters,
"gen_kwargs": gen_kwargs,
"random_seed": random_seed,
"numpy_seed": numpy_random_seed,
"torch_seed": torch_random_seed,
"fewshot_seed": fewshot_random_seed,
}
)
results["git_hash"] = get_git_commit_hash() results["git_hash"] = get_git_commit_hash()
results["date"] = start_date
add_env_info(results) # additional environment info to results add_env_info(results) # additional environment info to results
return results return results
else: else:
...@@ -289,7 +328,7 @@ def evaluate( ...@@ -289,7 +328,7 @@ def evaluate(
:param limit: int, optional :param limit: int, optional
Limit the number of examples per task (only use this for testing) Limit the number of examples per task (only use this for testing)
:param bootstrap_iters: :param bootstrap_iters:
Number of iterations for bootstrap statistics Number of iterations for bootstrap statistics, used when calculating stderr. Set to 0 for skipping all stderr calculations.
:param write_out: bool :param write_out: bool
If True, write out an example document and model input for checking task integrity If True, write out an example document and model input for checking task integrity
:param log_samples: bool :param log_samples: bool
...@@ -327,7 +366,6 @@ def evaluate( ...@@ -327,7 +366,6 @@ def evaluate(
eval_logger.debug( eval_logger.debug(
f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}" f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}"
) )
if write_out: if write_out:
print_writeout(task) print_writeout(task)
# aggregate Instances by LM method requested to get output. # aggregate Instances by LM method requested to get output.
...@@ -413,6 +451,16 @@ def evaluate( ...@@ -413,6 +451,16 @@ def evaluate(
"filtered_resps": [ "filtered_resps": [
req.filtered_resps[filter_key] for req in requests req.filtered_resps[filter_key] for req in requests
], ],
"doc_hash": hash_string(
json.dumps(
requests[0].doc,
indent=2,
default=handle_non_serializable,
ensure_ascii=False,
)
),
"prompt_hash": hash_string(requests[0].arguments[0]),
"target_hash": hash_string(str(target)),
} }
example.update(metrics) example.update(metrics)
task_output.logged_samples.append(example) task_output.logged_samples.append(example)
...@@ -543,6 +591,16 @@ def evaluate( ...@@ -543,6 +591,16 @@ def evaluate(
"configs": dict(sorted(configs.items())), "configs": dict(sorted(configs.items())),
"versions": dict(sorted(versions.items())), "versions": dict(sorted(versions.items())),
"n-shot": dict(sorted(num_fewshot.items())), "n-shot": dict(sorted(num_fewshot.items())),
"n-samples": {
task_output.task_name: {
"original": len(task_output.task.eval_docs),
"effective": min(
limit if limit else len(task_output.task.eval_docs),
len(task_output.task.eval_docs),
),
}
for task_output in eval_tasks
},
} }
if log_samples: if log_samples:
results_dict["samples"] = dict(samples) results_dict["samples"] = dict(samples)
......
...@@ -97,7 +97,7 @@ class TaskOutput: ...@@ -97,7 +97,7 @@ class TaskOutput:
metric_key = f"{metric},{filter_key}" metric_key = f"{metric},{filter_key}"
self.agg_metrics[metric_key] = agg_fn(items) self.agg_metrics[metric_key] = agg_fn(items)
self.sample_len = len(items) # TODO: same sample size for each metric? self.sample_len = len(items) # TODO: same sample size for each metric?
if bootstrap_iters: if isinstance(bootstrap_iters, int):
stderr_fn = metrics.stderr_for_metric( stderr_fn = metrics.stderr_for_metric(
metric=agg_fn, metric=agg_fn,
bootstrap_iters=min(bootstrap_iters, 100) bootstrap_iters=min(bootstrap_iters, 100)
...@@ -107,6 +107,10 @@ class TaskOutput: ...@@ -107,6 +107,10 @@ class TaskOutput:
self.agg_metrics[f"{metric}_stderr,{filter_key}"] = ( self.agg_metrics[f"{metric}_stderr,{filter_key}"] = (
stderr_fn(items) if (stderr_fn and len(items) > 1) else "N/A" stderr_fn(items) if (stderr_fn and len(items) > 1) else "N/A"
) )
else:
raise ValueError(
f"Received bootstrap_iters '{bootstrap_iters}' but expected an integer. Set to 0 to turn off stderr calculations."
)
def __repr__(self): def __repr__(self):
return ( return (
......
from functools import partial from functools import partial
from typing import List, Union from typing import List
from lm_eval.api.filter import FilterEnsemble from lm_eval.api.filter import FilterEnsemble
from lm_eval.api.registry import get_filter
from . import extraction, selection, transformation from . import extraction, selection, transformation
FILTER_REGISTRY = {
"take_first": selection.TakeFirstFilter,
"regex": extraction.RegexFilter,
"majority_vote": selection.MajorityVoteFilter,
"take_first_k": selection.TakeKFilter,
"remove_whitespace": extraction.WhitespaceFilter,
"lowercase": transformation.LowercaseFilter,
"uppercase": transformation.UppercaseFilter,
"map": transformation.MapFilter,
"multi_choice_regex": extraction.MultiChoiceRegexFilter,
# TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function
# that takes an input and returns a scalar and then should select the max reward,
# or should implement different filters for different ways of handling a reward model's inference.
# "arg_max": selection.ArgMaxFilter,
}
def get_filter(filter_name: str) -> Union[type, str]:
if filter_name in FILTER_REGISTRY:
return FILTER_REGISTRY[filter_name]
else:
return filter_name
def build_filter_ensemble( def build_filter_ensemble(
filter_name: str, components: List[List[str]] filter_name: str, components: List[List[str]]
) -> FilterEnsemble: ) -> FilterEnsemble:
......
from lm_eval.api.filter import Filter from lm_eval.api.filter import Filter
from lm_eval.api.registry import register_filter
@register_filter("decontaminate")
class DecontaminationFilter(Filter): class DecontaminationFilter(Filter):
""" """
......
...@@ -3,8 +3,10 @@ import sys ...@@ -3,8 +3,10 @@ import sys
import unicodedata import unicodedata
from lm_eval.api.filter import Filter from lm_eval.api.filter import Filter
from lm_eval.api.registry import register_filter
@register_filter("regex")
class RegexFilter(Filter): class RegexFilter(Filter):
""" """ """ """
...@@ -49,6 +51,7 @@ class RegexFilter(Filter): ...@@ -49,6 +51,7 @@ class RegexFilter(Filter):
return filtered_resps return filtered_resps
@register_filter("remove_whitespace")
class WhitespaceFilter(Filter): class WhitespaceFilter(Filter):
""" """ """ """
...@@ -71,6 +74,7 @@ class WhitespaceFilter(Filter): ...@@ -71,6 +74,7 @@ class WhitespaceFilter(Filter):
return filtered_resps return filtered_resps
@register_filter("multi_choice_regex")
class MultiChoiceRegexFilter(RegexFilter): class MultiChoiceRegexFilter(RegexFilter):
""" """
A filter used to extract a model's answer on multiple choice questions with A filter used to extract a model's answer on multiple choice questions with
......
from collections import Counter from collections import Counter
from lm_eval.api.filter import Filter from lm_eval.api.filter import Filter
from lm_eval.api.registry import register_filter
# TODO: implement "arg_max" filter. either it should take in an arbitrary "scoring"/reward function
# that takes an input and returns a scalar and then should select the max reward,
# or should implement different filters for different ways of handling a reward model's inference.
@register_filter("take_first")
class TakeFirstFilter(Filter): class TakeFirstFilter(Filter):
def __init__(self) -> None: def __init__(self) -> None:
""" """
...@@ -16,6 +23,7 @@ class TakeFirstFilter(Filter): ...@@ -16,6 +23,7 @@ class TakeFirstFilter(Filter):
return map(lambda r: r[0], resps) return map(lambda r: r[0], resps)
@register_filter("take_first_k")
class TakeKFilter(Filter): class TakeKFilter(Filter):
def __init__(self, **kwargs) -> None: def __init__(self, **kwargs) -> None:
self.k = kwargs.pop("k") self.k = kwargs.pop("k")
...@@ -32,6 +40,7 @@ class TakeKFilter(Filter): ...@@ -32,6 +40,7 @@ class TakeKFilter(Filter):
return map(lambda r: r[: self.k], resps) return map(lambda r: r[: self.k], resps)
@register_filter("majority_vote")
class MajorityVoteFilter(Filter): class MajorityVoteFilter(Filter):
def __init__(self) -> None: def __init__(self) -> None:
""" """
......
from lm_eval.api.filter import Filter from lm_eval.api.filter import Filter
from lm_eval.api.registry import register_filter
@register_filter("lowercase")
class LowercaseFilter(Filter): class LowercaseFilter(Filter):
def __init__(self) -> None: def __init__(self) -> None:
pass pass
...@@ -12,6 +14,7 @@ class LowercaseFilter(Filter): ...@@ -12,6 +14,7 @@ class LowercaseFilter(Filter):
return [filter_set(resp) for resp in resps] return [filter_set(resp) for resp in resps]
@register_filter("uppercase")
class UppercaseFilter(Filter): class UppercaseFilter(Filter):
def __init__(self) -> None: def __init__(self) -> None:
pass pass
...@@ -23,6 +26,7 @@ class UppercaseFilter(Filter): ...@@ -23,6 +26,7 @@ class UppercaseFilter(Filter):
return [filter_set(resp) for resp in resps] return [filter_set(resp) for resp in resps]
@register_filter("map")
class MapFilter(Filter): class MapFilter(Filter):
def __init__(self, mapping_dict: dict = None, default_value=None) -> None: def __init__(self, mapping_dict: dict = None, default_value=None) -> None:
""" """
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment