diff --git a/.github/workflows/new_tasks.yml b/.github/workflows/new_tasks.yml index 0df7111cfb0025f56b5f78f2dda068e9886d206b..b748aab5c06533fd3f8d41cfd519841a9af93f75 100644 --- a/.github/workflows/new_tasks.yml +++ b/.github/workflows/new_tasks.yml @@ -56,7 +56,7 @@ jobs: if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' run: | python -m pip install --upgrade pip - pip install -e '.[dev]' --extra-index-url https://download.pytorch.org/whl/cpu + pip install -e '.[dev,ifeval]' --extra-index-url https://download.pytorch.org/whl/cpu # Install optional git dependencies # pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index a3e25429a4a761b3bd922b1026455f65a52550d2..49b85fb9a4541f6c6dfecd4395a4544dc4ec5aac 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -56,12 +56,37 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -e '.[dev,anthropic,sentencepiece,optimum,deepsparse,sparseml]' --extra-index-url https://download.pytorch.org/whl/cpu + pip install -e '.[dev,sentencepiece,api]' --extra-index-url https://download.pytorch.org/whl/cpu # Install optional git dependencies # pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Test with pytest - run: python -m pytest --showlocals -s -vv -n=auto + run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py + - name: Archive artifacts + uses: actions/upload-artifact@v3 + with: + name: output_results + path: | + test_logs/* + testmodels: + name: External LM Tests + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - name: Checkout Code + uses: actions/checkout@v4 + - name: Set up Python 3.8 + uses: actions/setup-python@v5 + with: + python-version: 3.8 + cache: pip + cache-dependency-path: pyproject.toml + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu + - name: Test with pytest + run: python -m pytest tests/models --showlocals -s -vv - name: Archive artifacts uses: actions/upload-artifact@v3 with: diff --git a/.gitignore b/.gitignore index 020622dfdc82dd22630c522c0f9841754aefa638..c9278761b0ce9499b0c20afe0151b89121443512 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ temp __pycache__ .ipynb_checkpoints temp +test_logs/ # IPython profile_default/ ipython_config.py diff --git a/CODEOWNERS b/CODEOWNERS index 9e6375466f1419af1e4c3450c5fbf7934f89b82f..5a08ab244564ec729e87ec6a0603c5cfeef721c3 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1 +1 @@ -* @haileyschoelkopf @lintangsutawika +* @haileyschoelkopf @lintangsutawika @baberabb diff --git a/README.md b/README.md index ef00b8a68aa969221e920c1a3c44c7f9efc3ee7c..e973cdb7e895152b889f93cd22ab071c2c37c0e7 100644 --- a/README.md +++ b/README.md @@ -2,11 +2,21 @@ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10256836.svg)](https://doi.org/10.5281/zenodo.10256836) +--- + +*Latest News 📣* + +- [2024/07] [API model](docs/API_guide.md) support has been updated and refactored, introducing support for batched and async requests, and making it significantly easier to customize and use for your own purposes. **To run Llama 405B, we recommend using VLLM's OpenAI-compliant API to host the model, and use the `local-completions` model type to evaluate the model.** +- [2024/07] New Open LLM Leaderboard tasks have been added ! You can find them under the [leaderboard](lm_eval/tasks/leaderboard/README.md) task group. + +--- + ## Announcement **A new v0.4.0 release of lm-evaluation-harness is available** ! New updates and features include: +- **New Open LLM Leaderboard tasks have been added ! You can find them under the [leaderboard](lm_eval/tasks/leaderboard/README.md) task group.** - Internal refactoring - Config-based task creation and configuration - Easier import and sharing of externally-defined task config YAMLs @@ -20,6 +30,8 @@ Please see our updated documentation pages in `docs/` for more details. Development will be continuing on the `main` branch, and we encourage you to give us feedback on what features are desired and how to improve the library further, or ask questions, either in issues or PRs on GitHub, or in the [EleutherAI discord](https://discord.gg/eleutherai)! +--- + ## Overview This project provides a unified framework to test generative language models on a large number of different evaluation tasks. @@ -94,7 +106,7 @@ lm_eval --model hf \ #### Multi-GPU Evaluation with Hugging Face `accelerate` -We support two main ways of using Hugging Face's [accelerate 🚀](https://github.com/huggingface/accelerate) library for multi-GPU evaluation. +We support three main ways of using Hugging Face's [accelerate 🚀](https://github.com/huggingface/accelerate) library for multi-GPU evaluation. To perform *data-parallel evaluation* (where each GPU loads a **separate full copy** of the model), we leverage the `accelerate` launcher as follows: @@ -111,7 +123,7 @@ For cases where your model can fit on a single GPU, this allows you to evaluate The second way of using `accelerate` for multi-GPU evaluation is when your model is *too large to fit on a single GPU.* -In this setting, run the library *outside of the `accelerate` launcher*, but passing `parallelize=True` to `--model_args` as follows: +In this setting, run the library *outside the `accelerate` launcher*, but passing `parallelize=True` to `--model_args` as follows: ``` lm_eval --model hf \ @@ -128,7 +140,19 @@ For more advanced users or even larger models, we allow for the following argume - `max_cpu_memory`: the max amount of CPU memory to use when offloading the model weights to RAM. - `offload_folder`: a folder where model weights will be offloaded to disk if needed. -These two options (`accelerate launch` and `parallelize=True`) are mutually exclusive. +The third option is to use both at the same time. This will allow you to take advantage of both data parallelism and model sharding, and is especially useful for models that are too large to fit on a single GPU. + +``` +accelerate launch --multi_gpu --num_processes {nb_of_copies_of_your_model} \ + -m lm_eval --model hf \ + --tasks lambada_openai,arc_easy \ + --model_args parallelize=True \ + --batch_size 16 +``` + +To learn more about model parallelism and how to use it with the `accelerate` library, see the [accelerate documentation](https://huggingface.co/docs/transformers/v4.15.0/en/parallelism) + +**Warning: We do not natively support multi-node evaluation using the `hf` model type! Please reference [our GPT-NeoX library integration](https://github.com/EleutherAI/gpt-neox/blob/main/eval.py) for an example of code in which a custom multi-machine evaluation script is written.** **Note: we do not currently support multi-node evaluations natively, and advise using either an externally hosted server to run inference requests against, or creating a custom integration with your distributed framework [as is done for the GPT-NeoX library](https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py).** @@ -216,26 +240,26 @@ lm_eval --model openai-completions \ We also support using your own local inference server with servers that mirror the OpenAI Completions and ChatCompletions APIs. ```bash -lm_eval --model local-chat-completions --tasks gsm8k --model_args model=facebook/opt-125m,base_url=http://{yourip}:8000/v1 +lm_eval --model local-completions --tasks gsm8k --model_args model=facebook/opt-125m,base_url=http://{yourip}:8000/v1/completions,num_concurrent=1,max_retries=3,tokenized_requests=False,batch_size=16 ``` -Note that for externally hosted models, configs such as `--device` and `--batch_size` should not be used and do not function. Just like you can use `--model_args` to pass arbitrary arguments to the model constructor for local models, you can use it to pass arbitrary arguments to the model API for hosted models. See the documentation of the hosting service for information on what arguments they support. - -| API or Inference Server | Implemented? | `--model ` name | Models supported: | Request Types: | -|---------------------------------------------------------------------------------------------------------------------------|---------------------------------|---------------------------------------------------------------------|-----------------------------------------------------------------------------------------------|------------------------------------------------------------| -| OpenAI Completions | :heavy_check_mark: | `openai-completions`, `local-completions` | All OpenAI Completions API models | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | -| OpenAI ChatCompletions | :heavy_check_mark: | `openai-chat-completions`, `local-chat-completions` | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt) | `generate_until` (no logprobs) | -| Anthropic | :heavy_check_mark: | `anthropic` | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model) | `generate_until` (no logprobs) | -| Anthropic Chat | :heavy_check_mark: | `anthropic-chat`, `anthropic-chat-completions` | [Supported Anthropic Engines](https://docs.anthropic.com/claude/docs/models-overview) | `generate_until` (no logprobs) | -| Textsynth | :heavy_check_mark: | `textsynth` | [All supported engines](https://textsynth.com/documentation.html#engines) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | -| Cohere | [:hourglass: - blocked on Cohere API bug](https://github.com/EleutherAI/lm-evaluation-harness/pull/395) | N/A | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | -| [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)) | :heavy_check_mark: | `gguf`, `ggml` | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp) | `generate_until`, `loglikelihood`, (perplexity evaluation not yet implemented) | -| vLLM | :heavy_check_mark: | `vllm` | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | -| Mamba | :heavy_check_mark: | `mamba_ssm` | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | -| Huggingface Optimum (Causal LMs) | ✔️ | `openvino` | Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... | -| Neuron via AWS Inf2 (Causal LMs) | ✔️ | `neuronx` | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... | -| [Neural Magic DeepSparse](https://github.com/neuralmagic/deepsparse) | ✔️ | `deepsparse` | Any LM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub with the "deepsparse" tag](https://huggingface.co/models?other=deepsparse) | `generate_until`, `loglikelihood` | ... | -| [Neural Magic SparseML](https://github.com/neuralmagic/sparseml) | ✔️ | `sparseml` | Any decoder-only AutoModelForCausalLM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub](https://huggingface.co/neuralmagic). Especially useful for models with quantization like [`zoo:llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized`](https://sparsezoo.neuralmagic.com/models/llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... | -| Your local inference server! | :heavy_check_mark: | `local-completions` or `local-chat-completions` (using `openai-chat-completions` model type) | Any server address that accepts GET requests using HF models and mirror's OpenAI's Completions or ChatCompletions interface | `generate_until` | | ... | +Note that for externally hosted models, configs such as `--device` which relate to where to place a local model should not be used and do not function. Just like you can use `--model_args` to pass arbitrary arguments to the model constructor for local models, you can use it to pass arbitrary arguments to the model API for hosted models. See the documentation of the hosting service for information on what arguments they support. + +| API or Inference Server | Implemented? | `--model ` name | Models supported: | Request Types: | +|---------------------------------------------------------------------------------------------------------------------------|---------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------| +| OpenAI Completions | :heavy_check_mark: | `openai-completions`, `local-completions` | All OpenAI Completions API models | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | +| OpenAI ChatCompletions | :heavy_check_mark: | `openai-chat-completions`, `local-chat-completions` | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt) | `generate_until` (no logprobs) | +| Anthropic | :heavy_check_mark: | `anthropic` | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model) | `generate_until` (no logprobs) | +| Anthropic Chat | :heavy_check_mark: | `anthropic-chat`, `anthropic-chat-completions` | [Supported Anthropic Engines](https://docs.anthropic.com/claude/docs/models-overview) | `generate_until` (no logprobs) | +| Textsynth | :heavy_check_mark: | `textsynth` | [All supported engines](https://textsynth.com/documentation.html#engines) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | +| Cohere | [:hourglass: - blocked on Cohere API bug](https://github.com/EleutherAI/lm-evaluation-harness/pull/395) | N/A | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | +| [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)) | :heavy_check_mark: | `gguf`, `ggml` | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp) | `generate_until`, `loglikelihood`, (perplexity evaluation not yet implemented) | +| vLLM | :heavy_check_mark: | `vllm` | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | +| Mamba | :heavy_check_mark: | `mamba_ssm` | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | +| Huggingface Optimum (Causal LMs) | ✔️ | `openvino` | Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... | +| Neuron via AWS Inf2 (Causal LMs) | ✔️ | `neuronx` | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... | +| [Neural Magic DeepSparse](https://github.com/neuralmagic/deepsparse) | ✔️ | `deepsparse` | Any LM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub with the "deepsparse" tag](https://huggingface.co/models?other=deepsparse) | `generate_until`, `loglikelihood` | ... | +| [Neural Magic SparseML](https://github.com/neuralmagic/sparseml) | ✔️ | `sparseml` | Any decoder-only AutoModelForCausalLM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub](https://huggingface.co/neuralmagic). Especially useful for models with quantization like [`zoo:llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized`](https://sparsezoo.neuralmagic.com/models/llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... | +| Your local inference server! | :heavy_check_mark: | `local-completions` or `local-chat-completions` | Support for OpenAI API-compatible servers, with easy customization for other APIs. | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | ... | Models which do not supply logits or logprobs can be used with tasks of type `generate_until` only, while local models, or APIs that supply logprobs/logits of their prompts, can be run on all task types: `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`. @@ -435,29 +459,27 @@ The best way to get support is to open an issue on this repo or join the [Eleuth ## Optional Extras Extras dependencies can be installed via `pip install -e ".[NAME]"` -| Name | Use | -|---------------|---------------------------------------| -| anthropic | For using Anthropic's models | -| deepsparse | For running NM's DeepSparse models | -| dev | For linting PRs and contributions | -| gptq | For loading models with GPTQ | -| hf_transfer | For speeding up HF Hub file downloads | -| ifeval | For running the IFEval task | -| neuronx | For running on AWS inf2 instances | -| mamba | For loading Mamba SSM models | -| math | For running math task answer checking | -| multilingual | For multilingual tokenizers | -| openai | For using OpenAI's models | -| optimum | For running Intel OpenVINO models | -| promptsource | For using PromptSource prompts | -| sentencepiece | For using the sentencepiece tokenizer | -| sparseml | For using NM's SparseML models | -| testing | For running library test suite | -| unitxt | For IBM's unitxt dataset tasks | -| vllm | For loading models with vLLM | -| zeno | For visualizing results with Zeno | -|---------------|---------------------------------------| -| all | Loads all extras (not recommended) | +| Name | Use | +|-----------------|----------------------------------------------| +| api | For using api models (Anthropic, OpenAI API) | +| deepsparse | For running NM's DeepSparse models | +| dev | For linting PRs and contributions | +| gptq | For loading models with GPTQ | +| hf_transfer | For speeding up HF Hub file downloads | +| ifeval | For running the IFEval task | +| neuronx | For running on AWS inf2 instances | +| mamba | For loading Mamba SSM models | +| math | For running math task answer checking | +| multilingual | For multilingual tokenizers | +| optimum | For running Intel OpenVINO models | +| promptsource | For using PromptSource prompts | +| sentencepiece | For using the sentencepiece tokenizer | +| sparseml | For using NM's SparseML models | +| testing | For running library test suite | +| vllm | For loading models with vLLM | +| zeno | For visualizing results with Zeno | +| --------------- | --------------------------------------- | +| all | Loads all extras (not recommended) | ## Cite as @@ -465,11 +487,11 @@ Extras dependencies can be installed via `pip install -e ".[NAME]"` @misc{eval-harness, author = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy}, title = {A framework for few-shot language model evaluation}, - month = 12, - year = 2023, + month = 07, + year = 2024, publisher = {Zenodo}, - version = {v0.4.0}, - doi = {10.5281/zenodo.10256836}, - url = {https://zenodo.org/records/10256836} + version = {v0.4.3}, + doi = {10.5281/zenodo.12608602}, + url = {https://zenodo.org/records/12608602} } ``` diff --git a/docs/API_guide.md b/docs/API_guide.md new file mode 100644 index 0000000000000000000000000000000000000000..de2f1420fe4b0901ae45288f47c35c6a663d2f71 --- /dev/null +++ b/docs/API_guide.md @@ -0,0 +1,198 @@ +# TemplateAPI Usage Guide + +The `TemplateAPI` class is a versatile superclass designed to facilitate the integration of various API-based language models into the lm-evaluation-harness framework. This guide will explain how to use and extend the `TemplateAPI` class to implement your own API models. If your API implements the OpenAI API you can use the `local-completions` or the `local-chat-completions` (defined [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py)) model types, which can also serve as examples of how to effectively subclass this template. + +## Overview + +The `TemplateAPI` class provides a template for creating API-based model implementations. It handles common functionalities such as: + +- Tokenization (optional) +- Batch processing +- Caching +- Retrying failed requests +- Parsing API responses + +To use this class, you typically need to subclass it and implement specific methods for your API. + +## Key Methods to Implement + +When subclassing `TemplateAPI`, you need to implement the following methods: + +1. `_create_payload`: Creates the JSON payload for API requests. +2. `parse_logprobs`: Parses log probabilities from API responses. +3. `parse_generations`: Parses generated text from API responses. +4. `headers`: Returns the headers for the API request. + +You may also need to override other methods or properties depending on your API's specific requirements. + +> [!NOTE] +> Currently loglikelihood and MCQ based tasks (such as MMLU) are only supported for completion endpoints. Not for chat-completion — those that expect a list of dicts — endpoints! Completion APIs which support instruct tuned models can be evaluated with the `--apply_chat_template` option in order to simultaneously evaluate models using a chat template format while still being able to access the model logits needed for loglikelihood-based tasks. + +# TemplateAPI Usage Guide + +## TemplateAPI Arguments + +When initializing a `TemplateAPI` instance or a subclass, you can provide several arguments to customize its behavior. Here's a detailed explanation of some important arguments: + +- `model` or `pretrained` (str): + - The name or identifier of the model to use. + - `model` takes precedence over `pretrained` when both are provided. + +- `base_url` (str): + - The base URL for the API endpoint. + +- `tokenizer` (str, optional): + - The name or path of the tokenizer to use. + - If not provided, it defaults to using the same tokenizer name as the model. + +- `num_concurrent` (int): + - Number of concurrent requests to make to the API. + - Useful for APIs that support parallel processing. + - Default is 1 (sequential processing). + +- `tokenized_requests` (bool): + - Determines whether the input is pre-tokenized. Defaults to `True`. + - Requests can be sent in either tokenized form (`list[list[int]]`) or as text (`list[str]`, or `str` for batch_size=1). + - For loglikelihood-based tasks, prompts require tokenization to calculate the context length. If `False` prompts are decoded back to text before being sent to the API. + - Not as important for `generate_until` tasks. + - Ignored for chat formatted inputs (list[dict...]) or if tokenizer_backend is None. + +- `tokenizer_backend` (str, optional): + - Required for loglikelihood-based or MCQ tasks. + - Specifies the tokenizer library to use. Options are "tiktoken", "huggingface", or None. + - Default is "huggingface". + +- `max_length` (int, optional): + - Maximum length of input + output. + - Default is 2048. + +- `max_retries` (int, optional): + - Maximum number of retries for failed API requests. + - Default is 3. + +- `max_gen_toks` (int, optional): + - Maximum number of tokens to generate in completion tasks. + - Default is 256 or set in task yaml. + +- `batch_size` (int or str, optional): + - Number of requests to batch together (if the API supports batching). + - Can be an integer or "auto" (which defaults to 1 for API models). + - Default is 1. + +- `seed` (int, optional): + - Random seed for reproducibility. + - Default is 1234. + +- `add_bos_token` (bool, optional): + - Whether to add the beginning-of-sequence token to inputs (when tokenizing). + - Default is False. + +- `custom_prefix_token_id` (int, optional): + - Custom token ID to use as a prefix for inputs. + - If not provided, uses the model's default BOS or EOS token (if `add_bos_token` is True). + + +Example usage: + +```python +class MyAPIModel(TemplateAPI): + def __init__(self, **kwargs): + super().__init__( + model="my-model", + base_url="https://api.mymodel.com/v1/completions", + tokenizer_backend="huggingface", + num_concurrent=5, + max_retries=5, + batch_size=10, + **kwargs + ) + + # Implement other required methods... +``` + +When subclassing `TemplateAPI`, you can override these arguments in your `__init__` method to set default values specific to your API. You can also add additional (potentially user-specified) arguments as needed for your specific implementation. + +## Example Implementation: OpenAI API + +The `OpenAICompletionsAPI` and `OpenAIChatCompletion` ([here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py) classes demonstrate how to implement API models using the `TemplateAPI` class. Here's a breakdown of the key components: + +### 1. Subclassing and Initialization + +```python +@register_model("openai-completions") +class OpenAICompletionsAPI(LocalCompletionsAPI): + def __init__( + self, + base_url="https://api.openai.com/v1/completions", + tokenizer_backend="tiktoken", + **kwargs, + ): + super().__init__( + base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs + ) +``` + +### 2. Implementing API Key Retrieval + +```python +@cached_property +def api_key(self): + key = os.environ.get("OPENAI_API_KEY", None) + if key is None: + raise ValueError( + "API key not found. Please set the OPENAI_API_KEY environment variable." + ) + return key +``` + +### 3. Creating the Payload + +```python +def _create_payload( + self, + messages: Union[List[List[int]], List[dict], List[str], str], + generate=False, + gen_kwargs: Optional[dict] = None, + **kwargs, +) -> dict: + if generate: + # ... (implementation for generation) + else: + # ... (implementation for log likelihood) +``` + +### 4. Parsing API Responses + +```python +@staticmethod +def parse_logprobs( + outputs: Union[Dict, List[Dict]], + tokens: List[List[int]] = None, + ctxlens: List[int] = None, + **kwargs, +) -> List[Tuple[float, bool]]: + # ... (implementation) + +@staticmethod +def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]: + # ... (implementation) +``` + +The requests are initiated in the `model_call` or the `amodel_call` methods. + +## Implementing Your Own API Model + +To implement your own API model: + +1. Subclass `TemplateAPI` or one of its subclasses (e.g., `LocalCompletionsAPI`). +2. Override the `__init__` method if you need to set specific parameters. +3. Implement the `_create_payload` and `header` methods to create the appropriate payload for your API. +4. Implement the `parse_logprobs` and `parse_generations` methods to parse your API's responses. +5. Override the `api_key` property if your API requires authentication. +6. Override any other methods as necessary to match your API's behavior. + +## Best Practices + +1. Use the `@register_model` decorator to register your model with the framework (and import it in `lm_eval/models/__init__.py`!). +3. Use environment variables for sensitive information like API keys. +4. Properly handle batching and concurrent requests if supported by your API. diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index 8b9ef616dc0a0a9503292248999fb9c32515bde9..48b5c332c22c50306f00f36bd3004db4393a49df 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -2,8 +2,6 @@ Welcome and thank you for your interest in the LM Evaluation Harness! We welcome contributions and feedback and appreciate your time spent with our library, and hope you find it useful! -We intend LM Evaluation Harness to be a broadly useful and - ## Important Resources There are several places information about LM Evaluation Harness is located: @@ -11,7 +9,7 @@ There are several places information about LM Evaluation Harness is located: - Our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs) - We occasionally use [GitHub Milestones](https://github.com/EleutherAI/lm-evaluation-harness/milestones) to track progress toward specific near-term version releases. - We maintain a [Project Board](https://github.com/orgs/EleutherAI/projects/25) for tracking current work items and PRs, and for future roadmap items or feature requests. -- Further discussion and support conversations are located in the #lm-thunderdome channel of the [EleutherAI discord](discord.gg/eleutherai). +- Further discussion and support conversations are located in the #lm-thunderdome channel of the [EleutherAI discord](https://discord.gg/eleutherai). ## Code Style @@ -32,7 +30,7 @@ in order to ensure linters and other checks will be run upon committing. We use [pytest](https://docs.pytest.org/en/latest/) for running unit tests. All library unit tests can be run via: ``` -python -m pytest --ignore=tests/tests_master --ignore=tests/extra +python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py ``` ## Contributor License Agreement diff --git a/docs/README.md b/docs/README.md index 974e2e2c1c3f05b802d2a095d60cc36f0cde564b..f040eabaef41b55e9bd8297f3f4653fbc16bf0bc 100644 --- a/docs/README.md +++ b/docs/README.md @@ -4,7 +4,8 @@ Welcome to the docs for the LM Evaluation Harness! ## Table of Contents -* To learn about the public interface of the library, as well as how to evaluate via the commandline or as integrated into an external library, see the [Interface](./interface.md) +* To learn about the public interface of the library, as well as how to evaluate via the command line or as integrated into an external library, see the [Interface](./interface.md). * To learn how to add a new library, API, or model type to the library, as well as a quick explainer on the types of ways to evaluate an LM, see the [Model Guide](./model_guide.md). + * For an extended description of how to extend the library to new model classes served over an API, see the [API Guide](./API_guide.md). * For a crash course on adding new tasks to the library, see our [New Task Guide](./new_task_guide.md). * To learn more about pushing the limits of task configuration that the Eval Harness supports, see the [Task Configuration Guide](./task_guide.md). diff --git a/docs/interface.md b/docs/interface.md index 2add18f8db36a8d5f077720cbc97745f6b66fdc7..47cf00b49694bdbdd86a431c318f0497a2cb4f5a 100644 --- a/docs/interface.md +++ b/docs/interface.md @@ -46,7 +46,11 @@ This mode supports a number of command-line arguments, the details of which can - `--system_instruction`: Specifies a system instruction string to prepend to the prompt. -- `--apply_chat_template` : If this flag is on, a chat template will be applied to the prompt. For Hugging Face models, the chat template is taken from the tokenizer, if the tokenizer does not have a chat template, a default one will be applied. For other models, chat templating is not currently implemented. +- `--apply_chat_template` : This flag specifies whether to apply a chat template to the prompt. It can be used in the following ways: + - `--apply_chat_template` : When used without an argument, applies the only available chat template to the prompt. For Hugging Face models, if no dedicated chat template exists, the default chat template will be applied. + - `--apply_chat_template template_name` : If the model has multiple chat templates, apply the specified template to the prompt. + + For Hugging Face models, the default chat template can be found in the [`default_chat_template`](https://github.com/huggingface/transformers/blob/fc35907f95459d7a6c5281dfadd680b6f7b620e3/src/transformers/tokenization_utils_base.py#L1912) property of the Transformers Tokenizer. - `--fewshot_as_multiturn` : If this flag is on, the Fewshot examples are treated as a multi-turn conversation. Questions are provided as user content and answers are provided as assistant responses. Requires `--num_fewshot` to be set to be greater than 0, and `--apply_chat_template` to be on. @@ -58,12 +62,15 @@ This mode supports a number of command-line arguments, the details of which can * `--hf_hub_log_args` : Logs evaluation results to Hugging Face Hub. Accepts a string with the arguments separated by commas. Available arguments: * `hub_results_org` - organization name on Hugging Face Hub, e.g., `EleutherAI`. If not provided, the results will be pushed to the owner of the Hugging Face token, - * `hub_repo_name` - repository name on Hugging Face Hub, e.g., `lm-eval-results`, + * `hub_repo_name` - repository name on Hugging Face Hub (deprecated, `details_repo_name` and `results_repo_name` should be used instead), e.g., `lm-eval-results`, + * `details_repo_name` - repository name on Hugging Face Hub to store details, e.g., `lm-eval-results`, + * `results_repo_name` - repository name on Hugging Face Hub to store results, e.g., `lm-eval-results`, * `push_results_to_hub` - whether to push results to Hugging Face Hub, can be `True` or `False`, * `push_samples_to_hub` - whether to push samples results to Hugging Face Hub, can be `True` or `False`. Requires `--log_samples` to be set, * `public_repo` - whether the repository is public, can be `True` or `False`, * `leaderboard_url` - URL to the leaderboard, e.g., `https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard`. * `point_of_contact` - Point of contact for the results dataset, e.g., `yourname@example.com`. + * `gated` - whether to gate the details dataset, can be `True` or `False`. ## External Library Usage @@ -102,12 +109,10 @@ results = lm_eval.simple_evaluate( # call simple_evaluate ) ``` -See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L35 for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously. +See the `simple_evaluate()` and `evaluate()` functions in [lm_eval/evaluator.py](../lm_eval/evaluator.py#:~:text=simple_evaluate) for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously. Additionally, the `evaluate()` function offers the core evaluation functionality provided by the library, but without some of the special handling and simplification + abstraction provided by `simple_evaluate()`. -See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L173 for more details. - As a brief example usage of `evaluate()`: ```python diff --git a/docs/model_guide.md b/docs/model_guide.md index b14935cb4f5690c769c338788a66cbbbeeb3edd2..810801cbfa47294a5c02ca0d0c38da02975dd713 100644 --- a/docs/model_guide.md +++ b/docs/model_guide.md @@ -118,17 +118,45 @@ class MyCustomLM(LM): #... @property def tokenizer_name(self) -> str: - # should return a string denoting the name of the model's tokenizer and/or the accompanying chat template. - - @property - def chat_template(self) -> str: - # should return a chat template formatting string that is used to build prompt from a user/assistant chat history. - # this will be saved in the evaluation results for reproducibility. + """ + Return the name of the model's tokenizer and/or the accompanying chat template. + The returned string is used to cache requests. + + Returns: + str: The name of the model's tokenizer and/or chat template. + """ + + def chat_template(self, chat_template: Union[bool, str] = False) -> str: + """ + Get the appropriate chat template for the model based on the `chat_template` argument. + + This method returns the chat template string to build the prompt from a chat history. + The chat template is saved in the evaluation results for reproducibility. + Boolean arguments should be used with models that have only one chat template, + while string arguments are used with models that have multiple chat templates. + For the reference implementation, see HFLM class in `lm_eval.models.huggingface`. + + Args: + chat_template (Union[bool, str]): Specifies whether to apply a chat template: + - If False: Do not apply any chat template. + - If True: Apply the default chat template. + - If str: Apply the specified chat template by name. + + Returns: + str: The selected chat template in Jinja format. + """ def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str: - # responsible for taking as input a chat history that would be fed into the model, and - # rendering it as a string that can be then tokenized and input into the model. - #... + """ + Process a chat history to create a string that can be tokenized and input into the model. + + Args: + chat_history (List[Dict[str, str]]): A list of dictionaries representing the chat history, + where each dictionary has "role" and "content" keys. + + Returns: + str: A string representing the chat history that can be tokenized and fed into the model. + """ ``` - `apply_chat_template` diff --git a/docs/new_task_guide.md b/docs/new_task_guide.md index 2f6d32177036407da973e7a4afa5f21574f193ea..e9bd2becb211b1722174fd406988ad74f2f37caf 100644 --- a/docs/new_task_guide.md +++ b/docs/new_task_guide.md @@ -285,7 +285,7 @@ As a heuristic check: For more detail on the task system and advanced features, see [`docs/task_guide.md`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md) . If none of the above sound like they apply to your task, it's time to continue onto checking your task performance! -### Task name + groups (registering a task) +### Task name + tags (registering a task) To test a task conveniently, it helps to *register* the task--that is, to give it a name and make the `lm-eval` library aware it exists! @@ -296,14 +296,14 @@ task: ``` Including a task name is mandatory. -It is often also convenient to label your task with several `groups`, or tags, though this field is optional: +It is often also convenient to label your task with several `tag` values, though this field is optional: ```yaml -group: - - group1 - - group2 +tag: + - tag1 + - tag2 ``` -This will add your task to the `group1` and `group2` groups, enabling people to know how to categorize your task, and if desired run all tasks in one of these groups at once, your task along with them. +This will add your task to the `tag1` and `tag2` tags, enabling people to know how to categorize your task, and if desired run all tasks in one of these groups at once, your task along with them. If your task is not in the `lm_eval/tasks` folder, you'll need to tell the Eval Harness where to look for YAML files. @@ -319,7 +319,48 @@ Passing `--tasks /path/to/yaml/file` is also accepted. ### Advanced Group Configs -You can make more complete group config while also tailoring parameters for individual tasks. +While `tag` values are helpful when you want to be able to quickly and conveniently run a set of related tasks via `--tasks my_tag_name`, often, we wish to implement more complex logic. For example, the MMLU benchmark contains 57 *subtasks* that must all be *averaged* together in order to report a final 'MMLU score'. + +Groupings of tasks might also use particular variants of a task--for example, we might want to default to evaluating a task as 5-shot when called as part of a given grouping, but not have a preference for number of shots when evaluating it as a standalone. + +We implement this via **groups**, which are distinct from tags. Groups can be implemented via *group config* YAML files, which are laid out similarly but slightly differently to tasks' YAML configs. + +The most basic form of group can be defined via a YAML config similar to the following: + +```yaml +group: nli_tasks +task: + - cb + - anli_r1 + - rte +metadata: + version: 1.0 +``` + +This will behave almost identically to a `tag` that includes these 3 tasks, but with one key distinction: we'll print the `nli_tasks` group as a row (with no associated metrics) in our table of outputs, and visually show that these 3 tasks appear under its subheader. + + +Now, let's assume we actually want to report an aggregate score for `nli_tasks`. We would instead use a YAML config like the following: + +```yaml +group: nli_tasks +task: + - cb + - anli_r1 + - rte +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean). +metadata: + version: 1.0 +``` + +Similar to our `metric_list` for listing out the metrics we want to calculate for a given task, we use an `aggregate_metric_list` field to specify which metric name to aggregate across subtasks, what aggregation function to use, and whether we should micro- or macro- average these metrics. See [./task_guide.md](./task_guide.md) for a full list of related sub-keys. + +**[!Tip]: currently, we predominantly only support the aggregation of group metrics that use `mean` (either micro- or macro- averaged) over their subtasks. If you require even more complex aggregation rules, you may want to perform aggregation offline.** + +Group configs can be fairly complex! We can do various operations, such as defining new subtask(s) inline in our group YAML, overriding an existing task's specific config value, or nesting existing groups within our For example, let's build a config for evaluating MMLU and a few natural language inference tasks. For MMLU, we can write the name for the benchmark as a subtask written under `task`. You can configure the parameters such as `num_fewshot`. If the task being configured is a group such as `mmlu` or `super_glue`, the parameter set will be applied to all of the subtasks. @@ -331,33 +372,13 @@ task: - cb - anli_r1 - rte + aggregate_metric_list: + - metric: acc + aggregation: mean + higher_is_better: true - task: mmlu num_fewshot: 2 ``` -It's also important to note how you can basically insert a group config as a task. Here, to make a group of natural language inference tasks, you simply write like how you would normally write a group config but this time place that as part of a task list under the main group being built. - -### Duplicate Tasks in Group Configs - -There might be cases where you might want to evaluate prompts and how models perform over prompt variations. You can list an existing task (In the example below, `anli_r1`) which varying `doc_to_text` implementation. To differentiate from each variation, we can utilize `task_alias`. LM-Eval will recognize that there are multiple variations of the same tasks and differentiate them. -```yaml -group: flan_held_in -group_alias: Flan (Held-In) -task: - # ANLI R1 - - group: anli_r1_flan - group_alias: ANLI R1 - task: - - task: anli_r1 - task_alias: prompt-0 - include: _held_in_template_yaml - doc_to_text: "{{premise}}\n\nChoose your answer ..." - ... - - task: anli_r1 - task_alias: prompt-1 - include: _held_in_template_yaml - doc_to_text: "{{premise}}\n\nBased on ..." - ... -``` ### Configuring python classes @@ -382,21 +403,29 @@ task: ... ``` +You can also pass a custom argument to your class by accepting `config` in the custom class constructor. +Here's how to do it: + +```yaml +task: 20_newsgroups +class: !function task.Unitxt +recipe: card=cards.20_newsgroups,template=templates.classification.multi_class.title +``` + +In this example, `recipe` is the custom argument for the `Unitxt` class. + ## Beautifying Table Display -To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed. For example in `mmlu_abstract_algebra.yaml` we set `group_alias` to `stem` and `task_alias` to `abstract_algebra`. +To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed. For example in `mmlu_abstract_algebra.yaml` we set `task_alias` to `abstract_algebra`. In group configs, a `group_alias` for a group can also be set. ``` "dataset_name": "abstract_algebra" "description": "The following are multiple choice questions (with answers) about abstract\ \ algebra.\n\n" -"group": "mmlu_stem" -"group_alias": "stem" "include": "_default_template_yaml" "task": "mmlu_abstract_algebra" "task_alias": "abstract_algebra" ``` -Note: Even though `group` can be a list, for now, `group_alias` can only be a single string. ## Checking validity @@ -416,9 +445,9 @@ a simple eye test. ## Versioning -One key feature in LM Evaluation Harness is the ability to version tasks--that is, mark them with a specific version number that can be bumped whenever a breaking change is made. +One key feature in LM Evaluation Harness is the ability to version tasks and groups--that is, mark them with a specific version number that can be bumped whenever a breaking change is made. -This version info can be provided by adding the following to your new task config file: +This version info can be provided by adding the following to your new task or group config file: ``` metadata: diff --git a/docs/task_guide.md b/docs/task_guide.md index b10ca7d9a96cfbb5a2204c69531f56c16ac427f4..34e47c413694eeb8da2d3dc5c743eaba2740e0b0 100644 --- a/docs/task_guide.md +++ b/docs/task_guide.md @@ -16,7 +16,8 @@ Tasks are configured via the `TaskConfig` object. Below, we describe all fields Task naming + registration: - **task** (`str`, defaults to None) — name of the task. -- **group** (`str`, *optional*) — name of the task group(s) a task belongs to. Enables one to run all tasks with a specified tag or group name at once. +- **task_alias** (`str`, defaults to None) - Alias of the task name that will be printed in the final table results. +- **tag** (`str`, *optional*) — name of the task tags(s) a task belongs to. Enables one to run all tasks with a specified tag name at once. Dataset configuration options: - **dataset_path** (`str`) — The name of the dataset as listed by HF in the datasets Hub. @@ -55,8 +56,6 @@ Other: ## Filters -Explain: What are filters? What is their place in the pipeline? - A key component of the `lm-evaluation-harness` library is the `Filter` object. In a typical evaluation run of the harness, we take the formatted inputs and run them through our LM, with the appropriate output type (greedy or free-form generation, or loglikelihood-based comparative scoring). After getting scores or output text from our LM on each `Instance` or document in the dataset, we then need to feed these responses into a metric or scoring function to return scores to a user. @@ -295,105 +294,24 @@ Generative tasks: Tasks using complex filtering: - GSM8k with CoT (+ with Self-Consistency): (`lm_eval/tasks/gsm8k/gsm8k-cot.yaml` ; `lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml`) - -## Benchmarks +# Group Configuration When evaluating a language model, it's is not unusual to test across a number of tasks that may not be related to one another in order to assess a variety of capabilities. To this end, it may be combursome to have to list the set of tasks or add a new group name to each yaml of each individual task. -To solve this, we can create a benchmark yaml config. This is a config that contains the names of the tasks that should be included in a particular benchmark. The config consists of two main keys `group` which denotes the name of the benchmark and `task` which is where we can list the tasks. The tasks listed in `task` are the task names that have been registered. A good example would be the list of tasks used to evaluate the Pythia Suite. - -```yaml -group: pythia -task: - - lambada_openai - - wikitext - - piqa - - sciq - - wsc - - winogrande - - arc - - logiqa - - blimp - - hendrycksTest* -``` - -It is also possible to list an existing task in your benchmark configuration with some adjustments. For example, a few tasks from mmlu is included `multimedqa`. There, the `task_alias` and `group_alias` (See [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md#beautifying-table-display) for more details) are modified to suit the benchmark. - -```yaml -group: multimedqa -task: - - pubmedqa - - medmcqa - - medqa_4options - - task: mmlu_anatomy - task_alias: "anatomy (mmlu)" - group_alias: null - - task: mmlu_clinical_knowledge - task_alias: "clinical_knowledge (mmlu)" - group_alias: null - ... -``` +To solve this, we can create a **group** yaml config. This is a config that contains the names of the tasks that should be included in a particular group. The config consists of two main keys: a `group` key which denotes the name of the group (as it would be called from the command line, e.g. `mmlu`) and a `task` key which is where we can list the tasks. The tasks listed in `task` are the task names that have been registered. A good example of a group yaml config can be found at [../lm_eval/tasks/mmlu/default/_mmlu.yaml]. See also the [New Task Guide](./new_task_guide.md) for a more in-depth and tutorial-esque explanation of how to write complex GroupConfigs. -Alternatively, benchmarks can have tasks that are customizable for each task. They can be defined like how a yaml task is usually set. +## Configurations -```yaml -group: t0_eval -task: - # Coreference Resolution - - dataset_path: super_glue - dataset_name: wsc.fixed - use_prompt: promptsource:* - training_split: train - validation_split: validation - metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - # Coreference Resolution - - dataset_path: winogrande - dataset_name: winogrande_xl - use_prompt: promptsource:* - training_split: train - validation_split: validation - metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - ... -``` +Groups are configured via the `GroupConfig` object. Below, we describe all fields usable within the object, and their role in defining a task. -If the benchmark contains the same dataset but with different configurations, use `task` to differentiate between them. For example, T0-Eval evaluates on 3 versions of ANLI but the huggingface dataset collects them in one dataset. - -```YAML -group: t0_eval -task: - ... - - task: anli_r1 - dataset_path: anli - use_prompt: promptsource:* - training_split: train_r1 - validation_split: dev_r1 - metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - task: anli_r2 - dataset_path: anli - use_prompt: promptsource:* - training_split: train_r2 - validation_split: dev_r2 - metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true -``` +### Parameters -Calling the benchmark is done the same way we would call any task with `--tasks`. Benchmarks can be added in `lm_eval/tasks/benchmarks/` +- **group** (`str`, defaults to `None`) — name of the group. Used to invoke it from the command line. +- **group_alias** (`str`, defaults to `None`) - Alternative name for the group that will be printed in the table output. +- **task** (`Union[str, list]`, defaults to `None`) - List of tasks that constitute the group. +- **aggregate_metric_list** (`list`, defaults to `None`) - similar to `metric_list` in TaskConfigs, provide a list of configurations for metrics that should be aggregated across subtasks. Leaving empty will result in no aggregation being performed for this group. Keys for each list entry are: + - `metric: str` - the name of the metric to aggregate over (all subtasks must report a metric holding this name.) + - `aggregation: str` - what aggregation function to apply to aggregate these per-subtask metrics. **currently, only `mean` is supported.** + - `weight_by_size: bool = True` whether to perform micro- averaging (`True`) or macro- (`False`) averaging of subtasks' accuracy scores when reporting the group's metric. MMLU, for example, averages over per-document accuracies (the *micro average*), resulting in the same accuracy as if one simply concatenated all 57 subjects into a single dataset and evaluated accuracy on that dataset. + - `filter_list: Union[str, List[str]] = "none"` - what filter keys one should match on to aggregate results. For example, if trying to aggregate over the `exact_match` metric using `strict-match` filter for `bbh_cot_zeroshot`, then set this to be `filter_list: "strict-match"`. +- **metadata** (`dict`, *optional*) - As with TaskConfigs, a field where extra config metadata can be passed. set the `num_fewshot` key within this to override the printed n_shot value in a results table for your group, for example. diff --git a/examples/lm-eval-overview.ipynb b/examples/lm-eval-overview.ipynb index 898192ce9f8e5963a3c9bb3e00910fa898fdcbdb..7c4564d6a6f7648f0b85edff2ab90a7707f4e0bf 100644 --- a/examples/lm-eval-overview.ipynb +++ b/examples/lm-eval-overview.ipynb @@ -210,7 +210,7 @@ ], "source": [ "# Install LM-Eval\n", - "!pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@big-refactor" + "!pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git" ] }, { @@ -377,7 +377,7 @@ "id": "LOUHK7PtQfq4" }, "source": [ - "Often, tasks are part of a larger group used to measure different capabilities. The dynamism of the field today means new dimensions of evaluation can come about which would mix and match new and older tasks alike. In LM-Eval, We can also group tasks and call that the group name to evaluate on a set of tasks easily. In this instance, let's evaluate the group `yes_or_no_tasks` which comprise of the tasks `demo_boolq` and `demo_cola`; tasks which are multiple choice tasks with options `yes` and `no` as the name suggests.\n", + "Often, tasks are part of a larger group used to measure different capabilities. The dynamism of the field today means new dimensions of evaluation can come about which would mix and match new and older tasks alike. In LM-Eval, We can also group tasks and call that the group name to evaluate on a set of tasks easily. In this instance, let's evaluate the tag `yes_or_no_tasks` which comprise of the tasks `demo_boolq` and `demo_cola`; tasks which are multiple choice tasks with options `yes` and `no` as the name suggests.\n", "\n", "