Merge branch 'main' into humaneval

# Conflicts: # lm_eval/api/task.py

Merge branch 'main' into humaneval
# Conflicts: # lm_eval/api/task.py
173b2bc3 · Baber · 74344829 · bb098f13 · 173b2bc3 · 173b2bc3
Commit 173b2bc3 authored Jan 10, 2025 by Baber
20 changed files
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
@@ -16,7 +16,7 @@ jobs:
    name: Scan for changed tasks
    steps:
      - name: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          fetch-depth: 2  # OR "2" -> To retrieve the preceding commit.

@@ -47,7 +47,7 @@ jobs:

      - name: Set up Python 3.9
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          python-version: 3.9
          cache: 'pip'
@@ -56,7 +56,7 @@ jobs:
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
        run: |
            python -m pip install --upgrade pip
-            pip install -e '.[dev]' --extra-index-url https://download.pytorch.org/whl/cpu
+            pip install -e '.[dev,ifeval]' --extra-index-url https://download.pytorch.org/whl/cpu
    #   Install optional git dependencies
    #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
    #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi

--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -13,7 +13,7 @@ jobs:
    steps:
    - uses: actions/checkout@v4
    - name: Set up Python
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
      with:
        python-version: "3.x"

@@ -26,7 +26,7 @@ jobs:
    - name: Build a binary wheel and a source tarball
      run: python3 -m build
    - name: Store the distribution packages
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
      with:
        name: python-package-distributions
        path: dist/
@@ -46,7 +46,7 @@ jobs:

    steps:
    - name: Download all the dists
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
      with:
        name: python-package-distributions
        path: dist/
@@ -68,7 +68,7 @@ jobs:

    steps:
    - name: Download all the dists
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
      with:
        name: python-package-distributions
        path: dist/

--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -22,10 +22,10 @@ jobs:
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
-    - name: Set up Python 3.8
+    - name: Set up Python 3.9
      uses: actions/setup-python@v5
      with:
-        python-version: 3.8
+        python-version: 3.9
        cache: pip
        cache-dependency-path: pyproject.toml
    - name: Pre-Commit
@@ -42,7 +42,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+        python-version: ["3.9", "3.10", "3.11", "3.12" ]
    timeout-minutes: 30
    steps:
    - name: Checkout Code
@@ -56,15 +56,35 @@ jobs:
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install -e '.[dev,anthropic,sentencepiece,optimum,deepsparse,sparseml]' --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -e '.[dev,sentencepiece,api]' --extra-index-url https://download.pytorch.org/whl/cpu
 #         Install optional git dependencies
 #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
 #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
    - name: Test with pytest
-      run: python -m pytest --showlocals -s -vv -n=auto
+      run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py
    - name: Archive artifacts
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
      with:
-        name: output_results
+        name: output_testcpu${{ matrix.python-version }}
        path: |
          test_logs/*
+  testmodels:
+    name: External LM Tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v5
+      with:
+        python-version: 3.9
+        cache: pip
+        cache-dependency-path: pyproject.toml
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -U transformers peft
+    - name: Test with pytest
+      run: python -m pytest tests/models --showlocals -s -vv
--- a/.gitignore
+++ b/.gitignore
@@ -8,11 +8,13 @@ build
 dist
 *.egg-info
 venv
+.venv/
 .vscode/
 temp
 __pycache__
 .ipynb_checkpoints
 temp
+test_logs/
 # IPython
 profile_default/
 ipython_config.py

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 exclude: ^tests/testdata/
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v5.0.0
    hooks:
      - id: check-added-large-files
      - id: check-ast
@@ -29,7 +29,7 @@ repos:
      - id: mixed-line-ending
        args: [--fix=lf]
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.8
+    rev: v0.7.4
    hooks:
      # Run the linter.
      - id: ruff

--- a/CODEOWNERS
+++ b/CODEOWNERS
-* @haileyschoelkopf @lintangsutawika
+* @baberabb @lintangsutawika
--- a/README.md
+++ b/README.md
--- a/docs/API_guide.md
+++ b/docs/API_guide.md
+# TemplateAPI Usage Guide
+
+The `TemplateAPI` class is a versatile superclass designed to facilitate the integration of various API-based language models into the lm-evaluation-harness framework. This guide will explain how to use and extend the `TemplateAPI` class to implement your own API models. If your API implements the OpenAI API you can use the `local-completions` or the `local-chat-completions` (defined [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py)) model types, which can also serve as examples of how to effectively subclass this template.
+
+## Overview
+
+The `TemplateAPI` class provides a template for creating API-based model implementations. It handles common functionalities such as:
+
+- Tokenization (optional)
+- Batch processing
+- Caching
+- Retrying failed requests
+- Parsing API responses
+
+To use this class, you typically need to subclass it and implement specific methods for your API.
+
+## Key Methods to Implement
+
+When subclassing `TemplateAPI`, you need to implement the following methods:
+
+1. `_create_payload`: Creates the JSON payload for API requests.
+2. `parse_logprobs`: Parses log probabilities from API responses.
+3. `parse_generations`: Parses generated text from API responses.
+4. `headers`: Returns the headers for the API request.
+
+You may also need to override other methods or properties depending on your API's specific requirements.
+
+> [!NOTE]
+> Currently loglikelihood and MCQ based tasks (such as MMLU) are only supported for completion endpoints. Not for chat-completion — those that expect a list of dicts — endpoints! Completion APIs which support instruct tuned models can be evaluated with the `--apply_chat_template` option in order to simultaneously evaluate models using a chat template format while still being able to access the model logits needed for loglikelihood-based tasks.
+
+# TemplateAPI Usage Guide
+
+## TemplateAPI Arguments
+
+When initializing a `TemplateAPI` instance or a subclass, you can provide several arguments to customize its behavior. Here's a detailed explanation of some important arguments:
+
+- `model` or `pretrained` (str):
+   - The name or identifier of the model to use.
+   - `model` takes precedence over `pretrained` when both are provided.
+
+- `base_url` (str):
+   - The base URL for the API endpoint.
+
+- `tokenizer` (str, optional):
+  - The name or path of the tokenizer to use.
+  - If not provided, it defaults to using the same tokenizer name as the model.
+
+- `num_concurrent` (int):
+   - Number of concurrent requests to make to the API.
+   - Useful for APIs that support parallel processing.
+   - Default is 1 (sequential processing).
+
+- `timeout` (int, optional):
+   - Timeout for API requests in seconds.
+   - Default is 30.
+
+- `tokenized_requests` (bool):
+  - Determines whether the input is pre-tokenized. Defaults to `True`.
+  - Requests can be sent in either tokenized form (`list[list[int]]`) or as text (`list[str]`, or `str` for batch_size=1).
+  - For loglikelihood-based tasks, prompts require tokenization to calculate the context length. If `False` prompts are decoded back to text before being sent to the API.
+  - Not as important for `generate_until` tasks.
+  - Ignored for chat formatted inputs (list[dict...]) or if tokenizer_backend is None.
+
+- `tokenizer_backend` (str, optional):
+  - Required for loglikelihood-based or MCQ tasks.
+  - Specifies the tokenizer library to use. Options are "tiktoken", "huggingface", or None.
+  - Default is "huggingface".
+
+- `max_length` (int, optional):
+  - Maximum length of input + output.
+  - Default is 2048.
+
+- `max_retries` (int, optional):
+   - Maximum number of retries for failed API requests.
+   - Default is 3.
+
+- `max_gen_toks` (int, optional):
+  - Maximum number of tokens to generate in completion tasks.
+  - Default is 256 or set in task yaml.
+
+- `batch_size` (int or str, optional):
+  - Number of requests to batch together (if the API supports batching).
+  - Can be an integer or "auto" (which defaults to 1 for API models).
+  - Default is 1.
+
+- `seed` (int, optional):
+  - Random seed for reproducibility.
+  - Default is 1234.
+
+- `add_bos_token` (bool, optional):
+  - Whether to add the beginning-of-sequence token to inputs (when tokenizing).
+  - Default is False.
+
+- `custom_prefix_token_id` (int, optional):
+  - Custom token ID to use as a prefix for inputs.
+  - If not provided, uses the model's default BOS or EOS token (if `add_bos_token` is True).
+
+- `verify_certificate` (bool, optional):
+  - Whether to validate the certificate of the API endpoint (if HTTPS).
+  - Default is True.
+
+
+Example usage:
+
+```python
+class MyAPIModel(TemplateAPI):
+    def __init__(self, **kwargs):
+        super().__init__(
+            model="my-model",
+            base_url="https://api.mymodel.com/v1/completions",
+            tokenizer_backend="huggingface",
+            num_concurrent=5,
+            max_retries=5,
+            batch_size=10,
+            **kwargs
+        )
+
+    # Implement other required methods...
+```
+
+When subclassing `TemplateAPI`, you can override these arguments in your `__init__` method to set default values specific to your API. You can also add additional (potentially user-specified) arguments as needed for your specific implementation.
+
+## Example Implementation: OpenAI API
+
+The `OpenAICompletionsAPI` and `OpenAIChatCompletion` ([here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py) classes demonstrate how to implement API models using the `TemplateAPI` class. Here's a breakdown of the key components:
+
+### 1. Subclassing and Initialization
+
+```python
+@register_model("openai-completions")
+class OpenAICompletionsAPI(LocalCompletionsAPI):
+    def __init__(
+        self,
+        base_url="https://api.openai.com/v1/completions",
+        tokenizer_backend="tiktoken",
+        **kwargs,
+    ):
+        super().__init__(
+            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
+        )
+```
+
+### 2. Implementing API Key Retrieval
+
+```python
+@cached_property
+def api_key(self):
+    key = os.environ.get("OPENAI_API_KEY", None)
+    if key is None:
+        raise ValueError(
+            "API key not found. Please set the OPENAI_API_KEY environment variable."
+        )
+    return key
+```
+
+### 3. Creating the Payload
+
+```python
+def _create_payload(
+    self,
+    messages: Union[List[List[int]], List[dict], List[str], str],
+    generate=False,
+    gen_kwargs: Optional[dict] = None,
+    **kwargs,
+) -> dict:
+    if generate:
+        # ... (implementation for generation)
+    else:
+        # ... (implementation for log likelihood)
+```
+
+### 4. Parsing API Responses
+
+```python
+@staticmethod
+def parse_logprobs(
+    outputs: Union[Dict, List[Dict]],
+    tokens: List[List[int]] = None,
+    ctxlens: List[int] = None,
+    **kwargs,
+) -> List[Tuple[float, bool]]:
+    # ... (implementation)
+
+@staticmethod
+def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]:
+    # ... (implementation)
+```
+
+The requests are initiated in the `model_call` or the `amodel_call` methods.
+
+## Implementing Your Own API Model
+
+To implement your own API model:
+
+1. Subclass `TemplateAPI` or one of its subclasses (e.g., `LocalCompletionsAPI`).
+2. Override the `__init__` method if you need to set specific parameters.
+3. Implement the `_create_payload` and `header` methods to create the appropriate payload for your API.
+4. Implement the `parse_logprobs` and `parse_generations` methods to parse your API's responses.
+5. Override the `api_key` property if your API requires authentication.
+6. Override any other methods as necessary to match your API's behavior.
+
+## Best Practices
+
+1. Use the `@register_model` decorator to register your model with the framework (and import it in `lm_eval/models/__init__.py`!).
+3. Use environment variables for sensitive information like API keys.
+4. Properly handle batching and concurrent requests if supported by your API.
--- a/docs/CONTRIBUTING.md
+++ b/docs/CONTRIBUTING.md
@@ -2,8 +2,6 @@

 Welcome and thank you for your interest in the LM Evaluation Harness! We welcome contributions and feedback and appreciate your time spent with our library, and hope you find it useful!

-We intend LM Evaluation Harness to be a broadly useful and
-
 ## Important Resources

 There are several places information about LM Evaluation Harness is located:
@@ -11,7 +9,7 @@ There are several places information about LM Evaluation Harness is located:
 - Our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs)
 - We occasionally use [GitHub Milestones](https://github.com/EleutherAI/lm-evaluation-harness/milestones) to track progress toward specific near-term version releases.
 - We maintain a [Project Board](https://github.com/orgs/EleutherAI/projects/25) for tracking current work items and PRs, and for future roadmap items or feature requests.
- Further discussion and support conversations are located in the #lm-thunderdome channel of the [EleutherAI discord](discord.gg/eleutherai).
+- Further discussion and support conversations are located in the #lm-thunderdome channel of the [EleutherAI discord](https://discord.gg/eleutherai).

 ## Code Style

@@ -32,7 +30,7 @@ in order to ensure linters and other checks will be run upon committing.
 We use [pytest](https://docs.pytest.org/en/latest/) for running unit tests. All library unit tests can be run via:

 ```
-python -m pytest --ignore=tests/tests_master --ignore=tests/extra
+python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py
 ```

 ## Contributor License Agreement

--- a/docs/README.md
+++ b/docs/README.md
@@ -4,7 +4,8 @@ Welcome to the docs for the LM Evaluation Harness!

 ## Table of Contents

-* To learn about the public interface of the library, as well as how to evaluate via the commandline or as integrated into an external library, see the [Interface](./interface.md)
+* To learn about the public interface of the library, as well as how to evaluate via the command line or as integrated into an external library, see the [Interface](./interface.md).
 * To learn how to add a new library, API, or model type to the library, as well as a quick explainer on the types of ways to evaluate an LM, see the [Model Guide](./model_guide.md).
+  * For an extended description of how to extend the library to new model classes served over an API, see the [API Guide](./API_guide.md).
 * For a crash course on adding new tasks to the library, see our [New Task Guide](./new_task_guide.md).
 * To learn more about pushing the limits of task configuration that the Eval Harness supports, see the [Task Configuration Guide](./task_guide.md).
--- a/docs/chat-template-readme.md
+++ b/docs/chat-template-readme.md
+# Chat Template Delimiter Handling Update
+
+## Overview
+This change modifies how delimiters are handled when applying chat templates in the request construction process for likelihood and multiple-choice based tasks. When `apply_chat_template` is set to `True`, the target delimiter is now set to an empty string instead of using the configured delimiter.
+
+## Background
+By default, the system uses a target delimiter (typically a whitespace " ") between the context and target text when constructing prompts. The full string is constructed as:
+```
+doc_to_text(doc) + target_delimiter + doc_to_target(doc)
+```
+
+While this worked well for base models where we wanted the model to predict a single whitespace followed by the answer, chat models have their own formatting conventions that handle spacing differently.
+
+## The Change
+- When `apply_chat_template=True`, the target delimiter is now empty ("") instead of the default whitespace
+- This prevents interference between chat template formatting and the default delimiter system
+- Particularly important for multiple choice tasks where the template itself handles spacing
+
+## Example
+```
+# Before (with default delimiter " ")
+<user>Question: What color is the sky?\nAnswer:<assistant> blue
+
+# After
+<user>Question: What color is the sky?\nAnswer:<assistant>blue
+```
--- a/docs/interface.md
+++ b/docs/interface.md
@@ -46,7 +46,11 @@ This mode supports a number of command-line arguments, the details of which can

 - `--system_instruction`: Specifies a system instruction string to prepend to the prompt.

- `--apply_chat_template` : If this flag is on, a chat template will be applied to the prompt. For Hugging Face models, the chat template is taken from the tokenizer, if the tokenizer does not have a chat template, a default one will be applied. For other models, chat templating is not currently implemented.
+- `--apply_chat_template` : This flag specifies whether to apply a chat template to the prompt. It can be used in the following ways:
+	- `--apply_chat_template` : When used without an argument, applies the only available chat template to the prompt. For Hugging Face models, if no dedicated chat template exists, the default chat template will be applied.
+	- `--apply_chat_template template_name` : If the model has multiple chat templates, apply the specified template to the prompt.
+
+    For Hugging Face models, the default chat template can be found in the [`default_chat_template`](https://github.com/huggingface/transformers/blob/fc35907f95459d7a6c5281dfadd680b6f7b620e3/src/transformers/tokenization_utils_base.py#L1912) property of the Transformers Tokenizer.

 - `--fewshot_as_multiturn` : If this flag is on, the Fewshot examples are treated as a multi-turn conversation. Questions are provided as user content and answers are provided as assistant responses. Requires `--num_fewshot` to be set to be greater than 0, and `--apply_chat_template` to be on.

@@ -54,16 +58,19 @@ This mode supports a number of command-line arguments, the details of which can

 * `--seed`: Set seed for python's random, numpy and torch.  Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three.  The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility).  E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`.  E.g, `--seed 42` sets all three seeds to 42.

-* `--wandb_args`:  Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list [here](https://docs.wandb.ai/ref/python/init). e.g., ```--wandb_args project=test-project,name=test-run```
+* `--wandb_args`:  Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list [here](https://docs.wandb.ai/ref/python/init). e.g., ```--wandb_args project=test-project,name=test-run```. Also allows for the passing of the step to log things at (passed to `wandb.run.log`), e.g., `--wandb_args step=123`.

 * `--hf_hub_log_args` : Logs evaluation results to Hugging Face Hub. Accepts a string with the arguments separated by commas. Available arguments:
    * `hub_results_org` - organization name on Hugging Face Hub, e.g., `EleutherAI`. If not provided, the results will be pushed to the owner of the Hugging Face token,
-    * `hub_repo_name` - repository name on Hugging Face Hub, e.g., `lm-eval-results`,
+    * `hub_repo_name` - repository name on Hugging Face Hub (deprecated, `details_repo_name` and `results_repo_name` should be used instead), e.g., `lm-eval-results`,
+    * `details_repo_name` - repository name on Hugging Face Hub to store details, e.g., `lm-eval-results`,
+    * `results_repo_name` - repository name on Hugging Face Hub to store results, e.g., `lm-eval-results`,
    * `push_results_to_hub` - whether to push results to Hugging Face Hub, can be `True` or `False`,
    * `push_samples_to_hub` - whether to push samples results to Hugging Face Hub, can be `True` or `False`. Requires `--log_samples` to be set,
    * `public_repo` - whether the repository is public, can be `True` or `False`,
    * `leaderboard_url` - URL to the leaderboard, e.g., `https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard`.
    * `point_of_contact` - Point of contact for the results dataset, e.g., `yourname@example.com`.
+    * `gated` - whether to gate the details dataset, can be `True` or `False`.

 ## External Library Usage

@@ -102,12 +109,10 @@ results = lm_eval.simple_evaluate( # call simple_evaluate
 )
 ```

-See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L35 for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously.
+See the `simple_evaluate()` and `evaluate()` functions in [lm_eval/evaluator.py](../lm_eval/evaluator.py#:~:text=simple_evaluate) for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously.

 Additionally, the `evaluate()` function offers the core evaluation functionality provided by the library, but without some of the special handling and simplification + abstraction provided by `simple_evaluate()`.

-See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L173 for more details.
-
 As a brief example usage of `evaluate()`:

 ```python

--- a/docs/model_guide.md
+++ b/docs/model_guide.md
@@ -118,17 +118,45 @@ class MyCustomLM(LM):
    #...
    @property
    def tokenizer_name(self) -> str:
-        # should return a string denoting the name of the model's tokenizer and/or the accompanying chat template.
-
-    @property
-    def chat_template(self) -> str:
-        # should return a chat template formatting string that is used to build prompt from a user/assistant chat history.
-        # this will be saved in the evaluation results for reproducibility.
+        """
+        Return the name of the model's tokenizer and/or the accompanying chat template.
+        The returned string is used to cache requests.
+
+        Returns:
+            str: The name of the model's tokenizer and/or chat template.
+        """
+
+    def chat_template(self, chat_template: Union[bool, str] = False) -> str:
+        """
+        Get the appropriate chat template for the model based on the `chat_template` argument.
+
+        This method returns the chat template string to build the prompt from a chat history.
+        The chat template is saved in the evaluation results for reproducibility.
+        Boolean arguments should be used with models that have only one chat template,
+        while string arguments are used with models that have multiple chat templates.
+        For the reference implementation, see HFLM class in `lm_eval.models.huggingface`.
+
+        Args:
+            chat_template (Union[bool, str]): Specifies whether to apply a chat template:
+                - If False: Do not apply any chat template.
+                - If True: Apply the default chat template.
+                - If str: Apply the specified chat template by name.
+
+        Returns:
+            str: The selected chat template in Jinja format.
+        """

    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
-        # responsible for taking as input a chat history that would be fed into the model, and
-        # rendering it as a string that can be then tokenized and input into the model.
-    #...
+        """
+        Process a chat history to create a string that can be tokenized and input into the model.
+
+        Args:
+            chat_history (List[Dict[str, str]]): A list of dictionaries representing the chat history,
+                where each dictionary has "role" and "content" keys.
+
+        Returns:
+            str: A string representing the chat history that can be tokenized and fed into the model.
+        """
 ```

 - `apply_chat_template`

--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -86,20 +86,20 @@ Let's create a python file in the directory where we're writing our YAML file:
 ```bash
 touch lm_eval/tasks/<dataset_name>/utils.py
 ```
-Now, in `utils.py` we'll write a function to process each split of our dataset:
-
-TODO: Change the example to one that's in the tasks/
+Now, in `utils.py` we'll write a function to process each split of our dataset (the following example is drawn from [the `hellaswag` task](../lm_eval/tasks/hellaswag/utils.py)):

 ```python
-def process_docs(dataset: datasets.Dataset):
-    def _helper(doc):
-      # modifies the contents of a single
-      # document in our dataset.
-      doc["choices"] = [doc["choice1"], doc["choice2"], doc["wrong_answer"]]
-      doc["gold"] = doc["label"]
-      return doc
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc):
+        ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
+        out_doc = {
+            "query": preprocess(doc["activity_label"] + ": " + ctx),
+            "choices": [preprocess(ending) for ending in doc["endings"]],
+            "gold": int(doc["label"]),
+        }
+        return out_doc

-    return dataset.map(_helper) # returns back a datasets.Dataset object
+    return dataset.map(_process_doc)
 ```

 Now, in our YAML config file we'll use the `!function` constructor, and tell the config where our imported Python function will come from. At runtime, before doing anything else we will preprocess our dataset according to this function!
@@ -190,7 +190,8 @@ doc_to_target: "{{answer}}"
 ```


-**Important**: we now add `target_delimiter` between input and target which defaults to " ", such that the full input-output string is `doc_to_target(doc) + target_delimiter + doc_to_text(doc)`. `doc_to_text` and `doc_to_target` should not contain trailing right or left whitespace, respectively.
+> [!WARNING]
+> We add `target_delimiter` between input and target which defaults to " ", such that the full input-output string is `doc_to_text(doc) + target_delimiter + doc_to_target(doc)`. `doc_to_text` and `doc_to_target` should not contain trailing right or left whitespace, respectively. For multiple choice the target will be each choice index concatenated with the delimiter.


 #### Multiple choice format
@@ -206,7 +207,7 @@ doc_to_choice: "{{[distractor1, distractor2, distractor3, correct_answer]}}"
 ```
 Task implementers are thus able to decide what the answer choices should be for a document, and what prompt format to use.

-The label index can also be sourced from a feature directly. For example in `superglue/boolq`, the label index if defined in the feature `label`. We can set `doc_to_target` as simply `label`. The options or verbalizers can be written in a the form of a list `["no", "yes"]` that will correspond to the label index.
+The label index can also be sourced from a feature directly. For example in `superglue/boolq`, the label index if defined in the feature `label`. We can set `doc_to_target` as simply `label`. The options or verbalizers can be written in the form of a list `["no", "yes"]` that will correspond to the label index.

 ```yaml
 doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
@@ -285,7 +286,7 @@ As a heuristic check:

 For more detail on the task system and advanced features, see [`docs/task_guide.md`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md) . If none of the above sound like they apply to your task, it's time to continue onto checking your task performance!

-### Task name + groups (registering a task)
+### Task name + tags (registering a task)

 To test a task conveniently, it helps to *register* the task--that is, to give it a name and make the `lm-eval` library aware it exists!

@@ -296,14 +297,14 @@ task: <name of the task>
 ```
 Including a task name is mandatory.

-It is often also convenient to label your task with several `groups`, or tags, though this field is optional:
+It is often also convenient to label your task with several `tag` values, though this field is optional:

 ```yaml
-group:
-  - group1
-  - group2
+tag:
+  - tag1
+  - tag2
 ```
-This will add your task to the `group1` and `group2` groups, enabling people to know how to categorize your task, and if desired run all tasks in one of these groups at once, your task along with them.
+This will add your task to the `tag1` and `tag2` tags, enabling people to know how to categorize your task, and if desired run all tasks in one of these groups at once, your task along with them.


 If your task is not in the `lm_eval/tasks` folder, you'll need to tell the Eval Harness where to look for YAML files.
@@ -319,7 +320,48 @@ Passing `--tasks /path/to/yaml/file` is also accepted.

 ### Advanced Group Configs

-You can make more complete group config while also tailoring parameters for individual tasks.
+While `tag` values are helpful when you want to be able to quickly and conveniently run a set of related tasks via `--tasks my_tag_name`, often, we wish to implement more complex logic. For example, the MMLU benchmark contains 57 *subtasks* that must all be *averaged* together in order to report a final 'MMLU score'.
+
+Groupings of tasks might also use particular variants of a task--for example, we might want to default to evaluating a task as 5-shot when called as part of a given grouping, but not have a preference for number of shots when evaluating it as a standalone.
+
+We implement this via **groups**, which are distinct from tags. Groups can be implemented via *group config* YAML files, which are laid out similarly but slightly differently to tasks' YAML configs.
+
+The most basic form of group can be defined via a YAML config similar to the following:
+
+```yaml
+group: nli_tasks
+task:
+  - cb
+  - anli_r1
+  - rte
+metadata:
+  version: 1.0
+```
+
+This will behave almost identically to a `tag` that includes these 3 tasks, but with one key distinction: we'll print the `nli_tasks` group as a row (with no associated metrics) in our table of outputs, and visually show that these 3 tasks appear under its subheader.
+
+
+Now, let's assume we actually want to report an aggregate score for `nli_tasks`. We would instead use a YAML config like the following:
+
+```yaml
+group: nli_tasks
+task:
+  - cb
+  - anli_r1
+  - rte
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean).
+metadata:
+  version: 1.0
+```
+
+Similar to our `metric_list` for listing out the metrics we want to calculate for a given task, we use an `aggregate_metric_list` field to specify which metric name to aggregate across subtasks, what aggregation function to use, and whether we should micro- or macro- average these metrics. See [./task_guide.md](./task_guide.md) for a full list of related sub-keys.
+
+**[!Tip]: currently, we predominantly only support the aggregation of group metrics that use `mean` (either micro- or macro- averaged) over their subtasks. If you require even more complex aggregation rules, you may want to perform aggregation offline.**
+
+Group configs can be fairly complex! We can do various operations, such as defining new subtask(s) inline in our group YAML, overriding an existing task's specific config value, or nesting existing groups within our

 For example, let's build a config for evaluating MMLU and a few natural language inference tasks. For MMLU, we can write the name for the benchmark as a subtask written under `task`. You can configure the parameters such as `num_fewshot`. If the task being configured is a group such as `mmlu` or `super_glue`, the parameter set will be applied to all of the subtasks.

@@ -331,33 +373,13 @@ task:
      - cb
      - anli_r1
      - rte
+    aggregate_metric_list:
+      - metric: acc
+        aggregation: mean
+        higher_is_better: true
  - task: mmlu
    num_fewshot: 2
 ```
-It's also important to note how you can basically insert a group config as a task. Here, to make a group of natural language inference tasks, you simply write like how you would normally write a group config but this time place that as part of a task list under the main group being built.
-
-### Duplicate Tasks in Group Configs
-
-There might be cases where you might want to evaluate prompts and how models perform over prompt variations. You can list an existing task (In the example below, `anli_r1`) which varying `doc_to_text` implementation. To differentiate from each variation, we can utilize `task_alias`. LM-Eval will recognize that there are multiple variations of the same tasks and differentiate them.
-```yaml
-group: flan_held_in
-group_alias: Flan (Held-In)
-task:
-  # ANLI R1
-  - group: anli_r1_flan
-    group_alias: ANLI R1
-    task:
-      - task: anli_r1
-        task_alias: prompt-0
-        include: _held_in_template_yaml
-        doc_to_text: "{{premise}}\n\nChoose your answer ..."
-        ...
-      - task: anli_r1
-        task_alias: prompt-1
-        include: _held_in_template_yaml
-        doc_to_text: "{{premise}}\n\nBased on ..."
-      ...
-```

 ### Configuring python classes

@@ -382,21 +404,29 @@ task:
  ...
 ```

+You can also pass a custom argument to your class by accepting `config` in the custom class constructor.
+Here's how to do it:
+
+```yaml
+task: 20_newsgroups
+class: !function task.Unitxt
+recipe: card=cards.20_newsgroups,template=templates.classification.multi_class.title
+```
+
+In this example, `recipe` is the custom argument for the `Unitxt` class.
+
 ## Beautifying Table Display

-To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed. For example in `mmlu_abstract_algebra.yaml` we set `group_alias` to `stem` and `task_alias` to `abstract_algebra`.
+To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed. For example in `mmlu_abstract_algebra.yaml` we set `task_alias` to `abstract_algebra`. In group configs, a `group_alias` for a group can also be set.

 ```
 "dataset_name": "abstract_algebra"
 "description": "The following are multiple choice questions (with answers) about abstract\
  \ algebra.\n\n"
-"group": "mmlu_stem"
-"group_alias": "stem"
 "include": "_default_template_yaml"
 "task": "mmlu_abstract_algebra"
 "task_alias": "abstract_algebra"
 ```
-Note: Even though `group` can be a list, for now, `group_alias` can only be a single string.

 ## Checking validity

@@ -416,9 +446,9 @@ a simple eye test.

 ## Versioning

-One key feature in LM Evaluation Harness is the ability to version tasks--that is, mark them with a specific version number that can be bumped whenever a breaking change is made.
+One key feature in LM Evaluation Harness is the ability to version tasks and groups--that is, mark them with a specific version number that can be bumped whenever a breaking change is made.

-This version info can be provided by adding the following to your new task config file:
+This version info can be provided by adding the following to your new task or group config file:

 ```
 metadata:

--- a/docs/task_guide.md
+++ b/docs/task_guide.md
@@ -16,7 +16,8 @@ Tasks are configured via the `TaskConfig` object. Below, we describe all fields

 Task naming + registration:
 - **task** (`str`, defaults to None) — name of the task.
- **group** (`str`, *optional*) — name of the task group(s) a task belongs to. Enables one to run all tasks with a specified tag or group name at once.
+- **task_alias** (`str`, defaults to None) - Alias of the task name that will be printed in the final table results.
+- **tag** (`str`, *optional*) — name of the task tags(s) a task belongs to. Enables one to run all tasks with a specified tag name at once.

 Dataset configuration options:
 - **dataset_path** (`str`) — The name of the dataset as listed by HF in the datasets Hub.
@@ -55,8 +56,6 @@ Other:

 ## Filters

-Explain: What are filters? What is their place in the pipeline?
-
 A key component of the `lm-evaluation-harness` library is the `Filter` object. In a typical evaluation run of the harness, we take the formatted inputs and run them through our LM, with the appropriate output type (greedy or free-form generation, or loglikelihood-based comparative scoring).

 After getting scores or output text from our LM on each `Instance` or document in the dataset, we then need to feed these responses into a metric or scoring function to return scores to a user.
@@ -295,105 +294,24 @@ Generative tasks:
 Tasks using complex filtering:
 - GSM8k with CoT (+ with Self-Consistency): (`lm_eval/tasks/gsm8k/gsm8k-cot.yaml` ; `lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml`)

-
-## Benchmarks
+# Group Configuration

 When evaluating a language model, it's is not unusual to test across a number of tasks that may not be related to one another in order to assess a variety of capabilities. To this end, it may be combursome to have to list the set of tasks or add a new group name to each yaml of each individual task.

-To solve this, we can create a benchmark yaml config. This is a config that contains the names of the tasks that should be included in a particular benchmark. The config consists of two main keys `group` which denotes the name of the benchmark and `task` which is where we can list the tasks. The tasks listed in `task` are the task names that have been registered. A good example would be the list of tasks used to evaluate the Pythia Suite.
-
-```yaml
-group: pythia
-task:
-  - lambada_openai
-  - wikitext
-  - piqa
-  - sciq
-  - wsc
-  - winogrande
-  - arc
-  - logiqa
-  - blimp
-  - hendrycksTest*
-```
-
-It is also possible to list an existing task in your benchmark configuration with some adjustments. For example, a few tasks from mmlu is included `multimedqa`. There, the `task_alias` and `group_alias` (See [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md#beautifying-table-display) for more details) are modified to suit the benchmark.
-
-```yaml
-group: multimedqa
-task:
-  - pubmedqa
-  - medmcqa
-  - medqa_4options
-  - task: mmlu_anatomy
-    task_alias: "anatomy (mmlu)"
-    group_alias: null
-  - task: mmlu_clinical_knowledge
-    task_alias: "clinical_knowledge (mmlu)"
-    group_alias: null
-  ...
-```
+To solve this, we can create a **group** yaml config. This is a config that contains the names of the tasks that should be included in a particular group. The config consists of two main keys: a `group` key which denotes the name of the group (as it would be called from the command line, e.g. `mmlu`) and a `task` key which is where we can list the tasks. The tasks listed in `task` are the task names that have been registered. A good example of a group yaml config can be found at [../lm_eval/tasks/mmlu/default/_mmlu.yaml]. See also the [New Task Guide](./new_task_guide.md) for a more in-depth and tutorial-esque explanation of how to write complex GroupConfigs.

-Alternatively, benchmarks can have tasks that are customizable for each task. They can be defined like how a yaml task is usually set.
+## Configurations

-```yaml
-group: t0_eval
-task:
-  # Coreference Resolution
-  - dataset_path: super_glue
-    dataset_name: wsc.fixed
-    use_prompt: promptsource:*
-    training_split: train
-    validation_split: validation
-    metric_list:
-      - metric: exact_match
-        aggregation: mean
-        higher_is_better: true
-        ignore_case: true
-        ignore_punctuation: true
-  # Coreference Resolution
-  - dataset_path: winogrande
-    dataset_name: winogrande_xl
-    use_prompt: promptsource:*
-    training_split: train
-    validation_split: validation
-    metric_list:
-      - metric: exact_match
-        aggregation: mean
-        higher_is_better: true
-        ignore_case: true
-        ignore_punctuation: true
-  ...
-```
+Groups are configured via the `GroupConfig` object. Below, we describe all fields usable within the object, and their role in defining a task.

-If the benchmark contains the same dataset but with different configurations, use `task` to differentiate between them. For example, T0-Eval evaluates on 3 versions of ANLI but the huggingface dataset collects them in one dataset.
-
-```YAML
-group: t0_eval
-task:
-  ...
-  - task: anli_r1
-    dataset_path: anli
-    use_prompt: promptsource:*
-    training_split: train_r1
-    validation_split: dev_r1
-    metric_list:
-      - metric: exact_match
-        aggregation: mean
-        higher_is_better: true
-        ignore_case: true
-        ignore_punctuation: true
-  - task: anli_r2
-    dataset_path: anli
-    use_prompt: promptsource:*
-    training_split: train_r2
-    validation_split: dev_r2
-    metric_list:
-      - metric: exact_match
-        aggregation: mean
-        higher_is_better: true
-        ignore_case: true
-        ignore_punctuation: true
-```
+### Parameters

-Calling the benchmark is done the same way we would call any task with `--tasks`. Benchmarks can be added in `lm_eval/tasks/benchmarks/`
+- **group** (`str`, defaults to `None`) — name of the group. Used to invoke it from the command line.
+- **group_alias** (`str`, defaults to `None`) - Alternative name for the group that will be printed in the table output.
+- **task** (`Union[str, list]`, defaults to `None`) - List of tasks that constitute the group.
+- **aggregate_metric_list** (`list`, defaults to `None`) - similar to `metric_list` in TaskConfigs, provide a list of configurations for metrics that should be aggregated across subtasks. Leaving empty will result in no aggregation being performed for this group. Keys for each list entry are:
+  - `metric: str` - the name of the metric to aggregate over (all subtasks must report a metric holding this name.)
+  - `aggregation: str` - what aggregation function to apply to aggregate these per-subtask metrics.  **currently, only `mean` is supported.**
+  - `weight_by_size: bool = True` whether to perform micro- averaging (`True`) or macro- (`False`) averaging of subtasks' accuracy scores when reporting the group's metric. MMLU, for example, averages over per-document accuracies (the *micro average*), resulting in the same accuracy as if one simply concatenated all 57 subjects into a single dataset and evaluated accuracy on that dataset.
+  - `filter_list: Union[str, List[str]] = "none"` - what filter keys one should match on to aggregate results. For example, if trying to aggregate over the `exact_match` metric using `strict-match` filter for `bbh_cot_zeroshot`, then set this to be `filter_list: "strict-match"`.  
+- **metadata** (`dict`, *optional*) - As with TaskConfigs, a field where extra config metadata can be passed. set the `num_fewshot` key within this to override the printed n_shot value in a results table for your group, for example.
--- a/examples/lm-eval-overview.ipynb
+++ b/examples/lm-eval-overview.ipynb
--- a/examples/visualize-wandb.ipynb
+++ b/examples/visualize-wandb.ipynb
@@ -68,6 +68,7 @@
   "source": [
    "import wandb\n",
    "\n",
+    "\n",
    "wandb.login()"
   ]
  },
@@ -110,13 +111,15 @@
   "cell_type": "markdown",
   "id": "e974cabdbe70b667",
   "metadata": {},
-   "source": ""
+   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "5178ca9445b844e4",
   "metadata": {},
-   "source": "W&B can also be initialized programmatically for use outside the CLI to parse and log the results."
+   "source": [
+    "W&B can also be initialized programmatically for use outside the CLI to parse and log the results."
+   ]
  },
  {
   "cell_type": "code",
@@ -126,7 +129,8 @@
   "outputs": [],
   "source": [
    "import lm_eval\n",
-    "from lm_eval.logging_utils import WandbLogger\n",
+    "from lm_eval.loggers import WandbLogger\n",
+    "\n",
    "\n",
    "results = lm_eval.simple_evaluate(\n",
    "    model=\"hf\",\n",

--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -73,7 +73,7 @@ def setup_parser() -> argparse.ArgumentParser:
        default=None,
        type=str,
        metavar="task1,task2",
-        help="To get full list of tasks, use the command lm-eval --tasks list",
+        help="Comma-separated list of task names or task groupings to evaluate on.\nTo get full list of tasks, use one of the commands `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above",
    )
    parser.add_argument(
        "--model_args",
@@ -170,9 +170,16 @@ def setup_parser() -> argparse.ArgumentParser:
    )
    parser.add_argument(
        "--apply_chat_template",
-        action="store_true",
+        type=str,
+        nargs="?",
+        const=True,
        default=False,
-        help="If True, applies the chat template to the prompt",
+        help=(
+            "If True, apply chat template to the prompt. "
+            "Providing `--apply_chat_template` without an argument will apply the default chat template to the prompt. "
+            "To apply a specific template from the available list of templates, provide the template name as an argument. "
+            "E.g. `--apply_chat_template template_name`"
+        ),
    )
    parser.add_argument(
        "--fewshot_as_multiturn",
@@ -289,14 +296,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:

    if args.fewshot_as_multiturn and args.apply_chat_template is False:
        raise ValueError(
-            "If fewshot_as_multiturn is set, apply_chat_template must be set to True."
-        )
-
-    if (
-        args.num_fewshot is None or args.num_fewshot == 0
-    ) and args.fewshot_as_multiturn:
-        raise ValueError(
-            "If fewshot_as_multiturn is set, num_fewshot must be greater than 0."
+            "When `fewshot_as_multiturn` is selected, `apply_chat_template` must be set (either to `True` or to the chosen template name)."
        )

    if args.include_path is not None:
@@ -318,9 +318,16 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        eval_logger.error("Need to specify task to evaluate.")
        sys.exit()
    elif args.tasks == "list":
-        eval_logger.info(
-            "Available Tasks:\n - {}".format("\n - ".join(task_manager.all_tasks))
-        )
+        print(task_manager.list_all_tasks())
+        sys.exit()
+    elif args.tasks == "list_groups":
+        print(task_manager.list_all_tasks(list_subtasks=False, list_tags=False))
+        sys.exit()
+    elif args.tasks == "list_tags":
+        print(task_manager.list_all_tasks(list_groups=False, list_subtasks=False))
+        sys.exit()
+    elif args.tasks == "list_subtasks":
+        print(task_manager.list_all_tasks(list_groups=False, list_tags=False))
        sys.exit()
    else:
        if os.path.isdir(args.tasks):
@@ -349,16 +356,22 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
                    f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
                )
                raise ValueError(
-                    f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks, or '--verbosity DEBUG' to troubleshoot task registration issues."
+                    f"Tasks not found: {missing}. Try `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above, or pass '--verbosity DEBUG' to troubleshoot task registration issues."
                )

    # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args
    if args.trust_remote_code:
-        os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = str(args.trust_remote_code)
-        args.model_args = (
-            args.model_args
-            + f",trust_remote_code={os.environ['HF_DATASETS_TRUST_REMOTE_CODE']}"
+        eval_logger.info(
+            "Passed `--trust_remote_code`, setting environment variable `HF_DATASETS_TRUST_REMOTE_CODE=true`"
        )
+        # HACK: import datasets and override its HF_DATASETS_TRUST_REMOTE_CODE value internally,
+        # because it's already been determined based on the prior env var before launching our
+        # script--`datasets` gets imported by lm_eval internally before these lines can update the env.
+        import datasets
+
+        datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
+
+        args.model_args = args.model_args + ",trust_remote_code=True"

    eval_logger.info(f"Selected Tasks: {task_names}")


--- a/lm_eval/api/group.py
+++ b/lm_eval/api/group.py
+import abc
+from dataclasses import asdict, dataclass
+from inspect import getsource
+from typing import Any, Callable, List, Optional, Union
+
+
+@dataclass
+class AggMetricConfig(dict):
+    metric: Optional[str] = None
+    aggregation: Optional[str] = "mean"
+    weight_by_size: Optional[str] = False
+    # list of filter names which should be incorporated into the aggregated metric.
+    filter_list: Optional[Union[str, list]] = "none"
+
+    def __post_init__(self):
+        if self.aggregation != "mean" and not callable(self.aggregation):
+            raise ValueError(
+                f"Currently, 'mean' is the only pre-defined aggregation across groups' subtasks. Got '{self.aggregation}'."
+            )
+
+        if isinstance(self.filter_list, str):
+            self.filter_list = [self.filter_list]
+
+
+@dataclass
+class GroupConfig(dict):
+    group: Optional[str] = None
+    group_alias: Optional[str] = None
+    task: Optional[Union[str, list]] = None
+    aggregate_metric_list: Optional[
+        Union[List[AggMetricConfig], AggMetricConfig, dict]
+    ] = None
+    metadata: Optional[dict] = (
+        None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
+    )
+
+    def __getitem__(self, item):
+        return getattr(self, item)
+
+    def __setitem__(self, item, value):
+        return setattr(self, item, value)
+
+    def __post_init__(self):
+        if self.aggregate_metric_list is not None:
+            if isinstance(self.aggregate_metric_list, dict):
+                self.aggregate_metric_list = [self.aggregate_metric_list]
+
+            self.aggregate_metric_list = [
+                AggMetricConfig(**item) if isinstance(item, dict) else item
+                for item in self.aggregate_metric_list
+            ]
+
+    def to_dict(self, keep_callable: bool = False) -> dict:
+        """dumps the current config as a dictionary object, as a printable format.
+        null fields will not be printed.
+        Used for dumping results alongside full task configuration
+
+        :return: dict
+            A printable dictionary version of the TaskConfig object.
+
+        # TODO: should any default value in the TaskConfig not be printed?
+        """
+        cfg_dict = asdict(self)
+        # remove values that are `None`
+        for k, v in list(cfg_dict.items()):
+            if callable(v):
+                cfg_dict[k] = self.serialize_function(v, keep_callable=keep_callable)
+        return cfg_dict
+
+    def serialize_function(
+        self, value: Union[Callable, str], keep_callable=False
+    ) -> Union[Callable, str]:
+        """Serializes a given function or string.
+
+        If 'keep_callable' is True, the original callable is returned.
+        Otherwise, attempts to return the source code of the callable using 'getsource'.
+        """
+        if keep_callable:
+            return value
+        else:
+            try:
+                return getsource(value)
+            except (TypeError, OSError):
+                return str(value)
+
+
+class ConfigurableGroup(abc.ABC):
+    def __init__(
+        self,
+        config: Optional[dict] = None,
+    ) -> None:
+        self._config = GroupConfig(**config)
+
+    @property
+    def group(self):
+        return self._config.group
+
+    @property
+    def group_alias(self):
+        return self._config.group_alias
+
+    @property
+    def version(self):
+        return self._config.version
+
+    @property
+    def config(self):
+        return self._config.to_dict()
+
+    @property
+    def group_name(self) -> Any:
+        return self._config.group
+
+    def __repr__(self):
+        return (
+            f"ConfigurableGroup(group={self.group}," f"group_alias={self.group_alias})"
+        )
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
 import logging
 import math
 import random
+import re
+import string
 from collections.abc import Iterable
 from typing import List

-import evaluate as hf_evaluate
 import numpy as np
 import sacrebleu
-import sklearn.metrics

 from lm_eval.api.registry import register_aggregation, register_metric

@@ -50,21 +50,24 @@ def bits_per_byte(items):

 @register_aggregation("f1")
 def f1_score(items):
+    from sklearn.metrics import f1_score
+
    unzipped_list = list(zip(*items))
    golds = unzipped_list[0]
    preds = unzipped_list[1]
-    fscore = sklearn.metrics.f1_score(golds, preds)
+    fscore = f1_score(golds, preds)

    return np.max(fscore)


 @register_aggregation("matthews_corrcoef")
 def matthews_corrcoef(items):
+    from sklearn.metrics import matthews_corrcoef
+
    unzipped_list = list(zip(*items))
    golds = unzipped_list[0]
    preds = unzipped_list[1]
-    # print(preds)
-    return sklearn.metrics.matthews_corrcoef(golds, preds)
+    return matthews_corrcoef(golds, preds)


 @register_aggregation("bleu")
@@ -166,7 +169,60 @@ def acc_mutual_info_fn(items):  # This is a passthrough function
    return items


-exact_match = hf_evaluate.load("exact_match")
+### the code used in the `exact_match_hf_evaluate` function is ported from
+### https://github.com/huggingface/evaluate/blob/main/metrics/exact_match/exact_match.py
+### which is under the apache license.
+
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def exact_match_hf_evaluate(
+    predictions,
+    references,
+    regexes_to_ignore=None,
+    ignore_case=False,
+    ignore_punctuation=False,
+    ignore_numbers=False,
+):
+    if regexes_to_ignore is not None:
+        for s in regexes_to_ignore:
+            predictions = np.array([re.sub(s, "", x) for x in predictions])
+            references = np.array([re.sub(s, "", x) for x in references])
+    else:
+        predictions = np.asarray(predictions)
+        references = np.asarray(references)
+
+    if ignore_case:
+        predictions = np.char.lower(predictions)
+        references = np.char.lower(references)
+
+    if ignore_punctuation:
+        repl_table = string.punctuation.maketrans("", "", string.punctuation)
+        predictions = np.char.translate(predictions, table=repl_table)
+        references = np.char.translate(references, table=repl_table)
+
+    if ignore_numbers:
+        repl_table = string.digits.maketrans("", "", string.digits)
+        predictions = np.char.translate(predictions, table=repl_table)
+        references = np.char.translate(references, table=repl_table)
+
+    score_list = predictions == references
+
+    return {"exact_match": np.mean(score_list)}
+
+
+###


 @register_metric(
@@ -176,7 +232,7 @@ exact_match = hf_evaluate.load("exact_match")
    aggregation="mean",
 )
 def exact_match_fn(**kwargs):
-    return exact_match.compute(**kwargs)
+    return exact_match_hf_evaluate(**kwargs)


 @register_metric(