Merge branch 'EleutherAI:main' into main

da211969 · Jess · GitHub · 1b97e487 · 801322e0 · da211969
Unverified Commit da211969 authored Jun 28, 2024 by Jess Committed by GitHub Jun 28, 2024
20 changed files
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
@@ -20,13 +20,13 @@ jobs:
        with:
          fetch-depth: 2  # OR "2" -> To retrieve the preceding commit.

-      # Uses the tj-actions/changed-files@v37 action to check for changes.
+      # Uses the tj-actions/changed-files action to check for changes.
      # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
      # The `files_yaml` input optionally takes a yaml string to specify filters,
      # and prepends the filter name to the standard output names.
      - name: Check task folders
        id: changed-tasks
-        uses: tj-actions/changed-files@v37.1.2
+        uses: tj-actions/changed-files@v44.5.2
        with:
          # tasks checks the tasks folder and api checks the api folder for changes
          files_yaml: |

--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -32,7 +32,7 @@ jobs:
      env:
        SKIP: "no-commit-to-branch,mypy"

-      uses: pre-commit/action@v3.0.0
+      uses: pre-commit/action@v3.0.1
 #       # mypy turned off for now
 #    - name: Lint with mypy
 #      run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
@@ -56,12 +56,37 @@ jobs:
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install -e '.[dev,anthropic,sentencepiece,optimum,deepsparse,sparseml]' --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -e '.[dev,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
 #         Install optional git dependencies
 #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
 #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
    - name: Test with pytest
-      run: python -m pytest --showlocals -s -vv -n=auto
+      run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py
+    - name: Archive artifacts
+      uses: actions/upload-artifact@v3
+      with:
+        name: output_results
+        path: |
+          test_logs/*
+  testmodels:
+    name: External LM Tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v5
+      with:
+        python-version: 3.8
+        cache: pip
+        cache-dependency-path: pyproject.toml
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e '.[dev,optimum,deepsparse,sparseml]' --extra-index-url https://download.pytorch.org/whl/cpu
+    - name: Test with pytest
+      run: python -m pytest tests/models --showlocals -s -vv
    - name: Archive artifacts
      uses: actions/upload-artifact@v3
      with:

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -10,6 +10,7 @@ repos:
      - id: check-case-conflict
      - id: check-json
      - id: check-merge-conflict
+        args: [--assume-in-merge]
      - id: check-symlinks
      - id: check-yaml
        args: ["--unsafe"]
@@ -28,8 +29,7 @@ repos:
      - id: mixed-line-ending
        args: [--fix=lf]
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    # Ruff version.
-    rev: v0.2.2
+    rev: v0.4.8
    hooks:
      # Run the linter.
      - id: ruff
@@ -38,7 +38,7 @@ repos:
        # Run the formatter.
      - id: ruff-format
  - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.6
+    rev: v2.3.0
    hooks:
      - id: codespell
        exclude: >
@@ -46,9 +46,9 @@ repos:
              .*\.json|ignore.txt|lm_eval/tasks/.*|.*yaml|.*\.ipynb
          )$
        args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
-  - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.5.1
-    hooks:
-    - id: mypy
-      additional_dependencies: [".[sentencepiece,multilingual,promptsource,gptq]", "types-PyYAML", "types-requests"]
-      exclude: ^tests/.*$
+#  - repo: https://github.com/pre-commit/mirrors-mypy
+#    rev: v1.5.1
+#    hooks:
+#    - id: mypy
+#      additional_dependencies: [".[sentencepiece,multilingual,promptsource,gptq]", "types-PyYAML", "types-requests"]
+#      exclude: ^tests/.*$
--- a/README.md
+++ b/README.md
@@ -49,6 +49,11 @@ pip install -e .
 We also provide a number of optional dependencies for extended functionality. A detailed table is available at the end of this document.

 ## Basic Usage
+### User Guide
+
+A user guide detailing the full list of supported arguments is provided [here](./docs/interface.md), and on the terminal by calling `lm_eval -h`. Alternatively, you can use `lm-eval` instead of `lm_eval`.
+
+A list of supported tasks (or groupings of tasks) can be viewed with `lm-eval --tasks list`. Task descriptions and links to corresponding subfolders are provided [here](./lm_eval/tasks/README.md).

 ### Hugging Face `transformers`

@@ -84,8 +89,6 @@ lm_eval --model hf \
    --batch_size auto:4
 ```

-The full list of supported arguments are provided [here](./docs/interface.md), and on the terminal by calling `lm_eval -h`. Alternatively, you can use `lm-eval` instead of `lm_eval`. A list of supported tasks can be viewed with `lm-eval --tasks list`.
-
 > [!Note]
 > Just like you can provide a local path to `transformers.AutoModel`, you can also provide a local path to `lm_eval` via `--model_args pretrained=/path/to/model`

@@ -307,7 +310,7 @@ To save evaluation results provide an `--output_path`. We also support logging m

 Additionally, one can provide a directory with `--use_cache` to cache the results of prior runs. This allows you to avoid repeated execution of the same (model, task) pairs for re-scoring.

-To push results and samples to the Hugging Face Hub, first ensure an access token with write access is set in the `HF_TOKEN` environment variable. Then, use the `--hf_hub_log_args` flag to specify the organization, repository name, repository visibility, and whether to push results and samples to the Hub - [example output](https://huggingface.co/datasets/KonradSzafer/lm-eval-results-demo/tree/main/microsoft__phi-2). For instance:
+To push results and samples to the Hugging Face Hub, first ensure an access token with write access is set in the `HF_TOKEN` environment variable. Then, use the `--hf_hub_log_args` flag to specify the organization, repository name, repository visibility, and whether to push results and samples to the Hub - [example dataset on the  HF Hub](https://huggingface.co/datasets/KonradSzafer/lm-eval-results-demo). For instance:

 ```bash
 lm_eval --model hf \
@@ -318,6 +321,13 @@ lm_eval --model hf \
    --hf_hub_log_args hub_results_org=EleutherAI,hub_repo_name=lm-eval-results,push_results_to_hub=True,push_samples_to_hub=True,public_repo=False \
 ```

+This allows you to easily download the results and samples from the Hub, using:
+```python
+from datasets import load_dataset
+
+load_dataset("EleutherAI/lm-eval-results-private", "hellaswag", "latest")
+```
+
 For a full list of supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md) guide in our documentation!

 ## Visualizing Results

--- a/docs/interface.md
+++ b/docs/interface.md
@@ -10,7 +10,7 @@ Equivalently, running the library can be done via the `lm-eval` entrypoint at th

 This mode supports a number of command-line arguments, the details of which can be also be seen via running with `-h` or `--help`:

- `--model` : Selects which model type or provider is evaluated. Must be a string corresponding to the name of the model type/provider being used. See [the main README](https://github.com/EleutherAI/lm-evaluation-harness/tree/main#commercial-apis) for a full list of enabled model names and supported libraries or APIs.
+- `--model` : Selects which model type or provider is evaluated. Must be a string corresponding to the name of the model type/provider being used. See [the main README](https://github.com/EleutherAI/lm-evaluation-harness/tree/main#model-apis-and-inference-servers) for a full list of enabled model names and supported libraries or APIs.

 - `--model_args` : Controls parameters passed to the model constructor. Accepts a string containing comma-separated keyword arguments to the model class of the format `"arg1=val1,arg2=val2,..."`, such as, for example `--model_args pretrained=EleutherAI/pythia-160m,dtype=float32`. For a full list of what keyword arguments, see the initialization of the `lm_eval.api.model.LM` subclass, e.g. [`HFLM`](https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/models/huggingface.py#L66)

@@ -42,20 +42,28 @@ This mode supports a number of command-line arguments, the details of which can

 - `--show_config` : If used, prints the full `lm_eval.api.task.TaskConfig` contents (non-default settings the task YAML file) for each task which was run, at the completion of an evaluation. Useful for when one is modifying a task's configuration YAML locally to transmit the exact configurations used for debugging or for reproducibility purposes.

- `--include_path` : Accepts a path to a folder. If passed, then all YAML files containing ` lm-eval`` compatible task configurations will be added to the task registry as available tasks. Used for when one is writing config files for their own task in a folder other than  `lm_eval/tasks/`
+- `--include_path` : Accepts a path to a folder. If passed, then all YAML files containing `lm-eval` compatible task configurations will be added to the task registry as available tasks. Used for when one is writing config files for their own task in a folder other than `lm_eval/tasks/`.
+
+- `--system_instruction`: Specifies a system instruction string to prepend to the prompt.
+
+- `--apply_chat_template` : If this flag is on, a chat template will be applied to the prompt. For Hugging Face models, the chat template is taken from the tokenizer, if the tokenizer does not have a chat template, a default one will be applied. For other models, chat templating is not currently implemented.
+
+- `--fewshot_as_multiturn` : If this flag is on, the Fewshot examples are treated as a multi-turn conversation. Questions are provided as user content and answers are provided as assistant responses. Requires `--num_fewshot` to be set to be greater than 0, and `--apply_chat_template` to be on.

 - `--predict_only`: Generates the model outputs without computing metrics. Use with `--log_samples` to retrieve decoded results.

 * `--seed`: Set seed for python's random, numpy and torch.  Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three.  The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility).  E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`.  E.g, `--seed 42` sets all three seeds to 42.

-* `--wandb_args`:  Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list (here.)[https://docs.wandb.ai/ref/python/init]. e.g., ```--wandb_args project=test-project,name=test-run```
+* `--wandb_args`:  Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list [here](https://docs.wandb.ai/ref/python/init). e.g., ```--wandb_args project=test-project,name=test-run```

 * `--hf_hub_log_args` : Logs evaluation results to Hugging Face Hub. Accepts a string with the arguments separated by commas. Available arguments:
-    * `hub_results_org` - organization name on Hugging Face Hub, e.g., `EleutherAI`,
+    * `hub_results_org` - organization name on Hugging Face Hub, e.g., `EleutherAI`. If not provided, the results will be pushed to the owner of the Hugging Face token,
    * `hub_repo_name` - repository name on Hugging Face Hub, e.g., `lm-eval-results`,
    * `push_results_to_hub` - whether to push results to Hugging Face Hub, can be `True` or `False`,
    * `push_samples_to_hub` - whether to push samples results to Hugging Face Hub, can be `True` or `False`. Requires `--log_samples` to be set,
    * `public_repo` - whether the repository is public, can be `True` or `False`,
+    * `leaderboard_url` - URL to the leaderboard, e.g., `https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard`.
+    * `point_of_contact` - Point of contact for the results dataset, e.g., `yourname@example.com`.

 ## External Library Usage

@@ -84,7 +92,7 @@ task_manager = lm_eval.tasks.TaskManager()

 # Setting `task_manager` to the one above is optional and should generally be done
 # if you want to include tasks from paths other than ones in `lm_eval/tasks`.
-# `simple_evaluate` will instantiate its own task_manager is the it is set to None here.
+# `simple_evaluate` will instantiate its own task_manager if it is set to None here.
 results = lm_eval.simple_evaluate( # call simple_evaluate
    model=lm_obj,
    tasks=["taskname1", "taskname2"],
@@ -94,12 +102,10 @@ results = lm_eval.simple_evaluate( # call simple_evaluate
 )
 ```

-See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L35 for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously.
+See the `simple_evaluate()` and `evaluate()` functions in [lm_eval/evaluator.py](../lm_eval/evaluator.py#:~:text=simple_evaluate) for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously.

 Additionally, the `evaluate()` function offers the core evaluation functionality provided by the library, but without some of the special handling and simplification + abstraction provided by `simple_evaluate()`.

-See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L173 for more details.
-
 As a brief example usage of `evaluate()`:

 ```python
@@ -139,7 +145,7 @@ task_dict = lm_eval.tasks.get_task_dict(
    task_manager # A task manager that allows lm_eval to
                 # load the task during evaluation.
                 # If none is provided, `get_task_dict`
-                 # will instantiated one itself, but this
+                 # will instantiate one itself, but this
                 # only includes the stock tasks so users
                 # will need to set this if including
                 # custom paths is required.

--- a/docs/model_guide.md
+++ b/docs/model_guide.md
@@ -6,7 +6,7 @@ In order to properly evaluate a given LM, we require implementation of a wrapper

 ## Setup

-To get started contributing, go ahead and fork the main repo, clone it, create a branch with the name of your task, and install the project requirements in your environment:
+To get started contributing, go ahead and fork the main repo, clone it, create a branch with the name of your model, and install the project requirements in your environment:

 ```sh
 # After forking...
@@ -107,6 +107,53 @@ Using this decorator results in the class being added to an accounting of the us

 We also recommend that new model contributions be accompanied by short tests of their 3 core functionalities, at minimum. To see an example of such tests, look at https://github.com/EleutherAI/lm-evaluation-harness/blob/35bdecd379c0cefad6897e67db892f4a6026a128/tests/test_ggml.py .

+## Chat Templating
+
+Many models are fine-tuned with a [Chat Template](https://huggingface.co/docs/transformers/main/en/chat_templating) in order to enable back-and-forth interaction between a "User"'s queries and the model (often called "Assistant")'s responses. It can be desirable to evaluate fine-tuned models on evaluation tasks while wrapped in the conversational format they expect.
+
+In order to make your model optionally compatible with a chat format, three additional methods must be implemented:
+
+```python
+class MyCustomLM(LM):
+    #...
+    @property
+    def tokenizer_name(self) -> str:
+        # should return a string denoting the name of the model's tokenizer and/or the accompanying chat template.
+
+    @property
+    def chat_template(self) -> str:
+        # should return a chat template formatting string that is used to build prompt from a user/assistant chat history.
+        # this will be saved in the evaluation results for reproducibility.
+
+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        # responsible for taking as input a chat history that would be fed into the model, and
+        # rendering it as a string that can be then tokenized and input into the model.
+    #...
+```
+
+- `apply_chat_template`
+  - This method performs the bulk of the work required for chat-formatting.
+  - As input, a `chat_history: List[Dict[str, str]]` is passed in. This is a transcript of a conversation of a form similar to
+      ```
+      [
+        {"system": <user-provided system message such as "You are a helpful math-focused chatbot">},
+        {"user": <task example - a few-shot example 'input'>}
+        {"assistant": <correct response to the above example>},
+        # ... more few-shot examples, potentially
+        {"user": <test set query--response on which we will evaluate>},
+      ]
+      ```
+      which can then be converted into a string input.
+  - The output is a string representing this conversation that can be fed into the model.
+  - For example, this consists of simply calling `tokenizer.apply_chat_template` for HFLM--see the implementation there for reference.
+- `tokenizer_name`
+  - LM Eval Harness supports [caching requests](https://github.com/EleutherAI/lm-evaluation-harness/blob/4902aaaf1f374682f95ac25fe2e13b23faddc91a/lm_eval/__main__.py#L140) that are sent to a model, for faster setup when repeating an already-performed evaluation.
+  - However, we don't want to use the cache of chat transcripts rendered using one chat template or system prompt to send to a model with a different template! So, we use this `lm.tokenizer_name` string to distinguish caches for a given model (and chat template) from one another.
+- `chat_template`
+  - Chat templates are typically provided as a Jinja template string or a string formatted with str.format to include user and assistant messages in a single prompt. This template string is saved in the evaluation results to ensure reproducibility.
+
+If not implemented for a given model type, the flags `--apply_chat_template` , `--fewshot_as_multiturn`, and `--system_instruction` cannot be used.
+
 ## Other

 **Pro tip**: In order to make the Evaluation Harness overestimate total runtimes rather than underestimate it, HuggingFace models come in-built with the ability to provide responses on data points in *descending order by total input length* via `lm_eval.utils.Reorderer`. Take a look at `lm_eval.models.hf_causal.HFLM` to see how this is done, and see if you can implement it in your own model!

--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -35,7 +35,7 @@ and rename the folders and YAML file(s) as desired.

 ### Selecting and configuring a dataset

-All data downloading and management is handled through the HuggingFace (**HF**) [`datasets`](https://github.com/huggingface/datasets) API. So, the first thing you should do is check to see if your task's dataset is already provided in their catalog [here](https://huggingface.co/datasets). If it's not in there, please consider adding it to their Hub to make it accessible to a wider user base by following their [new dataset guide](https://github.com/huggingface/datasets/blob/master/ADD_NEW_DATASET.md)
+All data downloading and management is handled through the HuggingFace (**HF**) [`datasets`](https://github.com/huggingface/datasets) API. So, the first thing you should do is check to see if your task's dataset is already provided in their catalog [here](https://huggingface.co/datasets). If it's not in there, please consider adding it to their Hub to make it accessible to a wider user base by following their [new dataset guide](https://github.com/huggingface/datasets/blob/main/ADD_NEW_DATASET.md)
 .

 Once you have a HuggingFace dataset prepared for your task, we want to assign our new YAML to use this dataset:
@@ -59,7 +59,25 @@ We can also specify from which split the task should retrieve few-shot examples
 ```yaml
 fewshot_split: <split name to draw fewshot examples from, or `null`>
 ```
-though if this is not set, we will default to train/validation/test sets, in that order.
+or by hardcoding them, either using the following in the yaml file:
+```yaml
+fewshot_config:
+  sampler: first_n
+  samples: [
+    {<sample 1>},
+    {<sample 2>},
+  ]
+```
+or by adding the function `list_fewshot_samples` in the associated utils.py file:
+```python
+def list_fewshot_samples() -> list[dict]:
+  return [{<sample 1>}, {<sample 2>}]
+```
+See `lm_eval/tasks/minerva_math/minerva_math_algebra.yaml` for an example of the latter, and `lm_eval/tasks/gsm8k/gsm8k-cot.yaml` for an example of the former.
+
+In this case, each sample must contain the same fields as the samples in the above sets--for example, if `doc_to_text` expects an `input` field when rendering input prompts, these provided samples must include an `input` key.
+
+If neither above options are not set, we will default to train/validation/test sets, in that order.


 Finally, our dataset may not be already in the exact format we want. Maybe we have to strip whitespace and special characters via a regex from our dataset's "question" field! Or maybe we just want to rename its columns to match a convention we'll be using for our prompts.
@@ -172,7 +190,7 @@ doc_to_target: "{{answer}}"
 ```


-**Important**: we now add `target_delimiter` between input and target which defaults to " ", such that the full input-output string is `doc_to_target(doc) + target_delimiter + doc_to_text(doc)`. doc_to_text and doc_to_target should not contain trailing right or left whitespace, respectively.
+**Important**: we now add `target_delimiter` between input and target which defaults to " ", such that the full input-output string is `doc_to_target(doc) + target_delimiter + doc_to_text(doc)`. `doc_to_text` and `doc_to_target` should not contain trailing right or left whitespace, respectively.


 #### Multiple choice format
@@ -213,7 +231,7 @@ def wikitext_detokenizer(doc):
    return string
 ```

-We can load this function in `doc_to_target` by using a `!function` operator after `doc_to_target` and followed by `<file name>.<function name>`. In the file [wikitext.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/6ae376e3a43caa58b95bb8aa73054a94827bf560/lm_eval/tasks/wikitext/wikitext.yaml) we write:
+We can load this function in `doc_to_target` by using a `!function` operator after `doc_to_target` and followed by `<file name>.<function name>`. In the file [wikitext.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/wikitext/wikitext.yaml) we write:
 ```
 doc_to_target: !function preprocess_wikitext.wikitext_detokenizer
 ```
@@ -366,9 +384,7 @@ task:

 ## Beautifying Table Display

-To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed.
-``
-for example in `mmlu_abstract_algebra.yaml` we set `group_alias` to `stem` and `task_alias` to `abstract_algebra`.
+To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed. For example in `mmlu_abstract_algebra.yaml` we set `group_alias` to `stem` and `task_alias` to `abstract_algebra`.

 ```
 "dataset_name": "abstract_algebra"
@@ -440,6 +456,8 @@ If other tasks on this dataset are already supported:

 It is recommended to include a filled-out copy of this checklist in the README.md for the subfolder you are creating, if you have created a new subfolder in `lm_eval/tasks`.

+**Finally, please add a short description of your task(s), along with a link to its subfolder in lm_eval/tasks , to [`lm_eval/tasks/README.md`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/README.md) so that users can discover your task in the library, and follow the link to your README for more information about the variants supported, their task names, and the original source of the dataset and/or evaluation setup.**
+
 ## Submitting your task

 You're all set! Now push your work and make a pull request to the `main` branch! Thanks for the contribution :). If there are any questions, please leave a message in the `#lm-thunderdome` channel on the EAI discord!
--- a/docs/task_guide.md
+++ b/docs/task_guide.md
@@ -31,8 +31,8 @@ Dataset configuration options:
 Prompting / in-context formatting options:
 - **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text, doc_to_target, and doc_to_choice.
 - **description** (`str`, *optional*) — An optional prepended Jinja2 template or string which will be prepended to the few-shot examples passed into the model, often describing the task or providing instructions to a model, such as `"The following are questions (with answers) about {{subject}}.\n\n"`. No delimiters or spacing are inserted between the description and the first few-shot example.
- **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate input for the model
- **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate target output for the model. For multiple choice tasks, this should return an index into
+- **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate input for the model.
+- **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate target output for the model. For multiple choice tasks, this should return an index into the answer choice list of the correct answer.
 - **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `generate_until` tasks.
 - **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples.
 - **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested.
@@ -190,7 +190,7 @@ You can base a YAML on another YAML file as a template. This can be handy when y
 include: <YAML filename or with full path>
 ...
 ```
-You can find an example of how to use this feature at [gsm8k-cot-self-consistency.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/3c07cc04a92fc467d7c9a94894aeddd58c93a5da/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml) where it is based off [gsm8k-cot.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/3c07cc04a92fc467d7c9a94894aeddd58c93a5da/lm_eval/tasks/gsm8k/gsm8k-cot.yaml)
+You can find an example of how to use this feature at [gsm8k-cot-self-consistency.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml) where it is based off [gsm8k-cot.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k-cot.yaml)


 ## Passing Arguments to Metrics

--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -8,7 +8,7 @@ from typing import Union

 from lm_eval import evaluator, utils
 from lm_eval.evaluator import request_caching_arg_to_dict
-from lm_eval.logging import EvaluationTracker, WandbLogger
+from lm_eval.loggers import EvaluationTracker, WandbLogger
 from lm_eval.tasks import TaskManager
 from lm_eval.utils import handle_non_serializable, make_table, simple_parse_args_string

@@ -162,6 +162,24 @@ def setup_parser() -> argparse.ArgumentParser:
        default=False,
        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
    )
+    parser.add_argument(
+        "--system_instruction",
+        type=str,
+        default=None,
+        help="System instruction to be used in the prompt",
+    )
+    parser.add_argument(
+        "--apply_chat_template",
+        action="store_true",
+        default=False,
+        help="If True, applies the chat template to the prompt",
+    )
+    parser.add_argument(
+        "--fewshot_as_multiturn",
+        action="store_true",
+        default=False,
+        help="If True, uses the fewshot as a multi-turn conversation",
+    )
    parser.add_argument(
        "--show_config",
        action="store_true",
@@ -219,7 +237,7 @@ def setup_parser() -> argparse.ArgumentParser:
        help=(
            "Set seed for python's random, numpy, torch, and fewshot sampling.\n"
            "Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, "
-            "respectively, or a single integer to set the same seed for all three.\n"
+            "respectively, or a single integer to set the same seed for all four.\n"
            f"The values are either an integer or 'None' to not set the seed. Default is `{default_seed_string}` "
            "(for backward compatibility).\n"
            "E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. "
@@ -255,13 +273,12 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    # update the evaluation tracker args with the output path and the HF token
-    args.hf_hub_log_args = f"output_path={args.output_path},token={os.environ.get('HF_TOKEN')},{args.hf_hub_log_args}"
+    if args.output_path:
+        args.hf_hub_log_args += f",output_path={args.output_path}"
+    if os.environ.get("HF_TOKEN", None):
+        args.hf_hub_log_args += f",token={os.environ.get('HF_TOKEN')}"
    evaluation_tracker_args = simple_parse_args_string(args.hf_hub_log_args)
    evaluation_tracker = EvaluationTracker(**evaluation_tracker_args)
-    evaluation_tracker.general_config_tracker.log_experiment_args(
-        model_source=args.model,
-        model_args=args.model_args,
-    )

    if args.predict_only:
        args.log_samples = True
@@ -270,17 +287,22 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
            "Specify --output_path if providing --log_samples or --predict_only"
        )

-    if args.include_path is not None:
-        eval_logger.info(f"Including path: {args.include_path}")
-    task_manager = TaskManager(args.verbosity, include_path=args.include_path)
+    if args.fewshot_as_multiturn and args.apply_chat_template is False:
+        raise ValueError(
+            "If fewshot_as_multiturn is set, apply_chat_template must be set to True."
+        )

    if (
-        "push_results_to_hub" in evaluation_tracker_args
-        or "push_samples_to_hub" in evaluation_tracker_args
-    ) and "hub_results_org" not in evaluation_tracker_args:
+        args.num_fewshot is None or args.num_fewshot == 0
+    ) and args.fewshot_as_multiturn:
        raise ValueError(
-            "If push_results_to_hub or push_samples_to_hub is set, results_org must be specified."
+            "If fewshot_as_multiturn is set, num_fewshot must be greater than 0."
        )
+
+    if args.include_path is not None:
+        eval_logger.info(f"Including path: {args.include_path}")
+    task_manager = TaskManager(args.verbosity, include_path=args.include_path)
+
    if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
        eval_logger.warning(
            "Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub."
@@ -332,11 +354,17 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:

    # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args
    if args.trust_remote_code:
-        os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = str(args.trust_remote_code)
-        args.model_args = (
-            args.model_args
-            + f",trust_remote_code={os.environ['HF_DATASETS_TRUST_REMOTE_CODE']}"
+        eval_logger.info(
+            "Passed `--trust_remote_code`, setting environment variable `HF_DATASETS_TRUST_REMOTE_CODE=true`"
        )
+        # HACK: import datasets and override its HF_DATASETS_TRUST_REMOTE_CODE value internally,
+        # because it's already been determined based on the prior env var before launching our
+        # script--`datasets` gets imported by lm_eval internally before these lines can update the env.
+        import datasets
+
+        datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
+
+        args.model_args = args.model_args + ",trust_remote_code=True"

    eval_logger.info(f"Selected Tasks: {task_names}")

@@ -357,6 +385,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        check_integrity=args.check_integrity,
        write_out=args.write_out,
        log_samples=args.log_samples,
+        evaluation_tracker=evaluation_tracker,
+        system_instruction=args.system_instruction,
+        apply_chat_template=args.apply_chat_template,
+        fewshot_as_multiturn=args.fewshot_as_multiturn,
        gen_kwargs=args.gen_kwargs,
        task_manager=task_manager,
        verbosity=args.verbosity,
@@ -399,6 +431,12 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
                    task_name=task_name, samples=samples[task_name]
                )

+        if (
+            evaluation_tracker.push_results_to_hub
+            or evaluation_tracker.push_samples_to_hub
+        ):
+            evaluation_tracker.recreate_metadata_card()
+
        print(
            f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"

--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -145,9 +145,10 @@ def ter(items):
 @register_aggregation("brier_score")
 def brier_score(items):  # This is a passthrough function
    gold, predictions = list(zip(*items))
+    bs, num_class = np.array(predictions).shape
+
    gold = list(gold)
-    gold_one_hot = np.eye(np.max(gold) + 1)[gold]
-    predictions = list(zip(*items))[1]
+    gold_one_hot = np.eye(num_class)[gold]
    return np.mean(np.sum((predictions - gold_one_hot) ** 2, axis=1))


@@ -463,7 +464,11 @@ def bootstrap_stderr(f, xs, iters):
    return sample_stddev(res)


-def stderr_for_metric(metric, bootstrap_iters):
+def stderr_for_metric(metric, bootstrap_iters: int):
+    if bootstrap_iters <= 0:
+        # return no function (don't compute stderr) if bootstrap iters = 0
+        return None
+
    bootstrappable = [
        median,
        matthews_corrcoef,

--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -3,7 +3,7 @@ import hashlib
 import json
 import logging
 import os
-from typing import List, Optional, Tuple, Type, TypeVar
+from typing import Dict, List, Optional, Tuple, Type, TypeVar

 import transformers
 from sqlitedict import SqliteDict
@@ -114,6 +114,20 @@ class LM(abc.ABC):
        """
        pass

+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        """
+        Defines how to transform few-shot examples provided as chat history into a format that can be used as input to the LM.
+
+        :param chat_history: list[dict[str, str]]
+            A list of dictionaries with keys 'role' and 'content'.
+            Values are strings representing the role name and the content of the message, respectively.
+        :return: str
+            A string representing the chat history in a format that can be used as input to the LM.
+        """
+        raise NotImplementedError(
+            "To use this model with chat templates, please implement the 'apply_chat_template' method for your model type."
+        )
+
    @classmethod
    def create_from_arg_string(
        cls: Type[T], arg_string: str, additional_config: Optional[dict] = None
@@ -169,6 +183,26 @@ class LM(abc.ABC):
        # not support multi-device parallelism nor expect it.
        return self._world_size

+    @property
+    def tokenizer_name(self) -> str:
+        """Must be defined for LM subclasses which implement Chat Templating.
+        Should return the name of the tokenizer or chat template used.
+        Used only to properly fingerprint caches when requests are being cached with `--cache_requests`, otherwise not used.
+        """
+        raise NotImplementedError(
+            "To use this model with chat templates, please implement the 'tokenizer_name' property."
+        )
+
+    @property
+    def chat_template(self) -> str:
+        """Must be defined for LM subclasses that implement Chat Templating.
+        Should return the structure of the chat template applied to user/assistant messages.
+        This is used only to save in the experiment results for reproducibility.
+        """
+        raise NotImplementedError(
+            "To use this model with chat templates, please implement the 'chat_template' property."
+        )
+
    def set_cache_hook(self, cache_hook) -> None:
        self.cache_hook = cache_hook


--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
+import datasets
+
+
 class ContextSampler:
    def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
        self.rnd = rnd
@@ -18,6 +21,10 @@ class ContextSampler:

        self.docs = docs  # HF dataset split, provided by task._fewshot_docs()
        if fewshot_indices:  # subset few-shot docs from
+            if not isinstance(self.docs, datasets.Dataset):
+                raise ValueError(
+                    "Got `fewshot_indices` but fewshot_docs are not a HF dataset. Don't use both `fewshot_indices` and a user-defined few-shot sample list simultaneously"
+                )
            self.docs = self.docs.select(fewshot_indices)

    def get_context(self, doc, num_fewshot):
@@ -35,37 +42,79 @@ class ContextSampler:
        # TODO: should we just stop people from using fewshot from same split as evaluating?
        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]

-        labeled_examples = (
-            self.fewshot_delimiter.join(
-                [
-                    # TODO: is separating doc_to_text and doc_to_target by one space always desired?
-                    (
-                        self.doc_to_text(doc)
-                        if (
-                            self.config.doc_to_choice is None
-                            or isinstance(self.doc_to_text(doc), str)
-                        )
-                        else self.doc_to_choice(doc)[self.doc_to_text(doc)]
-                    )
-                    + self.target_delimiter
-                    + (
-                        str(self.doc_to_target(doc)[0])
-                        if isinstance(self.doc_to_target(doc), list)
-                        else self.doc_to_target(doc)
-                        if (
-                            self.config.doc_to_choice is None
-                            or isinstance(self.doc_to_target(doc), str)
-                        )
-                        else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
-                    )
-                    for doc in selected_docs
-                ]
+        labeled_examples = ""
+        for doc in selected_docs:
+            doc_content = self.doc_to_text(doc)
+            doc_target = self.doc_to_target(doc)
+            labeled_examples += (
+                doc_content
+                if self.config.doc_to_choice is None or isinstance(doc_content, str)
+                else self.doc_to_choice(doc)[doc_content]
            )
-            + self.fewshot_delimiter
-        )
+            labeled_examples += self.target_delimiter
+            labeled_examples += (
+                str(doc_target[0])
+                if isinstance(doc_target, list)
+                else doc_target
+                if self.config.doc_to_choice is None or isinstance(doc_target, str)
+                else str(self.doc_to_choice(doc)[doc_target])
+            )
+            labeled_examples += self.fewshot_delimiter

        return labeled_examples

+    def get_chat_context(
+        self,
+        doc,
+        num_fewshot,
+        fewshot_as_multiturn: bool = False,
+    ):
+        chat_history = []
+        # draw an extra fewshot sample if using same split as evaluating on
+        n_samples = (
+            num_fewshot + 1
+            if self.config.fewshot_split == self.config.test_split
+            else num_fewshot
+        )
+        # draw `n_samples` docs from fewshot_docs
+        fewshotex = self.sample(n_samples)
+
+        # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+        # TODO: should we just stop people from using fewshot from same split as evaluating?
+        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
+
+        if fewshot_as_multiturn:
+            for doc in selected_docs:
+                doc_content = self.doc_to_text(doc)
+                doc_target = self.doc_to_target(doc)
+                chat_history.append(
+                    {
+                        "role": "user",
+                        "content": doc_content
+                        if self.config.doc_to_choice is None
+                        or isinstance(doc_content, str)
+                        else self.doc_to_choice(doc)[doc_content],
+                    }
+                )
+                chat_history.append(
+                    {
+                        "role": "assistant",
+                        "content": str(doc_target[0])
+                        if isinstance(doc_target, list)
+                        else doc_target
+                        if self.config.doc_to_choice is None
+                        or isinstance(doc_target, str)
+                        else str(self.doc_to_choice(doc)[doc_target]),
+                    }
+                )
+        else:
+            # get fewshot context as one user turn
+            chat_history.append(
+                {"role": "user", "content": self.get_context(doc, num_fewshot)}
+            )
+
+        return chat_history
+
    def sample(self, n):
        """
        Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -68,9 +68,9 @@ class TaskConfig(dict):
    training_split: Optional[str] = None
    validation_split: Optional[str] = None
    test_split: Optional[str] = None
-    fewshot_split: Optional[
-        str
-    ] = None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
+    fewshot_split: Optional[str] = (
+        None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
+    )
    # formatting / prompting options.
    # see docs/advanced_task_guide.md for more info
    process_docs: Optional[Callable] = None
@@ -93,9 +93,9 @@ class TaskConfig(dict):
    filter_list: Optional[Union[str, list]] = None
    should_decontaminate: bool = False
    doc_to_decontamination_query: Optional[str] = None
-    metadata: Optional[
-        dict
-    ] = None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
+    metadata: Optional[dict] = (
+        None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
+    )

    def __post_init__(self) -> None:
        if self.generation_kwargs is not None:
@@ -230,9 +230,9 @@ class Task(abc.ABC):
        self._config: TaskConfig = TaskConfig({**config}) if config else TaskConfig()

        self._filters = [build_filter_ensemble("none", [["take_first", None]])]
-        self.fewshot_rnd: Optional[
-            random.Random
-        ] = None  # purposely induce errors in case of improper usage
+        self.fewshot_rnd: Optional[random.Random] = (
+            None  # purposely induce errors in case of improper usage
+        )

    def download(
        self,
@@ -369,11 +369,16 @@ class Task(abc.ABC):
    def build_all_requests(
        self,
        *,
-        limit=None,
-        rank=None,
-        world_size=None,
-        cache_requests=False,
-        rewrite_requests_cache=False,
+        limit: Union[int, None] = None,
+        rank: int = 0,
+        world_size: int = 1,
+        cache_requests: bool = False,
+        rewrite_requests_cache: bool = False,
+        system_instruction: Optional[str] = None,
+        apply_chat_template: bool = False,
+        fewshot_as_multiturn: bool = False,
+        chat_template: Optional[Callable] = None,
+        tokenizer_name: str = "",
    ) -> None:
        """Build a set of Instances for a task, and store them in task.instances"""

@@ -381,6 +386,14 @@ class Task(abc.ABC):
        og_limit = limit

        cache_key = f"requests-{self._config.task}-{self.config.num_fewshot}shot-rank{rank}-world_size{world_size}"
+        cache_key += "-chat_template" if apply_chat_template else ""
+        cache_key += "-fewshot_as_multiturn" if fewshot_as_multiturn else ""
+        cache_key += (
+            f"-system_prompt_hash{utils.hash_string(system_instruction)}"
+            if system_instruction is not None
+            else ""
+        )
+        cache_key += f"-tokenizer{tokenizer_name}"

        cached_instances = load_from_cache(file_name=cache_key)

@@ -422,6 +435,10 @@ class Task(abc.ABC):
            fewshot_ctx = self.fewshot_context(
                doc,
                0 if self.config.num_fewshot is None else self.config.num_fewshot,
+                system_instruction,
+                apply_chat_template,
+                fewshot_as_multiturn,
+                chat_template,
            )

            # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
@@ -949,6 +966,18 @@ class ConfigurableTask(Task):
            if self.config.process_docs is not None:
                return self.config.process_docs(self.dataset[self.config.fewshot_split])
            return self.dataset[self.config.fewshot_split]
+        elif (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("samples", None) is not None
+        ):
+            if isinstance(self.config.fewshot_config["samples"], list):
+                return self.config.fewshot_config["samples"]
+            elif callable(self.config.fewshot_config["samples"]):
+                return self.config.fewshot_config["samples"]()
+            else:
+                raise Exception(
+                    "`fewshot_config['samples']` was incorrectly defined in the configuration. It should be either a list of samples as a dict, or function returning this list."
+                )
        else:
            if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0):
                eval_logger.warning(
@@ -958,8 +987,37 @@ class ConfigurableTask(Task):
                )
            return super().fewshot_docs()

+    @staticmethod
+    def append_target_question(
+        labeled_examples: List[Dict[str, str]],
+        question: str,
+        fewshot_as_multiturn: bool = False,
+    ) -> None:
+        """Adds a target question to the labeled examples list.
+        If fewshot_as_multiturn is True, or labeled_examples is empty, or the last entry is a system turn, appends the question as a new user entry.
+        Otherwise, it is appended to the last user entry, ensuring that the conversation alternates between the user and the assistant.
+        """
+        if not fewshot_as_multiturn:
+            # if no messages or last message is system, append as new user entry
+            if len(labeled_examples) == 0 or labeled_examples[-1]["role"] == "system":
+                labeled_examples.append({"role": "user", "content": question})
+            # if last message is user, append to it to avoid two user messages in a row
+            else:
+                labeled_examples[-1]["content"] += question
+        else:
+            # if fewshot_as_multiturn is True, append as next user entry (last is always assistant)
+            labeled_examples.append({"role": "user", "content": question})
+
    @utils.positional_deprecated
-    def fewshot_context(self, doc: str, num_fewshot: int) -> str:
+    def fewshot_context(
+        self,
+        doc: str,
+        num_fewshot: int,
+        system_instruction: Optional[str] = None,
+        apply_chat_template: bool = False,
+        fewshot_as_multiturn: bool = False,
+        chat_template: Optional[Callable] = None,
+    ) -> str:
        """Returns a fewshot context string that is made up of a prepended description
        (if provided), the `num_fewshot` number of examples, and an appended prompt example.

@@ -967,22 +1025,90 @@ class ConfigurableTask(Task):
            The document as returned from training_docs, validation_docs, or test_docs.
        :param num_fewshot: int
            The number of fewshot examples to provide in the returned context string.
+        :param  system_instruction: str
+            System instruction to be applied to the prompt.
+        :param apply_chat_template: bool
+            Whether to apply the chat template to the fewshot context.
+        :param fewshot_as_multiturn: bool
+            Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
+        :param chat_template: Callable
+            Chat template to be applied to the fewshot context.
        :returns: str
            The fewshot context.
        """
+
+        if apply_chat_template:
+            labeled_examples = []
+        else:
+            labeled_examples = ""
+
+        # get task description
        if description := self.config.description:
            description = utils.apply_template(self.config.description, doc)

-        if num_fewshot == 0:
-            # always prepend the (possibly empty) task description
-            labeled_examples = description
+        # create system prompt based on the provided system instruction and description
+        if system_instruction is not None and description:
+            system_prompt = (
+                f"{system_instruction}{self.sampler.fewshot_delimiter}{description}"
+            )
+        elif system_instruction is not None:
+            system_prompt = system_instruction
+        elif description:
+            system_prompt = description
        else:
-            labeled_examples = description + self.sampler.get_context(doc, num_fewshot)
+            system_prompt = ""
+
+        # add system prompt if specified
+        if system_prompt:
+            if apply_chat_template:
+                labeled_examples.append({"role": "system", "content": system_prompt})
+            else:
+                labeled_examples = system_prompt
+
+        # if few-shot - append examples after the system prompt
+        if num_fewshot > 0:
+            if apply_chat_template:
+                labeled_examples.extend(
+                    self.sampler.get_chat_context(
+                        doc, num_fewshot, fewshot_as_multiturn
+                    )
+                )
+            else:
+                labeled_examples += self.sampler.get_context(doc, num_fewshot)

        example = self.doc_to_text(doc)
-        if self.multiple_input:
-            return labeled_examples
+        if apply_chat_template:
+            if self.multiple_input:
+                return chat_template(labeled_examples)
+            if isinstance(example, str):
+                self.append_target_question(
+                    labeled_examples, example, fewshot_as_multiturn
+                )
+            # for loglikelihood create a list of questions with appended choices
+            elif isinstance(example, list):
+                labeled_examples_list = []
+                # copy chat history for each example and append the answer
+                for ex in example:
+                    chat = deepcopy(labeled_examples)
+                    self.append_target_question(chat, ex, fewshot_as_multiturn)
+                    labeled_examples_list.append(chat_template(chat))
+                return labeled_examples_list
+            # if example is an integer, append the choice or convert to string
+            elif isinstance(example, int):
+                if self.config.doc_to_choice is not None:
+                    choices = self.doc_to_choice(doc)
+                    self.append_target_question(
+                        labeled_examples, choices[example], fewshot_as_multiturn
+                    )
+                else:
+                    self.append_target_question(
+                        labeled_examples, str(example), fewshot_as_multiturn
+                    )
+                # return lm.apply_chat_template(labeled_examples)
+            return chat_template(labeled_examples)
        else:
+            if self.multiple_input:
+                return labeled_examples
            if isinstance(example, str):
                return labeled_examples + example
            elif isinstance(example, list):

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -21,7 +21,8 @@ from lm_eval.evaluator_utils import (
    print_writeout,
    run_task_tests,
 )
-from lm_eval.logging.utils import add_env_info, get_git_commit_hash
+from lm_eval.loggers import EvaluationTracker
+from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash
 from lm_eval.tasks import TaskManager, get_task_dict
 from lm_eval.utils import (
    eval_logger,
@@ -55,6 +56,10 @@ def simple_evaluate(
    check_integrity: bool = False,
    write_out: bool = False,
    log_samples: bool = True,
+    evaluation_tracker: Optional[EvaluationTracker] = None,
+    system_instruction: Optional[str] = None,
+    apply_chat_template: bool = False,
+    fewshot_as_multiturn: bool = False,
    gen_kwargs: Optional[str] = None,
    task_manager: Optional[TaskManager] = None,
    verbosity: str = "INFO",
@@ -92,13 +97,19 @@ def simple_evaluate(
    :param limit: int or float, optional
        Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples.
    :param bootstrap_iters:
-        Number of iterations for bootstrap statistics
+        Number of iterations for bootstrap statistics, used when calculating stderrs. set to 0 for no stderr calculations to be performed.
    :param check_integrity: bool
        Whether to run the relevant part of the test suite for the tasks
    :param write_out: bool
        If True, write out an example document and model input for checking task integrity
    :param log_samples: bool
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
+    :param system_instruction: str
+        System instruction to be applied to the prompt
+    :param apply_chat_template: bool
+        If True, apply chat template to the prompt
+    :param fewshot_as_multiturn: bool
+        Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
    :param gen_kwargs: str
        String arguments for model generation
        Ignored for all tasks with loglikelihood output_type
@@ -241,18 +252,28 @@ def simple_evaluate(
                    f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
                )
                task_obj.set_config(key="num_fewshot", value=num_fewshot)
-            task_obj.set_fewshot_seed(seed=fewshot_random_seed)
-            eval_logger.info(
-                f"Setting fewshot random generator seed to {fewshot_random_seed}"
-            )
        else:
            # if num_fewshot not provided, and the task does not define a default one, default to 0
            if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None:
                task_obj.set_config(key="num_fewshot", value=0)
+        # fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file)
+        task_obj.set_fewshot_seed(seed=fewshot_random_seed)
+        eval_logger.info(
+            f"Setting fewshot random generator seed to {fewshot_random_seed}"
+        )

    if check_integrity:
        run_task_tests(task_list=tasks)

+    if evaluation_tracker is not None:
+        evaluation_tracker.general_config_tracker.log_experiment_args(
+            model_source=model,
+            model_args=model_args,
+            system_instruction=system_instruction,
+            chat_template=lm.chat_template if apply_chat_template else None,
+            fewshot_as_multiturn=fewshot_as_multiturn,
+        )
+
    results = evaluate(
        lm=lm,
        task_dict=task_dict,
@@ -262,6 +283,9 @@ def simple_evaluate(
        bootstrap_iters=bootstrap_iters,
        write_out=write_out,
        log_samples=log_samples,
+        system_instruction=system_instruction,
+        apply_chat_template=apply_chat_template,
+        fewshot_as_multiturn=fewshot_as_multiturn,
        verbosity=verbosity,
    )

@@ -302,6 +326,7 @@ def simple_evaluate(
        results["git_hash"] = get_git_commit_hash()
        results["date"] = start_date
        add_env_info(results)  # additional environment info to results
+        add_tokenizer_info(results, lm)  # additional info about tokenizer
        return results
    else:
        return None
@@ -317,6 +342,9 @@ def evaluate(
    bootstrap_iters: Optional[int] = 100000,
    write_out: bool = False,
    log_samples: bool = True,
+    system_instruction: Optional[str] = None,
+    apply_chat_template: bool = False,
+    fewshot_as_multiturn: bool = False,
    verbosity: str = "INFO",
 ):
    """Instantiate and evaluate a model on a list of tasks.
@@ -328,11 +356,17 @@ def evaluate(
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
-        Number of iterations for bootstrap statistics
+        Number of iterations for bootstrap statistics, used when calculating stderr. Set to 0 for skipping all stderr calculations.
    :param write_out: bool
        If True, write out an example document and model input for checking task integrity
    :param log_samples: bool
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
+    :param system_instruction: str
+        System instruction to be applied to the prompt
+    :param apply_chat_template: bool
+        If True, apply chat template to the prompt
+    :param fewshot_as_multiturn: bool
+        Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
    :return
        Dictionary of results
    """
@@ -362,6 +396,15 @@ def evaluate(
            world_size=lm.world_size,
            cache_requests=cache_requests,
            rewrite_requests_cache=rewrite_requests_cache,
+            system_instruction=system_instruction,
+            apply_chat_template=apply_chat_template,
+            fewshot_as_multiturn=fewshot_as_multiturn,
+            chat_template=getattr(lm, "apply_chat_template")
+            if apply_chat_template
+            else None,
+            tokenizer_name=getattr(lm, "tokenizer_name", "")
+            if apply_chat_template
+            else "",
        )
        eval_logger.debug(
            f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}"
@@ -503,9 +546,14 @@ def evaluate(
        # aggregate results ; run bootstrap CIs
        for task_output in eval_tasks:
            task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters)
-        results, samples, configs, versions, num_fewshot = consolidate_results(
-            eval_tasks
-        )
+        (
+            results,
+            samples,
+            configs,
+            versions,
+            num_fewshot,
+            higher_is_better,
+        ) = consolidate_results(eval_tasks)

        ### Calculate group metrics ###
        if bool(results):
@@ -516,6 +564,27 @@ def evaluate(
                    # or `task_name: []`.
                    # we only want to operate on groups here.
                    continue
+
+                # collect all higher_is_better values for metrics
+                # in the group's subtasks.
+                # TODO: clean this up ; unify with the below metric_list loop?
+                _higher_is_better = {}
+                for task in task_list:
+                    for m, h in higher_is_better[task].items():
+                        if m not in _higher_is_better.keys():
+                            _higher_is_better[m] = h
+                    if (
+                        m in _higher_is_better
+                        and _higher_is_better[m] is not None
+                        and _higher_is_better[m] != h
+                    ):
+                        eval_logger.warning(
+                            f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
+                        )
+                        _higher_is_better[m] = None
+                higher_is_better[group] = _higher_is_better
+
+                # collect all metric keys used by a subtask in the group.
                metric_list = list(
                    {
                        key
@@ -545,16 +614,16 @@ def evaluate(
                    ]

                    # compute group's pooled metric and stderr
-                    results[group][
-                        metric
-                    ] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
+                    results[group][metric] = (
+                        lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
+                    )
                    # TODO: calculate grouped metric using aggregation fn
                    if "N/A" in stderrs:
                        results[group][stderr] = "N/A"
                    else:
-                        results[group][
-                            stderr
-                        ] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
+                        results[group][stderr] = (
+                            lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
+                        )
                        # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
                        # To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
                        # results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
@@ -591,6 +660,7 @@ def evaluate(
            "configs": dict(sorted(configs.items())),
            "versions": dict(sorted(versions.items())),
            "n-shot": dict(sorted(num_fewshot.items())),
+            "higher_is_better": dict(sorted(higher_is_better.items())),
            "n-samples": {
                task_output.task_name: {
                    "original": len(task_output.task.eval_docs),

--- a/lm_eval/evaluator_utils.py
+++ b/lm_eval/evaluator_utils.py
@@ -97,7 +97,7 @@ class TaskOutput:
            metric_key = f"{metric},{filter_key}"
            self.agg_metrics[metric_key] = agg_fn(items)
            self.sample_len = len(items)  # TODO: same sample size for each metric?
-            if bootstrap_iters:
+            if isinstance(bootstrap_iters, int):
                stderr_fn = metrics.stderr_for_metric(
                    metric=agg_fn,
                    bootstrap_iters=min(bootstrap_iters, 100)
@@ -107,6 +107,10 @@ class TaskOutput:
                self.agg_metrics[f"{metric}_stderr,{filter_key}"] = (
                    stderr_fn(items) if (stderr_fn and len(items) > 1) else "N/A"
                )
+            else:
+                raise ValueError(
+                    f"Received bootstrap_iters '{bootstrap_iters}' but expected an integer. Set to 0 to turn off stderr calculations."
+                )

    def __repr__(self):
        return (
@@ -219,7 +223,7 @@ def prepare_print_tasks(

 def consolidate_results(
    eval_tasks: List[TaskOutput],
-) -> Tuple[dict, dict, dict, dict, dict]:
+) -> Tuple[dict, dict, dict, dict, dict, dict]:
    """
    @param eval_tasks: list(TaskOutput).
    @return: A tuple containing the consolidated results, samples, configs, versions, and num_fewshot.
@@ -236,6 +240,8 @@ def consolidate_results(
    - configs: A defaultdict with task names as keys and task configurations as values.
    - versions: A defaultdict with task names as keys and task versions as values.
    - num_fewshot: A defaultdict with task names as keys and number of few-shot samples as values.
+    - higher_is_better: A defaultdict with task names as keys and indicators of whether higher values are better
+    for each metric as values.

    The method then returns the consolidated results, samples, configs, versions, and num_fewshot as a tuple.
    """
@@ -249,6 +255,9 @@ def consolidate_results(
    configs = collections.defaultdict(dict)
    # Tracks each task's version.
    versions = collections.defaultdict(dict)
+    # Track `higher_is_better` for each metric
+    higher_is_better = collections.defaultdict(dict)
+
    for task_output in eval_tasks:
        if "task_alias" in (task_config := task_output.task_config):
            results[task_output.task_name]["alias"] = task_config["task_alias"]
@@ -259,16 +268,17 @@ def consolidate_results(
        configs[task_output.task_name] = task_output.task_config
        versions[task_output.task_name] = task_output.version
        samples[task_output.task_name] = task_output.logged_samples
+        higher_is_better[task_output.task_name] = task_output.task.higher_is_better()
        for (metric, filter_key), items in task_output.sample_metrics.items():
            metric_key = f"{metric},{filter_key}"
            results[task_output.task_name][metric_key] = task_output.agg_metrics[
                metric_key
            ]
            results[task_output.task_name]["samples"] = task_output.sample_len
-            results[task_output.task_name][
-                f"{metric}_stderr,{filter_key}"
-            ] = task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
-    return results, samples, configs, versions, num_fewshot
+            results[task_output.task_name][f"{metric}_stderr,{filter_key}"] = (
+                task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
+            )
+    return results, samples, configs, versions, num_fewshot, higher_is_better


 @positional_deprecated

--- a/lm_eval/filters/decontamination.py
+++ b/lm_eval/filters/decontamination.py
@@ -4,7 +4,6 @@ from lm_eval.api.registry import register_filter

 @register_filter("decontaminate")
 class DecontaminationFilter(Filter):
-
    """
    A filter which evaluates
    """

--- a/lm_eval/logging/__init__.py
+++ b/lm_eval/logging/__init__.py
--- a/lm_eval/logging/evaluation_tracker.py
+++ b/lm_eval/logging/evaluation_tracker.py
 import json
 import re
 import time
+from collections import defaultdict
 from dataclasses import asdict, dataclass
 from datetime import datetime
 from pathlib import Path

-from huggingface_hub import HfApi
+from datasets import load_dataset
+from datasets.utils.metadata import MetadataConfigs
+from huggingface_hub import (
+    DatasetCard,
+    DatasetCardData,
+    HfApi,
+    hf_hub_url,
+)

 from lm_eval.utils import (
    eval_logger,
+    get_file_datetime,
+    get_file_task_name,
+    get_results_filenames,
+    get_sample_results_filenames,
    handle_non_serializable,
    hash_string,
+    sanitize_list,
+    sanitize_model_name,
+    sanitize_task_name,
 )


@@ -31,6 +46,11 @@ class GeneralConfigTracker:
    model_source: str = None
    model_name: str = None
    model_name_sanitized: str = None
+    system_instruction: str = None
+    system_instruction_sha: str = None
+    fewshot_as_multiturn: bool = None
+    chat_template: str = None
+    chat_template_sha: str = None
    start_time: float = None
    end_time: float = None
    total_evaluation_time_seconds: str = None
@@ -59,13 +79,21 @@ class GeneralConfigTracker:
        self,
        model_source: str,
        model_args: str,
+        system_instruction: str,
+        chat_template: str,
+        fewshot_as_multiturn: bool,
    ) -> None:
        """Logs model parameters and job ID."""
        self.model_source = model_source
        self.model_name = GeneralConfigTracker._get_model_name(model_args)
-        self.model_name_sanitized = re.sub(
-            r"[\"<>:/\|\\?\*\[\]]+", "__", self.model_name
+        self.model_name_sanitized = sanitize_model_name(self.model_name)
+        self.system_instruction = system_instruction
+        self.system_instruction_sha = (
+            hash_string(system_instruction) if system_instruction else None
        )
+        self.chat_template = chat_template
+        self.chat_template_sha = hash_string(chat_template) if chat_template else None
+        self.fewshot_as_multiturn = fewshot_as_multiturn

    def log_end_time(self) -> None:
        """Logs the end time of the evaluation and calculates the total evaluation time."""
@@ -88,31 +116,53 @@ class EvaluationTracker:
        push_samples_to_hub: bool = False,
        public_repo: bool = False,
        token: str = "",
+        leaderboard_url: str = "",
+        point_of_contact: str = "",
    ) -> None:
        """
        Creates all the necessary loggers for evaluation tracking.

        Args:
            output_path (str): Path to save the results. If not provided, the results won't be saved.
-            hub_results_org (str): The Hugging Face organisation to push the results to. If not provided, the results won't be pushed.
+            hub_results_org (str): The Hugging Face organization to push the results to. If not provided, the results will be pushed to the owner of the Hugging Face token.
            hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`.
            push_results_to_hub (bool): Whether to push the results to the Hugging Face hub.
            push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub.
            public_repo (bool): Whether to push the results to a public or private repository.
            token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`.
+            leaderboard_url (str): URL to the leaderboard on the Hugging Face hub on the dataset card.
+            point_of_contact (str): Contact information on the Hugging Face hub dataset card.
        """
        self.general_config_tracker = GeneralConfigTracker()

        self.output_path = output_path
-        self.hub_results_org = hub_results_org
-        hub_repo_name = hub_repo_name if hub_repo_name else "lm-eval-results"
-        self.hub_results_repo = f"{hub_results_org}/{hub_repo_name}"
-        self.hub_results_repo_private = f"{hub_results_org}/{hub_repo_name}-private"
        self.push_results_to_hub = push_results_to_hub
        self.push_samples_to_hub = push_samples_to_hub
        self.public_repo = public_repo
+        self.leaderboard_url = leaderboard_url
+        self.point_of_contact = point_of_contact
        self.api = HfApi(token=token) if token else None

+        if not self.api and (push_results_to_hub or push_samples_to_hub):
+            raise ValueError(
+                "Hugging Face token is not defined, but 'push_results_to_hub' or 'push_samples_to_hub' is set to True. "
+                "Please provide a valid Hugging Face token by setting the HF_TOKEN environment variable."
+            )
+
+        if (
+            self.api
+            and hub_results_org == ""
+            and (push_results_to_hub or push_samples_to_hub)
+        ):
+            hub_results_org = self.api.whoami()["name"]
+            eval_logger.warning(
+                f"hub_results_org was not specified. Results will be pushed to '{hub_results_org}'."
+            )
+
+        hub_repo_name = hub_repo_name if hub_repo_name else "lm-eval-results"
+        self.hub_results_repo = f"{hub_results_org}/{hub_repo_name}"
+        self.hub_results_repo_private = f"{hub_results_org}/{hub_repo_name}-private"
+
    def save_results_aggregated(
        self,
        results: dict,
@@ -160,23 +210,28 @@ class EvaluationTracker:
                file_results_aggregated.open("w", encoding="utf-8").write(dumped)

                if self.api and self.push_results_to_hub:
-                    self.api.create_repo(
-                        repo_id=self.hub_results_repo
+                    repo_id = (
+                        self.hub_results_repo
                        if self.public_repo
-                        else self.hub_results_repo_private,
+                        else self.hub_results_repo_private
+                    )
+                    self.api.create_repo(
+                        repo_id=repo_id,
                        repo_type="dataset",
                        private=not self.public_repo,
                        exist_ok=True,
                    )
                    self.api.upload_folder(
-                        repo_id=self.hub_results_repo
-                        if self.public_repo
-                        else self.hub_results_repo_private,
+                        repo_id=repo_id,
                        folder_path=str(path),
                        path_in_repo=self.general_config_tracker.model_name_sanitized,
                        repo_type="dataset",
                        commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
                    )
+                    eval_logger.info(
+                        "Successfully pushed aggregated results to the Hugging Face Hub. "
+                        f"You can find them at: {repo_id}"
+                    )

            except Exception as e:
                eval_logger.warning("Could not save results aggregated")
@@ -200,44 +255,268 @@ class EvaluationTracker:
        """
        if self.output_path:
            try:
-                eval_logger.info("Saving samples results")
-                samples_dumped = json.dumps(
-                    samples,
-                    indent=2,
-                    default=handle_non_serializable,
-                    ensure_ascii=False,
-                )
+                eval_logger.info(f"Saving per-sample results for: {task_name}")

                path = Path(self.output_path if self.output_path else Path.cwd())
                path = path.joinpath(self.general_config_tracker.model_name_sanitized)
                path.mkdir(parents=True, exist_ok=True)

                file_results_samples = path.joinpath(
-                    f"samples_{task_name}_{self.date_id}.json"
+                    f"samples_{task_name}_{self.date_id}.jsonl"
                )
-                file_results_samples.write_text(samples_dumped, encoding="utf-8")
+
+                for sample in samples:
+                    # we first need to sanitize arguments and resps
+                    # otherwise we won't be able to load the dataset
+                    # using the datasets library
+                    arguments = {}
+                    for i, arg in enumerate(sample["arguments"]):
+                        arguments[f"gen_args_{i}"] = {}
+                        for j, tmp in enumerate(arg):
+                            arguments[f"gen_args_{i}"][f"arg_{j}"] = tmp
+
+                    sample["resps"] = sanitize_list(sample["resps"])
+                    sample["filtered_resps"] = sanitize_list(sample["filtered_resps"])
+                    sample["arguments"] = arguments
+
+                    sample_dump = (
+                        json.dumps(
+                            sample,
+                            default=handle_non_serializable,
+                            ensure_ascii=False,
+                        )
+                        + "\n"
+                    )
+
+                    with open(file_results_samples, "a") as f:
+                        f.write(sample_dump)

                if self.api and self.push_samples_to_hub:
-                    self.api.create_repo(
+                    repo_id = (
                        self.hub_results_repo
                        if self.public_repo
-                        else self.hub_results_repo_private,
+                        else self.hub_results_repo_private
+                    )
+                    self.api.create_repo(
+                        repo_id=repo_id,
                        repo_type="dataset",
                        private=not self.public_repo,
                        exist_ok=True,
                    )
                    self.api.upload_folder(
-                        repo_id=self.hub_results_repo
-                        if self.public_repo
-                        else self.hub_results_repo_private,
+                        repo_id=repo_id,
                        folder_path=str(path),
                        path_in_repo=self.general_config_tracker.model_name_sanitized,
                        repo_type="dataset",
                        commit_message=f"Adding samples results for {task_name} to {self.general_config_tracker.model_name}",
                    )
+                    eval_logger.info(
+                        f"Successfully pushed sample results for task: {task_name} to the Hugging Face Hub. "
+                        f"You can find them at: {repo_id}"
+                    )

            except Exception as e:
                eval_logger.warning("Could not save sample results")
                eval_logger.info(repr(e))
        else:
            eval_logger.info("Output path not provided, skipping saving sample results")
+
+    def recreate_metadata_card(self) -> None:
+        """
+        Creates a metadata card for the evaluation results dataset and pushes it to the Hugging Face hub.
+        """
+
+        eval_logger.info("Recreating metadata card")
+        repo_id = (
+            self.hub_results_repo if self.public_repo else self.hub_results_repo_private
+        )
+
+        files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
+        results_files = get_results_filenames(files_in_repo)
+        sample_files = get_sample_results_filenames(files_in_repo)
+
+        # Build a dictionary to store the latest evaluation datetime for:
+        # - Each tested model and its aggregated results
+        # - Each task and sample results, if existing
+        # i.e. {
+        #     "org__model_name__gsm8k": "2021-09-01T12:00:00",
+        #     "org__model_name__ifeval": "2021-09-01T12:00:00",
+        #     "org__model_name__results": "2021-09-01T12:00:00"
+        # }
+        latest_task_results_datetime = defaultdict(lambda: datetime.min.isoformat())
+
+        for file_path in sample_files:
+            file_path = Path(file_path)
+            filename = file_path.name
+            model_name = file_path.parent
+            task_name = get_file_task_name(filename)
+            results_datetime = get_file_datetime(filename)
+            task_name_sanitized = sanitize_task_name(task_name)
+            # Results and sample results for the same model and task will have the same datetime
+            samples_key = f"{model_name}__{task_name_sanitized}"
+            results_key = f"{model_name}__results"
+            latest_datetime = max(
+                latest_task_results_datetime[samples_key],
+                results_datetime,
+            )
+            latest_task_results_datetime[samples_key] = latest_datetime
+            latest_task_results_datetime[results_key] = latest_datetime
+
+        # Create metadata card
+        card_metadata = MetadataConfigs()
+
+        # Add the latest aggregated results to the metadata card for easy access
+        for file_path in results_files:
+            file_path = Path(file_path)
+            results_filename = file_path.name
+            model_name = file_path.parent
+            eval_date = get_file_datetime(results_filename)
+            eval_date_sanitized = re.sub(r"[^\w\.]", "_", eval_date)
+            results_filename = Path("**") / Path(results_filename).name
+            config_name = f"{model_name}__results"
+            sanitized_last_eval_date_results = re.sub(
+                r"[^\w\.]", "_", latest_task_results_datetime[config_name]
+            )
+            # Ensure that all results files are listed in the metadata card
+            current_results = card_metadata.get(config_name, {"data_files": []})
+            current_results["data_files"].append(
+                {"split": eval_date_sanitized, "path": [str(results_filename)]}
+            )
+            card_metadata[config_name] = current_results
+            # If the results file is the newest, update the "latest" field in the metadata card
+            if eval_date_sanitized == sanitized_last_eval_date_results:
+                card_metadata[config_name]["data_files"].append(
+                    {"split": "latest", "path": [str(results_filename)]}
+                )
+
+        # Add the tasks details configs
+        for file_path in sample_files:
+            file_path = Path(file_path)
+            filename = file_path.name
+            model_name = file_path.parent
+            task_name = get_file_task_name(filename)
+            eval_date = get_file_datetime(filename)
+            task_name_sanitized = sanitize_task_name(task_name)
+            eval_date_sanitized = re.sub(r"[^\w\.]", "_", eval_date)
+            results_filename = Path("**") / Path(filename).name
+            config_name = f"{model_name}__{task_name_sanitized}"
+            sanitized_last_eval_date_results = re.sub(
+                r"[^\w\.]", "_", latest_task_results_datetime[config_name]
+            )
+            # Ensure that all sample results files are listed in the metadata card
+            current_details_for_task = card_metadata.get(
+                config_name, {"data_files": []}
+            )
+            current_details_for_task["data_files"].append(
+                {"split": eval_date_sanitized, "path": [str(results_filename)]}
+            )
+            card_metadata[config_name] = current_details_for_task
+            # If the samples results file is the newest, update the "latest" field in the metadata card
+            if eval_date_sanitized == sanitized_last_eval_date_results:
+                card_metadata[config_name]["data_files"].append(
+                    {"split": "latest", "path": [str(results_filename)]}
+                )
+
+            # Special case for MMLU with a single split covering it all
+            # We add another config with all MMLU splits results together for easy inspection
+            SPECIAL_TASKS = ["mmlu", "gpqa", "minerva_math"]
+            for special_task in SPECIAL_TASKS:
+                if special_task in config_name:
+                    special_task = f"{model_name}__{special_task}"
+                    former_entry = card_metadata.get(special_task, {"data_files": []})
+
+                    former_split = [
+                        (i, entry)
+                        for i, entry in enumerate(former_entry["data_files"])
+                        if entry.get("split", None) == eval_date_sanitized
+                    ]
+
+                    if len(former_split) == 0:
+                        former_entry["data_files"].append(
+                            {
+                                "split": eval_date_sanitized,
+                                "path": [str(results_filename)],
+                            }
+                        )
+                    else:
+                        split_index, _ = former_split[0]
+                        former_entry["data_files"][split_index]["path"].append(
+                            str(results_filename)
+                        )
+
+                    if eval_date_sanitized == sanitized_last_eval_date_results:
+                        latest_split = [
+                            (i, entry)
+                            for i, entry in enumerate(former_entry["data_files"])
+                            if entry.get("split", None) == "latest"
+                        ]
+                        if len(latest_split) == 0:
+                            former_entry["data_files"].append(
+                                {"split": "latest", "path": [str(results_filename)]}
+                            )
+                        else:
+                            latest_index, _ = latest_split[0]
+                            former_entry["data_files"][latest_index]["path"].append(
+                                str(results_filename)
+                            )
+
+                    card_metadata[special_task] = former_entry
+
+        # Get latest results and extract info to update metadata card examples
+        latest_datetime = max(latest_task_results_datetime.values())
+        latest_model_name = max(
+            latest_task_results_datetime, key=lambda k: latest_task_results_datetime[k]
+        )
+        last_results_file = [
+            f for f in results_files if latest_datetime.replace(":", "-") in f
+        ][0]
+        last_results_file_path = hf_hub_url(
+            repo_id=repo_id, filename=last_results_file, repo_type="dataset"
+        )
+        latest_results_file = load_dataset(
+            "json", data_files=last_results_file_path, split="train"
+        )
+        results_dict = latest_results_file["results"][0]
+        new_dictionary = {"all": results_dict}
+        new_dictionary.update(results_dict)
+        results_string = json.dumps(new_dictionary, indent=4)
+
+        dataset_summary = (
+            "Dataset automatically created during the evaluation run of model "
+        )
+        if self.general_config_tracker.model_source == "hf":
+            dataset_summary += f"[{self.general_config_tracker.model_name}](https://huggingface.co/{self.general_config_tracker.model_name})\n"
+        else:
+            dataset_summary += f"{self.general_config_tracker.model_name}\n"
+        dataset_summary += (
+            f"The dataset is composed of {len(card_metadata)-1} configuration(s), each one corresponding to one of the evaluated task.\n\n"
+            f"The dataset has been created from {len(results_files)} run(s). Each run can be found as a specific split in each "
+            'configuration, the split being named using the timestamp of the run.The "train" split is always pointing to the latest results.\n\n'
+            'An additional configuration "results" store all the aggregated results of the run.\n\n'
+            "To load the details from a run, you can for instance do the following:\n"
+        )
+        if self.general_config_tracker.model_source == "hf":
+            dataset_summary += (
+                "```python\nfrom datasets import load_dataset\n"
+                f'data = load_dataset(\n\t"{repo_id}",\n\tname="{latest_model_name}",\n\tsplit="latest"\n)\n```\n\n'
+            )
+        dataset_summary += (
+            "## Latest results\n\n"
+            f'These are the [latest results from run {latest_datetime}]({last_results_file_path.replace("/resolve/", "/blob/")}) '
+            "(note that there might be results for other tasks in the repos if successive evals didn't cover the same tasks. "
+            'You find each in the results and the "latest" split for each eval):\n\n'
+            f"```python\n{results_string}\n```"
+        )
+        card_data = DatasetCardData(
+            dataset_summary=dataset_summary,
+            repo_url=f"https://huggingface.co/{self.general_config_tracker.model_name}",
+            pretty_name=f"Evaluation run of {self.general_config_tracker.model_name}",
+            leaderboard_url=self.leaderboard_url,
+            point_of_contact=self.point_of_contact,
+        )
+        card_metadata.to_dataset_card_data(card_data)
+        card = DatasetCard.from_template(
+            card_data,
+            pretty_name=card_data.pretty_name,
+        )
+        card.push_to_hub(repo_id, repo_type="dataset")
--- a/lm_eval/logging/utils.py
+++ b/lm_eval/logging/utils.py
@@ -110,3 +110,20 @@ def add_env_info(storage: Dict[str, Any]):
        "upper_git_hash": upper_dir_commit,  # in case this repo is submodule
    }
    storage.update(added_info)
+
+
+def add_tokenizer_info(storage: Dict[str, Any], lm):
+    if getattr(lm, "tokenizer", False):
+        tokenizer_info = {
+            "tokenizer_pad_token": [lm.tokenizer.pad_token, lm.tokenizer.pad_token_id],
+            "tokenizer_eos_token": [lm.tokenizer.eos_token, lm.tokenizer.eos_token_id],
+            "tokenizer_bos_token": [lm.tokenizer.bos_token, lm.tokenizer.bos_token_id],
+            "eot_token_id": getattr(lm, "eot_token_id", None),
+            "max_length": getattr(lm, "max_length", None),
+        }
+        storage.update(tokenizer_info)
+    # seems gguf and textsynth do not have tokenizer
+    else:
+        logger.debug(
+            "LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results."
+        )
--- a/lm_eval/logging/wandb_logger.py
+++ b/lm_eval/logging/wandb_logger.py
@@ -7,7 +7,7 @@ import numpy as np
 import pandas as pd
 from packaging.version import Version

-from lm_eval.logging.utils import _handle_non_serializable, remove_none_pattern
+from lm_eval.loggers.utils import _handle_non_serializable, remove_none_pattern


 logger = logging.getLogger(__name__)