Merge branch 'EleutherAI:main' into main

a2af2101 · Yen-Ting Lin · GitHub · 82cb25c1 · d5f39bf8 · a2af2101
Unverified Commit a2af2101 authored Jul 12, 2024 by Yen-Ting Lin Committed by GitHub Jul 12, 2024
20 changed files
--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -62,11 +62,8 @@ class WhitespaceFilter(Filter):
        def filter_set(inst):
            filtered_resp = []
            for resp in inst:
-                if resp.startswith(" "):
-                    resp = resp[1:]
-
+                resp = resp.lstrip()
                filtered_resp.append(resp)
-
            return filtered_resp

        filtered_resps = [filter_set(resp) for resp in resps]

--- a/lm_eval/logging/__init__.py
+++ b/lm_eval/logging/__init__.py
--- a/lm_eval/logging/evaluation_tracker.py
+++ b/lm_eval/logging/evaluation_tracker.py
 import json
+import os
 import re
 import time
+from collections import defaultdict
 from dataclasses import asdict, dataclass
 from datetime import datetime
 from pathlib import Path

-from huggingface_hub import HfApi
+from datasets import load_dataset
+from datasets.utils.metadata import MetadataConfigs
+from huggingface_hub import (
+    DatasetCard,
+    DatasetCardData,
+    HfApi,
+    hf_hub_url,
+)
+from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status

 from lm_eval.utils import (
    eval_logger,
+    get_file_datetime,
+    get_file_task_name,
+    get_results_filenames,
+    get_sample_results_filenames,
    handle_non_serializable,
    hash_string,
+    sanitize_list,
+    sanitize_model_name,
+    sanitize_task_name,
 )


@@ -31,6 +48,11 @@ class GeneralConfigTracker:
    model_source: str = None
    model_name: str = None
    model_name_sanitized: str = None
+    system_instruction: str = None
+    system_instruction_sha: str = None
+    fewshot_as_multiturn: bool = None
+    chat_template: str = None
+    chat_template_sha: str = None
    start_time: float = None
    end_time: float = None
    total_evaluation_time_seconds: str = None
@@ -59,13 +81,21 @@ class GeneralConfigTracker:
        self,
        model_source: str,
        model_args: str,
+        system_instruction: str,
+        chat_template: str,
+        fewshot_as_multiturn: bool,
    ) -> None:
        """Logs model parameters and job ID."""
        self.model_source = model_source
        self.model_name = GeneralConfigTracker._get_model_name(model_args)
-        self.model_name_sanitized = re.sub(
-            r"[\"<>:/\|\\?\*\[\]]+", "__", self.model_name
+        self.model_name_sanitized = sanitize_model_name(self.model_name)
+        self.system_instruction = system_instruction
+        self.system_instruction_sha = (
+            hash_string(system_instruction) if system_instruction else None
        )
+        self.chat_template = chat_template
+        self.chat_template_sha = hash_string(chat_template) if chat_template else None
+        self.fewshot_as_multiturn = fewshot_as_multiturn

    def log_end_time(self) -> None:
        """Logs the end time of the evaluation and calculates the total evaluation time."""
@@ -81,37 +111,81 @@ class EvaluationTracker:

    def __init__(
        self,
-        output_path: str = "",
+        output_path: str = None,
        hub_results_org: str = "",
        hub_repo_name: str = "",
+        details_repo_name: str = "",
+        results_repo_name: str = "",
        push_results_to_hub: bool = False,
        push_samples_to_hub: bool = False,
        public_repo: bool = False,
        token: str = "",
+        leaderboard_url: str = "",
+        point_of_contact: str = "",
+        gated: bool = False,
    ) -> None:
        """
        Creates all the necessary loggers for evaluation tracking.

        Args:
            output_path (str): Path to save the results. If not provided, the results won't be saved.
-            hub_results_org (str): The Hugging Face organisation to push the results to. If not provided, the results won't be pushed.
+            hub_results_org (str): The Hugging Face organization to push the results to. If not provided, the results will be pushed to the owner of the Hugging Face token.
            hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`.
+            details_repo_name (str): The name of the Hugging Face repository to push the details to. If not provided, the results will be pushed to `lm-eval-results`.
+            result_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will not be pushed and will be found in the details_hub_repo.
            push_results_to_hub (bool): Whether to push the results to the Hugging Face hub.
            push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub.
            public_repo (bool): Whether to push the results to a public or private repository.
            token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`.
+            leaderboard_url (str): URL to the leaderboard on the Hugging Face hub on the dataset card.
+            point_of_contact (str): Contact information on the Hugging Face hub dataset card.
+            gated (bool): Whether to gate the repository.
        """
        self.general_config_tracker = GeneralConfigTracker()

        self.output_path = output_path
-        self.hub_results_org = hub_results_org
-        hub_repo_name = hub_repo_name if hub_repo_name else "lm-eval-results"
-        self.hub_results_repo = f"{hub_results_org}/{hub_repo_name}"
-        self.hub_results_repo_private = f"{hub_results_org}/{hub_repo_name}-private"
        self.push_results_to_hub = push_results_to_hub
        self.push_samples_to_hub = push_samples_to_hub
        self.public_repo = public_repo
+        self.leaderboard_url = leaderboard_url
+        self.point_of_contact = point_of_contact
        self.api = HfApi(token=token) if token else None
+        self.gated_repo = gated
+
+        if not self.api and (push_results_to_hub or push_samples_to_hub):
+            raise ValueError(
+                "Hugging Face token is not defined, but 'push_results_to_hub' or 'push_samples_to_hub' is set to True. "
+                "Please provide a valid Hugging Face token by setting the HF_TOKEN environment variable."
+            )
+
+        if (
+            self.api
+            and hub_results_org == ""
+            and (push_results_to_hub or push_samples_to_hub)
+        ):
+            hub_results_org = self.api.whoami()["name"]
+            eval_logger.warning(
+                f"hub_results_org was not specified. Results will be pushed to '{hub_results_org}'."
+            )
+
+        if hub_repo_name == "":
+            details_repo_name = (
+                details_repo_name if details_repo_name != "" else "lm-eval-results"
+            )
+            results_repo_name = (
+                results_repo_name if results_repo_name != "" else details_repo_name
+            )
+        else:
+            details_repo_name = hub_repo_name
+            results_repo_name = hub_repo_name
+            eval_logger.warning(
+                "hub_repo_name was specified. Both details and results will be pushed to the same repository. Using hub_repo_name is no longer recommended, details_repo_name and results_repo_name should be used instead."
+            )
+
+        self.details_repo = f"{hub_results_org}/{details_repo_name}"
+        self.details_repo_private = f"{hub_results_org}/{details_repo_name}-private"
+        self.results_repo = f"{hub_results_org}/{results_repo_name}"
+        self.results_repo_private = f"{hub_results_org}/{results_repo_name}-private"

    def save_results_aggregated(
        self,
@@ -160,23 +234,33 @@ class EvaluationTracker:
                file_results_aggregated.open("w", encoding="utf-8").write(dumped)

                if self.api and self.push_results_to_hub:
-                    self.api.create_repo(
-                        repo_id=self.hub_results_repo
+                    repo_id = (
+                        self.results_repo
                        if self.public_repo
-                        else self.hub_results_repo_private,
+                        else self.results_repo_private
+                    )
+                    self.api.create_repo(
+                        repo_id=repo_id,
                        repo_type="dataset",
                        private=not self.public_repo,
                        exist_ok=True,
                    )
-                    self.api.upload_folder(
-                        repo_id=self.hub_results_repo
-                        if self.public_repo
-                        else self.hub_results_repo_private,
-                        folder_path=str(path),
-                        path_in_repo=self.general_config_tracker.model_name_sanitized,
+                    self.api.upload_file(
+                        repo_id=repo_id,
+                        path_or_fileobj=str(
+                            path.joinpath(f"results_{self.date_id}.json")
+                        ),
+                        path_in_repo=os.path.join(
+                            self.general_config_tracker.model_name,
+                            f"results_{self.date_id}.json",
+                        ),
                        repo_type="dataset",
                        commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
                    )
+                    eval_logger.info(
+                        "Successfully pushed aggregated results to the Hugging Face Hub. "
+                        f"You can find them at: {repo_id}"
+                    )

            except Exception as e:
                eval_logger.warning("Could not save results aggregated")
@@ -200,44 +284,238 @@ class EvaluationTracker:
        """
        if self.output_path:
            try:
-                eval_logger.info("Saving samples results")
-                samples_dumped = json.dumps(
-                    samples,
-                    indent=2,
-                    default=handle_non_serializable,
-                    ensure_ascii=False,
-                )
+                eval_logger.info(f"Saving per-sample results for: {task_name}")

                path = Path(self.output_path if self.output_path else Path.cwd())
                path = path.joinpath(self.general_config_tracker.model_name_sanitized)
                path.mkdir(parents=True, exist_ok=True)

                file_results_samples = path.joinpath(
-                    f"samples_{task_name}_{self.date_id}.json"
+                    f"samples_{task_name}_{self.date_id}.jsonl"
                )
-                file_results_samples.write_text(samples_dumped, encoding="utf-8")
+
+                for sample in samples:
+                    # we first need to sanitize arguments and resps
+                    # otherwise we won't be able to load the dataset
+                    # using the datasets library
+                    arguments = {}
+                    for i, arg in enumerate(sample["arguments"]):
+                        arguments[f"gen_args_{i}"] = {}
+                        for j, tmp in enumerate(arg):
+                            arguments[f"gen_args_{i}"][f"arg_{j}"] = tmp
+
+                    sample["resps"] = sanitize_list(sample["resps"])
+                    sample["filtered_resps"] = sanitize_list(sample["filtered_resps"])
+                    sample["arguments"] = arguments
+                    sample["target"] = str(sample["target"])
+
+                    sample_dump = (
+                        json.dumps(
+                            sample,
+                            default=handle_non_serializable,
+                            ensure_ascii=False,
+                        )
+                        + "\n"
+                    )
+
+                    with open(file_results_samples, "a", encoding="utf-8") as f:
+                        f.write(sample_dump)

                if self.api and self.push_samples_to_hub:
-                    self.api.create_repo(
-                        self.hub_results_repo
+                    repo_id = (
+                        self.details_repo
                        if self.public_repo
-                        else self.hub_results_repo_private,
+                        else self.details_repo_private
+                    )
+                    self.api.create_repo(
+                        repo_id=repo_id,
                        repo_type="dataset",
                        private=not self.public_repo,
                        exist_ok=True,
                    )
+                    try:
+                        if self.gated_repo:
+                            headers = build_hf_headers()
+                            r = get_session().put(
+                                url=f"https://huggingface.co/api/datasets/{repo_id}/settings",
+                                headers=headers,
+                                json={"gated": "auto"},
+                            )
+                            hf_raise_for_status(r)
+                    except Exception as e:
+                        eval_logger.warning("Could not gate the repository")
+                        eval_logger.info(repr(e))
                    self.api.upload_folder(
-                        repo_id=self.hub_results_repo
-                        if self.public_repo
-                        else self.hub_results_repo_private,
+                        repo_id=repo_id,
                        folder_path=str(path),
                        path_in_repo=self.general_config_tracker.model_name_sanitized,
                        repo_type="dataset",
                        commit_message=f"Adding samples results for {task_name} to {self.general_config_tracker.model_name}",
                    )
+                    eval_logger.info(
+                        f"Successfully pushed sample results for task: {task_name} to the Hugging Face Hub. "
+                        f"You can find them at: {repo_id}"
+                    )

            except Exception as e:
                eval_logger.warning("Could not save sample results")
                eval_logger.info(repr(e))
        else:
            eval_logger.info("Output path not provided, skipping saving sample results")
+
+    def recreate_metadata_card(self) -> None:
+        """
+        Creates a metadata card for the evaluation results dataset and pushes it to the Hugging Face hub.
+        """
+
+        eval_logger.info("Recreating metadata card")
+        repo_id = self.details_repo if self.public_repo else self.details_repo_private
+
+        files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
+        results_files = get_results_filenames(files_in_repo)
+        sample_files = get_sample_results_filenames(files_in_repo)
+
+        # Build a dictionary to store the latest evaluation datetime for:
+        # - Each tested model and its aggregated results
+        # - Each task and sample results, if existing
+        # i.e. {
+        #     "org__model_name__gsm8k": "2021-09-01T12:00:00",
+        #     "org__model_name__ifeval": "2021-09-01T12:00:00",
+        #     "org__model_name__results": "2021-09-01T12:00:00"
+        # }
+        latest_task_results_datetime = defaultdict(lambda: datetime.min.isoformat())
+
+        for file_path in sample_files:
+            file_path = Path(file_path)
+            filename = file_path.name
+            model_name = file_path.parent
+            task_name = get_file_task_name(filename)
+            results_datetime = get_file_datetime(filename)
+            task_name_sanitized = sanitize_task_name(task_name)
+            # Results and sample results for the same model and task will have the same datetime
+            samples_key = f"{model_name}__{task_name_sanitized}"
+            results_key = f"{model_name}__results"
+            latest_datetime = max(
+                latest_task_results_datetime[samples_key],
+                results_datetime,
+            )
+            latest_task_results_datetime[samples_key] = latest_datetime
+            latest_task_results_datetime[results_key] = max(
+                latest_task_results_datetime[results_key],
+                latest_datetime,
+            )
+
+        # Create metadata card
+        card_metadata = MetadataConfigs()
+
+        # Add the latest aggregated results to the metadata card for easy access
+        for file_path in results_files:
+            file_path = Path(file_path)
+            results_filename = file_path.name
+            model_name = file_path.parent
+            eval_date = get_file_datetime(results_filename)
+            eval_date_sanitized = re.sub(r"[^\w\.]", "_", eval_date)
+            results_filename = Path("**") / Path(results_filename).name
+            config_name = f"{model_name}__results"
+            sanitized_last_eval_date_results = re.sub(
+                r"[^\w\.]", "_", latest_task_results_datetime[config_name]
+            )
+
+            if eval_date_sanitized == sanitized_last_eval_date_results:
+                # Ensure that all results files are listed in the metadata card
+                current_results = card_metadata.get(config_name, {"data_files": []})
+                current_results["data_files"].append(
+                    {"split": eval_date_sanitized, "path": [str(results_filename)]}
+                )
+                card_metadata[config_name] = current_results
+                # If the results file is the newest, update the "latest" field in the metadata card
+                card_metadata[config_name]["data_files"].append(
+                    {"split": "latest", "path": [str(results_filename)]}
+                )
+
+        # Add the tasks details configs
+        for file_path in sample_files:
+            file_path = Path(file_path)
+            filename = file_path.name
+            model_name = file_path.parent
+            task_name = get_file_task_name(filename)
+            eval_date = get_file_datetime(filename)
+            task_name_sanitized = sanitize_task_name(task_name)
+            eval_date_sanitized = re.sub(r"[^\w\.]", "_", eval_date)
+            results_filename = Path("**") / Path(filename).name
+            config_name = f"{model_name}__{task_name_sanitized}"
+            sanitized_last_eval_date_results = re.sub(
+                r"[^\w\.]", "_", latest_task_results_datetime[config_name]
+            )
+            if eval_date_sanitized == sanitized_last_eval_date_results:
+                # Ensure that all sample results files are listed in the metadata card
+                current_details_for_task = card_metadata.get(
+                    config_name, {"data_files": []}
+                )
+                current_details_for_task["data_files"].append(
+                    {"split": eval_date_sanitized, "path": [str(results_filename)]}
+                )
+                card_metadata[config_name] = current_details_for_task
+                # If the samples results file is the newest, update the "latest" field in the metadata card
+                card_metadata[config_name]["data_files"].append(
+                    {"split": "latest", "path": [str(results_filename)]}
+                )
+
+        # Get latest results and extract info to update metadata card examples
+        latest_datetime = max(latest_task_results_datetime.values())
+        latest_model_name = max(
+            latest_task_results_datetime, key=lambda k: latest_task_results_datetime[k]
+        )
+        last_results_file = [
+            f for f in results_files if latest_datetime.replace(":", "-") in f
+        ][0]
+        last_results_file_path = hf_hub_url(
+            repo_id=repo_id, filename=last_results_file, repo_type="dataset"
+        )
+        latest_results_file = load_dataset(
+            "json", data_files=last_results_file_path, split="train"
+        )
+        results_dict = latest_results_file["results"][0]
+        new_dictionary = {"all": results_dict}
+        new_dictionary.update(results_dict)
+        results_string = json.dumps(new_dictionary, indent=4)
+
+        dataset_summary = (
+            "Dataset automatically created during the evaluation run of model "
+        )
+        if self.general_config_tracker.model_source == "hf":
+            dataset_summary += f"[{self.general_config_tracker.model_name}](https://huggingface.co/{self.general_config_tracker.model_name})\n"
+        else:
+            dataset_summary += f"{self.general_config_tracker.model_name}\n"
+        dataset_summary += (
+            f"The dataset is composed of {len(card_metadata)-1} configuration(s), each one corresponding to one of the evaluated task.\n\n"
+            f"The dataset has been created from {len(results_files)} run(s). Each run can be found as a specific split in each "
+            'configuration, the split being named using the timestamp of the run.The "train" split is always pointing to the latest results.\n\n'
+            'An additional configuration "results" store all the aggregated results of the run.\n\n'
+            "To load the details from a run, you can for instance do the following:\n"
+        )
+        if self.general_config_tracker.model_source == "hf":
+            dataset_summary += (
+                "```python\nfrom datasets import load_dataset\n"
+                f'data = load_dataset(\n\t"{repo_id}",\n\tname="{latest_model_name}",\n\tsplit="latest"\n)\n```\n\n'
+            )
+        dataset_summary += (
+            "## Latest results\n\n"
+            f'These are the [latest results from run {latest_datetime}]({last_results_file_path.replace("/resolve/", "/blob/")}) '
+            "(note that there might be results for other tasks in the repos if successive evals didn't cover the same tasks. "
+            'You find each in the results and the "latest" split for each eval):\n\n'
+            f"```python\n{results_string}\n```"
+        )
+        card_data = DatasetCardData(
+            dataset_summary=dataset_summary,
+            repo_url=f"https://huggingface.co/{self.general_config_tracker.model_name}",
+            pretty_name=f"Evaluation run of {self.general_config_tracker.model_name}",
+            leaderboard_url=self.leaderboard_url,
+            point_of_contact=self.point_of_contact,
+        )
+        card_metadata.to_dataset_card_data(card_data)
+        card = DatasetCard.from_template(
+            card_data,
+            pretty_name=card_data.pretty_name,
+        )
+        card.push_to_hub(repo_id, repo_type="dataset")
--- a/lm_eval/logging/utils.py
+++ b/lm_eval/logging/utils.py
@@ -110,3 +110,34 @@ def add_env_info(storage: Dict[str, Any]):
        "upper_git_hash": upper_dir_commit,  # in case this repo is submodule
    }
    storage.update(added_info)
+
+
+def add_tokenizer_info(storage: Dict[str, Any], lm):
+    if getattr(lm, "tokenizer", False):
+        try:
+            tokenizer_info = {
+                "tokenizer_pad_token": [
+                    lm.tokenizer.pad_token,
+                    str(lm.tokenizer.pad_token_id),
+                ],
+                "tokenizer_eos_token": [
+                    lm.tokenizer.eos_token,
+                    str(lm.tokenizer.eos_token_id),
+                ],
+                "tokenizer_bos_token": [
+                    lm.tokenizer.bos_token,
+                    str(lm.tokenizer.bos_token_id),
+                ],
+                "eot_token_id": getattr(lm, "eot_token_id", None),
+                "max_length": getattr(lm, "max_length", None),
+            }
+            storage.update(tokenizer_info)
+        except Exception as err:
+            logger.debug(
+                f"Logging detailed tokenizer info failed with {err}, skipping..."
+            )
+        # seems gguf and textsynth do not have tokenizer
+    else:
+        logger.debug(
+            "LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results."
+        )
--- a/lm_eval/logging/wandb_logger.py
+++ b/lm_eval/logging/wandb_logger.py
@@ -7,7 +7,7 @@ import numpy as np
 import pandas as pd
 from packaging.version import Version

-from lm_eval.logging.utils import _handle_non_serializable, remove_none_pattern
+from lm_eval.loggers.utils import _handle_non_serializable, remove_none_pattern


 logger = logging.getLogger(__name__)

--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -307,7 +307,7 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
        # defaults to os.environ.get("ANTHROPIC_API_KEY")
        self.client = anthropic.Anthropic()
        self.temperature = temperature
-        self.max_token = max_tokens
+        self.max_tokens = max_tokens
        self.tokenizer = self.client.get_tokenizer()
        self.kwargs = kwargs


--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -2,7 +2,7 @@ import copy
 import os
 from datetime import timedelta
 from pathlib import Path
-from typing import List, Literal, Optional, Tuple, Union
+from typing import Dict, List, Literal, Optional, Tuple, Union

 import torch
 import torch.nn.functional as F
@@ -30,6 +30,7 @@ from lm_eval.api.registry import register_model
 from lm_eval.models.utils import (
    Collator,
    clear_torch_cache,
+    configure_pad_token,
    get_dtype,
    pad_and_concat,
    stop_sequences_criteria,
@@ -44,13 +45,13 @@ def _get_accelerate_args(
    max_memory_per_gpu: Optional[Union[int, str]] = None,
    max_cpu_memory: Optional[Union[int, str]] = None,
    offload_folder: Optional[str] = "./offload",
+    gpus: Optional[int] = None,
 ) -> dict:
    """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
    max_memory = {}
    if max_memory_per_gpu is not None:
        max_memory_per_gpu_map = {
-            device_idx: max_memory_per_gpu
-            for device_idx in range(torch.cuda.device_count())
+            device_idx: max_memory_per_gpu for device_idx in range(gpus)
        }
        max_memory.update(max_memory_per_gpu_map)
    if max_cpu_memory is not None:
@@ -153,12 +154,16 @@ class HFLM(TemplateLM):
            if accelerator.num_processes > 1:
                self.accelerator = accelerator

+            if "npu" in accelerator.device.type:
+                gpus = torch.npu.device_count()
+
            if not (parallelize or accelerator.num_processes > 1):
                # use user-passed device
                device_list = set(
                    ["cuda", "cpu"]
-                    + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
+                    + [f"cuda:{i}" for i in range(gpus)]
                    + ["mps", "mps:0"]
+                    + [f"npu:{i}" for i in range(gpus)]
                )
                if device and device in device_list:
                    self._device = torch.device(device)
@@ -199,6 +204,15 @@ class HFLM(TemplateLM):
            config=self.config, backend=backend, trust_remote_code=trust_remote_code
        )

+        # load tokenizer so we know tokenizer vocabulary size before loading model and PEFT
+        self._create_tokenizer(
+            pretrained,
+            tokenizer,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            use_fast_tokenizer=use_fast_tokenizer,
+        )
+
        # if we passed `pretrained` as a string, initialize our model now
        if isinstance(pretrained, str):
            self._create_model(
@@ -207,6 +221,7 @@ class HFLM(TemplateLM):
                dtype=dtype,
                trust_remote_code=trust_remote_code,
                parallelize=parallelize,
+                gpus=gpus,
                device_map_option=device_map_option,
                max_memory_per_gpu=max_memory_per_gpu,
                max_cpu_memory=max_cpu_memory,
@@ -235,44 +250,14 @@ class HFLM(TemplateLM):
                        "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
                    )

-        self._create_tokenizer(
-            pretrained,
-            tokenizer,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-            use_fast_tokenizer=use_fast_tokenizer,
-        )
-
        self.truncation = truncation
        self.logits_cache = logits_cache
        self.vocab_size = self.tokenizer.vocab_size
        # select (or create) a pad token to use
-        if self.tokenizer.pad_token:
-            pass
-        elif self.tokenizer.unk_token:
-            self.tokenizer.pad_token_id = self.tokenizer.unk_token_id
-        elif self.tokenizer.eos_token:
-            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
-        else:
-            if getattr(self.config, "model_type", None) == "qwen":
-                # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
-                self.tokenizer.pad_token = "<|endoftext|>"
-            elif (
-                self.tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
-                or self.tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
-            ):
-                # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
-                # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
-                # ---
-                # Note that the world tokenizer class name, might change in the future for the final huggingface merge
-                # https://github.com/huggingface/transformers/pull/26963
-                assert self.tokenizer.pad_token_id == 0
-            else:
-                self.tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
+        self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)

-        # TODO: override this for Gemma
        self.add_bos_token = add_bos_token
-        if getattr(self.config, "model_type", None) == "gemma":
+        if getattr(self.config, "model_type", None) in ["gemma", "gemma2"]:
            self.add_bos_token = True
            eval_logger.info(
                f"Model type is '{self.config.model_type}', a BOS token will be used as Gemma underperforms without it."
@@ -321,6 +306,7 @@ class HFLM(TemplateLM):
                        in [
                            DistributedType.FSDP,
                            DistributedType.MULTI_GPU,
+                            DistributedType.MULTI_NPU,
                        ]
                    ), "Unsupported distributed type provided. Only DDP and FSDP are supported."
                    if accelerator.distributed_type == DistributedType.FSDP:
@@ -329,9 +315,7 @@ class HFLM(TemplateLM):
                        self._model = accelerator.prepare_model(
                            self.model, evaluation_mode=True
                        )
-                    self._device = torch.device(
-                        f"cuda:{accelerator.local_process_index}"
-                    )
+                    self._device = torch.device(f"{accelerator.device}")
                    self.accelerator = accelerator

                    if self.accelerator.is_local_main_process:
@@ -414,6 +398,16 @@ class HFLM(TemplateLM):
    def world_size(self):
        return self._world_size

+    @property
+    def tokenizer_name(self) -> str:
+        return self.tokenizer.name_or_path.replace("/", "__")
+
+    @property
+    def chat_template(self) -> str:
+        if self.tokenizer.chat_template is not None:
+            return self.tokenizer.chat_template
+        return self.tokenizer.default_chat_template
+
    def _get_backend(
        self,
        config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
@@ -488,6 +482,7 @@ class HFLM(TemplateLM):
        # only used if `parallelize=True`.
        # (accelerate naive PP (device_map) options)
        parallelize: Optional[bool] = False,
+        gpus: Optional[int] = None,
        device_map_option: Optional[str] = "auto",
        max_memory_per_gpu: Optional[Union[int, str]] = None,
        max_cpu_memory: Optional[Union[int, str]] = None,
@@ -519,6 +514,7 @@ class HFLM(TemplateLM):
                    max_memory_per_gpu,
                    max_cpu_memory,
                    offload_folder,
+                    gpus,
                )
            )
        elif "device_map" not in model_kwargs:
@@ -527,9 +523,7 @@ class HFLM(TemplateLM):
            # for quantized models now seems to be device_map="auto"
            # which breaks data-parallel mode.
            if hasattr(self, "accelerator"):
-                model_kwargs.update(
-                    {"device_map": {"": f"cuda:{self.accelerator.local_process_index}"}}
-                )
+                model_kwargs.update({"device_map": {"": f"{self.accelerator.device}"}})
            else:
                model_kwargs.update({"device_map": {"": str(self.device)}})

@@ -579,6 +573,12 @@ class HFLM(TemplateLM):
            if model_kwargs.get("load_in_4bit", None):
                if version.parse(PEFT_VERSION) < version.parse("0.4.0"):
                    raise AssertionError("load_in_4bit requires peft >= 0.4.0")
+            if self._model.config.vocab_size != len(self.tokenizer):
+                # resize model for LoRAs with added tokens
+                self._model.resize_token_embeddings(len(self.tokenizer))
+                eval_logger.info(
+                    f"Model config indicates vocab_size='{self._model.config.vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..."
+                )
            self._model = PeftModel.from_pretrained(
                self._model, peft, revision=revision
            )
@@ -667,6 +667,8 @@ class HFLM(TemplateLM):
            max_cont_enc = len(continuation_enc[-(self.max_length + 1) :])
        else:
            max_length = self.max_length
+            max_context_enc = max_length
+            max_cont_enc = max_length

        # if OOM, then halves batch_size and tries again
        @find_executable_batch_size(starting_batch_size=self.max_batch_size)
@@ -1277,6 +1279,14 @@ class HFLM(TemplateLM):

        return res

+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        """
+        Method to apply a chat template to a list of chat history between user and model.
+        """
+        return self.tokenizer.apply_chat_template(
+            chat_history, tokenize=False, add_generation_prompt=True
+        )
+
    def get_model_info(self) -> dict:
        """
        Method to get Hugging Face model information for experiment reproducibility.

--- a/lm_eval/models/neuron_optimum.py
+++ b/lm_eval/models/neuron_optimum.py
@@ -288,7 +288,7 @@ class NEURON_HF(TemplateLM):

        self.vocab_size = self.tokenizer.vocab_size
        self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
-        self.add_bos_token = self.add_bos_token
+        self.add_bos_token = add_bos_token

        self._max_length = max_length


--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
-""" TextSynth API
+"""TextSynth API
 Implementation provided by Fabrice Bellard:
    https://github.com/EleutherAI/lm-evaluation-harness/issues/295

@@ -11,6 +11,7 @@ Example usage:

 Homepage: https://textsynth.com/index.html
 """
+
 import logging
 import os


--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -5,6 +5,7 @@ import itertools
 import time
 from functools import wraps
 from typing import (
+    TYPE_CHECKING,
    Any,
    Callable,
    Dict,
@@ -24,6 +25,11 @@ import transformers
 from lm_eval.utils import eval_logger


+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizerBase
+    from transformers.configuration_utils import PretrainedConfig
+
+
 def chunks(iter, n: int = 0, fn=None):
    """
    Divides an iterable into chunks of specified size or based on a given function.
@@ -613,3 +619,48 @@ class Collator:

        if arr:
            yield arr
+
+
+def configure_pad_token(
+    tokenizer: "PreTrainedTokenizerBase",
+    model_config: Optional["PretrainedConfig"] = None,
+) -> "PreTrainedTokenizerBase":
+    """
+    This function checks if the (Hugging Face) tokenizer has a padding token and sets it if not present.
+    Some tokenizers require special handling.
+
+    Args:
+        tokenizer: The tokenizer for which the padding token is to be handled.
+        model_config: The configuration of the model. Default is None.
+
+    Returns:
+        The tokenizer after the padding token has been handled.
+
+    Raises:
+        AssertionError: If the tokenizer is of type RWKVWorldTokenizer or Rwkv5Tokenizer and the padding token id is not 0.
+    """
+    if tokenizer.pad_token:
+        pass
+    elif tokenizer.unk_token:
+        tokenizer.pad_token_id = tokenizer.unk_token_id
+    elif tokenizer.eos_token:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    else:
+        # handle special cases
+        if model_config and getattr(model_config, "model_type", None) == "qwen":
+            # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
+            tokenizer.pad_token = "<|endoftext|>"
+        elif (
+            tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
+            or tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
+        ):
+            # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
+            # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
+            # ---
+            # Note that the world tokenizer class name, might change in the future for the final huggingface merge
+            # https://github.com/huggingface/transformers/pull/26963
+            assert tokenizer.pad_token_id == 0
+        else:
+            tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
+
+    return tokenizer
--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
 import copy
 from importlib.metadata import version
 from importlib.util import find_spec
-from typing import List, Literal, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union

 from more_itertools import distribute
 from packaging.version import parse as parse_version
@@ -10,7 +10,7 @@ from tqdm import tqdm
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import TemplateLM
 from lm_eval.api.registry import register_model
-from lm_eval.models.utils import Collator, undistribute
+from lm_eval.models.utils import Collator, configure_pad_token, undistribute
 from lm_eval.utils import (
    eval_logger,
    get_rolling_token_windows,
@@ -21,13 +21,13 @@ from lm_eval.utils import (
 try:
    import ray
    from vllm import LLM, SamplingParams
-
-    if parse_version(version("vllm")) > parse_version("0.3.0"):
-        from vllm.lora.request import LoRARequest
+    from vllm.lora.request import LoRARequest
    from vllm.transformers_utils.tokenizer import get_tokenizer
 except ModuleNotFoundError:
    pass

+if TYPE_CHECKING:
+    pass

 eval_logger = eval_logger

@@ -102,9 +102,6 @@ class VLLM(TemplateLM):
        if self.data_parallel_size <= 1:
            self.model = LLM(**self.model_args)
        else:
-            assert parse_version(version("vllm")) < parse_version(
-                "0.3.3"
-            ), "data_parallel is only compatible with vllm < v0.3.3."
            eval_logger.warning(
                "You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached."
            )
@@ -123,7 +120,14 @@ class VLLM(TemplateLM):
            trust_remote_code=trust_remote_code,
            tokenizer_revision=tokenizer_revision,
        )
+        self.tokenizer = configure_pad_token(self.tokenizer)
        self.add_bos_token = add_bos_token
+        if "gemma" in pretrained.lower():
+            self.add_bos_token = True
+            eval_logger.info(
+                "Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it."
+            )
+
        self.custom_prefix_token_id = prefix_token_id
        if prefix_token_id is not None:
            eval_logger.info(
@@ -175,23 +179,46 @@ class VLLM(TemplateLM):
    def max_gen_toks(self):
        return self._max_gen_toks

+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        """
+        Method to apply a chat template to a list of chat history between user and model.
+        """
+        return self.tokenizer.apply_chat_template(
+            chat_history, tokenize=False, add_generation_prompt=True
+        )
+
+    @property
+    def chat_template(self) -> str:
+        if self.tokenizer.chat_template is not None:
+            return self.tokenizer.chat_template
+        return self.tokenizer.default_chat_template
+
+    @property
+    def tokenizer_name(self) -> str:
+        return self.tokenizer.name_or_path.replace("/", "__")
+
    def tok_encode(
        self,
-        string: str,
-        left_truncate_len=None,
-        add_special_tokens=None,
-        truncation=False,
-    ):
-        """ """
+        string: Union[str, List[str]],
+        left_truncate_len: int = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+    ) -> Union[List[int], List[List[int]]]:
        if not add_special_tokens:
            add_special_tokens = False or self.add_bos_token
-        encoding = self.tokenizer.encode(
-            string, add_special_tokens=add_special_tokens, truncation=truncation
-        )
+        encoding: Union[List[List[int]], List[int]] = self.tokenizer(
+            string,
+            add_special_tokens=add_special_tokens,
+            truncation=truncation,
+            return_attention_mask=False,
+        ).input_ids

        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
        if left_truncate_len:
-            encoding = encoding[-left_truncate_len:]
+            if not isinstance(string, str):
+                encoding = [enc[-left_truncate_len:] for enc in encoding]
+            else:
+                encoding = encoding[-left_truncate_len:]

        return encoding

@@ -208,7 +235,7 @@ class VLLM(TemplateLM):
            sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs)
        else:
            sampling_params = SamplingParams(
-                temperature=0, prompt_logprobs=1, max_tokens=1
+                temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
            )
        if self.data_parallel_size > 1:
            # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
@@ -289,7 +316,9 @@ class VLLM(TemplateLM):

        # batch tokenize contexts
        context, all_gen_kwargs = zip(*(req.args for req in requests))
-        context_encoding = self.tokenizer(context, add_special_tokens=False).input_ids
+        context_encoding: List[List[int]] = self.tok_encode(
+            context, add_special_tokens=self.add_bos_token
+        )
        requests = [
            ((a, b), c) for a, b, c in zip(context, context_encoding, all_gen_kwargs)
        ]
@@ -498,7 +527,10 @@ class VLLM(TemplateLM):
    def modify_gen_kwargs(kwargs: dict) -> dict:
        # sampling_params
        do_sample = kwargs.pop("do_sample", None)
-        if do_sample is False or "temperature" not in kwargs:
+        if do_sample is False and "temperature" not in kwargs:
+            eval_logger.debug(
+                "Got `do_sample=False` and no temperature value, setting VLLM temperature to 0.0 ..."
+            )
            kwargs["temperature"] = 0.0
        # hf defaults
        kwargs["skip_special_tokens"] = kwargs.get("skip_special_tokens", False)

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
+
+# Tasks
+
+ A list of supported tasks and task groupings can be viewed with `lm-eval --tasks list`.
+
+ For more information, including a full list of task names and their precise meanings or sources, follow the links provided to the individual README.md files for each subfolder.
+
+| Task Family | Description | Language(s) |
+|-------------|-------------|-------------|
+| [aclue](aclue/README.md) | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese |
+| [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic |
+| [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese |
+| [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English |
+| [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic |
+| [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions.  | English |
+| [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English |
+| [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English |
+| [babi](babi/README.md) | Tasks designed as question and answering challenges based on simulated stories. | English |
+| [basqueglue](basqueglue/README.md) | Tasks designed to evaluate language understanding in Basque language. | Basque |
+| [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German |
+| [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) |
+| benchmarks | General benchmarking tasks that test a wide range of language understanding capabilities. | |
+| [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) |
+| [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple |
+| [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English |
+| [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese |
+| [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese |
+| code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby |
+| [commonsense_qa](commmonsense_qa/README.md) | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English |
+| [copal_id](copal_id/README.md) | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian |
+| [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English |
+| [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French |
+| csatqa | Tasks related to SAT and other standardized testing questions for academic assessment. | Korean |
+| [drop](drop/README.md) | Tasks requiring numerical reasoning, reading comprehension, and question answering. | English |
+| [eq_bench](eq_bench/README.md) | Tasks focused on equality and ethics in question answering and decision-making. | English |
+| [eus_exams](eus_exams/README.md) | Tasks based on various professional and academic exams in the Basque language. | Basque |
+| [eus_proficiency](eus_proficiency/README.md) | Tasks designed to test proficiency in the Basque language across various topics. | Basque |
+| [eus_reading](eus_reading/README.md) | Reading comprehension tasks specifically designed for the Basque language. | Basque |
+| [eus_trivia](eus_trivia/README.md) | Trivia and knowledge testing tasks in the Basque language. | Basque |
+| [fda](fda/README.md) | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English |
+| [fld](fld/README.md) | Tasks involving free-form and directed dialogue understanding. | English |
+| [french_bench](french_bench/README.md) | Set of tasks designed to assess language model performance in French. | French|
+| [glue](glue/README.md) | General Language Understanding Evaluation benchmark to test broad language abilities. | English |
+| [gpqa](gpqa/README.md) | Tasks designed for general public question answering and knowledge verification. | English |
+| [gsm8k](gsm8k/README.md) | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English |
+| [haerae](haerae/README.md) | Tasks focused on assessing detailed factual and historical knowledge. | Korean |
+| [headqa](headqa/README.md) | A high-level education-based question answering dataset to test specialized knowledge. | Spanish, English |
+| [hellaswag](hellaswag/README.md) | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity. | English |
+| [hendrycks_ethics](hendrycks_ethics/README.md)     | Tasks designed to evaluate the ethical reasoning capabilities of models. | English |
+| [hendrycks_math](hendrycks_math/README.md) | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English |
+| [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English |
+| [inverse_scaling](inverse_scaling/README.md) | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English |
+| [kmmlu](kmmlu/README.md) | Knowledge-based multi-subject multiple choice questions for academic evaluation. | Korean |
+| [kobest](kobest/README.md) | A collection of tasks designed to evaluate understanding in Korean language. | Korean |
+| [kormedmcqa](kormedmcqa/README.md) | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean |
+| [lambada](lambada/README.md) | Tasks designed to predict the endings of text passages, testing language prediction skills. | English |
+| [lambada_cloze](lambada_cloze/README.md) | Cloze-style LAMBADA dataset. | English |
+| [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian |
+| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese |
+| [leaderboard](leaderboard/README.md) | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English |
+| [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese |
+| [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese |
+| [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English |
+| [mc_taco](mc_taco/README.md) | Question-answer pairs that require temporal commonsense comprehension. | English |
+| medmcqa | Medical multiple choice questions assessing detailed medical knowledge. | English |
+| medqa | Multiple choice question answering based on the United States Medical License Exams. | |
+| [mgsm](mgsm/README.md) | Benchmark of multilingual grade-school math problems. | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu |
+| [minerva_math](minerva_math/README.md) | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills. | English |
+| mmlu | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English |
+| [mmlusr](mmlusr/README.md) | Variation of MMLU designed to be more rigourous. | English |
+| model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. | |
+| [mutual](mutual/README.md) | A retrieval-based dataset for multi-turn dialogue reasoning. | English |
+| [nq_open](nq_open/README.md) | Open domain question answering tasks based on the Natural Questions dataset. | English |
+| [okapi/arc_multilingual](okapi/arc_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.** |
+| [okapi/hellaswag_multilingual](okapi/hellaswag_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (30 languages) |
+| okapi/mmlu_multilingual | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) |
+| [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) |
+| [openbookqa](openbookqa/README.md) | Open-book question answering tasks that require external knowledge and reasoning. | English |
+| [paloma](paloma/README.md) | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit. | English |
+| [paws-x](paws-x/README.md) | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean |
+| [pile](pile/README.md) | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English |
+| [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English |
+| [piqa](piqa/README.md) | Physical Interaction Question Answering tasks to test physical commonsense reasoning. | English |
+| [polemo2](polemo2/README.md) | Sentiment analysis and emotion detection tasks based on Polish language data. | Polish |
+| [prost](prost/README.md) | Tasks requiring understanding of professional standards and ethics in various domains. | English |
+| [pubmedqa](pubmedqa/README.md) | Question answering tasks based on PubMed research articles for biomedical understanding. | English |
+| [qa4mre](qa4mre/README.md) | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning. | English |
+| [qasper](qasper/README.md) | Question Answering dataset based on academic papers, testing in-depth scientific knowledge. | English |
+| [race](race/README.md) | Reading comprehension assessment tasks based on English exams in China. | English |
+| realtoxicityprompts | Tasks to evaluate language models for generating text with potential toxicity. | |
+| [sciq](sciq/README.md) | Science Question Answering tasks to assess understanding of scientific concepts. | English |
+| [scrolls](scrolls/README.md) | Tasks that involve long-form reading comprehension across various domains. | English |
+| [siqa](siqa/README.md) | Social Interaction Question Answering to evaluate common sense and social reasoning.  | English |
+| [squad_completion](squad_completion/README.md) | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs. | English |
+| [squadv2](squadv2/README.md) | Stanford Question Answering Dataset version 2, a reading comprehension benchmark. | English |
+| [storycloze](storycloze/README.md) | Tasks to predict story endings, focusing on narrative logic and coherence. | English |
+| [super_glue](super_glue/README.md) | A suite of challenging tasks designed to test a range of language understanding skills. | English |
+| [swag](swag/README.md) | Situations With Adversarial Generations, predicting the next event in videos. | English |
+| [swde](swde/README.md) | Information extraction tasks from semi-structured web pages. | English |
+| [tinyBenchmarks](tinyBenchmarks/README.md) | Evaluation of large language models with fewer examples using tiny versions of popular benchmarks. | English |
+| [tmmluplus](tmmluplus/README.md) | An extended set of tasks under the TMMLU framework for broader academic assessments. | Traditional Chinese |
+| [toxigen](toxigen/README.md) | Tasks designed to evaluate language models on their propensity to generate toxic content. | English |
+| [translation](translation/README.md) | Tasks focused on evaluating the language translation capabilities of models. | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese |
+| [triviaqa](triviaqa/README.md) | A large-scale dataset for trivia question answering to test general knowledge. | English |
+| [truthfulqa](truthfulqa/README.md) | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English |
+| [unitxt](unitxt/README.md) | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI. | English |
+| [unscramble](unscramble/README.md) | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding. | English |
+| [webqs](webqs/README.md) | Web-based question answering tasks designed to evaluate internet search and retrieval. | English |
+| [wikitext](wikitext/README.md) | Tasks based on text from Wikipedia articles to assess language modeling and generation. | English |
+| [winogrande](winogrande/README.md) | A large-scale dataset for coreference resolution, inspired by the Winograd Schema Challenge. | English |
+| [wmdp](wmdp/README.md) | A benchmark with the objective of minimizing performance, based on potentially-sensitive multiple-choice knowledge questions. | English |
+| [wmt2016](wmt2016/README.md) | Tasks from the WMT 2016 shared task, focusing on translation between multiple languages. | English, Czech, German, Finnish, Russian, Romanian, Turkish |
+| [wsc273](wsc273/README.md) | The Winograd Schema Challenge, a test of commonsense reasoning and coreference resolution. | English |
+| [xcopa](xcopa/README.md) | Cross-lingual Choice of Plausible Alternatives, testing reasoning in multiple languages. | Estonian, Haitian, Indonesian, Italian, Quechua, Swahili, Tamil, Thai, Turkish, Vietnamese, Chinese |
+| [xnli](xnli/README.md) | Cross-Lingual Natural Language Inference to test understanding across different languages. | Arabic, Bulgarian, German, Greekm English, Spanish, French, Hindi, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese |
+| [xnli_eu](xnli_eu/README.md) | Cross-lingual Natural Language Inference tasks in Basque. | Basque |
+| [xstorycloze](xstorycloze/README.md) | Cross-lingual narrative understanding tasks to predict story endings in multiple languages. | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese |
+| [xwinograd](xwinograd/README.md) | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages. | English, French, Japanese, Portuguese, Russian, Chinese |
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
 import collections
+import inspect
 import logging
 import os
 from functools import partial
 from typing import Dict, List, Mapping, Optional, Union

 from lm_eval import utils
+from lm_eval.api.group import ConfigurableGroup, GroupConfig
 from lm_eval.api.task import ConfigurableTask, Task
+from lm_eval.evaluator_utils import get_subtask_list
+
+
+GROUP_ONLY_KEYS = list(GroupConfig().to_dict().keys())


 class TaskManager:
@@ -14,27 +20,53 @@ class TaskManager:

    """

-    def __init__(self, verbosity="INFO", include_path: Optional[str] = None) -> None:
+    def __init__(
+        self,
+        verbosity="INFO",
+        include_path: Optional[Union[str, List]] = None,
+        include_defaults: bool = True,
+    ) -> None:
        self.verbosity = verbosity
        self.include_path = include_path
        self.logger = utils.eval_logger
        self.logger.setLevel(getattr(logging, f"{verbosity}"))

-        self._task_index = self.initialize_tasks(include_path=include_path)
+        self._task_index = self.initialize_tasks(
+            include_path=include_path, include_defaults=include_defaults
+        )
        self._all_tasks = sorted(list(self._task_index.keys()))

+        self._all_groups = sorted(
+            [x for x in self._all_tasks if self._task_index[x]["type"] == "group"]
+        )
+        self._all_subtasks = sorted(
+            [x for x in self._all_tasks if self._task_index[x]["type"] == "task"]
+        )
+        self._all_tags = sorted(
+            [x for x in self._all_tasks if self._task_index[x]["type"] == "tag"]
+        )
+
        self.task_group_map = collections.defaultdict(list)

-    def initialize_tasks(self, include_path: Optional[str] = None):
+    def initialize_tasks(
+        self,
+        include_path: Optional[Union[str, List]] = None,
+        include_defaults: bool = True,
+    ):
        """Creates a dictionary of tasks index.

-        :param include_path: str = None
-            An additional path to be searched for tasks
-
+        :param include_path: Union[str, List] = None
+            An additional path to be searched for tasks recursively.
+            Can provide more than one such path as a list.
+        :param include_defaults: bool = True
+            If set to false, default tasks (those in lm_eval/tasks/) are not indexed.
        :return
            Dictionary of task names as key and task metadata
        """
-        all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
+        if include_defaults:
+            all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
+        else:
+            all_paths = []
        if include_path is not None:
            if isinstance(include_path, str):
                include_path = [include_path]
@@ -51,10 +83,88 @@ class TaskManager:
    def all_tasks(self):
        return self._all_tasks

+    @property
+    def all_groups(self):
+        return self._all_groups
+
+    @property
+    def all_subtasks(self):
+        return self._all_subtasks
+
+    @property
+    def all_tags(self):
+        return self._all_tags
+
    @property
    def task_index(self):
        return self._task_index

+    def list_all_tasks(
+        self, list_groups=True, list_tags=True, list_subtasks=True
+    ) -> str:
+        from pytablewriter import MarkdownTableWriter
+
+        def sanitize_path(path):
+            # don't print full path if we are within the lm_eval/tasks dir !
+            # if we aren't though, provide the full path.
+            if "lm_eval/tasks/" in path:
+                return "lm_eval/tasks/" + path.split("lm_eval/tasks/")[-1]
+            else:
+                return path
+
+        group_table = MarkdownTableWriter()
+        group_table.headers = ["Group", "Config Location"]
+        gt_values = []
+        for g in self.all_groups:
+            path = self.task_index[g]["yaml_path"]
+            if path == -1:
+                path = "---"
+            else:
+                path = sanitize_path(path)
+            gt_values.append([g, path])
+        group_table.value_matrix = gt_values
+
+        tag_table = MarkdownTableWriter()
+        tag_table.headers = ["Tag"]
+        tag_table.value_matrix = [[t] for t in self.all_tags]
+
+        subtask_table = MarkdownTableWriter()
+        subtask_table.headers = ["Task", "Config Location", "Output Type"]
+        st_values = []
+        for t in self.all_subtasks:
+            path = self.task_index[t]["yaml_path"]
+
+            output_type = ""
+
+            # read the yaml file to determine the output type
+            if path != -1:
+                config = utils.load_yaml_config(path, mode="simple")
+                if "output_type" in config:
+                    output_type = config["output_type"]
+                elif (
+                    "include" in config
+                ):  # if no output type, check if there is an include with an output type
+                    include_path = path.split("/")[:-1] + config["include"]
+                    include_config = utils.load_yaml_config(include_path, mode="simple")
+                    if "output_type" in include_config:
+                        output_type = include_config["output_type"]
+
+            if path == -1:
+                path = "---"
+            else:
+                path = sanitize_path(path)
+            st_values.append([t, path, output_type])
+        subtask_table.value_matrix = st_values
+
+        result = "\n"
+        if list_groups:
+            result += group_table.dumps() + "\n\n"
+        if list_tags:
+            result += tag_table.dumps() + "\n\n"
+        if list_subtasks:
+            result += subtask_table.dumps() + "\n\n"
+        return result
+
    def match_tasks(self, task_list):
        return utils.pattern_match(task_list, self.all_tasks)

@@ -64,7 +174,12 @@ class TaskManager:
        return False

    def _name_is_task(self, name) -> bool:
-        if self._name_is_registered(name) and ("task" in self.task_index[name]["type"]):
+        if self._name_is_registered(name) and (self.task_index[name]["type"] == "task"):
+            return True
+        return False
+
+    def _name_is_tag(self, name) -> bool:
+        if self._name_is_registered(name) and (self.task_index[name]["type"] == "tag"):
            return True
        return False

@@ -125,89 +240,126 @@ class TaskManager:
                config["group_alias"] = None
        return config

+    def _class_has_config_in_constructor(self, cls):
+        constructor = getattr(cls, "__init__", None)
+        return (
+            "config" in inspect.signature(constructor).parameters
+            if constructor
+            else False
+        )
+
    def _load_individual_task_or_group(
        self,
        name_or_config: Optional[Union[str, dict]] = None,
        parent_name: Optional[str] = None,
        update_config: Optional[dict] = None,
-        yaml_path: Optional[str] = None,
    ) -> Mapping:
-        def load_task(config, task, group=None, yaml_path=None):
+        def _load_task(config, task):
            if "include" in config:
-                if yaml_path is None:
-                    raise ValueError
                config = {
                    **utils.load_yaml_config(
-                        yaml_path,
+                        yaml_path=None,
                        yaml_config={"include": config.pop("include")},
                        mode="full",
                    ),
                    **config,
                }
            if self._config_is_python_task(config):
-                task_object = config["class"]()
+                if self._class_has_config_in_constructor(config["class"]):
+                    task_object = config["class"](config=config)
+                else:
+                    task_object = config["class"]()
+                if isinstance(task_object, ConfigurableTask):
+                    # very scuffed: set task name here. TODO: fixme?
+                    task_object.config.task = config["task"]
            else:
-                config = self._process_alias(config, group=group)
                task_object = ConfigurableTask(config=config)
-            if group is not None:
-                task_object = (group, task_object)
+
            return {task: task_object}

+        def _get_group_and_subtask_from_config(config):
+            group_name = ConfigurableGroup(config=config)
+            subtask_list = []
+            for task in group_name.config["task"]:
+                if isinstance(task, str) and self._name_is_tag(task):
+                    subtask_list.extend(self._get_tasklist(task))
+                else:
+                    subtask_list.append(task)
+            return group_name, subtask_list
+
+        def _process_group_config(config, update_config=None):
+            if update_config is not None:
+                config = {**config, **update_config}
+            _update_config = {
+                k: v for k, v in config.items() if k not in GROUP_ONLY_KEYS
+            }
+            if not bool(_update_config):
+                _update_config = None
+
+            group_config = {k: v for k, v in config.items() if k in GROUP_ONLY_KEYS}
+            return group_config, _update_config
+
        if isinstance(name_or_config, str):
            if update_config is not None:
                # Process name_or_config as a dict instead
                name_or_config = {"task": name_or_config, **update_config}
-            elif self._name_is_task(name_or_config):
+            elif self._name_is_task(name_or_config) or self._name_is_python_task(
+                name_or_config
+            ):
                task_config = self._get_config(name_or_config)
-                return load_task(task_config, task=name_or_config, group=parent_name)
+                return _load_task(task_config, task=name_or_config)
            else:
-                group_name = name_or_config
                subtask_list = self._get_tasklist(name_or_config)
                if subtask_list == -1:
                    group_config = self._get_config(name_or_config)
-                    subtask_list = group_config["task"]
-
-                # This checks if we're at the root.
-                if parent_name is None:
-                    group_config = self._get_config(name_or_config)
-                    if set(group_config.keys()) > {"task", "group"}:
-                        update_config = {
-                            k: v
-                            for k, v in group_config.items()
-                            if k not in ["task", "group"]
-                        }
-                    yaml_path = self._get_yaml_path(group_name)
-
-                    if (update_config is not None) and ("group_alias" in update_config):
-                        group_name = update_config["group_alias"]
-                        update_config.pop("group_alias")
+                    group_config, update_config = _process_group_config(group_config)
+                    group_name, subtask_list = _get_group_and_subtask_from_config(
+                        group_config
+                    )
+                else:
+                    if self._name_is_tag(name_or_config):
+                        fn = partial(
+                            self._load_individual_task_or_group,
+                            update_config=name_or_config
+                            if isinstance(name_or_config, dict)
+                            else None,
+                        )
+                        return dict(
+                            collections.ChainMap(*map(fn, reversed(subtask_list)))
+                        )
+                    else:
+                        group_name = ConfigurableGroup(
+                            config={"group": name_or_config, "task": subtask_list}
+                        )

        if isinstance(name_or_config, dict):
-            if update_config is not None:
-                name_or_config = {
-                    **name_or_config,
-                    **update_config,
-                }
-
            if self._config_is_task(name_or_config):
-                name = name_or_config["task"]
+                name = name_or_config.pop("task")
+                if update_config is not None:
+                    name_or_config = {**name_or_config, **update_config}
                # If the name is registered as a group
-                # if self._name_is_task(name) is False:
                if self._name_is_group(name):
-                    group_name = name
-                    update_config = {
-                        k: v for k, v in name_or_config.items() if k != "task"
-                    }
+                    group_config = self._get_config(name)
+
+                    group_config, update_config = _process_group_config(
+                        group_config, name_or_config
+                    )
+                    group_name, subtask_list = _get_group_and_subtask_from_config(
+                        group_config
+                    )
+                elif self._name_is_tag(name):
                    subtask_list = self._get_tasklist(name)
-                    if subtask_list == -1:
-                        subtask_list = self._get_config(name)["task"]
+                    fn = partial(
+                        self._load_individual_task_or_group,
+                        update_config=name_or_config,
+                    )
+                    return dict(collections.ChainMap(*map(fn, reversed(subtask_list))))
                else:
                    if self._name_is_registered(name):
                        base_task_config = self._get_config(name)

                        # Check if this is a duplicate.
                        if parent_name is not None:
-                            name_or_config["group"] = parent_name
                            num_duplicate = len(
                                list(
                                    filter(
@@ -226,34 +378,21 @@ class TaskManager:
                        }
                    else:
                        task_config = name_or_config
-                    return load_task(
-                        task_config, task=name, group=parent_name, yaml_path=yaml_path
-                    )
+                    return _load_task(task_config, task=name)
            else:
-                group_name = name_or_config["group"]
-                subtask_list = name_or_config["task"]
-                if set(name_or_config.keys()) > {"task", "group"}:
-                    update_config = {
-                        k: v
-                        for k, v in name_or_config.items()
-                        if k not in ["task", "group"]
-                    }
-
-        all_subtasks = {}
-        if parent_name is not None:
-            all_subtasks = {group_name: (parent_name, None)}
+                group_config, update_config = _process_group_config(name_or_config)
+                group_name, subtask_list = _get_group_and_subtask_from_config(
+                    group_config
+                )

        fn = partial(
            self._load_individual_task_or_group,
            parent_name=group_name,
            update_config=update_config,
-            yaml_path=yaml_path,
        )
-        all_subtasks = {
-            **all_subtasks,
-            **dict(collections.ChainMap(*map(fn, subtask_list))),
+        return {
+            group_name: dict(collections.ChainMap(*map(fn, reversed(subtask_list))))
        }
-        return all_subtasks

    def load_task_or_group(self, task_list: Optional[Union[str, list]] = None) -> dict:
        """Loads a dictionary of task objects from a list
@@ -277,10 +416,11 @@ class TaskManager:

    def _get_task_and_group(self, task_dir: str):
        """Creates a dictionary of tasks index with the following metadata,
-        - `type`, that can be either `task`, `python_task`, or `group`.
+        - `type`, that can be either `task`, `python_task`, `group` or `tags`.
            `task` refer to regular task configs, `python_task` are special
            yaml files that only consists of `task` and `class` parameters.
-            `group` are group configs.
+            `group` are group configs. `tags` are labels that can be assigned
+            to tasks to assist in sorting and calling tasks of certain themes.
        - `yaml_path`, path to the yaml file. If the entry is a `group` that
            was configured through a task config, the yaml_path will be -1
            and all subtasks will be listed in `task` (see below)
@@ -296,8 +436,15 @@ class TaskManager:
        :return
            Dictionary of task names as key and task metadata
        """
+        # TODO: remove group in next release
+        print_info = True
+        ignore_dirs = [
+            "__pycache__",
+            ".ipynb_checkpoints",
+        ]
        tasks_and_groups = collections.defaultdict()
-        for root, _, file_list in os.walk(task_dir):
+        for root, dirs, file_list in os.walk(task_dir):
+            dirs[:] = [d for d in dirs if d not in ignore_dirs]
            for f in file_list:
                if f.endswith(".yaml"):
                    yaml_path = os.path.join(root, f)
@@ -337,20 +484,38 @@ class TaskManager:
                            "yaml_path": yaml_path,
                        }

-                        if "group" in config:
-                            groups = config["group"]
-                            if isinstance(config["group"], str):
-                                groups = [groups]
-
-                            for group in groups:
-                                if group not in tasks_and_groups:
-                                    tasks_and_groups[group] = {
-                                        "type": "group",
-                                        "task": [task],
-                                        "yaml_path": -1,
-                                    }
-                                else:
-                                    tasks_and_groups[group]["task"].append(task)
+                        # TODO: remove group in next release
+                        for attr in ["tag", "group"]:
+                            if attr in config:
+                                if attr == "group" and print_info:
+                                    self.logger.info(
+                                        "`group` and `group_alias` keys in tasks' configs will no longer be used in the next release of lm-eval. "
+                                        "`tag` will be used to allow to call a collection of tasks just like `group`. "
+                                        "`group` will be removed in order to not cause confusion with the new ConfigurableGroup "
+                                        "which will be the offical way to create groups with addition of group-wide configuations."
+                                    )
+                                    print_info = False
+                                    # attr = "tag"
+
+                                attr_list = config[attr]
+                                if isinstance(attr_list, str):
+                                    attr_list = [attr_list]
+
+                                for tag in attr_list:
+                                    if tag not in tasks_and_groups:
+                                        tasks_and_groups[tag] = {
+                                            "type": "tag",
+                                            "task": [task],
+                                            "yaml_path": -1,
+                                        }
+                                    elif tasks_and_groups[tag]["type"] != "tag":
+                                        self.logger.info(
+                                            f"The tag {tag} is already registered as a group, this tag will not be registered. "
+                                            "This may affect tasks you want to call."
+                                        )
+                                        break
+                                    else:
+                                        tasks_and_groups[tag]["task"].append(task)
                    else:
                        self.logger.debug(f"File {f} in {root} could not be loaded")

@@ -379,6 +544,33 @@ def get_task_name_from_object(task_object):
    )


+def _check_duplicates(task_dict: dict) -> List[str]:
+    """helper function solely used in validating get_task_dict output.
+    Takes the output of lm_eval.evaluator_utils.get_subtask_list and
+    returns a list of all leaf subtasks contained within, and errors if any such leaf subtasks are
+    "oversubscribed" to several disjoint groups.
+    """
+    subtask_names = []
+    for key, value in task_dict.items():
+        subtask_names.extend(value)
+
+    duplicate_tasks = {
+        task_name for task_name in subtask_names if subtask_names.count(task_name) > 1
+    }
+
+    # locate the potentially problematic groups that seem to 'compete' for constituent subtasks
+    competing_groups = [
+        group
+        for group in task_dict.keys()
+        if len(set(task_dict[group]).intersection(duplicate_tasks)) > 0
+    ]
+
+    if len(duplicate_tasks) > 0:
+        raise ValueError(
+            f"Found 1 or more tasks while trying to call get_task_dict() that were members of more than 1 called group: {list(duplicate_tasks)}. Offending groups: {competing_groups}. Please call groups which overlap their constituent tasks in separate evaluation runs."
+        )
+
+
 def get_task_dict(
    task_name_list: Union[str, List[Union[str, Dict, Task]]],
    task_manager: Optional[TaskManager] = None,
@@ -396,6 +588,7 @@ def get_task_dict(
    :return
        Dictionary of task objects
    """
+
    task_name_from_string_dict = {}
    task_name_from_config_dict = {}
    task_name_from_object_dict = {}
@@ -413,7 +606,9 @@ def get_task_dict(
        )

    string_task_name_list = [task for task in task_name_list if isinstance(task, str)]
-    others_task_name_list = [task for task in task_name_list if ~isinstance(task, str)]
+    others_task_name_list = [
+        task for task in task_name_list if not isinstance(task, str)
+    ]
    if len(string_task_name_list) > 0:
        if task_manager is None:
            task_manager = TaskManager()
@@ -440,8 +635,16 @@ def get_task_dict(
    ):
        raise ValueError

-    return {
+    final_task_dict = {
        **task_name_from_string_dict,
        **task_name_from_config_dict,
        **task_name_from_object_dict,
    }
+
+    # behavior can get odd if one tries to invoke several groups that "compete" for the same task.
+    # (notably, because one could request several num_fewshot values at once in GroupConfig overrides for the subtask
+    # and we'd be unsure which to use and report.)
+    # we explicitly check and error in this case.
+    _check_duplicates(get_subtask_list(final_task_dict))
+
+    return final_task_dict
--- a/lm_eval/tasks/aclue/README.md
+++ b/lm_eval/tasks/aclue/README.md
@@ -26,7 +26,7 @@ Homepage: https://github.com/isen-zhang/ACLUE
 }
 ```

-### Groups and Tasks
+### Groups, Tags, and Tasks

 #### Groups


--- a/lm_eval/tasks/aclue/_aclue.yaml
+++ b/lm_eval/tasks/aclue/_aclue.yaml
+group: aclue
+task:
+  - aclue_ancient_chinese_culture
+  - aclue_ancient_literature
+  - aclue_ancient_medical
+  - aclue_ancient_phonetics
+  - aclue_basic_ancient_chinese
+  - aclue_couplet_prediction
+  - aclue_homographic_character_resolution
+  - aclue_named_entity_recognition
+  - aclue_poetry_appreciate
+  - aclue_poetry_context_prediction
+  - aclue_poetry_quality_assessment
+  - aclue_poetry_sentiment_analysis
+  - aclue_polysemy_resolution
+  - aclue_reading_comprehension
+  - aclue_sentence_segmentation
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/aclue/_default_template_yaml
+++ b/lm_eval/tasks/aclue/_default_template_yaml
-group: aclue
 dataset_path: tyouisen/aclue
 test_split: test
 fewshot_split: dev

--- a/lm_eval/tasks/aclue/_generate_configs.py
+++ b/lm_eval/tasks/aclue/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
+
 import argparse
 import os


--- a/lm_eval/tasks/aexams/README.md
+++ b/lm_eval/tasks/aexams/README.md
@@ -24,11 +24,11 @@ Homepage for Arabic EXAMS: [EXAMS Arabic Homepage](https://github.com/FreedomInt
 ### Citation


-### Groups and Tasks
+### Groups, Tags, and Tasks

 #### Groups

- `EXAMS Arabic`: include IslamicStudies, Biology, Science, Physics, Social.
+- `aexams`: Arabic EXAMS dataset, including IslamicStudies, Biology, Science, Physics, Social subjects.

 #### Tasks


--- a/lm_eval/tasks/aexams/_aexams.yaml
+++ b/lm_eval/tasks/aexams/_aexams.yaml
+group: aexams
+task:
+  - aexams_Biology
+  - aexams_IslamicStudies
+  - aexams_Physics
+  - aexams_Science
+  - aexams_Social
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/aexams/_default_template_yaml
+++ b/lm_eval/tasks/aexams/_default_template_yaml
-group: aexams
 dataset_path: Hennara/aexams
 test_split: test
 fewshot_split: dev