Merge branch 'upstream' into 'mmlu-pro'

add tokenizer logs info (#1731) See merge request shijie.yu/lm-evaluation-harness!4

Merge branch 'upstream' into 'mmlu-pro'
add tokenizer logs info (#1731) See merge request shijie.yu/lm-evaluation-harness!4
c1e63555 · Yu Shi Jie · e361687c · 42dc2448 · c1e63555 · c1e63555
Commit c1e63555 authored Jul 24, 2024 by Yu Shi Jie
20 changed files
--- a/lm_eval/loggers/evaluation_tracker.py
+++ b/lm_eval/loggers/evaluation_tracker.py
 import json
+import os
 import re
 import time
 from collections import defaultdict
@@ -14,6 +15,7 @@ from huggingface_hub import (
    HfApi,
    hf_hub_url,
 )
+from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
 from lm_eval.utils import (
    eval_logger,
@@ -112,12 +114,15 @@ class EvaluationTracker:
        output_path: str = None,
        hub_results_org: str = "",
        hub_repo_name: str = "",
+        details_repo_name: str = "",
+        results_repo_name: str = "",
        push_results_to_hub: bool = False,
        push_samples_to_hub: bool = False,
        public_repo: bool = False,
        token: str = "",
        leaderboard_url: str = "",
        point_of_contact: str = "",
+        gated: bool = False,
    ) -> None:
        """
        Creates all the necessary loggers for evaluation tracking.
@@ -126,12 +131,15 @@ class EvaluationTracker:
            output_path (str): Path to save the results. If not provided, the results won't be saved.
            hub_results_org (str): The Hugging Face organization to push the results to. If not provided, the results will be pushed to the owner of the Hugging Face token.
            hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`.
+            details_repo_name (str): The name of the Hugging Face repository to push the details to. If not provided, the results will be pushed to `lm-eval-results`.
+            result_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will not be pushed and will be found in the details_hub_repo.
            push_results_to_hub (bool): Whether to push the results to the Hugging Face hub.
            push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub.
            public_repo (bool): Whether to push the results to a public or private repository.
            token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`.
            leaderboard_url (str): URL to the leaderboard on the Hugging Face hub on the dataset card.
            point_of_contact (str): Contact information on the Hugging Face hub dataset card.
+            gated (bool): Whether to gate the repository.
        """
        self.general_config_tracker = GeneralConfigTracker()
@@ -142,6 +150,7 @@ class EvaluationTracker:
        self.leaderboard_url = leaderboard_url
        self.point_of_contact = point_of_contact
        self.api = HfApi(token=token) if token else None
+        self.gated_repo = gated
        if not self.api and (push_results_to_hub or push_samples_to_hub):
            raise ValueError(
@@ -159,9 +168,24 @@ class EvaluationTracker:
                f"hub_results_org was not specified. Results will be pushed to '{hub_results_org}'."
            )
-        hub_repo_name = hub_repo_name if hub_repo_name else "lm-eval-results"
+        if hub_repo_name == "":
-        self.hub_results_repo = f"{hub_results_org}/{hub_repo_name}"
+            details_repo_name = (
-        self.hub_results_repo_private = f"{hub_results_org}/{hub_repo_name}-private"
+                details_repo_name if details_repo_name != "" else "lm-eval-results"
+            )
+            results_repo_name = (
+                results_repo_name if results_repo_name != "" else details_repo_name
+            )
+        else:
+            details_repo_name = hub_repo_name
+            results_repo_name = hub_repo_name
+            eval_logger.warning(
+                "hub_repo_name was specified. Both details and results will be pushed to the same repository. Using hub_repo_name is no longer recommended, details_repo_name and results_repo_name should be used instead."
+            )
+        self.details_repo = f"{hub_results_org}/{details_repo_name}"
+        self.details_repo_private = f"{hub_results_org}/{details_repo_name}-private"
+        self.results_repo = f"{hub_results_org}/{results_repo_name}"
+        self.results_repo_private = f"{hub_results_org}/{results_repo_name}-private"
    def save_results_aggregated(
        self,
@@ -211,9 +235,9 @@ class EvaluationTracker:
                if self.api and self.push_results_to_hub:
                    repo_id = (
-                        self.hub_results_repo
+                        self.results_repo
                        if self.public_repo
-                        else self.hub_results_repo_private
+                        else self.results_repo_private
                    )
                    self.api.create_repo(
                        repo_id=repo_id,
@@ -221,10 +245,15 @@ class EvaluationTracker:
                        private=not self.public_repo,
                        exist_ok=True,
                    )
-                    self.api.upload_folder(
+                    self.api.upload_file(
                        repo_id=repo_id,
-                        folder_path=str(path),
+                        path_or_fileobj=str(
-                        path_in_repo=self.general_config_tracker.model_name_sanitized,
+                            path.joinpath(f"results_{self.date_id}.json")
+                        ),
+                        path_in_repo=os.path.join(
+                            self.general_config_tracker.model_name,
+                            f"results_{self.date_id}.json",
+                        ),
                        repo_type="dataset",
                        commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
                    )
@@ -278,6 +307,7 @@ class EvaluationTracker:
                    sample["resps"] = sanitize_list(sample["resps"])
                    sample["filtered_resps"] = sanitize_list(sample["filtered_resps"])
                    sample["arguments"] = arguments
+                    sample["target"] = str(sample["target"])
                    sample_dump = (
                        json.dumps(
@@ -288,14 +318,14 @@ class EvaluationTracker:
                        + "\n"
                    )
-                    with open(file_results_samples, "a") as f:
+                    with open(file_results_samples, "a", encoding="utf-8") as f:
                        f.write(sample_dump)
                if self.api and self.push_samples_to_hub:
                    repo_id = (
-                        self.hub_results_repo
+                        self.details_repo
                        if self.public_repo
-                        else self.hub_results_repo_private
+                        else self.details_repo_private
                    )
                    self.api.create_repo(
                        repo_id=repo_id,
@@ -303,6 +333,18 @@ class EvaluationTracker:
                        private=not self.public_repo,
                        exist_ok=True,
                    )
+                    try:
+                        if self.gated_repo:
+                            headers = build_hf_headers()
+                            r = get_session().put(
+                                url=f"https://huggingface.co/api/datasets/{repo_id}/settings",
+                                headers=headers,
+                                json={"gated": "auto"},
+                            )
+                            hf_raise_for_status(r)
+                    except Exception as e:
+                        eval_logger.warning("Could not gate the repository")
+                        eval_logger.info(repr(e))
                    self.api.upload_folder(
                        repo_id=repo_id,
                        folder_path=str(path),
@@ -327,9 +369,7 @@ class EvaluationTracker:
        """
        eval_logger.info("Recreating metadata card")
-        repo_id = (
+        repo_id = self.details_repo if self.public_repo else self.details_repo_private
-            self.hub_results_repo if self.public_repo else self.hub_results_repo_private
-        )
        files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
        results_files = get_results_filenames(files_in_repo)
@@ -360,7 +400,10 @@ class EvaluationTracker:
                results_datetime,
            )
            latest_task_results_datetime[samples_key] = latest_datetime
-            latest_task_results_datetime[results_key] = latest_datetime
+            latest_task_results_datetime[results_key] = max(
+                latest_task_results_datetime[results_key],
+                latest_datetime,
+            )
        # Create metadata card
        card_metadata = MetadataConfigs()
@@ -377,14 +420,15 @@ class EvaluationTracker:
            sanitized_last_eval_date_results = re.sub(
                r"[^\w\.]", "_", latest_task_results_datetime[config_name]
            )
-            # Ensure that all results files are listed in the metadata card
-            current_results = card_metadata.get(config_name, {"data_files": []})
-            current_results["data_files"].append(
-                {"split": eval_date_sanitized, "path": [str(results_filename)]}
-            )
-            card_metadata[config_name] = current_results
-            # If the results file is the newest, update the "latest" field in the metadata card
            if eval_date_sanitized == sanitized_last_eval_date_results:
+                # Ensure that all results files are listed in the metadata card
+                current_results = card_metadata.get(config_name, {"data_files": []})
+                current_results["data_files"].append(
+                    {"split": eval_date_sanitized, "path": [str(results_filename)]}
+                )
+                card_metadata[config_name] = current_results
+                # If the results file is the newest, update the "latest" field in the metadata card
                card_metadata[config_name]["data_files"].append(
                    {"split": "latest", "path": [str(results_filename)]}
                )
@@ -403,65 +447,20 @@ class EvaluationTracker:
            sanitized_last_eval_date_results = re.sub(
                r"[^\w\.]", "_", latest_task_results_datetime[config_name]
            )
-            # Ensure that all sample results files are listed in the metadata card
-            current_details_for_task = card_metadata.get(
-                config_name, {"data_files": []}
-            )
-            current_details_for_task["data_files"].append(
-                {"split": eval_date_sanitized, "path": [str(results_filename)]}
-            )
-            card_metadata[config_name] = current_details_for_task
-            # If the samples results file is the newest, update the "latest" field in the metadata card
            if eval_date_sanitized == sanitized_last_eval_date_results:
+                # Ensure that all sample results files are listed in the metadata card
+                current_details_for_task = card_metadata.get(
+                    config_name, {"data_files": []}
+                )
+                current_details_for_task["data_files"].append(
+                    {"split": eval_date_sanitized, "path": [str(results_filename)]}
+                )
+                card_metadata[config_name] = current_details_for_task
+                # If the samples results file is the newest, update the "latest" field in the metadata card
                card_metadata[config_name]["data_files"].append(
                    {"split": "latest", "path": [str(results_filename)]}
                )
-            # Special case for MMLU with a single split covering it all
-            # We add another config with all MMLU splits results together for easy inspection
-            SPECIAL_TASKS = ["mmlu", "gpqa", "minerva_math"]
-            for special_task in SPECIAL_TASKS:
-                if special_task in config_name:
-                    special_task = f"{model_name}__{special_task}"
-                    former_entry = card_metadata.get(special_task, {"data_files": []})
-                    former_split = [
-                        (i, entry)
-                        for i, entry in enumerate(former_entry["data_files"])
-                        if entry.get("split", None) == eval_date_sanitized
-                    ]
-                    if len(former_split) == 0:
-                        former_entry["data_files"].append(
-                            {
-                                "split": eval_date_sanitized,
-                                "path": [str(results_filename)],
-                            }
-                        )
-                    else:
-                        split_index, _ = former_split[0]
-                        former_entry["data_files"][split_index]["path"].append(
-                            str(results_filename)
-                        )
-                    if eval_date_sanitized == sanitized_last_eval_date_results:
-                        latest_split = [
-                            (i, entry)
-                            for i, entry in enumerate(former_entry["data_files"])
-                            if entry.get("split", None) == "latest"
-                        ]
-                        if len(latest_split) == 0:
-                            former_entry["data_files"].append(
-                                {"split": "latest", "path": [str(results_filename)]}
-                            )
-                        else:
-                            latest_index, _ = latest_split[0]
-                            former_entry["data_files"][latest_index]["path"].append(
-                                str(results_filename)
-                            )
-                    card_metadata[special_task] = former_entry
        # Get latest results and extract info to update metadata card examples
        latest_datetime = max(latest_task_results_datetime.values())
        latest_model_name = max(

--- a/lm_eval/loggers/utils.py
+++ b/lm_eval/loggers/utils.py
@@ -110,3 +110,34 @@ def add_env_info(storage: Dict[str, Any]):
        "upper_git_hash": upper_dir_commit,  # in case this repo is submodule
    }
    storage.update(added_info)
+def add_tokenizer_info(storage: Dict[str, Any], lm):
+    if getattr(lm, "tokenizer", False):
+        try:
+            tokenizer_info = {
+                "tokenizer_pad_token": [
+                    lm.tokenizer.pad_token,
+                    str(lm.tokenizer.pad_token_id),
+                ],
+                "tokenizer_eos_token": [
+                    lm.tokenizer.eos_token,
+                    str(lm.tokenizer.eos_token_id),
+                ],
+                "tokenizer_bos_token": [
+                    lm.tokenizer.bos_token,
+                    str(lm.tokenizer.bos_token_id),
+                ],
+                "eot_token_id": getattr(lm, "eot_token_id", None),
+                "max_length": getattr(lm, "max_length", None),
+            }
+            storage.update(tokenizer_info)
+        except Exception as err:
+            logger.debug(
+                f"Logging detailed tokenizer info failed with {err}, skipping..."
+            )
+        # seems gguf and textsynth do not have tokenizer
+    else:
+        logger.debug(
+            "LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results."
+        )
--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
 from . import (
    anthropic_llms,
+    api_models,
    dummy,
    gguf,
    huggingface,

--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
-from typing import Any, List, Tuple
+import os
+from functools import cached_property
+from typing import Any, Dict, List, Tuple, Union
 from tqdm import tqdm
 from lm_eval import utils
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
+from lm_eval.models.openai_completions import LocalCompletionsAPI
 from lm_eval.models.utils import retry_on_specific_exceptions
@@ -138,7 +141,7 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
    return messages()
-@register_model("anthropic")
+@register_model("anthropic-completions")
 class AnthropicLM(LM):
    REQ_CHUNK_SIZE = 20  # TODO: not used
@@ -271,90 +274,89 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
 @register_model("anthropic-chat", "anthropic-chat-completions")
-class AnthropicChatLM(AnthropicLM):
+class AnthropicChat(LocalCompletionsAPI):
-    REQ_CHUNK_SIZE = 20  # TODO: not used
    def __init__(
        self,
-        model: str,
+        base_url="https://api.anthropic.com/v1/messages",
-        batch_size: int = 1,
+        tokenizer_backend=None,
-        max_tokens: int = 256,
+        **kwargs,
-        temperature: float = 0,  # defaults to 1
+    ):
-        **kwargs,  # top_p, top_k, etc.
+        super().__init__(
-    ) -> None:
+            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
-        """Anthropic API wrapper.
+        )
+        eval_logger.warning(
-        :param model: str
+            "Chat completions does not support batching. Defaulting to batch size 1."
-            Anthropic model e.g. 'claude-3-opus-20240229', 'claude-3-sonnet-20240229'
+        )
-        :param max_tokens: int
+        self._batch_size = 1
-            Maximum number of tokens to sample from the model
+        self.anthropic_version = "2023-06-01"
-        :param temperature: float
+        eval_logger.warning(
-            Sampling temperature
+            f"Using Anthropic Version: {self.anthropic_version}. Confirm the current version here: https://docs.anthropic.com/en/api/versioning"
-        :param kwargs: Any
+        )
-            Additional model_args to pass to the API client
-        """
-        super().__init__()
-        try:
+    @cached_property
-            import anthropic
+    def api_key(self):
-        except ModuleNotFoundError:
+        """Override this property to return the API key for the API request."""
-            raise Exception(
+        key = os.environ.get("ANTHROPIC_API_KEY", None)
-                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+        if key is None:
-please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+            raise ValueError(
+                "API key not found. Please set the ANTHROPIC_API_KEY environment variable."
            )
+        return key
-        self.model = model
-        # defaults to os.environ.get("ANTHROPIC_API_KEY")
+    @cached_property
-        self.client = anthropic.Anthropic()
+    def header(self):
-        self.temperature = temperature
+        return {
-        self.max_tokens = max_tokens
+            "x-api-key": f"{self.api_key}",
-        self.tokenizer = self.client.get_tokenizer()
+            "anthropic-version": self.anthropic_version,
-        self.kwargs = kwargs
+        }
-    @property
+    def _create_payload(
-    def max_gen_toks(self) -> int:
+        self, messages: List[Dict], generate=True, gen_kwargs: dict = None, **kwargs
-        return self.max_tokens
+    ) -> dict:
+        system = (
-    def generate_until(self, requests) -> List[str]:
+            messages[0].get("content") if messages[0].get("role") == "system" else None
-        try:
+        )
-            import anthropic
+        if system:
-        except ModuleNotFoundError:
+            messages = messages[1:]
-            raise Exception(
+        gen_kwargs.pop("do_sample", False)
-                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+        max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
-please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+        temperature = gen_kwargs.pop("temperature", 0)
-            )
+        stop = gen_kwargs.pop("until", ["\n\nHuman:"])
+        if not isinstance(stop, list):
-        if not requests:
+            stop = [stop]
-            return []
+        out = {
+            "messages": messages,
-        _requests: List[Tuple[str, dict]] = [req.args for req in requests]
+            "model": self.model,
+            "max_tokens": max_tokens,
+            "temperature": temperature,
+            "stop_sequences": stop,
+            **gen_kwargs,
+        }
+        if system:
+            out["system"] = system
+        return out
+    def parse_generations(
+        self, outputs: Union[Dict, List[Dict]], **kwargs
+    ) -> List[str]:
        res = []
-        for request in tqdm(_requests):
+        if not isinstance(outputs, list):
-            try:
+            outputs = [outputs]
-                inp = request[0]
+        for out in outputs:
-                request_args = request[1]
+            for choices in out["content"]:
-                # generation_kwargs
+                res.append(choices["text"])
-                until = request_args.get("until")
-                max_tokens = request_args.get("max_gen_toks", self.max_length)
-                temperature = request_args.get("temperature", self.temperature)
-                response = anthropic_chat(
-                    client=self.client,
-                    model=self.model,
-                    prompt=inp,
-                    max_tokens=max_tokens,
-                    temperature=temperature,  # TODO: implement non-greedy sampling for Anthropic
-                    stop=until,  # type: ignore
-                    **self.kwargs,
-                )
-                res.append(response)
-                self.cache_hook.add_partial("generate_until", request, response)
-            except anthropic.APIConnectionError as e:  # type: ignore # noqa: F821
-                eval_logger.critical(f"Server unreachable: {e.__cause__}")
-                break
-            except anthropic.APIStatusError as e:  # type: ignore # noqa: F821
-                eval_logger.critical(f"API error {e.status_code}: {e.message}")
-                break
        return res
+    def tok_encode(
+        self,
+        string: str,
+        left_truncate_len=None,
+        add_special_tokens=None,
+        **kwargs,
+    ) -> List[str]:
+        return [string]
+    def _loglikelihood_tokens(self, requests, **kwargs):
+        raise NotImplementedError(
+            "Anthropic Chat Completions API does not support the return of log"
+        )
--- a/lm_eval/models/api_models.py
+++ b/lm_eval/models/api_models.py
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -30,6 +30,7 @@ from lm_eval.api.registry import register_model
 from lm_eval.models.utils import (
    Collator,
    clear_torch_cache,
+    configure_pad_token,
    get_dtype,
    pad_and_concat,
    stop_sequences_criteria,
@@ -253,35 +254,13 @@ class HFLM(TemplateLM):
        self.logits_cache = logits_cache
        self.vocab_size = self.tokenizer.vocab_size
        # select (or create) a pad token to use
-        if self.tokenizer.pad_token:
+        self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)
-            pass
-        elif self.tokenizer.unk_token:
-            self.tokenizer.pad_token_id = self.tokenizer.unk_token_id
-        elif self.tokenizer.eos_token:
-            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
-        else:
-            if getattr(self.config, "model_type", None) == "qwen":
-                # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
-                self.tokenizer.pad_token = "<|endoftext|>"
-            elif (
-                self.tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
-                or self.tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
-            ):
-                # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
-                # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
-                # ---
-                # Note that the world tokenizer class name, might change in the future for the final huggingface merge
-                # https://github.com/huggingface/transformers/pull/26963
-                assert self.tokenizer.pad_token_id == 0
-            else:
-                self.tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
-        # TODO: override this for Gemma
        self.add_bos_token = add_bos_token
-        if getattr(self.config, "model_type", None) == "gemma":
+        if "gemma" in getattr(self.config, "model_type", ""):
            self.add_bos_token = True
            eval_logger.info(
-                f"Model type is '{self.config.model_type}', a BOS token will be used as Gemma underperforms without it."
+                f"Model type is '{self.config.model_type}', part of the Gemma family--a BOS token will be used as Gemma underperforms without it."
            )
        self._max_length = max_length

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -5,6 +5,7 @@ import itertools
 import time
 from functools import wraps
 from typing import (
+    TYPE_CHECKING,
    Any,
    Callable,
    Dict,
@@ -24,6 +25,11 @@ import transformers
 from lm_eval.utils import eval_logger
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizerBase
+    from transformers.configuration_utils import PretrainedConfig
 def chunks(iter, n: int = 0, fn=None):
    """
    Divides an iterable into chunks of specified size or based on a given function.
@@ -613,3 +619,48 @@ class Collator:
        if arr:
            yield arr
+def configure_pad_token(
+    tokenizer: "PreTrainedTokenizerBase",
+    model_config: Optional["PretrainedConfig"] = None,
+) -> "PreTrainedTokenizerBase":
+    """
+    This function checks if the (Hugging Face) tokenizer has a padding token and sets it if not present.
+    Some tokenizers require special handling.
+    Args:
+        tokenizer: The tokenizer for which the padding token is to be handled.
+        model_config: The configuration of the model. Default is None.
+    Returns:
+        The tokenizer after the padding token has been handled.
+    Raises:
+        AssertionError: If the tokenizer is of type RWKVWorldTokenizer or Rwkv5Tokenizer and the padding token id is not 0.
+    """
+    if tokenizer.pad_token:
+        pass
+    elif tokenizer.unk_token:
+        tokenizer.pad_token_id = tokenizer.unk_token_id
+    elif tokenizer.eos_token:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    else:
+        # handle special cases
+        if model_config and getattr(model_config, "model_type", None) == "qwen":
+            # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
+            tokenizer.pad_token = "<|endoftext|>"
+        elif (
+            tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
+            or tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
+        ):
+            # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
+            # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
+            # ---
+            # Note that the world tokenizer class name, might change in the future for the final huggingface merge
+            # https://github.com/huggingface/transformers/pull/26963
+            assert tokenizer.pad_token_id == 0
+        else:
+            tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
+    return tokenizer
--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
 import copy
 from importlib.metadata import version
 from importlib.util import find_spec
-from typing import List, Literal, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
 from more_itertools import distribute
 from packaging.version import parse as parse_version
@@ -10,7 +10,7 @@ from tqdm import tqdm
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import TemplateLM
 from lm_eval.api.registry import register_model
-from lm_eval.models.utils import Collator, undistribute
+from lm_eval.models.utils import Collator, configure_pad_token, undistribute
 from lm_eval.utils import (
    eval_logger,
    get_rolling_token_windows,
@@ -26,6 +26,8 @@ try:
 except ModuleNotFoundError:
    pass
+if TYPE_CHECKING:
+    pass
 eval_logger = eval_logger
@@ -118,11 +120,12 @@ class VLLM(TemplateLM):
            trust_remote_code=trust_remote_code,
            tokenizer_revision=tokenizer_revision,
        )
+        self.tokenizer = configure_pad_token(self.tokenizer)
        self.add_bos_token = add_bos_token
        if "gemma" in pretrained.lower():
            self.add_bos_token = True
            eval_logger.info(
-                "Found 'gemma' in model name, a BOS token will be used as Gemma underperforms without it."
+                "Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it."
            )
        self.custom_prefix_token_id = prefix_token_id
@@ -176,23 +179,46 @@ class VLLM(TemplateLM):
    def max_gen_toks(self):
        return self._max_gen_toks
+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        """
+        Method to apply a chat template to a list of chat history between user and model.
+        """
+        return self.tokenizer.apply_chat_template(
+            chat_history, tokenize=False, add_generation_prompt=True
+        )
+    @property
+    def chat_template(self) -> str:
+        if self.tokenizer.chat_template is not None:
+            return self.tokenizer.chat_template
+        return self.tokenizer.default_chat_template
+    @property
+    def tokenizer_name(self) -> str:
+        return self.tokenizer.name_or_path.replace("/", "__")
    def tok_encode(
        self,
-        string: str,
+        string: Union[str, List[str]],
-        left_truncate_len=None,
+        left_truncate_len: int = None,
-        add_special_tokens=None,
+        add_special_tokens: bool = False,
-        truncation=False,
+        truncation: bool = False,
-    ):
+    ) -> Union[List[int], List[List[int]]]:
-        """ """
        if not add_special_tokens:
            add_special_tokens = False or self.add_bos_token
-        encoding = self.tokenizer.encode(
+        encoding: Union[List[List[int]], List[int]] = self.tokenizer(
-            string, add_special_tokens=add_special_tokens, truncation=truncation
+            string,
-        )
+            add_special_tokens=add_special_tokens,
+            truncation=truncation,
+            return_attention_mask=False,
+        ).input_ids
        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
        if left_truncate_len:
-            encoding = encoding[-left_truncate_len:]
+            if not isinstance(string, str):
+                encoding = [enc[-left_truncate_len:] for enc in encoding]
+            else:
+                encoding = encoding[-left_truncate_len:]
        return encoding
@@ -209,7 +235,7 @@ class VLLM(TemplateLM):
            sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs)
        else:
            sampling_params = SamplingParams(
-                temperature=0, prompt_logprobs=1, max_tokens=1
+                temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
            )
        if self.data_parallel_size > 1:
            # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
@@ -290,7 +316,9 @@ class VLLM(TemplateLM):
        # batch tokenize contexts
        context, all_gen_kwargs = zip(*(req.args for req in requests))
-        context_encoding = self.tokenizer(context, add_special_tokens=False).input_ids
+        context_encoding: List[List[int]] = self.tok_encode(
+            context, add_special_tokens=self.add_bos_token
+        )
        requests = [
            ((a, b), c) for a, b, c in zip(context, context_encoding, all_gen_kwargs)
        ]

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -26,6 +26,7 @@
 | [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese |
 | [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese |
 | code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby |
+| [commonsense_qa](commmonsense_qa/README.md) | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English |
 | [copal_id](copal_id/README.md) | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian |
 | [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English |
 | [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French |
@@ -48,6 +49,7 @@
 | [hendrycks_ethics](hendrycks_ethics/README.md)     | Tasks designed to evaluate the ethical reasoning capabilities of models. | English |
 | [hendrycks_math](hendrycks_math/README.md) | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English |
 | [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English |
+| [inverse_scaling](inverse_scaling/README.md) | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English |
 | [kmmlu](kmmlu/README.md) | Knowledge-based multi-subject multiple choice questions for academic evaluation. | Korean |
 | [kobest](kobest/README.md) | A collection of tasks designed to evaluate understanding in Korean language. | Korean |
 | [kormedmcqa](kormedmcqa/README.md) | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean |
@@ -55,15 +57,18 @@
 | [lambada_cloze](lambada_cloze/README.md) | Cloze-style LAMBADA dataset. | English |
 | [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian |
 | [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese |
+| [leaderboard](leaderboard/README.md) | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English |
 | [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese |
 | [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese |
 | [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English |
 | [mc_taco](mc_taco/README.md) | Question-answer pairs that require temporal commonsense comprehension. | English |
+| [med_concepts_qa](med_concepts_qa/README.md) | Benchmark for evaluating LLMs on their abilities to interpret medical codes and distinguish between medical concept. | English |
 | medmcqa | Medical multiple choice questions assessing detailed medical knowledge. | English |
 | medqa | Multiple choice question answering based on the United States Medical License Exams. | |
 | [mgsm](mgsm/README.md) | Benchmark of multilingual grade-school math problems. | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu |
 | [minerva_math](minerva_math/README.md) | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills. | English |
 | mmlu | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English |
+| [mmlusr](mmlusr/README.md) | Variation of MMLU designed to be more rigourous. | English |
 | model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. | |
 | [mutual](mutual/README.md) | A retrieval-based dataset for multi-turn dialogue reasoning. | English |
 | [nq_open](nq_open/README.md) | Open domain question answering tasks based on the Natural Questions dataset. | English |

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
--- a/lm_eval/tasks/aclue/README.md
+++ b/lm_eval/tasks/aclue/README.md
@@ -26,7 +26,7 @@ Homepage: https://github.com/isen-zhang/ACLUE
 }
 ```
-### Groups and Tasks
+### Groups, Tags, and Tasks
 #### Groups

--- a/lm_eval/tasks/aclue/_aclue.yaml
+++ b/lm_eval/tasks/aclue/_aclue.yaml
+group: aclue
+task:
+  - aclue_ancient_chinese_culture
+  - aclue_ancient_literature
+  - aclue_ancient_medical
+  - aclue_ancient_phonetics
+  - aclue_basic_ancient_chinese
+  - aclue_couplet_prediction
+  - aclue_homographic_character_resolution
+  - aclue_named_entity_recognition
+  - aclue_poetry_appreciate
+  - aclue_poetry_context_prediction
+  - aclue_poetry_quality_assessment
+  - aclue_poetry_sentiment_analysis
+  - aclue_polysemy_resolution
+  - aclue_reading_comprehension
+  - aclue_sentence_segmentation
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/aclue/_default_template_yaml
+++ b/lm_eval/tasks/aclue/_default_template_yaml
-group: aclue
 dataset_path: tyouisen/aclue
 test_split: test
 fewshot_split: dev

--- a/lm_eval/tasks/aexams/README.md
+++ b/lm_eval/tasks/aexams/README.md
@@ -24,11 +24,11 @@ Homepage for Arabic EXAMS: [EXAMS Arabic Homepage](https://github.com/FreedomInt
 ### Citation
-### Groups and Tasks
+### Groups, Tags, and Tasks
 #### Groups
- `EXAMS Arabic`: include IslamicStudies, Biology, Science, Physics, Social.
+- `aexams`: Arabic EXAMS dataset, including IslamicStudies, Biology, Science, Physics, Social subjects.
 #### Tasks

--- a/lm_eval/tasks/aexams/_aexams.yaml
+++ b/lm_eval/tasks/aexams/_aexams.yaml
+group: aexams
+task:
+  - aexams_Biology
+  - aexams_IslamicStudies
+  - aexams_Physics
+  - aexams_Science
+  - aexams_Social
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/aexams/_default_template_yaml
+++ b/lm_eval/tasks/aexams/_default_template_yaml
-group: aexams
 dataset_path: Hennara/aexams
 test_split: test
 fewshot_split: dev

--- a/lm_eval/tasks/afrimgsm/README.md
+++ b/lm_eval/tasks/afrimgsm/README.md
+# MathQA
+### Paper
+IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models
+https://arxiv.org/pdf/2406.03368
+IrokoBench is a human-translated benchmark dataset for 16 typologically diverse
+low-resource African languages covering three tasks: natural language inference (AfriXNLI),
+mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU).
+### Citation
+```
+@misc{adelani2024irokobenchnewbenchmarkafrican,
+      title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models},
+      author={David Ifeoluwa Adelani and Jessica Ojo and Israel Abebe Azime and Jian Yun Zhuang and Jesujoba O. Alabi and Xuanli He and Millicent Ochieng and Sara Hooker and Andiswa Bukula and En-Shiun Annie Lee and Chiamaka Chukwuneke and Happy Buzaaba and Blessing Sibanda and Godson Kalipe and Jonathan Mukiibi and Salomon Kabongo and Foutse Yuehgoh and Mmasibidi Setaka and Lolwethu Ndolela and Nkiruka Odu and Rooweither Mabuya and Shamsuddeen Hassan Muhammad and Salomey Osei and Sokhar Samb and Tadesse Kebede Guge and Pontus Stenetorp},
+      year={2024},
+      eprint={2406.03368},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2406.03368},
+}
+```
+### Groups and Tasks
+#### Groups
+* `afrimgsm`: All afrimgsm tasks
+* `afrimgsm_direct`: afrimgsm_direct evaluates models performance on the curated dataset
+* `afrimgsm_en_cot`: afrimgsm_en_cot includes 5-shot of exemplars for chain-of-thought approach
+* `afrimgsm_translate`: afrimgsm_translate evaluates models in translate-test setting
+#### Tasks
+* `afrimgsm_direct_{language_code}`: each task evaluates for one language
+* `afrimgsm_en_cot_{language_code}`: each task evaluates for one language
+* `afrimgsm_translate_{language_code}`: each task evaluates for one language
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
+  * [x] Checked for equivalence with v0.3.0 LM Evaluation Harness
--- a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_amh.yaml
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_amh.yaml
+# Generated by utils.py
+dataset_name: amh
+doc_to_target: '{% if answer is not none %}{{answer[15:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: afrimgsm_direct_amh
--- a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_eng.yaml
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_eng.yaml
+# Generated by utils.py
+dataset_name: eng
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: afrimgsm_direct_eng