:Revert "batch commit"

This reverts commit d859d1ca.

:Revert "batch commit"
This reverts commit d859d1ca.
137b5423 · Nathan Habib · d859d1ca · 137b5423 · 137b5423 · 137b5423
Commit 137b5423 authored Jun 26, 2024 by Nathan Habib
20 changed files
--- a/docs/interface.md
+++ b/docs/interface.md
@@ -102,12 +102,10 @@ results = lm_eval.simple_evaluate( # call simple_evaluate
 )
 ```
-See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L35 for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously.
+See the `simple_evaluate()` and `evaluate()` functions in [lm_eval/evaluator.py](../lm_eval/evaluator.py#:~:text=simple_evaluate) for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously.
 Additionally, the `evaluate()` function offers the core evaluation functionality provided by the library, but without some of the special handling and simplification + abstraction provided by `simple_evaluate()`.
-See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L173 for more details.
 As a brief example usage of `evaluate()`:
 ```python
@@ -147,7 +145,7 @@ task_dict = lm_eval.tasks.get_task_dict(
    task_manager # A task manager that allows lm_eval to
                 # load the task during evaluation.
                 # If none is provided, `get_task_dict`
-                 # will instantiated one itself, but this
+                 # will instantiate one itself, but this
                 # only includes the stock tasks so users
                 # will need to set this if including
                 # custom paths is required.

--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -5,7 +5,6 @@ import os
 import sys
 from functools import partial
 from typing import Union
-from accelerate import Accelerator
 from lm_eval import evaluator, utils
 from lm_eval.evaluator import request_caching_arg_to_dict
@@ -293,6 +292,13 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
            "If fewshot_as_multiturn is set, apply_chat_template must be set to True."
        )
+    if (
+        args.num_fewshot is None or args.num_fewshot == 0
+    ) and args.fewshot_as_multiturn:
+        raise ValueError(
+            "If fewshot_as_multiturn is set, num_fewshot must be greater than 0."
+        )
    if args.include_path is not None:
        eval_logger.info(f"Including path: {args.include_path}")
    task_manager = TaskManager(args.verbosity, include_path=args.include_path)
@@ -348,11 +354,17 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
    # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args
    if args.trust_remote_code:
-        os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = str(args.trust_remote_code)
+        eval_logger.info(
-        args.model_args = (
+            "Passed `--trust_remote_code`, setting environment variable `HF_DATASETS_TRUST_REMOTE_CODE=true`"
-            args.model_args
-            + f",trust_remote_code={os.environ['HF_DATASETS_TRUST_REMOTE_CODE']}"
        )
+        # HACK: import datasets and override its HF_DATASETS_TRUST_REMOTE_CODE value internally,
+        # because it's already been determined based on the prior env var before launching our
+        # script--`datasets` gets imported by lm_eval internally before these lines can update the env.
+        import datasets
+        datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
+        args.model_args = args.model_args + ",trust_remote_code=True"
    eval_logger.info(f"Selected Tasks: {task_names}")
@@ -388,9 +400,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        **request_caching_args,
    )
-    accelerator = Accelerator()
+    if results is not None:
-    if results is not None and accelerator.is_main_process:
        if args.log_samples:
            samples = results.pop("samples")
        dumped = json.dumps(

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -67,9 +67,9 @@ class TaskConfig(dict):
    training_split: Optional[str] = None
    validation_split: Optional[str] = None
    test_split: Optional[str] = None
-    fewshot_split: Optional[
+    fewshot_split: Optional[str] = (
-        str
+        None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
-    ] = None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
+    )
    # formatting / prompting options.
    # see docs/advanced_task_guide.md for more info
    process_docs: Optional[Callable] = None
@@ -92,9 +92,9 @@ class TaskConfig(dict):
    filter_list: Optional[Union[str, list]] = None
    should_decontaminate: bool = False
    doc_to_decontamination_query: Optional[str] = None
-    metadata: Optional[
+    metadata: Optional[dict] = (
-        dict
+        None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
-    ] = None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
+    )
    def __post_init__(self) -> None:
        if self.generation_kwargs is not None:
@@ -229,9 +229,9 @@ class Task(abc.ABC):
        self._config: TaskConfig = TaskConfig({**config}) if config else TaskConfig()
        self._filters = [build_filter_ensemble("none", [["take_first", None]])]
-        self.fewshot_rnd: Optional[
+        self.fewshot_rnd: Optional[random.Random] = (
-            random.Random
+            None  # purposely induce errors in case of improper usage
-        ] = None  # purposely induce errors in case of improper usage
+        )
    def download(
        self,
@@ -368,15 +368,16 @@ class Task(abc.ABC):
    def build_all_requests(
        self,
        *,
-        limit=None,
+        limit: Union[int, None] = None,
-        rank=None,
+        rank: int = 0,
-        world_size=None,
+        world_size: int = 1,
-        cache_requests=False,
+        cache_requests: bool = False,
-        rewrite_requests_cache=False,
+        rewrite_requests_cache: bool = False,
-        system_instruction=None,
+        system_instruction: Optional[str] = None,
-        apply_chat_template=False,
+        apply_chat_template: bool = False,
-        fewshot_as_multiturn=False,
+        fewshot_as_multiturn: bool = False,
-        lm=None,
+        chat_template: Optional[Callable] = None,
+        tokenizer_name: str = "",
    ) -> None:
        """Build a set of Instances for a task, and store them in task.instances"""
@@ -391,7 +392,7 @@ class Task(abc.ABC):
            if system_instruction is not None
            else ""
        )
-        cache_key += f"-tokenizer{lm.tokenizer_name}" if apply_chat_template else ""
+        cache_key += f"-tokenizer{tokenizer_name}"
        cached_instances = load_from_cache(file_name=cache_key)
@@ -436,7 +437,7 @@ class Task(abc.ABC):
                system_instruction,
                apply_chat_template,
                fewshot_as_multiturn,
-                lm,
+                chat_template,
            )
            # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
@@ -444,7 +445,6 @@ class Task(abc.ABC):
                doc=doc,
                ctx=fewshot_ctx,
                metadata=(self.config["task"], doc_id, self.config.repeats),
-                apply_chat_template=apply_chat_template
            )
            if not isinstance(inst, list):
@@ -987,28 +987,6 @@ class ConfigurableTask(Task):
            return super().fewshot_docs()
    @staticmethod
-    def append_target_question(
-        labeled_examples: List[Dict[str, str]],
-        question: str,
-        fewshot_as_multiturn: bool = False,
-    ) -> None:
-        """Adds a target question to the labeled examples list.
-        If fewshot_as_multiturn is True, or labeled_examples is empty, or the last entry is a system turn, appends the question as a new user entry.
-        Otherwise, it is appended to the last user entry, ensuring that the conversation alternates between the user and the assistant.
-        """
-        if not fewshot_as_multiturn:
-            # if no messages or last message is system, append as new user entry
-            if len(labeled_examples) == 0 or labeled_examples[-1]["role"] == "system":
-                labeled_examples.append({"role": "user", "content": question})
-            # if last message is user, append to it to avoid two user messages in a row
-            else:
-                labeled_examples[-1]["content"] += question
-        else:
-            return self.sampler.fewshot_delimiter + "".join(
-                f"{s['role']}: {s['content']}" + self.sampler.fewshot_delimiter
-                for s in chat_history
-            )
-    @staticmethod
    def append_target_question(
        labeled_examples: List[Dict[str, str]],
        question: str,
@@ -1037,7 +1015,7 @@ class ConfigurableTask(Task):
        system_instruction: Optional[str] = None,
        apply_chat_template: bool = False,
        fewshot_as_multiturn: bool = False,
-        lm=None,
+        chat_template: Optional[Callable] = None,
    ) -> str:
        """Returns a fewshot context string that is made up of a prepended description
        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
@@ -1050,12 +1028,10 @@ class ConfigurableTask(Task):
            System instruction to be applied to the prompt.
        :param apply_chat_template: bool
            Whether to apply the chat template to the fewshot context.
-        :param tokenizer:
-            The tokenizer to use for applying the chat template.
        :param fewshot_as_multiturn: bool
            Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
-        :param lm:
+        :param chat_template: Callable
-            Language model with definition of the tokenizer/function to use for applying the chat template.
+            Chat template to be applied to the fewshot context.
        :returns: str
            The fewshot context.
        """
@@ -1102,7 +1078,7 @@ class ConfigurableTask(Task):
        example = self.doc_to_text(doc)
        if apply_chat_template:
            if self.multiple_input:
-                return lm.apply_chat_template(labeled_examples)
+                return chat_template(labeled_examples)
            if isinstance(example, str):
                self.append_target_question(
                    labeled_examples, example, fewshot_as_multiturn
@@ -1114,7 +1090,7 @@ class ConfigurableTask(Task):
                for ex in example:
                    chat = deepcopy(labeled_examples)
                    self.append_target_question(chat, ex, fewshot_as_multiturn)
-                    labeled_examples_list.append(lm.apply_chat_template(chat))
+                    labeled_examples_list.append(chat_template(chat))
                return labeled_examples_list
            # if example is an integer, append the choice or convert to string
            elif isinstance(example, int):
@@ -1128,7 +1104,7 @@ class ConfigurableTask(Task):
                        labeled_examples, str(example), fewshot_as_multiturn
                    )
                # return lm.apply_chat_template(labeled_examples)
-            return lm.apply_chat_template(labeled_examples)
+            return chat_template(labeled_examples)
        else:
            if self.multiple_input:
                return labeled_examples
@@ -1295,8 +1271,6 @@ class ConfigurableTask(Task):
        elif self.OUTPUT_TYPE == "multiple_choice":
            choices = self.doc_to_choice(doc)
            target_delimiter = self.config.target_delimiter
-            if kwargs.get("apply_chat_template", False) is True:
-                target_delimiter = ""
            if self.multiple_input:
                # If there are multiple inputs, choices are placed in the ctx
                cont = self.doc_to_target(doc)
@@ -1306,7 +1280,6 @@ class ConfigurableTask(Task):
            else:
                # Otherwise they are placed in the continuation
                arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]
-            kwargs.pop("apply_chat_template")
            request_list = [
                Instance(
@@ -1343,7 +1316,6 @@ class ConfigurableTask(Task):
        elif self.OUTPUT_TYPE == "generate_until":
            arguments = (ctx, deepcopy(self.config.generation_kwargs))
-        kwargs.pop("apply_chat_template")
        return Instance(
            request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs
        )

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -22,7 +22,7 @@ from lm_eval.evaluator_utils import (
    run_task_tests,
 )
 from lm_eval.loggers import EvaluationTracker
-from lm_eval.loggers.utils import add_env_info, get_git_commit_hash
+from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash
 from lm_eval.tasks import TaskManager, get_task_dict
 from lm_eval.utils import (
    eval_logger,
@@ -271,6 +271,7 @@ def simple_evaluate(
            model_args=model_args,
            system_instruction=system_instruction,
            chat_template=lm.chat_template if apply_chat_template else None,
+            fewshot_as_multiturn=fewshot_as_multiturn,
        )
    results = evaluate(
@@ -325,6 +326,7 @@ def simple_evaluate(
        results["git_hash"] = get_git_commit_hash()
        results["date"] = start_date
        add_env_info(results)  # additional environment info to results
+        add_tokenizer_info(results, lm)  # additional info about tokenizer
        return results
    else:
        return None
@@ -397,7 +399,12 @@ def evaluate(
            system_instruction=system_instruction,
            apply_chat_template=apply_chat_template,
            fewshot_as_multiturn=fewshot_as_multiturn,
-            lm=lm,
+            chat_template=getattr(lm, "apply_chat_template")
+            if apply_chat_template
+            else None,
+            tokenizer_name=getattr(lm, "tokenizer_name", "")
+            if apply_chat_template
+            else "",
        )
        eval_logger.debug(
            f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}"
@@ -607,16 +614,16 @@ def evaluate(
                    ]
                    # compute group's pooled metric and stderr
-                    results[group][
+                    results[group][metric] = (
-                        metric
+                        lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
-                    ] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
+                    )
                    # TODO: calculate grouped metric using aggregation fn
                    if "N/A" in stderrs:
                        results[group][stderr] = "N/A"
                    else:
-                        results[group][
+                        results[group][stderr] = (
-                            stderr
+                            lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
-                        ] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
+                        )
                        # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
                        # To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
                        # results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)

--- a/lm_eval/evaluator_utils.py
+++ b/lm_eval/evaluator_utils.py
@@ -275,9 +275,9 @@ def consolidate_results(
                metric_key
            ]
            results[task_output.task_name]["samples"] = task_output.sample_len
-            results[task_output.task_name][
+            results[task_output.task_name][f"{metric}_stderr,{filter_key}"] = (
-                f"{metric}_stderr,{filter_key}"
+                task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
-            ] = task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
+            )
    return results, samples, configs, versions, num_fewshot, higher_is_better

--- a/lm_eval/filters/decontamination.py
+++ b/lm_eval/filters/decontamination.py
@@ -4,7 +4,6 @@ from lm_eval.api.registry import register_filter
 @register_filter("decontaminate")
 class DecontaminationFilter(Filter):
    """
    A filter which evaluates
    """

--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -62,8 +62,11 @@ class WhitespaceFilter(Filter):
        def filter_set(inst):
            filtered_resp = []
            for resp in inst:
-                resp = resp.lstrip()
+                if resp.startswith(" "):
+                    resp = resp[1:]
                filtered_resp.append(resp)
            return filtered_resp
        filtered_resps = [filter_set(resp) for resp in resps]

--- a/lm_eval/loggers/evaluation_tracker.py
+++ b/lm_eval/loggers/evaluation_tracker.py
 import json
-import os
 import re
 import time
 from collections import defaultdict
 from dataclasses import asdict, dataclass
 from datetime import datetime
 from pathlib import Path
-from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
 from datasets import load_dataset
 from datasets.utils.metadata import MetadataConfigs
@@ -19,9 +17,15 @@ from huggingface_hub import (
 from lm_eval.utils import (
    eval_logger,
+    get_file_datetime,
+    get_file_task_name,
+    get_results_filenames,
+    get_sample_results_filenames,
    handle_non_serializable,
    hash_string,
    sanitize_list,
+    sanitize_model_name,
+    sanitize_task_name,
 )
@@ -44,6 +48,7 @@ class GeneralConfigTracker:
    model_name_sanitized: str = None
    system_instruction: str = None
    system_instruction_sha: str = None
+    fewshot_as_multiturn: bool = None
    chat_template: str = None
    chat_template_sha: str = None
    start_time: float = None
@@ -76,24 +81,19 @@ class GeneralConfigTracker:
        model_args: str,
        system_instruction: str,
        chat_template: str,
+        fewshot_as_multiturn: bool,
    ) -> None:
        """Logs model parameters and job ID."""
        self.model_source = model_source
        self.model_name = GeneralConfigTracker._get_model_name(model_args)
-        self.model_name_sanitized = re.sub(
+        self.model_name_sanitized = sanitize_model_name(self.model_name)
-            r"[\"<>:/\|\\?\*\[\]]+", "__", self.model_name
-        )
        self.system_instruction = system_instruction
        self.system_instruction_sha = (
            hash_string(system_instruction) if system_instruction else None
        )
        self.chat_template = chat_template
-        self.chat_template_sha = None
+        self.chat_template_sha = hash_string(chat_template) if chat_template else None
-        if chat_template:
+        self.fewshot_as_multiturn = fewshot_as_multiturn
-            if not isinstance(chat_template, str):
-                self.chat_template_sha = hash_string(str(chat_template))
-            else:
-                self.chat_template_sha = hash_string(chat_template)
    def log_end_time(self) -> None:
        """Logs the end time of the evaluation and calculates the total evaluation time."""
@@ -210,17 +210,21 @@ class EvaluationTracker:
                file_results_aggregated.open("w", encoding="utf-8").write(dumped)
                if self.api and self.push_results_to_hub:
-                    repo_id = "open-llm-leaderboard/results_v2"
+                    repo_id = (
+                        self.hub_results_repo
+                        if self.public_repo
+                        else self.hub_results_repo_private
+                    )
                    self.api.create_repo(
                        repo_id=repo_id,
                        repo_type="dataset",
                        private=not self.public_repo,
                        exist_ok=True,
                    )
-                    self.api.upload_file(
+                    self.api.upload_folder(
                        repo_id=repo_id,
-                        path_or_fileobj=str(path.joinpath(f"results_{self.date_id}.json")),
+                        folder_path=str(path),
-                        path_in_repo=os.path.join(self.general_config_tracker.model_name, f"results_{self.date_id}.json"),
+                        path_in_repo=self.general_config_tracker.model_name_sanitized,
                        repo_type="dataset",
                        commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
                    )
@@ -258,7 +262,7 @@ class EvaluationTracker:
                path.mkdir(parents=True, exist_ok=True)
                file_results_samples = path.joinpath(
-                    f"samples_{task_name}_{self.date_id}.json"
+                    f"samples_{task_name}_{self.date_id}.jsonl"
                )
                for sample in samples:
@@ -274,7 +278,6 @@ class EvaluationTracker:
                    sample["resps"] = sanitize_list(sample["resps"])
                    sample["filtered_resps"] = sanitize_list(sample["filtered_resps"])
                    sample["arguments"] = arguments
-                    sample["target"] = str(sample["target"])
                    sample_dump = (
                        json.dumps(
@@ -300,13 +303,6 @@ class EvaluationTracker:
                        private=not self.public_repo,
                        exist_ok=True,
                    )
-                    headers = build_hf_headers()
-                    r = get_session().put(
-                        url=f"https://huggingface.co/api/datasets/{repo_id}/settings",
-                        headers=headers,
-                        json={"gated": "auto"},
-                    )
-                    hf_raise_for_status(r)
                    self.api.upload_folder(
                        repo_id=repo_id,
                        folder_path=str(path),
@@ -330,23 +326,14 @@ class EvaluationTracker:
        Creates a metadata card for the evaluation results dataset and pushes it to the Hugging Face hub.
        """
-        def get_file_task_name(filename: str) -> str:
-            return filename[filename.find("_") + 1 : filename.rfind("_")]
-        def get_file_datetime(filename: str) -> str:
-            return filename[filename.rfind("_") + 1 :].replace(".json", "")
-        def sanitize_task_name(task_name: str) -> str:
-            return re.sub(r"\W", "_", task_name)
        eval_logger.info("Recreating metadata card")
        repo_id = (
            self.hub_results_repo if self.public_repo else self.hub_results_repo_private
        )
        files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
-        results_files = [f for f in files_in_repo if "/results_" in f and ".json" in f]
+        results_files = get_results_filenames(files_in_repo)
-        sample_files = [f for f in files_in_repo if "/samples_" in f and ".json" in f]
+        sample_files = get_sample_results_filenames(files_in_repo)
        # Build a dictionary to store the latest evaluation datetime for:
        # - Each tested model and its aggregated results
@@ -373,10 +360,7 @@ class EvaluationTracker:
                results_datetime,
            )
            latest_task_results_datetime[samples_key] = latest_datetime
-            latest_task_results_datetime[results_key] = max(
+            latest_task_results_datetime[results_key] = latest_datetime
-                latest_task_results_datetime[results_key],
-                latest_datetime,
-            )
        # Create metadata card
        card_metadata = MetadataConfigs()
@@ -393,15 +377,14 @@ class EvaluationTracker:
            sanitized_last_eval_date_results = re.sub(
                r"[^\w\.]", "_", latest_task_results_datetime[config_name]
            )
+            # Ensure that all results files are listed in the metadata card
+            current_results = card_metadata.get(config_name, {"data_files": []})
+            current_results["data_files"].append(
+                {"split": eval_date_sanitized, "path": [str(results_filename)]}
+            )
+            card_metadata[config_name] = current_results
+            # If the results file is the newest, update the "latest" field in the metadata card
            if eval_date_sanitized == sanitized_last_eval_date_results:
-                # Ensure that all results files are listed in the metadata card
-                current_results = card_metadata.get(config_name, {"data_files": []})
-                current_results["data_files"].append(
-                    {"split": eval_date_sanitized, "path": [str(results_filename)]}
-                )
-                card_metadata[config_name] = current_results
-                # If the results file is the newest, update the "latest" field in the metadata card
                card_metadata[config_name]["data_files"].append(
                    {"split": "latest", "path": [str(results_filename)]}
                )
@@ -420,65 +403,64 @@ class EvaluationTracker:
            sanitized_last_eval_date_results = re.sub(
                r"[^\w\.]", "_", latest_task_results_datetime[config_name]
            )
+            # Ensure that all sample results files are listed in the metadata card
+            current_details_for_task = card_metadata.get(
+                config_name, {"data_files": []}
+            )
+            current_details_for_task["data_files"].append(
+                {"split": eval_date_sanitized, "path": [str(results_filename)]}
+            )
+            card_metadata[config_name] = current_details_for_task
+            # If the samples results file is the newest, update the "latest" field in the metadata card
            if eval_date_sanitized == sanitized_last_eval_date_results:
-                print(f"adding {config_name} for {eval_date_sanitized}")
-                # Ensure that all sample results files are listed in the metadata card
-                current_details_for_task = card_metadata.get(
-                    config_name, {"data_files": []}
-                )
-                current_details_for_task["data_files"].append(
-                    {"split": eval_date_sanitized, "path": [str(results_filename)]}
-                )
-                card_metadata[config_name] = current_details_for_task
-                # If the samples results file is the newest, update the "latest" field in the metadata card
                card_metadata[config_name]["data_files"].append(
                    {"split": "latest", "path": [str(results_filename)]}
                )
-                # Special case for MMLU with a single split covering it all
+            # Special case for MMLU with a single split covering it all
-                # We add another config with all MMLU splits results together for easy inspection
+            # We add another config with all MMLU splits results together for easy inspection
-                SPECIAL_TASKS = ["leaderboard_gpqa", "leaderboard_math", "leaderboard_bbh", "leaderboard_musr"]
+            SPECIAL_TASKS = ["mmlu", "gpqa", "minerva_math"]
-                for special_task in SPECIAL_TASKS:
+            for special_task in SPECIAL_TASKS:
-                    if special_task in config_name:
+                if special_task in config_name:
-                        special_task = f"{model_name}__{special_task}"
+                    special_task = f"{model_name}__{special_task}"
-                        former_entry = card_metadata.get(special_task, {"data_files": []})
+                    former_entry = card_metadata.get(special_task, {"data_files": []})
+                    former_split = [
+                        (i, entry)
+                        for i, entry in enumerate(former_entry["data_files"])
+                        if entry.get("split", None) == eval_date_sanitized
+                    ]
+                    if len(former_split) == 0:
+                        former_entry["data_files"].append(
+                            {
+                                "split": eval_date_sanitized,
+                                "path": [str(results_filename)],
+                            }
+                        )
+                    else:
+                        split_index, _ = former_split[0]
+                        former_entry["data_files"][split_index]["path"].append(
+                            str(results_filename)
+                        )
-                        former_split = [
+                    if eval_date_sanitized == sanitized_last_eval_date_results:
+                        latest_split = [
                            (i, entry)
                            for i, entry in enumerate(former_entry["data_files"])
-                            if entry.get("split", None) == eval_date_sanitized
+                            if entry.get("split", None) == "latest"
                        ]
+                        if len(latest_split) == 0:
-                        if len(former_split) == 0:
                            former_entry["data_files"].append(
-                                {
+                                {"split": "latest", "path": [str(results_filename)]}
-                                    "split": eval_date_sanitized,
-                                    "path": [str(results_filename)],
-                                }
                            )
                        else:
-                            split_index, _ = former_split[0]
+                            latest_index, _ = latest_split[0]
-                            former_entry["data_files"][split_index]["path"].append(
+                            former_entry["data_files"][latest_index]["path"].append(
                                str(results_filename)
                            )
-                        if eval_date_sanitized == sanitized_last_eval_date_results:
+                    card_metadata[special_task] = former_entry
-                            latest_split = [
-                                (i, entry)
-                                for i, entry in enumerate(former_entry["data_files"])
-                                if entry.get("split", None) == "latest"
-                            ]
-                            if len(latest_split) == 0:
-                                former_entry["data_files"].append(
-                                    {"split": "latest", "path": [str(results_filename)]}
-                                )
-                            else:
-                                latest_index, _ = latest_split[0]
-                                former_entry["data_files"][latest_index]["path"].append(
-                                    str(results_filename)
-                                )
-                        card_metadata[special_task] = former_entry
        # Get latest results and extract info to update metadata card examples
        latest_datetime = max(latest_task_results_datetime.values())

--- a/lm_eval/loggers/utils.py
+++ b/lm_eval/loggers/utils.py
@@ -110,3 +110,20 @@ def add_env_info(storage: Dict[str, Any]):
        "upper_git_hash": upper_dir_commit,  # in case this repo is submodule
    }
    storage.update(added_info)
+def add_tokenizer_info(storage: Dict[str, Any], lm):
+    if getattr(lm, "tokenizer", False):
+        tokenizer_info = {
+            "tokenizer_pad_token": [lm.tokenizer.pad_token, lm.tokenizer.pad_token_id],
+            "tokenizer_eos_token": [lm.tokenizer.eos_token, lm.tokenizer.eos_token_id],
+            "tokenizer_bos_token": [lm.tokenizer.bos_token, lm.tokenizer.bos_token_id],
+            "eot_token_id": getattr(lm, "eot_token_id", None),
+            "max_length": getattr(lm, "max_length", None),
+        }
+        storage.update(tokenizer_info)
+    # seems gguf and textsynth do not have tokenizer
+    else:
+        logger.debug(
+            "LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results."
+        )
--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -307,7 +307,7 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
        # defaults to os.environ.get("ANTHROPIC_API_KEY")
        self.client = anthropic.Anthropic()
        self.temperature = temperature
-        self.max_token = max_tokens
+        self.max_tokens = max_tokens
        self.tokenizer = self.client.get_tokenizer()
        self.kwargs = kwargs

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -3,7 +3,6 @@ import os
 from datetime import timedelta
 from pathlib import Path
 from typing import Dict, List, Literal, Optional, Tuple, Union
-import jinja2
 import torch
 import torch.nn.functional as F
@@ -14,7 +13,6 @@ from accelerate import (
    InitProcessGroupKwargs,
    find_executable_batch_size,
 )
-from accelerate.utils import get_max_memory
 from huggingface_hub import HfApi
 from packaging import version
 from peft import PeftModel
@@ -41,6 +39,31 @@ from lm_eval.models.utils import (
 eval_logger = utils.eval_logger
+def _get_accelerate_args(
+    device_map_option: Optional[str] = "auto",
+    max_memory_per_gpu: Optional[Union[int, str]] = None,
+    max_cpu_memory: Optional[Union[int, str]] = None,
+    offload_folder: Optional[str] = "./offload",
+    gpus: Optional[int] = None,
+) -> dict:
+    """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
+    max_memory = {}
+    if max_memory_per_gpu is not None:
+        max_memory_per_gpu_map = {
+            device_idx: max_memory_per_gpu for device_idx in range(gpus)
+        }
+        max_memory.update(max_memory_per_gpu_map)
+    if max_cpu_memory is not None:
+        max_memory["cpu"] = max_cpu_memory
+    args = {}
+    if max_memory:
+        args["max_memory"] = max_memory
+    args["device_map"] = device_map_option
+    args["offload_folder"] = offload_folder
+    return args
 @register_model("hf-auto", "hf", "huggingface")
 class HFLM(TemplateLM):
    """
@@ -81,6 +104,7 @@ class HFLM(TemplateLM):
        # arguments used for splitting a model across GPUs naively.
        # only used if `parallelize=True`.
        parallelize: Optional[bool] = False,
+        device_map_option: Optional[str] = "auto",
        max_memory_per_gpu: Optional[Union[int, str]] = None,
        max_cpu_memory: Optional[Union[int, str]] = None,
        offload_folder: Optional[Union[str, os.PathLike]] = "./offload",
@@ -103,6 +127,21 @@ class HFLM(TemplateLM):
            self._config = self._model.config
            gpus = 0
+            if tokenizer:
+                assert isinstance(
+                    tokenizer, transformers.PreTrainedTokenizer
+                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
+                self.tokenizer = tokenizer
+            else:
+                # Get tokenizer
+                model_name = self._model.name_or_path
+                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                    model_name,
+                    revision=revision,
+                    trust_remote_code=trust_remote_code,
+                    use_fast=use_fast_tokenizer,
+                )
        else:
            assert isinstance(device, str)
            assert isinstance(pretrained, str)
@@ -111,7 +150,8 @@ class HFLM(TemplateLM):
            gpus = torch.cuda.device_count()
            accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
            accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
-            self.accelerator = accelerator
+            if accelerator.num_processes > 1:
+                self.accelerator = accelerator
            if "npu" in accelerator.device.type:
                gpus = torch.npu.device_count()
@@ -141,13 +181,13 @@ class HFLM(TemplateLM):
                        if torch.cuda.is_available()
                        else torch.device("cpu")
                    )
-            else: # Parallelism managed by accelerate
+            else:
                if device != "cuda":
                    eval_logger.info(
                        f"Using `accelerate launch` or `parallelize=True`, device '{device}' will be overridden when placing model."
                    )
                # TODO: include in warning that `load_in_8bit` etc. affect this too
-                self._device = self.accelerator.device if self.accelerator is not None else torch.device(device)
+                self._device = torch.device(device)
            # TODO: update this to be less of a hack once subfolder is fixed in HF
            revision = revision + ("/" + subfolder if subfolder is not None else "")
@@ -155,7 +195,7 @@ class HFLM(TemplateLM):
            self._get_config(
                pretrained,
                revision=revision,
-                trust_remote_code=trust_remote_code, 
+                trust_remote_code=trust_remote_code,
            )
        # determine which of 'causal' and 'seq2seq' backends to use
@@ -181,6 +221,7 @@ class HFLM(TemplateLM):
                trust_remote_code=trust_remote_code,
                parallelize=parallelize,
                gpus=gpus,
+                device_map_option=device_map_option,
                max_memory_per_gpu=max_memory_per_gpu,
                max_cpu_memory=max_cpu_memory,
                offload_folder=offload_folder,
@@ -195,6 +236,19 @@ class HFLM(TemplateLM):
            self.model.eval()
            self.model.tie_weights()
+        if isinstance(pretrained, str) and (gpus >= 1 or str(self.device) == "mps"):
+            # TODO: can remove this whole snippet except in the mps case, perhaps?
+            if not (parallelize or autogptq or hasattr(self, "accelerator")):
+                # place model onto device requested manually,
+                # if not using HF Accelerate or device_map
+                # or any other option that preloads model onto device
+                try:
+                    self.model.to(self.device)
+                except ValueError:
+                    eval_logger.debug(
+                        "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
+                    )
        self.truncation = truncation
        self.logits_cache = logits_cache
        self.vocab_size = self.tokenizer.vocab_size
@@ -247,24 +301,15 @@ class HFLM(TemplateLM):
            self.batch_size_per_gpu = int(batch_size)
        if isinstance(pretrained, str):
-            if (gpus >= 1 or str(self.device) == "mps"):
-                # TODO: can remove this whole snippet except in the mps case, perhaps?
-                if not (parallelize or autogptq or hasattr(self, "accelerator")):
-                    # place model onto device requested manually,
-                    # if not using HF Accelerate or device_map
-                    # or any other option that preloads model onto device
-                    try:
-                        self.model.to(self.device)
-                    except ValueError:
-                        eval_logger.debug(
-                            "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
-                        )
            # multigpu data-parallel support when launched with accelerate
            if gpus > 1:
-                if parallelize and accelerator.num_processes > 1:
+                if parallelize:
-                    eval_logger.warning(
+                    if accelerator.num_processes > 1:
-                        "You are both using a HF Accelerate `device_map` and launching via `accelerate launch`. This will attempt to do model and data parallelism depending on the resources available."
+                        raise RuntimeError(
-                    )
+                            "Attempted to use both a HF Accelerate `device_map` and to launch via `accelerate launch`. If this is the case, please either remove `parallelize=True` from --model_args or launch outside of the Accelerate launcher."
+                        )
+                    else:
+                        pass
                elif accelerator.num_processes == 1:
                    # if we aren't launching via accelerate, ditch
                    self._rank = 0
@@ -313,77 +358,6 @@ class HFLM(TemplateLM):
                f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}"
            )
-    def _get_accelerate_args(
-        self,
-        parallelize: bool = None,
-        device_map: Optional[str] = "auto",
-        max_memory_per_gpu: Optional[Union[int, str]] = None,
-        max_cpu_memory: Optional[Union[int, str]] = None,
-        offload_folder: Optional[str] = "./offload",
-        gpus: Optional[int] = None,
-    ) -> dict:
-        """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
-        num_local_processes = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
-        num_machines = int(os.environ.get("WORLD_SIZE", 0)) // num_local_processes
-        if num_machines == 0:
-            eval_logger.info("We are not in a distributed setting. Setting model_parallel to False.")
-            parallelize = False
-        if parallelize is None: 
-            # If parallelism is unset by the user, we automatically assign model parallelism
-            # if enough extra GPUs are available
-            max_memory_all_gpus = get_max_memory()
-            # We just want gpu, not cpu, max memory
-            if "cpu" in max_memory_all_gpus:
-                del max_memory_all_gpus["cpu"]
-            model_parallel = bool(num_local_processes < len(max_memory_all_gpus))
-            eval_logger.info(
-                f"Setting model parallel to {model_parallel} since "
-                f"the number of local processes is {num_local_processes} "
-                f"and the number of GPUs is {len(max_memory_all_gpus)}"
-            )
-        args = {}
-        if parallelize: # Model parallelism will be used
-            max_memory = {}
-            if max_memory_per_gpu is not None: # Using the provided memory requirements
-                max_memory_per_gpu_map = {device_idx: max_memory_per_gpu for device_idx in range(gpus)}
-            else: # Estimating the possible memory requirements
-                max_memory_all_gpus = get_max_memory() 
-                if "cpu" in max_memory_all_gpus:
-                    del max_memory_all_gpus["cpu"]
-                max_memory_per_gpu_map = {
-                    k: v
-                    for k, v in max_memory_all_gpus.items()
-                    if k % num_local_processes == (self.accelerator.process_index % num_local_processes)
-                }
-            args["max_memory"] = max_memory_per_gpu_map
-            args["device_map"] = "auto"
-            eval_logger.info(
-                f"Model parallel was set to True, setting max memory per GPU to {max_memory_per_gpu_map} and device map to 'auto'"
-            )
-            if max_cpu_memory is not None:
-                max_memory["cpu"] = max_cpu_memory
-            args["offload_folder"] = offload_folder
-        elif device_map is None: # No model parallelism, we use the default provided device for our model
-            if hasattr(self, "accelerator"):
-                device_map = {"": f"{self.accelerator.device}"}
-            else:
-                device_map = {"": str(self.device)}
-            args["max_memory"] = None
-            args["device_map"] = device_map
-            eval_logger.info(
-                f"Model parallel was set to False, max memory was not set, and device map was set to {device_map}"
-            )
-        else:
-            args["max_memory"] = None
-            args["device_map"] = None
-            eval_logger.info("Model parallel was set to False.")
-        return args
    @property
    def config(self):
        # return the associated transformers.AutoConfig for the given pretrained model.
@@ -513,13 +487,11 @@ class HFLM(TemplateLM):
        revision: str = "main",
        trust_remote_code: bool = False,
    ) -> None:
-        with self.accelerator.main_process_first():
+        self._config = transformers.AutoConfig.from_pretrained(
-            self._config = transformers.AutoConfig.from_pretrained(
+            pretrained,
-                pretrained,
+            revision=revision,
-                revision=revision,
+            trust_remote_code=trust_remote_code,
-                trust_remote_code=trust_remote_code,
+        )
-                force_download=False,
-            )
    def _create_model(
        self,
@@ -532,6 +504,7 @@ class HFLM(TemplateLM):
        # (accelerate naive PP (device_map) options)
        parallelize: Optional[bool] = False,
        gpus: Optional[int] = None,
+        device_map_option: Optional[str] = "auto",
        max_memory_per_gpu: Optional[Union[int, str]] = None,
        max_cpu_memory: Optional[Union[int, str]] = None,
        offload_folder: Optional[str] = "./offload",
@@ -555,16 +528,25 @@ class HFLM(TemplateLM):
        model_kwargs = kwargs if kwargs else {}
-        model_kwargs.update(
+        if parallelize:
-            self._get_accelerate_args(
+            model_kwargs.update(
-                parallelize=parallelize,
+                _get_accelerate_args(
-                device_map=kwargs.get("device_map", None),
+                    device_map_option,  # TODO: phase out device_map_option?
-                max_memory_per_gpu=max_memory_per_gpu,
+                    max_memory_per_gpu,
-                max_cpu_memory=max_cpu_memory,
+                    max_cpu_memory,
-                offload_folder=offload_folder,
+                    offload_folder,
-                gpus=gpus,
+                    gpus,
+                )
            )
-        )
+        elif "device_map" not in model_kwargs:
+            # set a device_map to initialize model on the right GPU.
+            # this is needed because it seems that the default behavior
+            # for quantized models now seems to be device_map="auto"
+            # which breaks data-parallel mode.
+            if hasattr(self, "accelerator"):
+                model_kwargs.update({"device_map": {"": f"{self.accelerator.device}"}})
+            else:
+                model_kwargs.update({"device_map": {"": str(self.device)}})
        if not autogptq:
            if model_kwargs.get("load_in_4bit", None):
@@ -577,17 +559,13 @@ class HFLM(TemplateLM):
                        model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(
                            model_kwargs["bnb_4bit_compute_dtype"]
                        )
+            self._model = self.AUTO_MODEL_CLASS.from_pretrained(
-            with self.accelerator.main_process_first():
+                pretrained,
-                #model_kwargs["device_map"] = "balanced_low_0"
+                revision=revision,
-                self._model = self.AUTO_MODEL_CLASS.from_pretrained(
+                torch_dtype=get_dtype(dtype),
-                    pretrained,
+                trust_remote_code=trust_remote_code,
-                    revision=revision,
+                **model_kwargs,
-                    torch_dtype=get_dtype(dtype),
+            )
-                    trust_remote_code=trust_remote_code,
-                    force_download=False,
-                    **model_kwargs,
-                )
        else:
            try:
                from auto_gptq import AutoGPTQForCausalLM
@@ -679,7 +657,6 @@ class HFLM(TemplateLM):
                    revision=revision,
                    trust_remote_code=trust_remote_code,
                    use_fast=use_fast_tokenizer,
-                    force_download=False
                )
            else:
                assert isinstance(
@@ -701,57 +678,43 @@ class HFLM(TemplateLM):
            )
        return None
-    def _detect_batch_size(self, requests=None, pos: int = 0) -> int:
+    def _detect_batch_size(self, requests=None, pos: int = 0):
-        if len(requests[0]) == 3: # logprob evals
+        if requests:
            _, context_enc, continuation_enc = requests[pos]
            max_length = len(
                (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
            )
            max_context_enc = len(context_enc[-(self.max_length + 1) :])
            max_cont_enc = len(continuation_enc[-(self.max_length + 1) :])
-            security_margin_factor = 6 # batch sizes for log prob evals sometimes generate OOMs
+        else:
-        elif len(requests[0]) == 2: # generative evals
+            max_length = self.max_length
-            # using rolling window with maximum context
-            longest_context = max([len(self.tok_encode(request[0])) + request[1].get("max_gen_toks", self.max_length) for request in requests[pos:]])
-            if longest_context > self.max_length:
-                eval_logger.warning(
-                    f"Longest context length of {longest_context} exceeds max_length of {self.max_length}. Truncating to max_length."
-                )
-                longest_context = self.max_length
-            max_length = longest_context
            max_context_enc = max_length
            max_cont_enc = max_length
-            security_margin_factor = 6
        # if OOM, then halves batch_size and tries again
        @find_executable_batch_size(starting_batch_size=self.max_batch_size)
        def forward_batch(batch_size):
-            security_margin = int(0.05 * security_margin_factor * batch_size)
            if self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
                length = max(max_context_enc, max_cont_enc)
                batched_conts = torch.ones(
-                    (batch_size + security_margin, length), device=self.device
+                    (batch_size, length), device=self.device
                ).long()
-                test_batch = torch.ones((batch_size + security_margin, length), device=self.device).long()
+                test_batch = torch.ones((batch_size, length), device=self.device).long()
                call_kwargs = {
                    "attn_mask": test_batch,
                    "labels": batched_conts,
                }
            else:
                call_kwargs = {}
-                test_batch = torch.rand(
+                test_batch = torch.ones(
-                    (batch_size + security_margin, max_length), device=self.device
+                    (batch_size, max_length), device=self.device
                ).long()
+            for _ in range(5):
-            for _ in range(5*security_margin_factor):
+                out = F.log_softmax(self._model_call(test_batch, **call_kwargs), dim=-1)  # noqa: F841
-                logits = self._model_call(inps=test_batch, **call_kwargs).float()
-                scores = F.log_softmax(logits, dim=-1)  # noqa: F841
            return batch_size
        try:
-            print(f"finding batch size on process {self.accelerator.local_process_index}")
            batch_size = forward_batch()
        except RuntimeError as e:
            if "No executable batch size found" in str(e):
@@ -762,7 +725,6 @@ class HFLM(TemplateLM):
        if self.world_size > 1:
            # if multi-GPU, always take minimum over all selected batch sizes
            max_rnk_bs = torch.tensor([batch_size], device=self.device)
-            print(f"gathering on process {self.accelerator.local_process_index}")
            gathered = (
                self.accelerator.gather(max_rnk_bs).cpu().detach().numpy().tolist()
            )
@@ -983,10 +945,6 @@ class HFLM(TemplateLM):
        print(f"Determined largest batch size: {self.batch_sizes[sched]}")
        return self.batch_sizes[sched]
-    def _reset_batch_scheduler(self):
-        """When we change group in generative evaluations, we reset the batch size"""
-        self.batch_sizes = {}
    def _loglikelihood_tokens(
        self,
        requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
@@ -1044,7 +1002,7 @@ class HFLM(TemplateLM):
            else None
        )
-        chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn, accelerator=self.accelerator)
+        chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn)
        pbar = tqdm(
            total=len(requests),
            disable=(disable_tqdm or (self.rank != 0)),
@@ -1064,8 +1022,6 @@ class HFLM(TemplateLM):
            # tensors, then we pack them together into a batch, call the model, and then pick it all apart
            # again because vectorizing is annoying
-            from pprint import pprint
            for _, context_enc, continuation_enc in chunk:
                # sanity check
                assert len(context_enc) > 0
@@ -1148,7 +1104,7 @@ class HFLM(TemplateLM):
                }
            multi_logits = F.log_softmax(
-                self._model_call(batched_inps, **call_kwargs), dim=-1, dtype=torch.float16
+                self._model_call(batched_inps, **call_kwargs), dim=-1
            )  # [batch, padding_length (inp or cont), vocab]
            for (request_str, ctx_tokens, _), logits, inplen, cont_toks in zip(
@@ -1210,8 +1166,6 @@ class HFLM(TemplateLM):
    ) -> List[str]:
        res = []
-        self.accelerator.wait_for_everyone()
        def _collate(req: Tuple[str, dict]):
            """Defines the key for the sorted method"""
            # the negative sign on len(toks) sorts descending - this has a few advantages:
@@ -1228,14 +1182,24 @@ class HFLM(TemplateLM):
            disable=(disable_tqdm or (self.rank != 0)),
            desc="Running generate_until requests",
        )
+        adaptive_batch_size = None
+        if self.batch_size == "auto":
+            # using rolling window with maximum context
+            print("Passed argument batch_size = auto. Detecting largest batch size")
+            batch_size = self._detect_batch_size()
+            print(f"Determined Largest batch size: {batch_size}")
+            adaptive_batch_size = batch_size
+        # for each different set of kwargs, we execute all requests, by batch.
        batch_size = (
            self.batch_size
            if self.batch_size != "auto"
+            else adaptive_batch_size
+            if adaptive_batch_size is not None
            else 0
        )
        batch_fn = (
            self._batch_scheduler
-            if self.batch_size == "auto" #  and not adaptive_batch_size
+            if self.batch_size == "auto" and not adaptive_batch_size
            else None
        )
@@ -1249,7 +1213,7 @@ class HFLM(TemplateLM):
            group_by="gen_kwargs",
            group_fn=lambda x: x[1],
        )
-        chunks = re_ords.get_batched(n=batch_size, batch_fn=batch_fn, reset_batch_fn=self._reset_batch_scheduler)
+        chunks = re_ords.get_batched(n=batch_size, batch_fn=batch_fn)
        for chunk in chunks:
            contexts, all_gen_kwargs = zip(*chunk)
            # we assume all gen kwargs in the batch are the same
@@ -1279,8 +1243,6 @@ class HFLM(TemplateLM):
                until.append(eos)
            if "max_gen_toks" in kwargs.keys():
                max_gen_toks = kwargs.pop("max_gen_toks")
-                if max_gen_toks > self.max_length:
-                    max_gen_toks = self.max_gen_toks
            else:
                max_gen_toks = self.max_gen_toks
@@ -1288,9 +1250,6 @@ class HFLM(TemplateLM):
            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
                # max len for inputs = max length, minus room to generate the max new tokens
                max_ctx_len = self.max_length - max_gen_toks
-                while max_ctx_len <= 0:
-                    max_gen_toks = max_gen_toks // 2
-                    max_ctx_len = self.max_length - max_gen_toks
            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
                # max len for inputs = encoder's whole max_length
                max_ctx_len = self.max_length
@@ -1345,21 +1304,9 @@ class HFLM(TemplateLM):
        """
        Method to apply a chat template to a list of chat history between user and model.
        """
-        try:
+        return self.tokenizer.apply_chat_template(
-            chat_templated = self.tokenizer.apply_chat_template(
+            chat_history, tokenize=False, add_generation_prompt=True
-                chat_history, tokenize=False, add_generation_prompt=True
+        )
-            )
-        except jinja2.exceptions.TemplateError:
-            eval_logger.warning(
-                "Failed to apply chat template. removing the system role in chat history."
-            )
-            chat_history = [msg for msg in chat_history if msg["role"] != "system"]
-            chat_templated = self.tokenizer.apply_chat_template(
-                chat_history, tokenize=False, add_generation_prompt=True
-            )
-        return chat_templated
    def get_model_info(self) -> dict:
        """

--- a/lm_eval/models/neuron_optimum.py
+++ b/lm_eval/models/neuron_optimum.py
@@ -288,7 +288,7 @@ class NEURON_HF(TemplateLM):
        self.vocab_size = self.tokenizer.vocab_size
        self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
-        self.add_bos_token = self.add_bos_token
+        self.add_bos_token = add_bos_token
        self._max_length = max_length

--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
-""" TextSynth API
+"""TextSynth API
 Implementation provided by Fabrice Bellard:
    https://github.com/EleutherAI/lm-evaluation-harness/issues/295
@@ -11,6 +11,7 @@ Example usage:
 Homepage: https://textsynth.com/index.html
 """
 import logging
 import os

--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -389,7 +389,7 @@ class Collator:
            self._arr_with_indices, fn=self._group_fn, group_by="contexts"
        )
-    def get_batched(self, n: int = 1, batch_fn: Optional[Callable] = None, reset_batch_fn: Optional[Callable] = None, accelerator=None) -> Iterator:
+    def get_batched(self, n: int = 1, batch_fn: Optional[Callable] = None) -> Iterator:
        """
        Generates and yields batches from the reordered array. The method of grouping and batching
        depends on the parameter `group_by`.
@@ -402,8 +402,6 @@ class Collator:
        - n (int): The size of each batch. Defaults to 1.
        - batch_fn ([Callable[[int, Iterable], int]] | None): A function to determine the size of
          each batch. Optional, defaults to None.
-        - reset_batch_fn ([Callable[[int, Iterable], int]] | None): A function to reset the scheduler of 
-          the batch_fn, if present, when we change group in generative mode.
        Returns:
        Iterator: An iterator over batches of reordered elements grouped as per the `group_by`
@@ -413,9 +411,10 @@ class Collator:
        List of batched elements according to the `group_by` attribute.
        """
        if self._group_by == "gen_kwargs":
-            for key, values in self._arr_with_indices.items():  # type: ignore
+            for (
-                if reset_batch_fn is not None: # with each group change, we must recompute the batch size, so we restart the scheduler
+                key,
-                    reset_batch_fn()
+                values,
+            ) in self._arr_with_indices.items():  # type: ignore
                values = self._reorder(values)
                batch = self.get_chunks(values, n=n, fn=batch_fn)
                yield from batch

--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -119,6 +119,12 @@ class VLLM(TemplateLM):
            tokenizer_revision=tokenizer_revision,
        )
        self.add_bos_token = add_bos_token
+        if "gemma" in pretrained.lower():
+            self.add_bos_token = True
+            eval_logger.info(
+                "Found 'gemma' in model name, a BOS token will be used as Gemma underperforms without it."
+            )
        self.custom_prefix_token_id = prefix_token_id
        if prefix_token_id is not None:
            eval_logger.info(
@@ -493,7 +499,10 @@ class VLLM(TemplateLM):
    def modify_gen_kwargs(kwargs: dict) -> dict:
        # sampling_params
        do_sample = kwargs.pop("do_sample", None)
-        if do_sample is False or "temperature" not in kwargs:
+        if do_sample is False and "temperature" not in kwargs:
+            eval_logger.debug(
+                "Got `do_sample=False` and no temperature value, setting VLLM temperature to 0.0 ..."
+            )
            kwargs["temperature"] = 0.0
        # hf defaults
        kwargs["skip_special_tokens"] = kwargs.get("skip_special_tokens", False)

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -10,8 +10,8 @@
 | [aclue](aclue/README.md) | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese |
 | [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic |
 | [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese |
-| [ammlu](ammlu/README.md) | Arabic version of MMLU. | Arabic |
 | [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English |
+| [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic |
 | [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions.  | English |
 | [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English |
 | [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English |
@@ -20,11 +20,13 @@
 | [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German |
 | [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) |
 | benchmarks | General benchmarking tasks that test a wide range of language understanding capabilities. | |
+| [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) |
 | [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple |
 | [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English |
 | [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese |
 | [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese |
 | code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby |
+| [commonsense_qa](commmonsense_qa/README.md) | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English |
 | [copal_id](copal_id/README.md) | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian |
 | [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English |
 | [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French |
@@ -71,6 +73,7 @@
 | okapi/mmlu_multilingual | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) |
 | [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) |
 | [openbookqa](openbookqa/README.md) | Open-book question answering tasks that require external knowledge and reasoning. | English |
+| [paloma](paloma/README.md) | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit. | English |
 | [paws-x](paws-x/README.md) | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean |
 | [pile](pile/README.md) | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English |
 | [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English |

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -14,27 +14,43 @@ class TaskManager:
    """
-    def __init__(self, verbosity="INFO", include_path: Optional[str] = None) -> None:
+    def __init__(
+        self,
+        verbosity="INFO",
+        include_path: Optional[Union[str, List]] = None,
+        include_defaults: bool = True,
+    ) -> None:
        self.verbosity = verbosity
        self.include_path = include_path
        self.logger = utils.eval_logger
        self.logger.setLevel(getattr(logging, f"{verbosity}"))
-        self._task_index = self.initialize_tasks(include_path=include_path)
+        self._task_index = self.initialize_tasks(
+            include_path=include_path, include_defaults=include_defaults
+        )
        self._all_tasks = sorted(list(self._task_index.keys()))
        self.task_group_map = collections.defaultdict(list)
-    def initialize_tasks(self, include_path: Optional[str] = None):
+    def initialize_tasks(
+        self,
+        include_path: Optional[Union[str, List]] = None,
+        include_defaults: bool = True,
+    ):
        """Creates a dictionary of tasks index.
-        :param include_path: str = None
+        :param include_path: Union[str, List] = None
-            An additional path to be searched for tasks
+            An additional path to be searched for tasks recursively.
+            Can provide more than one such path as a list.
+        :param include_defaults: bool = True
+            If set to false, default tasks (those in lm_eval/tasks/) are not indexed.
        :return
            Dictionary of task names as key and task metadata
        """
-        all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
+        if include_defaults:
+            all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
+        else:
+            all_paths = []
        if include_path is not None:
            if isinstance(include_path, str):
                include_path = [include_path]
@@ -296,8 +312,13 @@ class TaskManager:
        :return
            Dictionary of task names as key and task metadata
        """
+        ignore_dirs = [
+            "__pycache__",
+            ".ipynb_checkpoints",
+        ]
        tasks_and_groups = collections.defaultdict()
-        for root, _, file_list in os.walk(task_dir):
+        for root, dirs, file_list in os.walk(task_dir):
+            dirs[:] = [d for d in dirs if d not in ignore_dirs]
            for f in file_list:
                if f.endswith(".yaml"):
                    yaml_path = os.path.join(root, f)

--- a/lm_eval/tasks/aclue/_generate_configs.py
+++ b/lm_eval/tasks/aclue/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
 import argparse
 import os

--- a/lm_eval/tasks/bbh/_generate_configs.py
+++ b/lm_eval/tasks/bbh/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
 import argparse
 import os
 import re

--- a/lm_eval/tasks/belebele/_generate_configs.py
+++ b/lm_eval/tasks/belebele/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
 import argparse
 import os