Merge branch 'main' into autobatchtest

# Conflicts: # lm_eval/models/huggingface.py

Merge branch 'main' into autobatchtest
# Conflicts: # lm_eval/models/huggingface.py
948f120f · Baber · a5b1c7a8 · bd80a6c0 · 948f120f · 948f120f
Commit 948f120f authored Nov 09, 2024 by Baber
20 changed files
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -56,8 +56,7 @@ class TaskConfig(dict):
    # task naming/registry
    task: Optional[str] = None
    task_alias: Optional[str] = None
-    group: Optional[Union[str, list]] = None
-    group_alias: Optional[Union[str, list]] = None
+    tag: Optional[Union[str, list]] = None
    # HF dataset options.
    # which dataset to use,
    # and what splits for what purpose
@@ -68,13 +67,14 @@ class TaskConfig(dict):
    validation_split: Optional[str] = None
    test_split: Optional[str] = None
    fewshot_split: Optional[str] = (
-        None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
+        None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaluating (?)
    )
    # formatting / prompting options.
    # see docs/advanced_task_guide.md for more info
    process_docs: Optional[Callable] = None
    doc_to_text: Optional[Union[Callable, str]] = None
    doc_to_target: Optional[Union[Callable, str]] = None
+    doc_to_image: Union[Callable, str] = None
    doc_to_choice: Optional[Union[Callable, str, dict, list]] = None
    process_results: Optional[Union[Callable, str]] = None
    use_prompt: Optional[str] = None
@@ -365,6 +365,10 @@ class Task(abc.ABC):
    def doc_to_target(self, doc):
        pass

+    # not an abstractmethod because not every language-only task has to implement this
+    def doc_to_image(self, doc):
+        raise NotImplementedError
+
    def build_all_requests(
        self,
        *,
@@ -723,6 +727,10 @@ class ConfigurableTask(Task):
                )
            self.OUTPUT_TYPE = self.config.output_type

+        if self.config.doc_to_image is not None:
+            # mark the task as requiring multimodality.
+            self.MULTIMODAL = True
+
        if self.config.dataset_path is not None:
            self.DATASET_PATH = self.config.dataset_path

@@ -980,7 +988,7 @@ class ConfigurableTask(Task):
        else:
            if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0):
                eval_logger.warning(
-                    f"Task '{self.config.task}': "
+                    f"[Task: {self.config.task}] "
                    "num_fewshot > 0 but fewshot_split is None. "
                    "using preconfigured rule."
                )
@@ -1030,8 +1038,8 @@ class ConfigurableTask(Task):
            Whether to apply the chat template to the fewshot context.
        :param fewshot_as_multiturn: bool
            Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
-        :param chat_template: Callable
-            Chat template to be applied to the fewshot context.
+        :param chat_template:
+            callable (from lm.apply_chat_template) that takes in a list[Dict] chat transcript and renders it into a string.
        :returns: str
            The fewshot context.
        """
@@ -1159,9 +1167,11 @@ class ConfigurableTask(Task):
        """
        return doc

-    def doc_to_text(self, doc):
+    def doc_to_text(self, doc, doc_to_text=None):
        if self.prompt is not None:
            doc_to_text = self.prompt
+        elif doc_to_text is not None:
+            doc_to_text = doc_to_text
        else:
            doc_to_text = self.config.doc_to_text

@@ -1193,9 +1203,11 @@ class ConfigurableTask(Task):
            print(type(doc_to_text))
            raise TypeError

-    def doc_to_target(self, doc: Mapping) -> Union[int, str, list]:
+    def doc_to_target(self, doc: Mapping, doc_to_target=None) -> Union[int, str, list]:
        if self.prompt is not None:
            doc_to_target = self.prompt
+        elif doc_to_target is not None:
+            doc_to_target = doc_to_target
        else:
            doc_to_target = self.config.doc_to_target

@@ -1237,9 +1249,11 @@ class ConfigurableTask(Task):
        else:
            raise TypeError

-    def doc_to_choice(self, doc: Any) -> List[str]:
+    def doc_to_choice(self, doc: Any, doc_to_choice=None) -> List[str]:
        if self.prompt is not None:
            doc_to_choice = self.prompt
+        elif doc_to_choice is not None:
+            doc_to_choice = doc_to_choice
        elif self.config.doc_to_choice is None:
            eval_logger.error("doc_to_choice was called but not set in config")
        else:
@@ -1261,9 +1275,34 @@ class ConfigurableTask(Task):
        else:
            raise TypeError

+    def doc_to_image(self, doc: Any, doc_to_image=None) -> Union[int, str, list]:
+        if doc_to_image is not None:
+            doc_to_image = doc_to_image
+        elif self.config.doc_to_image is not None:
+            doc_to_image = self.config.doc_to_image
+        else:
+            return None
+
+        if isinstance(doc_to_image, list):
+            image_feature = [
+                self.doc_to_image(doc, feature) for feature in doc_to_image
+            ]
+            return [feature for feature in image_feature if feature is not None]
+        elif isinstance(doc_to_image, str):
+            if doc_to_image in self.features:
+                return doc[doc_to_image]
+            else:
+                return ast.literal_eval(utils.apply_template(doc_to_image, doc))
+        elif callable(doc_to_image):
+            return doc_to_image(doc)
+        else:
+            return None
+
    def construct_requests(
        self, doc: dict, ctx: str, **kwargs
    ) -> Union[List[Instance], Instance]:
+        aux_arguments = None
+
        if self.OUTPUT_TYPE == "loglikelihood":
            arguments = (ctx, self.doc_to_target(doc))
        elif self.OUTPUT_TYPE == "loglikelihood_rolling":
@@ -1281,6 +1320,37 @@ class ConfigurableTask(Task):
                # Otherwise they are placed in the continuation
                arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]

+            # TODO: we should raise a warning telling users this will at most ~2x runtime.
+            if "acc_mutual_info" in self._metric_fn_list.keys():
+                # if we are calculating multiple choice accuracy
+                # using mutual information instead of raw loglikelihood as metric, need unconditional lls.
+
+                # here mutual info refers to calculating
+                # log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
+                # in other words normalizing by subtracting the unconditional logprob of each choice.
+                aux_arguments = [("", f"{choice}") for choice in choices]
+
+                arguments.extend(aux_arguments)
+
+        elif self.OUTPUT_TYPE == "generate_until":
+            arguments = (ctx, deepcopy(self.config.generation_kwargs))
+
+        multimodal_arg = {}
+        if (
+            self.config.doc_to_image
+        ):  # TODO: ensure that non-multimodal tasks aren't getting visual args
+            multimodal_arg = {
+                **multimodal_arg,
+                **{"visual": self.doc_to_image(doc)},
+            }
+
+        if bool(multimodal_arg):
+            if isinstance(arguments, list):
+                arguments = [arg + (multimodal_arg,) for arg in arguments]
+            else:
+                arguments = arguments + (multimodal_arg,)
+
+        if self.OUTPUT_TYPE == "multiple_choice":
            request_list = [
                Instance(
                    request_type="loglikelihood",
@@ -1291,33 +1361,15 @@ class ConfigurableTask(Task):
                )
                for i, arg in enumerate(arguments)
            ]
-            # TODO: we should raise a warning telling users this will at most ~2x runtime.
-            if "acc_mutual_info" in self._metric_fn_list.keys():
-                # if we are calculating multiple choice accuracy
-                # using mutual information instead of raw loglikelihood as metric, need unconditional lls.

-                # here mutual info refers to calculating
-                # log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
-                # in other words normalizing by subtracting the unconditional logprob of each choice.
-                request_list.extend(
-                    [
-                        Instance(
-                            request_type="loglikelihood",
-                            doc=doc,
-                            arguments=("", "{}".format(choice)),
-                            idx=i,
-                            **kwargs,
-                        )
-                        for i, choice in enumerate(choices)
-                    ]
-                )
            return request_list

-        elif self.OUTPUT_TYPE == "generate_until":
-            arguments = (ctx, deepcopy(self.config.generation_kwargs))
-
        return Instance(
-            request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs
+            request_type=self.OUTPUT_TYPE,
+            doc=doc,
+            arguments=arguments,
+            idx=0,
+            **kwargs,
        )

    def process_results(self, doc, results):
@@ -1446,7 +1498,7 @@ class ConfigurableTask(Task):
            # we expect multiple_targets to be a list.
            elif self.multiple_target:
                gold = list(gold)
-            elif type(gold) != type(result):
+            elif type(gold) is not type(result):
                # cast gold to the same type as result
                gold = type(result)(gold)

@@ -1520,10 +1572,13 @@ class ConfigurableTask(Task):
    def get_config(self, key: str) -> Any:
        return getattr(self._config, key, None)

+    @property
+    def task_name(self) -> Any:
+        return getattr(self.config, "task", None)
+
    def __repr__(self):
        return (
            f"ConfigurableTask(task_name={getattr(self.config, 'task', None)},"
-            f"group_name={getattr(self.config, 'group', None)},"
            f"output_type={self.OUTPUT_TYPE},"
            f"num_fewshot={getattr(self.config, 'num_fewshot', None)},"
            f"num_samples={len(self.eval_docs)})"

--- a/lm_eval/caching/__init__.py
+++ b/lm_eval/caching/__init__.py
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -11,11 +11,14 @@ import torch

 import lm_eval.api.metrics
 import lm_eval.api.registry
+import lm_eval.api.task
 import lm_eval.models
 from lm_eval.caching.cache import delete_cache
 from lm_eval.evaluator_utils import (
+    consolidate_group_results,
    consolidate_results,
    get_sample_size,
+    get_subtask_list,
    get_task_list,
    prepare_print_tasks,
    print_writeout,
@@ -23,7 +26,10 @@ from lm_eval.evaluator_utils import (
 )
 from lm_eval.loggers import EvaluationTracker
 from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash
-from lm_eval.tasks import TaskManager, get_task_dict
+from lm_eval.tasks import (
+    TaskManager,
+    get_task_dict,
+)
 from lm_eval.utils import (
    eval_logger,
    handle_non_serializable,
@@ -35,7 +41,7 @@ from lm_eval.utils import (

 if TYPE_CHECKING:
    from lm_eval.api.model import LM
-    from lm_eval.tasks import Task
+    from lm_eval.api.task import Task


 @positional_deprecated
@@ -44,7 +50,7 @@ def simple_evaluate(
    model_args: Optional[Union[str, dict]] = None,
    tasks: Optional[List[Union[str, dict, object]]] = None,
    num_fewshot: Optional[int] = None,
-    batch_size: Optional[int] = None,
+    batch_size: Optional[Union[int, str]] = None,
    max_batch_size: Optional[int] = None,
    device: Optional[str] = None,
    use_cache: Optional[str] = None,
@@ -58,7 +64,7 @@ def simple_evaluate(
    log_samples: bool = True,
    evaluation_tracker: Optional[EvaluationTracker] = None,
    system_instruction: Optional[str] = None,
-    apply_chat_template: bool = False,
+    apply_chat_template: Union[bool, str] = False,
    fewshot_as_multiturn: bool = False,
    gen_kwargs: Optional[str] = None,
    task_manager: Optional[TaskManager] = None,
@@ -106,8 +112,11 @@ def simple_evaluate(
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
    :param system_instruction: str
        System instruction to be applied to the prompt
-    :param apply_chat_template: bool
-        If True, apply chat template to the prompt
+    :param apply_chat_template: Union[bool, str]
+        Specifies whether to apply a chat template to the prompt.
+        - If set to True, the default chat template is applied.
+        - If set to a string, applies the specified chat template by name.
+        Defaults to False (no chat template applied).
    :param fewshot_as_multiturn: bool
        Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
    :param gen_kwargs: str
@@ -148,6 +157,9 @@ def simple_evaluate(
        seed_message.append(f"Setting torch manual seed to {torch_random_seed}")
        torch.manual_seed(torch_random_seed)

+    if fewshot_random_seed is not None:
+        seed_message.append(f"Setting fewshot manual seed to {fewshot_random_seed}")
+
    if seed_message:
        eval_logger.info(" | ".join(seed_message))

@@ -199,7 +211,9 @@ def simple_evaluate(
            )
    else:
        if not isinstance(model, lm_eval.api.model.LM):
-            raise TypeError
+            raise TypeError(
+                f"The value of `model` passed to simple_evaluate() was of type {type(model)}, but is required to be a subclass of lm_eval.api.model.LM . This may be because you are passing an initialized Hugging Face PreTrainedModel without having wrapped it in `lm_eval.models.huggingface.HFLM(pretrained=my_model)` first."
+            )
        eval_logger.info("Using pre-initialized model")
        lm = model

@@ -219,48 +233,58 @@ def simple_evaluate(
        task_manager = TaskManager(verbosity)

    task_dict = get_task_dict(tasks, task_manager)
-    for task_name in task_dict.keys():
-        task_obj = task_dict[task_name]
-        if isinstance(task_obj, tuple):
-            _, task_obj = task_obj
-            if task_obj is None:
-                continue
-
-        if task_obj.get_config("output_type") == "generate_until":
-            if gen_kwargs is not None:
-                task_obj.set_config(
-                    key="generation_kwargs", value=gen_kwargs, update=True
-                )

-        if predict_only:
-            log_samples = True
-            eval_logger.info(
-                f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
-            )
-            # we have to change the class properties post-hoc. This is pretty hacky.
-            task_obj.override_metric(metric_name="bypass")
-
-        # override tasks' fewshot values to the provided num_fewshot arg value
-        # except if tasks have it set to 0 manually in their configs--then we should never overwrite that
-        if num_fewshot is not None:
-            if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
-                eval_logger.info(
-                    f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
-                )
+    # helper function to recursively apply config overrides to leaf subtasks, skipping their constituent groups.
+    # (setting of num_fewshot ; bypassing metric calculation ; setting fewshot seed)
+    def _adjust_config(task_dict):
+        adjusted_task_dict = {}
+        for task_name, task_obj in task_dict.items():
+            if isinstance(task_obj, dict):
+                adjusted_task_dict = {
+                    **adjusted_task_dict,
+                    **{task_name: _adjust_config(task_obj)},
+                }
+
            else:
-                eval_logger.warning(
-                    f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
-                )
-                task_obj.set_config(key="num_fewshot", value=num_fewshot)
-        else:
-            # if num_fewshot not provided, and the task does not define a default one, default to 0
-            if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None:
-                task_obj.set_config(key="num_fewshot", value=0)
-        # fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file)
-        task_obj.set_fewshot_seed(seed=fewshot_random_seed)
-        eval_logger.info(
-            f"Setting fewshot random generator seed to {fewshot_random_seed}"
-        )
+                if task_obj.get_config("output_type") == "generate_until":
+                    if gen_kwargs is not None:
+                        task_obj.set_config(
+                            key="generation_kwargs", value=gen_kwargs, update=True
+                        )
+
+                if predict_only:
+                    eval_logger.info(
+                        f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
+                    )
+                    # we have to change the class properties post-hoc. This is pretty hacky.
+                    task_obj.override_metric(metric_name="bypass")
+
+                # override tasks' fewshot values to the provided num_fewshot arg value
+                # except if tasks have it set to 0 manually in their configs--then we should never overwrite that
+                if num_fewshot is not None:
+                    if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
+                        eval_logger.info(
+                            f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
+                        )
+                    else:
+                        eval_logger.warning(
+                            f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
+                        )
+                        task_obj.set_config(key="num_fewshot", value=num_fewshot)
+                else:
+                    # if num_fewshot not provided, and the task does not define a default one, default to 0
+                    if (
+                        default_num_fewshot := task_obj.get_config("num_fewshot")
+                    ) is None:
+                        task_obj.set_config(key="num_fewshot", value=0)
+                # fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file)
+                task_obj.set_fewshot_seed(seed=fewshot_random_seed)
+
+                adjusted_task_dict[task_name] = task_obj
+
+        return adjusted_task_dict
+
+    task_dict = _adjust_config(task_dict)

    if check_integrity:
        run_task_tests(task_list=tasks)
@@ -270,7 +294,7 @@ def simple_evaluate(
            model_source=model,
            model_args=model_args,
            system_instruction=system_instruction,
-            chat_template=lm.chat_template if apply_chat_template else None,
+            chat_template=lm.chat_template(apply_chat_template),
            fewshot_as_multiturn=fewshot_as_multiturn,
        )

@@ -282,7 +306,7 @@ def simple_evaluate(
        rewrite_requests_cache=rewrite_requests_cache,
        bootstrap_iters=bootstrap_iters,
        write_out=write_out,
-        log_samples=log_samples,
+        log_samples=True if predict_only else log_samples,
        system_instruction=system_instruction,
        apply_chat_template=apply_chat_template,
        fewshot_as_multiturn=fewshot_as_multiturn,
@@ -343,7 +367,7 @@ def evaluate(
    write_out: bool = False,
    log_samples: bool = True,
    system_instruction: Optional[str] = None,
-    apply_chat_template: bool = False,
+    apply_chat_template: Union[bool, str] = False,
    fewshot_as_multiturn: bool = False,
    verbosity: str = "INFO",
 ):
@@ -363,8 +387,11 @@ def evaluate(
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
    :param system_instruction: str
        System instruction to be applied to the prompt
-    :param apply_chat_template: bool
-        If True, apply chat template to the prompt
+    :param apply_chat_template: Union[bool, str]
+        Specifies whether to apply a chat template to the prompt.
+        - If set to True, the default chat template is applied.
+        - If set to a string, applies the specified chat template by name.
+        Defaults to False (no chat template applied).
    :param fewshot_as_multiturn: bool
        Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
    :return
@@ -380,16 +407,40 @@ def evaluate(
    padding_requests = defaultdict(int)

    # get lists of group hierarchy and each type of request
-    task_hierarchy, eval_tasks = get_task_list(task_dict)
+    eval_tasks = get_task_list(task_dict)
    if not log_samples:
        if not all(
            "bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys()
            for task_output in eval_tasks
        ):
            raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
+
+    # validation check: are we running multimodal task <-> non-multimodal model class, or vice-versa.
+    incompatible_tasks = []
    for task_output in eval_tasks:
        task: Task = task_output.task
-        limit = get_sample_size(task, limit)
+
+        if getattr(lm, "MULTIMODAL", False) != getattr(task, "MULTIMODAL", False):
+            incompatible_tasks.append(task_output.task_name)
+    if len(incompatible_tasks) > 0:
+        if not getattr(lm, "MULTIMODAL", False):
+            raise ValueError(
+                f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the ['hf-multimodal', 'vllm-vlm'] model type."
+            )
+        else:
+            raise ValueError(
+                f"Attempted to run tasks: {incompatible_tasks} which are text-only, but used a model type which only currently supports multimodal tasks."
+            )
+    # end multimodality validation check
+
+    # Cache the limit arg.
+    limit_arg = limit
+    limits = []
+    for task_output in eval_tasks:
+        task: Task = task_output.task
+
+        limit = get_sample_size(task, limit_arg)
+        limits.append(limit)
        task.build_all_requests(
            limit=limit,
            rank=lm.rank,
@@ -397,7 +448,7 @@ def evaluate(
            cache_requests=cache_requests,
            rewrite_requests_cache=rewrite_requests_cache,
            system_instruction=system_instruction,
-            apply_chat_template=apply_chat_template,
+            apply_chat_template=bool(apply_chat_template),
            fewshot_as_multiturn=fewshot_as_multiturn,
            chat_template=getattr(lm, "apply_chat_template")
            if apply_chat_template
@@ -459,7 +510,7 @@ def evaluate(
    WORLD_SIZE = lm.world_size
    ### Postprocess outputs ###
    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
-    for task_output in eval_tasks:
+    for task_output, limit in zip(eval_tasks, limits):
        task = task_output.task
        task.apply_filters()

@@ -557,106 +608,45 @@ def evaluate(

        ### Calculate group metrics ###
        if bool(results):
-            for group, task_list in reversed(task_hierarchy.items()):
-                if len(task_list) == 0:
-                    # task_hierarchy entries are either
-                    # `group_name: [subtask1, subtask2, ...]`
-                    # or `task_name: []`.
-                    # we only want to operate on groups here.
-                    continue
-
-                # collect all higher_is_better values for metrics
-                # in the group's subtasks.
-                # TODO: clean this up ; unify with the below metric_list loop?
-                _higher_is_better = {}
+            results, versions, show_group_table, *_ = consolidate_group_results(
+                results, versions, task_dict
+            )
+
+        results_agg, group_agg = prepare_print_tasks(task_dict, results)
+        subtask_list = get_subtask_list(task_dict)
+
+        # collect all higher_is_better values for metrics
+        # in the group's subtasks.
+        # TODO: clean this up ; unify with the below metric_list loop?
+        _higher_is_better = {}
+        for group, task_list in subtask_list.items():
+            if (
+                len(task_list) != 0
+            ):  # subtask list will list "task_name": [] for solo tasks
                for task in task_list:
                    for m, h in higher_is_better[task].items():
                        if m not in _higher_is_better.keys():
                            _higher_is_better[m] = h
-                    if (
-                        m in _higher_is_better
-                        and _higher_is_better[m] is not None
-                        and _higher_is_better[m] != h
-                    ):
-                        eval_logger.warning(
-                            f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
-                        )
-                        _higher_is_better[m] = None
-                higher_is_better[group] = _higher_is_better
-
-                # collect all metric keys used by a subtask in the group.
-                metric_list = list(
-                    {
-                        key
-                        for task in task_list
-                        for key in results[task].keys()
-                        if "_stderr" not in key and key not in ["alias", "samples"]
-                    }
-                )
-                for metric in metric_list:
-                    stderr = "_stderr,".join(metric.split(","))
-
-                    # gather metrics, sizes, and stderrs from subtasks
-                    metrics = [
-                        results[task][metric]
-                        for task in task_list
-                        if metric in results[task]
-                    ]  # TODO: copy?
-                    stderrs = [
-                        results[task][stderr]
-                        for task in task_list
-                        if stderr in results[task]
-                    ]
-                    sizes = [
-                        results[task]["samples"]
-                        for task in task_list
-                        if metric in results[task]
-                    ]
-
-                    # compute group's pooled metric and stderr
-                    results[group][metric] = (
-                        lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
-                    )
-                    # TODO: calculate grouped metric using aggregation fn
-                    if "N/A" in stderrs:
-                        results[group][stderr] = "N/A"
-                    else:
-                        results[group][stderr] = (
-                            lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
-                        )
-                        # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
-                        # To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
-                        # results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
-
-                    results[group]["samples"] = sum(sizes)
-
-        results_agg = defaultdict(dict)
-        groups_agg = defaultdict(dict)
-        all_tasks_list = list(task_hierarchy.keys())
-        while True:
-            add_tasks_list = list(k for k in results_agg.keys())
-            left_tasks_list = sorted(list(set(all_tasks_list) - set(add_tasks_list)))
-            if len(left_tasks_list) == 0:
-                break
-
-            _task_hierarchy = {
-                k: v for k, v in task_hierarchy.items() if k in left_tasks_list
-            }
-            _results_agg, _groups_agg = prepare_print_tasks(_task_hierarchy, results)

-            results_agg = {**results_agg, **_results_agg}
-            groups_agg = {**groups_agg, **_groups_agg}
-
-        for group_name, task_list in task_hierarchy.items():
-            if task_list:
-                num_fewshot[group_name] = num_fewshot[
-                    task_list[0]
-                ]  # TODO: validate this
+                        if (
+                            m in _higher_is_better
+                            and _higher_is_better[m] is not None
+                            and _higher_is_better[m] != h
+                        ):
+                            eval_logger.warning(
+                                f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
+                            )
+                            _higher_is_better[m] = None
+                higher_is_better[group] = _higher_is_better

        results_dict = {
            "results": dict(results_agg.items()),
-            **({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
-            "group_subtasks": dict(reversed(task_hierarchy.items())),
+            **(
+                {"groups": dict(group_agg.items())}
+                if (bool(group_agg) & show_group_table)
+                else {}
+            ),
+            "group_subtasks": dict(reversed(subtask_list.items())),
            "configs": dict(sorted(configs.items())),
            "versions": dict(sorted(versions.items())),
            "n-shot": dict(sorted(num_fewshot.items())),
@@ -669,7 +659,7 @@ def evaluate(
                        len(task_output.task.eval_docs),
                    ),
                }
-                for task_output in eval_tasks
+                for task_output, limit in zip(eval_tasks, limits)
            },
        }
        if log_samples:

--- a/lm_eval/evaluator_utils.py
+++ b/lm_eval/evaluator_utils.py
@@ -2,9 +2,15 @@ import collections
 import math
 import pathlib
 import sys
-from typing import Dict, List, Optional, Tuple, Union
-
-from lm_eval.api import metrics
+from typing import List, Optional, Tuple, Union
+
+from lm_eval.api.group import ConfigurableGroup
+from lm_eval.api.metrics import (
+    aggregate_subtask_metrics,
+    pooled_sample_stderr,
+    stderr_for_metric,
+)
+from lm_eval.api.task import Task
 from lm_eval.utils import eval_logger, positional_deprecated


@@ -98,7 +104,7 @@ class TaskOutput:
            self.agg_metrics[metric_key] = agg_fn(items)
            self.sample_len = len(items)  # TODO: same sample size for each metric?
            if isinstance(bootstrap_iters, int):
-                stderr_fn = metrics.stderr_for_metric(
+                stderr_fn = stderr_for_metric(
                    metric=agg_fn,
                    bootstrap_iters=min(bootstrap_iters, 100)
                    if metric in ["bleu", "chrf", "ter"]
@@ -116,23 +122,71 @@ class TaskOutput:
        return (
            f"TaskOutput(task_name={self.task_name}, "
            f"group_name={self.group_name}, "
-            f"version={self.version},"
-            f"n_shot={self.n_shot}"
-            f"task_alias={self.task_alias}, group_alias={self.group_alias})"
+            f"version={self.version}, "
+            f"n_shot={self.n_shot}, "
+            f"task_alias={self.task_alias}, "
+            f"group_alias={self.group_alias})"
        )


-def get_task_list(task_dict: dict) -> Tuple[Dict[str, list], List[TaskOutput]]:
-    task_hierarchy = collections.defaultdict(list)
-    outputs = list(TaskOutput.from_taskdict(x, y) for x, y in task_dict.items())
-    for task_output in outputs:
-        if group_name := task_output.group_name:
-            task_hierarchy[group_name].append(task_output.task_name)
+def get_task_list(task_dict: dict) -> List[TaskOutput]:
+    outputs = []
+    for task_name, task_obj in task_dict.items():
+        if isinstance(task_obj, dict):
+            _outputs = get_task_list(task_obj)
+            outputs.extend(_outputs)
        else:
-            task_hierarchy[task_output.task_name] = []
-    # returns task_hierarchy tracking which groups contain which subtasks,
-    # and a list of TaskOutput classes for each non-group subtask
-    return task_hierarchy, [x for x in outputs if x.task]
+            task_output = TaskOutput.from_taskdict(task_name, task_obj)
+            outputs.append(task_output)
+
+    return outputs
+
+
+def get_subtask_list(task_dict, task_root=None, depth=0):
+    subtask_list = {}
+    for group_obj, task_obj in task_dict.items():
+        if isinstance(group_obj, ConfigurableGroup):
+            # group_name = group_obj.group_name
+            group_name = group_obj.group_name
+        else:
+            group_name = group_obj
+        if isinstance(task_obj, dict):
+            _subtask_list = get_subtask_list(
+                task_obj, task_root=group_name, depth=depth + 1
+            )
+            if task_root:
+                subtask_list.setdefault((task_root, depth), []).extend(
+                    [
+                        _task
+                        for (_task, _depth) in _subtask_list.keys()
+                        if (_depth - 1) == depth
+                    ]
+                )
+
+            subtask_list = {**subtask_list, **_subtask_list}
+        else:
+            if isinstance(task_obj, ConfigurableGroup):
+                # group_or_task_name = task_obj.group_name
+                group_or_task_name = task_obj.group_name
+            elif isinstance(task_obj, Task):
+                # group_or_task_name = task_obj.task_name
+                group_or_task_name = task_obj.task_name
+
+            if task_root is None:
+                subtask_list.setdefault((group_or_task_name, depth), [])
+            else:
+                subtask_list.setdefault((task_root, depth), []).append(
+                    group_or_task_name
+                )
+
+    if depth == 0:
+        _subtask_list = {}
+        for group_key, task_list in subtask_list.items():
+            group_name, depth = group_key
+            _subtask_list[group_name] = task_list
+        subtask_list = _subtask_list
+
+    return subtask_list


 def print_writeout(task) -> None:
@@ -155,70 +209,95 @@ def get_sample_size(task, limit: Optional[int]) -> Union[int, None]:


 def prepare_print_tasks(
-    task_hierarchy: dict, results: dict, tab=0
+    task_dict: dict,
+    results: dict,
+    task_depth=0,
+    group_depth=0,
 ) -> Tuple[dict, dict]:
    """
-    @param task_hierarchy: Dictionary representing the group hierarchy of tasks. Each key is a group name and its
+    @param task_dict: Dictionary representing the group hierarchy of tasks. Each key is a group name and its
    value is a list of task names.
    @param results: Dictionary containing the results of each task. Each key is a
    group name and its value is a dictionary of task results.
-    @param tab: The indentation level for printing the task
+    @param task_depth: The indentation level for printing the task
+    hierarchy. Default is 0.
+    @param group_depth: The indentation level for printing the group
    hierarchy. Default is 0.
    @return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains
    aggregated results for each task, and groups_agg contains aggregated results for each group.

    Prepares the task hierarchy and aggregates the results for each task and group recursively for printing.
    """
-    results_agg = collections.defaultdict(dict)
-    groups_agg = collections.defaultdict(dict)
-
-    (group_name, task_list), *_ = task_hierarchy.items()
-    task_list = sorted(task_list)
-
-    results_agg[group_name] = results[group_name].copy()
-    # results_agg[group_name]["tab"] = tab
-    if "samples" in results_agg[group_name]:
-        results_agg[group_name].pop("samples")
-
-    tab_string = " " * tab + "- " if tab > 0 else ""

-    if "alias" in results_agg[group_name]:
-        results_agg[group_name]["alias"] = tab_string + results_agg[group_name]["alias"]
-    else:
-        results_agg[group_name]["alias"] = tab_string + group_name
-
-    if len(task_list) > 0:
-        groups_agg[group_name] = results[group_name].copy()
-        # groups_agg[group_name]["tab"] = tab
-        if "samples" in groups_agg[group_name]:
-            groups_agg[group_name].pop("samples")
-
-        if "alias" in groups_agg[group_name]:
-            groups_agg[group_name]["alias"] = (
-                tab_string + groups_agg[group_name]["alias"]
+    def _sort_task_dict(task_dict):
+        """
+        Helper utility. Sorts the task dict at the current level of the hierarchy based on alphabetized task name.
+        Required so that we end up sorting within each sub-header correctly.
+        """
+
+        return dict(
+            sorted(
+                task_dict.items(),
+                key=lambda item: item[0].group_name
+                if isinstance(item[0], ConfigurableGroup)
+                else item[0],
            )
-        else:
-            groups_agg[group_name]["alias"] = tab_string + group_name
+        )

-        for task_name in task_list:
-            if task_name in task_hierarchy:
-                _task_hierarchy = {
-                    **{task_name: task_hierarchy[task_name]},
-                    **task_hierarchy,
-                }
+    task_agg = collections.defaultdict(dict)
+    group_agg = collections.defaultdict(dict)
+    task_dict = _sort_task_dict(task_dict)
+    for task_or_group_name, task_or_group_obj in task_dict.items():
+        tab_string = " " * task_depth + "- " if task_depth > 0 else ""
+        if isinstance(task_or_group_name, ConfigurableGroup):
+            # string_name = task_or_group_name.group_name
+            name = task_or_group_name.group_name
+            from_configurable_group = True
+            task_or_group_obj = _sort_task_dict(task_or_group_obj)
+        elif isinstance(task_or_group_name, str):
+            name = task_or_group_name
+            if isinstance(task_or_group_obj, Task):
+                # string_name = task_or_group_obj.task_name
+                name = task_or_group_obj.task_name
+            from_configurable_group = False
+
+        task_agg[name] = results[name].copy()
+        if from_configurable_group:
+            if task_or_group_name.group_alias is not None:
+                alias = task_or_group_name.group_alias
            else:
-                _task_hierarchy = {
-                    **{task_name: []},
-                    **task_hierarchy,
-                }
-
-            _results_agg, _groups_agg = prepare_print_tasks(
-                _task_hierarchy, results, tab + 1
+                alias = task_or_group_name.group
+        else:
+            if "alias" in task_agg[name]:
+                alias = task_agg[name]["alias"]
+            else:
+                alias = name
+
+        task_agg[name]["alias"] = tab_string + alias
+        if "samples" in task_agg[name]:
+            task_agg[name].pop("samples")
+
+        if from_configurable_group and (" " not in results[name]):
+            group_tab_string = " " * group_depth + "- " if group_depth > 0 else ""
+            group_agg[name] = results[name].copy()
+            group_agg[name]["alias"] = group_tab_string + alias
+            if "samples" in group_agg[name]:
+                group_agg[name].pop("samples")
+
+        if isinstance(task_or_group_obj, dict):
+            task_depth += 1
+            group_depth += 1
+            _task_agg, _group_agg = prepare_print_tasks(
+                task_or_group_obj, results, task_depth, group_depth
            )
-            results_agg = {**results_agg, **_results_agg}
-            groups_agg = {**groups_agg, **_groups_agg}
-
-    return results_agg, groups_agg
+            task_agg = {
+                **task_agg,
+                **_task_agg,
+            }
+            group_agg = {**group_agg, **_group_agg}
+            task_depth -= 1
+            group_depth -= 1
+    return task_agg, group_agg


 def consolidate_results(
@@ -261,6 +340,8 @@ def consolidate_results(
    for task_output in eval_tasks:
        if "task_alias" in (task_config := task_output.task_config):
            results[task_output.task_name]["alias"] = task_config["task_alias"]
+        else:
+            results[task_output.task_name]["alias"] = task_output.task_name
        if group_alias := task_output.group_alias:
            if group_alias not in results and (group_name := task_output.group_name):
                results[group_name]["alias"] = group_alias
@@ -281,6 +362,147 @@ def consolidate_results(
    return results, samples, configs, versions, num_fewshot, higher_is_better


+def consolidate_group_results(
+    results,
+    versions,
+    task_dict,
+    task_root=None,
+    show_group_table=False,
+    task_aggregation_list=None,
+) -> Tuple[dict, dict, bool, Union[None,]]:
+    """
+    (Recursively) calculates groups' aggregated metrics and updates the results and versions dictionaries with this info.
+
+    @return: a tuple [results, versions, show_group_table, task_aggregation_list] with formats described below:
+
+    - results: A defaultdict with task names (and, after this function is called, group names of
+    groups that perform aggregation) as keys, and dictionaries with "alias" and metric,filter_name pairs as keys.
+    - versions: A defaultdict with task names (and, after this function is called, group names of
+    groups that perform aggregation) as keys, and float values representing the task or group's version if a version is specified. (defaulting to None).
+    - show_group_table: a boolean which is true if there exists a group that requires printing of its aggregated scores in a group table.
+    - task_aggregation_list: a defaultdict listing the subtasks to average over to produce a given group's end metric.
+
+    The method then returns the updated results, versions, show_group_table, and task_aggregation_list as a tuple.
+    In the top-level invocation of this function, task_aggregation_list is ignored.
+    """
+    if task_root is None:
+        task_root = {}
+
+    if task_aggregation_list is None:
+        task_aggregation_list = {}
+
+    for group_or_task, group_or_task_info in task_dict.items():
+        # Convert to string
+        if isinstance(group_or_task, ConfigurableGroup):
+            group_config = group_or_task.config
+            group_or_task = group_or_task.group_name
+        else:
+            group_config = None
+
+        if isinstance(group_or_task_info, Task):
+            if task_root:
+                task_aggregation_list.setdefault(task_root, []).append(
+                    group_or_task_info.task_name
+                )
+        else:
+            (
+                results,
+                versions,
+                show_group_table,
+                _task_aggregation_list,
+            ) = consolidate_group_results(
+                results,
+                versions,
+                group_or_task_info,
+                group_or_task,
+                show_group_table,
+                task_aggregation_list,
+            )
+            if task_root:
+                task_aggregation_list.setdefault(task_root, []).extend(
+                    task_aggregation_list.get(group_or_task, [])
+                )
+
+            if (group_config is None) or (
+                group_config["aggregate_metric_list"] is None
+            ):
+                results[group_or_task][" "] = " "
+                continue
+
+            if "aggregate_metric_list" in group_config:
+                agg_metric_list = group_config["aggregate_metric_list"]
+
+            show_group_table = show_group_table | bool(
+                group_config["aggregate_metric_list"]
+            )
+
+            task_list = _task_aggregation_list[group_or_task]
+
+            metric_list = list(
+                {
+                    key
+                    for task in task_list
+                    for key in results[task].keys()
+                    if "_stderr" not in key and key not in ["task", "alias", "samples"]
+                }
+            )
+            for metric in metric_list:
+                stderr = "_stderr,".join(metric.split(","))
+
+                # gather metrics, sizes, and stderrs from subtasks
+                metrics = [
+                    results[task][metric]
+                    for task in task_list
+                    if metric in results[task]
+                ]  # TODO: copy?
+                stderrs = [
+                    results[task][stderr]
+                    for task in task_list
+                    if stderr in results[task]
+                ]
+                sizes = [
+                    results[task]["samples"]
+                    for task in task_list
+                    if metric in results[task]
+                ]
+
+                for metric_config in agg_metric_list:
+                    for filter_name in metric_config["filter_list"]:
+                        if metric != ",".join([metric_config["metric"], filter_name]):
+                            continue
+
+                        # compute group's pooled metric and stderr
+                        if metric_config["aggregation"] == "mean":
+                            aggregate_fn = aggregate_subtask_metrics
+                        elif callable(metric_config["aggregation"]):
+                            aggregate_fn = metric_config["aggregation"]
+                        else:
+                            raise ValueError(
+                                f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{metric_config['aggregation']}' for group '{group_or_task}'"
+                            )
+
+                        results[group_or_task][metric] = aggregate_fn(
+                            metrics,
+                            sizes,
+                            metric_config["weight_by_size"],
+                        )
+                        # TODO: calculate groups' metrics using arbitrary agg fns
+                        if "N/A" in stderrs:
+                            results[group_or_task][stderr] = "N/A"
+                        else:
+                            # NOTE: this assumes we are using the mean to aggregate. There are warnings about this elsewhere
+                            results[group_or_task][stderr] = pooled_sample_stderr(
+                                stderrs, sizes
+                            )
+
+                results[group_or_task]["samples"] = sum(sizes)
+                group_metadata = group_config.get("metadata", None)
+                if group_metadata is not None:
+                    versions[group_or_task] = group_metadata.get("version", None)
+    # print(results)
+    return results, versions, show_group_table, task_aggregation_list
+
+
 @positional_deprecated
 def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
    """

--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -62,11 +62,8 @@ class WhitespaceFilter(Filter):
        def filter_set(inst):
            filtered_resp = []
            for resp in inst:
-                if resp.startswith(" "):
-                    resp = resp[1:]
-
+                resp = resp.lstrip()
                filtered_resp.append(resp)
-
            return filtered_resp

        filtered_resps = [filter_set(resp) for resp in resps]

--- a/lm_eval/loggers/evaluation_tracker.py
+++ b/lm_eval/loggers/evaluation_tracker.py
 import json
+import os
 import re
 import time
 from collections import defaultdict
@@ -14,6 +15,7 @@ from huggingface_hub import (
    HfApi,
    hf_hub_url,
 )
+from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status

 from lm_eval.utils import (
    eval_logger,
@@ -112,12 +114,15 @@ class EvaluationTracker:
        output_path: str = None,
        hub_results_org: str = "",
        hub_repo_name: str = "",
+        details_repo_name: str = "",
+        results_repo_name: str = "",
        push_results_to_hub: bool = False,
        push_samples_to_hub: bool = False,
        public_repo: bool = False,
        token: str = "",
        leaderboard_url: str = "",
        point_of_contact: str = "",
+        gated: bool = False,
    ) -> None:
        """
        Creates all the necessary loggers for evaluation tracking.
@@ -126,12 +131,15 @@ class EvaluationTracker:
            output_path (str): Path to save the results. If not provided, the results won't be saved.
            hub_results_org (str): The Hugging Face organization to push the results to. If not provided, the results will be pushed to the owner of the Hugging Face token.
            hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`.
+            details_repo_name (str): The name of the Hugging Face repository to push the details to. If not provided, the results will be pushed to `lm-eval-results`.
+            result_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will not be pushed and will be found in the details_hub_repo.
            push_results_to_hub (bool): Whether to push the results to the Hugging Face hub.
            push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub.
            public_repo (bool): Whether to push the results to a public or private repository.
            token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`.
            leaderboard_url (str): URL to the leaderboard on the Hugging Face hub on the dataset card.
            point_of_contact (str): Contact information on the Hugging Face hub dataset card.
+            gated (bool): Whether to gate the repository.
        """
        self.general_config_tracker = GeneralConfigTracker()

@@ -142,6 +150,7 @@ class EvaluationTracker:
        self.leaderboard_url = leaderboard_url
        self.point_of_contact = point_of_contact
        self.api = HfApi(token=token) if token else None
+        self.gated_repo = gated

        if not self.api and (push_results_to_hub or push_samples_to_hub):
            raise ValueError(
@@ -159,9 +168,24 @@ class EvaluationTracker:
                f"hub_results_org was not specified. Results will be pushed to '{hub_results_org}'."
            )

-        hub_repo_name = hub_repo_name if hub_repo_name else "lm-eval-results"
-        self.hub_results_repo = f"{hub_results_org}/{hub_repo_name}"
-        self.hub_results_repo_private = f"{hub_results_org}/{hub_repo_name}-private"
+        if hub_repo_name == "":
+            details_repo_name = (
+                details_repo_name if details_repo_name != "" else "lm-eval-results"
+            )
+            results_repo_name = (
+                results_repo_name if results_repo_name != "" else details_repo_name
+            )
+        else:
+            details_repo_name = hub_repo_name
+            results_repo_name = hub_repo_name
+            eval_logger.warning(
+                "hub_repo_name was specified. Both details and results will be pushed to the same repository. Using hub_repo_name is no longer recommended, details_repo_name and results_repo_name should be used instead."
+            )
+
+        self.details_repo = f"{hub_results_org}/{details_repo_name}"
+        self.details_repo_private = f"{hub_results_org}/{details_repo_name}-private"
+        self.results_repo = f"{hub_results_org}/{results_repo_name}"
+        self.results_repo_private = f"{hub_results_org}/{results_repo_name}-private"

    def save_results_aggregated(
        self,
@@ -211,9 +235,9 @@ class EvaluationTracker:

                if self.api and self.push_results_to_hub:
                    repo_id = (
-                        self.hub_results_repo
+                        self.results_repo
                        if self.public_repo
-                        else self.hub_results_repo_private
+                        else self.results_repo_private
                    )
                    self.api.create_repo(
                        repo_id=repo_id,
@@ -221,10 +245,15 @@ class EvaluationTracker:
                        private=not self.public_repo,
                        exist_ok=True,
                    )
-                    self.api.upload_folder(
+                    self.api.upload_file(
                        repo_id=repo_id,
-                        folder_path=str(path),
-                        path_in_repo=self.general_config_tracker.model_name_sanitized,
+                        path_or_fileobj=str(
+                            path.joinpath(f"results_{self.date_id}.json")
+                        ),
+                        path_in_repo=os.path.join(
+                            self.general_config_tracker.model_name,
+                            f"results_{self.date_id}.json",
+                        ),
                        repo_type="dataset",
                        commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
                    )
@@ -278,6 +307,7 @@ class EvaluationTracker:
                    sample["resps"] = sanitize_list(sample["resps"])
                    sample["filtered_resps"] = sanitize_list(sample["filtered_resps"])
                    sample["arguments"] = arguments
+                    sample["target"] = str(sample["target"])

                    sample_dump = (
                        json.dumps(
@@ -288,14 +318,14 @@ class EvaluationTracker:
                        + "\n"
                    )

-                    with open(file_results_samples, "a") as f:
+                    with open(file_results_samples, "a", encoding="utf-8") as f:
                        f.write(sample_dump)

                if self.api and self.push_samples_to_hub:
                    repo_id = (
-                        self.hub_results_repo
+                        self.details_repo
                        if self.public_repo
-                        else self.hub_results_repo_private
+                        else self.details_repo_private
                    )
                    self.api.create_repo(
                        repo_id=repo_id,
@@ -303,6 +333,18 @@ class EvaluationTracker:
                        private=not self.public_repo,
                        exist_ok=True,
                    )
+                    try:
+                        if self.gated_repo:
+                            headers = build_hf_headers()
+                            r = get_session().put(
+                                url=f"https://huggingface.co/api/datasets/{repo_id}/settings",
+                                headers=headers,
+                                json={"gated": "auto"},
+                            )
+                            hf_raise_for_status(r)
+                    except Exception as e:
+                        eval_logger.warning("Could not gate the repository")
+                        eval_logger.info(repr(e))
                    self.api.upload_folder(
                        repo_id=repo_id,
                        folder_path=str(path),
@@ -327,9 +369,7 @@ class EvaluationTracker:
        """

        eval_logger.info("Recreating metadata card")
-        repo_id = (
-            self.hub_results_repo if self.public_repo else self.hub_results_repo_private
-        )
+        repo_id = self.details_repo if self.public_repo else self.details_repo_private

        files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
        results_files = get_results_filenames(files_in_repo)
@@ -360,7 +400,10 @@ class EvaluationTracker:
                results_datetime,
            )
            latest_task_results_datetime[samples_key] = latest_datetime
-            latest_task_results_datetime[results_key] = latest_datetime
+            latest_task_results_datetime[results_key] = max(
+                latest_task_results_datetime[results_key],
+                latest_datetime,
+            )

        # Create metadata card
        card_metadata = MetadataConfigs()
@@ -377,14 +420,15 @@ class EvaluationTracker:
            sanitized_last_eval_date_results = re.sub(
                r"[^\w\.]", "_", latest_task_results_datetime[config_name]
            )
-            # Ensure that all results files are listed in the metadata card
-            current_results = card_metadata.get(config_name, {"data_files": []})
-            current_results["data_files"].append(
-                {"split": eval_date_sanitized, "path": [str(results_filename)]}
-            )
-            card_metadata[config_name] = current_results
-            # If the results file is the newest, update the "latest" field in the metadata card
+
            if eval_date_sanitized == sanitized_last_eval_date_results:
+                # Ensure that all results files are listed in the metadata card
+                current_results = card_metadata.get(config_name, {"data_files": []})
+                current_results["data_files"].append(
+                    {"split": eval_date_sanitized, "path": [str(results_filename)]}
+                )
+                card_metadata[config_name] = current_results
+                # If the results file is the newest, update the "latest" field in the metadata card
                card_metadata[config_name]["data_files"].append(
                    {"split": "latest", "path": [str(results_filename)]}
                )
@@ -403,65 +447,20 @@ class EvaluationTracker:
            sanitized_last_eval_date_results = re.sub(
                r"[^\w\.]", "_", latest_task_results_datetime[config_name]
            )
-            # Ensure that all sample results files are listed in the metadata card
-            current_details_for_task = card_metadata.get(
-                config_name, {"data_files": []}
-            )
-            current_details_for_task["data_files"].append(
-                {"split": eval_date_sanitized, "path": [str(results_filename)]}
-            )
-            card_metadata[config_name] = current_details_for_task
-            # If the samples results file is the newest, update the "latest" field in the metadata card
            if eval_date_sanitized == sanitized_last_eval_date_results:
+                # Ensure that all sample results files are listed in the metadata card
+                current_details_for_task = card_metadata.get(
+                    config_name, {"data_files": []}
+                )
+                current_details_for_task["data_files"].append(
+                    {"split": eval_date_sanitized, "path": [str(results_filename)]}
+                )
+                card_metadata[config_name] = current_details_for_task
+                # If the samples results file is the newest, update the "latest" field in the metadata card
                card_metadata[config_name]["data_files"].append(
                    {"split": "latest", "path": [str(results_filename)]}
                )

-            # Special case for MMLU with a single split covering it all
-            # We add another config with all MMLU splits results together for easy inspection
-            SPECIAL_TASKS = ["mmlu", "gpqa", "minerva_math"]
-            for special_task in SPECIAL_TASKS:
-                if special_task in config_name:
-                    special_task = f"{model_name}__{special_task}"
-                    former_entry = card_metadata.get(special_task, {"data_files": []})
-
-                    former_split = [
-                        (i, entry)
-                        for i, entry in enumerate(former_entry["data_files"])
-                        if entry.get("split", None) == eval_date_sanitized
-                    ]
-
-                    if len(former_split) == 0:
-                        former_entry["data_files"].append(
-                            {
-                                "split": eval_date_sanitized,
-                                "path": [str(results_filename)],
-                            }
-                        )
-                    else:
-                        split_index, _ = former_split[0]
-                        former_entry["data_files"][split_index]["path"].append(
-                            str(results_filename)
-                        )
-
-                    if eval_date_sanitized == sanitized_last_eval_date_results:
-                        latest_split = [
-                            (i, entry)
-                            for i, entry in enumerate(former_entry["data_files"])
-                            if entry.get("split", None) == "latest"
-                        ]
-                        if len(latest_split) == 0:
-                            former_entry["data_files"].append(
-                                {"split": "latest", "path": [str(results_filename)]}
-                            )
-                        else:
-                            latest_index, _ = latest_split[0]
-                            former_entry["data_files"][latest_index]["path"].append(
-                                str(results_filename)
-                            )
-
-                    card_metadata[special_task] = former_entry
-
        # Get latest results and extract info to update metadata card examples
        latest_datetime = max(latest_task_results_datetime.values())
        latest_model_name = max(

--- a/lm_eval/loggers/utils.py
+++ b/lm_eval/loggers/utils.py
@@ -114,15 +114,29 @@ def add_env_info(storage: Dict[str, Any]):

 def add_tokenizer_info(storage: Dict[str, Any], lm):
    if getattr(lm, "tokenizer", False):
-        tokenizer_info = {
-            "tokenizer_pad_token": [lm.tokenizer.pad_token, lm.tokenizer.pad_token_id],
-            "tokenizer_eos_token": [lm.tokenizer.eos_token, lm.tokenizer.eos_token_id],
-            "tokenizer_bos_token": [lm.tokenizer.bos_token, lm.tokenizer.bos_token_id],
-            "eot_token_id": getattr(lm, "eot_token_id", None),
-            "max_length": getattr(lm, "max_length", None),
-        }
-        storage.update(tokenizer_info)
-    # seems gguf and textsynth do not have tokenizer
+        try:
+            tokenizer_info = {
+                "tokenizer_pad_token": [
+                    lm.tokenizer.pad_token,
+                    str(lm.tokenizer.pad_token_id),
+                ],
+                "tokenizer_eos_token": [
+                    lm.tokenizer.eos_token,
+                    str(lm.tokenizer.eos_token_id),
+                ],
+                "tokenizer_bos_token": [
+                    lm.tokenizer.bos_token,
+                    str(lm.tokenizer.bos_token_id),
+                ],
+                "eot_token_id": getattr(lm, "eot_token_id", None),
+                "max_length": getattr(lm, "max_length", None),
+            }
+            storage.update(tokenizer_info)
+        except Exception as err:
+            logger.debug(
+                f"Logging detailed tokenizer info failed with {err}, skipping..."
+            )
+        # seems gguf and textsynth do not have tokenizer
    else:
        logger.debug(
            "LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results."

--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
 from . import (
    anthropic_llms,
+    api_models,
    dummy,
    gguf,
+    hf_vlms,
    huggingface,
+    ibm_watsonx_ai,
    mamba_lm,
    nemo_lm,
    neuralmagic,
@@ -11,6 +14,7 @@ from . import (
    optimum_lm,
    textsynth,
    vllm_causallms,
+    vllm_vlms,
 )



--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
-from typing import Any, List, Tuple
+import os
+from functools import cached_property
+from typing import Any, Dict, List, Tuple, Union

 from tqdm import tqdm

 from lm_eval import utils
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
+from lm_eval.models.openai_completions import LocalCompletionsAPI
 from lm_eval.models.utils import retry_on_specific_exceptions


@@ -42,8 +45,8 @@ def anthropic_completion(

    try:
        import anthropic
-    except ModuleNotFoundError:
-        raise Exception(
+    except ModuleNotFoundError as exception:
+        raise type(exception)(
            "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
 please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
        )
@@ -105,8 +108,8 @@ def anthropic_chat(

    try:
        import anthropic
-    except ModuleNotFoundError:
-        raise Exception(
+    except ModuleNotFoundError as exception:
+        raise type(exception)(
            "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
 please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
        )
@@ -138,7 +141,7 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
    return messages()


-@register_model("anthropic")
+@register_model("anthropic-completions")
 class AnthropicLM(LM):
    REQ_CHUNK_SIZE = 20  # TODO: not used

@@ -165,8 +168,8 @@ class AnthropicLM(LM):

        try:
            import anthropic
-        except ModuleNotFoundError:
-            raise Exception(
+        except ModuleNotFoundError as exception:
+            raise type(exception)(
                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
 please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
            )
@@ -214,8 +217,8 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
        try:
            import anthropic
-        except ModuleNotFoundError:
-            raise Exception(
+        except ModuleNotFoundError as exception:
+            raise type(exception)(
                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
 please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
            )
@@ -271,90 +274,89 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install


 @register_model("anthropic-chat", "anthropic-chat-completions")
-class AnthropicChatLM(AnthropicLM):
-    REQ_CHUNK_SIZE = 20  # TODO: not used
-
+class AnthropicChat(LocalCompletionsAPI):
    def __init__(
        self,
-        model: str,
-        batch_size: int = 1,
-        max_tokens: int = 256,
-        temperature: float = 0,  # defaults to 1
-        **kwargs,  # top_p, top_k, etc.
-    ) -> None:
-        """Anthropic API wrapper.
-
-        :param model: str
-            Anthropic model e.g. 'claude-3-opus-20240229', 'claude-3-sonnet-20240229'
-        :param max_tokens: int
-            Maximum number of tokens to sample from the model
-        :param temperature: float
-            Sampling temperature
-        :param kwargs: Any
-            Additional model_args to pass to the API client
-        """
-        super().__init__()
+        base_url="https://api.anthropic.com/v1/messages",
+        tokenizer_backend=None,
+        **kwargs,
+    ):
+        super().__init__(
+            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
+        )
+        eval_logger.warning(
+            "Chat completions does not support batching. Defaulting to batch size 1."
+        )
+        self._batch_size = 1
+        self.anthropic_version = "2023-06-01"
+        eval_logger.warning(
+            f"Using Anthropic Version: {self.anthropic_version}. Confirm the current version here: https://docs.anthropic.com/en/api/versioning"
+        )

-        try:
-            import anthropic
-        except ModuleNotFoundError:
-            raise Exception(
-                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
-please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+    @cached_property
+    def api_key(self):
+        """Override this property to return the API key for the API request."""
+        key = os.environ.get("ANTHROPIC_API_KEY", None)
+        if key is None:
+            raise ValueError(
+                "API key not found. Please set the ANTHROPIC_API_KEY environment variable."
            )
-
-        self.model = model
-        # defaults to os.environ.get("ANTHROPIC_API_KEY")
-        self.client = anthropic.Anthropic()
-        self.temperature = temperature
-        self.max_tokens = max_tokens
-        self.tokenizer = self.client.get_tokenizer()
-        self.kwargs = kwargs
-
-    @property
-    def max_gen_toks(self) -> int:
-        return self.max_tokens
-
-    def generate_until(self, requests) -> List[str]:
-        try:
-            import anthropic
-        except ModuleNotFoundError:
-            raise Exception(
-                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
-please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
-            )
-
-        if not requests:
-            return []
-
-        _requests: List[Tuple[str, dict]] = [req.args for req in requests]
-
+        return key
+
+    @cached_property
+    def header(self):
+        return {
+            "x-api-key": f"{self.api_key}",
+            "anthropic-version": self.anthropic_version,
+        }
+
+    def _create_payload(
+        self, messages: List[Dict], generate=True, gen_kwargs: dict = None, **kwargs
+    ) -> dict:
+        system = (
+            messages[0].get("content") if messages[0].get("role") == "system" else None
+        )
+        if system:
+            messages = messages[1:]
+        gen_kwargs.pop("do_sample", False)
+        max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
+        temperature = gen_kwargs.pop("temperature", 0)
+        stop = gen_kwargs.pop("until", ["\n\nHuman:"])
+        if not isinstance(stop, list):
+            stop = [stop]
+        out = {
+            "messages": messages,
+            "model": self.model,
+            "max_tokens": max_tokens,
+            "temperature": temperature,
+            "stop_sequences": stop,
+            **gen_kwargs,
+        }
+        if system:
+            out["system"] = system
+        return out
+
+    def parse_generations(
+        self, outputs: Union[Dict, List[Dict]], **kwargs
+    ) -> List[str]:
        res = []
-        for request in tqdm(_requests):
-            try:
-                inp = request[0]
-                request_args = request[1]
-                # generation_kwargs
-                until = request_args.get("until")
-                max_tokens = request_args.get("max_gen_toks", self.max_length)
-                temperature = request_args.get("temperature", self.temperature)
-                response = anthropic_chat(
-                    client=self.client,
-                    model=self.model,
-                    prompt=inp,
-                    max_tokens=max_tokens,
-                    temperature=temperature,  # TODO: implement non-greedy sampling for Anthropic
-                    stop=until,  # type: ignore
-                    **self.kwargs,
-                )
-                res.append(response)
-
-                self.cache_hook.add_partial("generate_until", request, response)
-            except anthropic.APIConnectionError as e:  # type: ignore # noqa: F821
-                eval_logger.critical(f"Server unreachable: {e.__cause__}")
-                break
-            except anthropic.APIStatusError as e:  # type: ignore # noqa: F821
-                eval_logger.critical(f"API error {e.status_code}: {e.message}")
-                break
-
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+        for out in outputs:
+            for choices in out["content"]:
+                res.append(choices["text"])
        return res
+
+    def tok_encode(
+        self,
+        string: str,
+        left_truncate_len=None,
+        add_special_tokens=None,
+        **kwargs,
+    ) -> List[str]:
+        return [string]
+
+    def loglikelihood(self, requests, **kwargs):
+        raise NotImplementedError(
+            "Anthropic Chat Completions API does not support the return of loglikelihood"
+        )
--- a/lm_eval/models/api_models.py
+++ b/lm_eval/models/api_models.py
+import abc
+import asyncio
+import copy
+import itertools
+import json
+from functools import cached_property
+from typing import (
+    Any,
+    Awaitable,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Literal,
+    NamedTuple,
+    Optional,
+    Tuple,
+    Union,
+)
+
+
+try:
+    import requests
+    from aiohttp import ClientSession, TCPConnector
+    from tenacity import RetryError, retry, stop_after_attempt, wait_exponential
+    from tqdm import tqdm
+    from tqdm.asyncio import tqdm_asyncio
+except ModuleNotFoundError:
+    pass
+
+
+from importlib.util import find_spec
+
+from lm_eval import utils
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import TemplateLM
+from lm_eval.models.utils import Collator, chunks, configure_pad_token
+
+
+LogLikelihoodInputs = Tuple[Tuple[str, str], List[int], List[int]]
+
+
+# utility class to keep track of json encoded chats
+class JsonChatStr(NamedTuple):
+    prompt: str
+
+    def encode(self, encoding):
+        return self.prompt.encode(encoding)
+
+
+eval_logger = utils.eval_logger
+
+
+class TemplateAPI(TemplateLM):
+    def __init__(
+        self,
+        model: str = None,
+        pretrained: str = None,  # `model` takes precedence over `pretrained` when passed.
+        base_url: str = None,
+        tokenizer: Optional[str] = None,
+        # Loglikelihood tasks require a tokenizer to calculate context lengths,
+        # however the requests can be sent as a string if the API doesn't support token inputs.
+        # use tokenized_requests=False
+        tokenizer_backend: Optional[
+            Literal["tiktoken", "huggingface", None]
+        ] = "huggingface",
+        truncate: bool = False,
+        # number of concurrent requests. More useful if not batching
+        num_concurrent: int = 1,
+        max_retries: int = 3,
+        max_gen_toks: int = 256,
+        batch_size: Union[str, int] = 1,
+        seed: int = 1234,
+        max_length: Optional[int] = 2048,
+        add_bos_token: bool = False,
+        custom_prefix_token_id: int = None,
+        # send the requests as tokens or strings
+        tokenized_requests: bool = True,
+        trust_remote_code: bool = False,
+        revision: Optional[str] = "main",
+        use_fast_tokenizer: bool = True,
+        verify_certificate: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        missing_packages = [
+            pkg
+            for pkg in ["aiohttp", "tqdm", "tenacity", "requests"]
+            if find_spec(pkg) is None
+        ]
+        if missing_packages:
+            raise ModuleNotFoundError(
+                f"Attempted to use an API model, but the required packages {missing_packages} are not installed. "
+                'Please install these via `pip install lm-eval[api]` or `pip install -e ."[api]"`'
+            )
+        self.model = model or pretrained
+        self.base_url = base_url
+        self.tokenizer = tokenizer
+        if not isinstance(batch_size, int) and "auto" in batch_size:
+            eval_logger.warning(
+                "Automatic batch size is not supported for API models. Defaulting to batch size 1."
+            )
+        elif int(batch_size) > 1:
+            eval_logger.warning(
+                "Batch size > 1 detected. Ensure your API supports batched requests with varying total sequence lengths."
+            )
+        self._batch_size = int(batch_size) if batch_size != "auto" else 1
+        self._truncate = truncate
+        self._max_gen_toks = int(max_gen_toks)
+        self._seed = int(seed)
+        # max_length - 1 as we always have 1 token for generation
+        eval_logger.info(f"Using max length {max_length} - 1")
+        self.max_length = max_length - 1
+        if int(num_concurrent) <= 1:
+            eval_logger.info(
+                "Concurrent requests are disabled. To enable concurrent requests, set `num_concurrent` > 1."
+            )
+        self._concurrent = int(num_concurrent)
+        self.tokenizer_backend = tokenizer_backend
+        self.add_bos_token = add_bos_token
+        self.custom_prefix_token_id = custom_prefix_token_id
+        self.tokenized_requests = tokenized_requests
+        self.max_retries = int(max_retries)
+        self.verify_certificate = verify_certificate
+
+        eval_logger.info(f"Using tokenizer {self.tokenizer_backend}")
+        if self.tokenizer_backend is None:
+            self.tokenizer = None
+            self.tokenized_requests = False
+        else:
+            if self.tokenizer is None:
+                if self.tokenizer_backend == "huggingface":
+                    import transformers
+
+                    self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                        self.tokenizer if self.tokenizer else self.model,
+                        trust_remote_code=trust_remote_code,
+                        revision=revision,
+                        use_fast=use_fast_tokenizer,
+                    )
+                    # Not used as the API will handle padding but to mirror the behavior of the HFLM
+                    self.tokenizer = configure_pad_token(self.tokenizer)
+                elif self.tokenizer_backend == "tiktoken":
+                    try:
+                        import tiktoken
+
+                        self.tokenizer = tiktoken.encoding_for_model(self.model)
+                    except ModuleNotFoundError as e:
+                        raise ModuleNotFoundError(
+                            "Attempted to use 'openai' LM type, but the package `tiktoken` is not installed. "
+                            "Please install it via `pip install lm-eval[api]` or `pip install -e .[api]`."
+                        ) from e
+                    if "openai" not in self.base_url:
+                        eval_logger.warning(
+                            f"Passed `base_url={self.base_url}` but using (OpenAI) Tiktoken tokenizer backend. "
+                            "Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken."
+                        )
+            else:
+                import transformers
+
+                assert isinstance(tokenizer, str), "tokenizer must be a string"
+                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                    tokenizer,
+                    trust_remote_code=trust_remote_code,
+                    revision=revision,
+                    use_fast=use_fast_tokenizer,
+                )
+
+    @abc.abstractmethod
+    def _create_payload(
+        self,
+        messages: Union[List[List[int]], List[dict], List[str], str],
+        *,
+        generate: bool = True,
+        gen_kwargs: Optional[dict] = None,
+        seed: int = 1234,
+        **kwargs,
+    ) -> dict:
+        """This method is responsible for creating the json payload that will be sent to the API."""
+        raise NotImplementedError
+
+    def create_message(
+        self,
+        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
+        generate=False,
+    ) -> Union[List[List[int]], List[dict], List[str], str]:
+        """Helper method to transform the prompt into the expected API input format. messages consist of batched requests"""
+        if isinstance(messages[0], JsonChatStr):
+            # for chat completions we need to decode the json string to list[dict,...]
+            assert (
+                self._batch_size == 1
+            ), "non-tokenized chat requests are only supported with batch_size=1"
+            # list[dict["role":..., "content":...],...]
+            return json.loads(messages[0].prompt)
+
+        if not self.tokenized_requests:
+            # if messages are tokenized:
+            if isinstance(messages[0][0], int):
+                # assuming decoding is lossless. However, this is only for loglikelihood requests
+                # as we need to compute the context length. For generations, we don't need to tokenize.
+                messages = self.decode_batch(messages)
+            if self._batch_size <= 1:
+                # if batch is 1 return str
+                return messages[0]
+            else:
+                # list[str,...]
+                return messages
+
+        # list[list[int], ...]
+        return messages
+
+    @staticmethod
+    @abc.abstractmethod
+    def parse_logprobs(
+        outputs: Union[Any, List[Any]],
+        tokens: List[List[int]] = None,
+        ctxlen: List[int] = None,
+        **kwargs,
+    ) -> List[Tuple[float, bool]]:
+        """Method used to parse the logprobs from the (batched) API response. This method should return a list of tuples"""
+        raise NotImplementedError
+
+    @staticmethod
+    @abc.abstractmethod
+    def parse_generations(outputs: Union[Any, List[Any]], **kwargs) -> List[str]:
+        """Method used to parse the generations from the (batched) API response. This method should return a list of str"""
+        raise NotImplementedError
+
+    @cached_property
+    def api_key(self) -> str:
+        """Override this property to return the API key for the API request."""
+        return ""
+
+    @cached_property
+    def header(self) -> dict:
+        """Override this property to return the headers for the API request."""
+        return {"Authorization": f"Bearer {self.api_key}"}
+
+    @property
+    def tokenizer_name(self) -> str:
+        """Must be defined for LM subclasses which implement Chat Templating.
+        Should return the name of the tokenizer or chat template used.
+        Used only to properly fingerprint caches when requests are being cached with `--cache_requests`, otherwise not used.
+        """
+        return ""
+
+    def apply_chat_template(
+        self, chat_history: List[Dict[str, str]]
+    ) -> Union[str, JsonChatStr]:
+        """Applies a chat template to a list of chat history between user and model."""
+        if self.tokenizer_backend == "huggingface" and self.tokenized_requests:
+            return self.tokenizer.apply_chat_template(
+                chat_history, tokenize=False, add_generation_prompt=True
+            )
+        else:
+            # bit of a hack. We'll load back before sending to the API
+            return JsonChatStr(json.dumps(chat_history))
+
+    @cached_property
+    def eot_token_id(self) -> Optional[int]:
+        if self.tokenizer is None:
+            return None
+        else:
+            if self.tokenizer_backend == "huggingface":
+                return self.tokenizer.eos_token_id
+            elif self.tokenizer_backend == "tiktoken":
+                return self.tokenizer.eot_token
+
+    @cached_property
+    def prefix_token_id(self) -> Optional[int]:
+        if self.tokenizer is None:
+            return None
+        else:
+            if self.custom_prefix_token_id is not None:
+                return self.custom_prefix_token_id
+            if self.tokenizer_backend == "huggingface":
+                if self.tokenizer.bos_token_id is not None:
+                    return self.tokenizer.bos_token_id
+                return self.tokenizer.eos_token_id
+            else:
+                return self.tokenizer.eot_token
+
+    def tok_encode(
+        self,
+        string: str,
+        left_truncate_len: int = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+        **kwargs,
+    ) -> Union[List[List[int]], List[int], List[str]]:
+        if self.tokenizer_backend is None:
+            return [string]
+        elif self.tokenizer_backend == "huggingface":
+            # by default for CausalLM - false or self.add_bos_token is set
+            if not add_special_tokens:
+                add_special_tokens = False or self.add_bos_token
+            encoding: Union[List[List[int]], List[int]] = self.tokenizer(
+                string,
+                add_special_tokens=add_special_tokens,
+                truncation=truncation,
+                return_attention_mask=False,
+            ).input_ids
+
+            # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+            if left_truncate_len:
+                if not isinstance(string, str):
+                    encoding = [enc[-left_truncate_len:] for enc in encoding]
+                else:
+                    encoding = encoding[-left_truncate_len:]
+
+            return encoding
+
+        else:
+            try:
+                encoding = self.tokenizer.encode(string)
+            except Exception:
+                encoding = self.tokenizer.encode_batch(string)
+            return encoding
+
+    def decode_batch(self, tokens: List[List[int]]) -> List[str]:
+        if self.tokenizer_backend == "huggingface":
+            return self.tokenizer.batch_decode(tokens)
+        elif self.tokenizer_backend == "tiktoken":
+            return self.tokenizer.decode_batch(tokens)
+
+    def model_call(
+        self,
+        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
+        *,
+        generate: bool = True,
+        gen_kwargs: Optional[Dict] = None,
+        **kwargs,
+    ) -> Optional[dict]:
+        # !!! Copy: shared dict for each request, need new object !!!
+        gen_kwargs = copy.deepcopy(gen_kwargs)
+        try:
+            response = requests.post(
+                self.base_url,
+                json=self._create_payload(
+                    self.create_message(messages),
+                    generate=generate,
+                    gen_kwargs=gen_kwargs,
+                    seed=self._seed,
+                    **kwargs,
+                ),
+                headers=self.header,
+                verify=self.verify_certificate,
+            )
+            if not response.ok:
+                eval_logger.warning(
+                    f"API request failed with error message: {response.text}. Retrying..."
+                )
+            response.raise_for_status()
+            return response.json()
+        except RetryError:
+            eval_logger.error(
+                "API request failed after multiple retries. Please check the API status."
+            )
+            return None
+
+    async def amodel_call(
+        self,
+        session: ClientSession,
+        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
+        *,
+        generate: bool = True,
+        cache_keys: list = None,
+        ctxlens: Optional[List[int]] = None,
+        gen_kwargs: Optional[Dict] = None,
+        **kwargs,
+    ) -> Union[List[str], List[Tuple[float, bool]], None]:
+        # !!! Copy: shared dict for each request, need new object !!!
+        gen_kwargs = copy.deepcopy(gen_kwargs)
+        payload = self._create_payload(
+            self.create_message(messages),
+            generate=generate,
+            gen_kwargs=gen_kwargs,
+            seed=self._seed,
+            **kwargs,
+        )
+        cache_method = "generate_until" if generate else "loglikelihood"
+        try:
+            async with session.post(
+                self.base_url,
+                json=payload,
+                headers=self.header,
+            ) as response:
+                if not response.ok:
+                    error_text = await response.text()
+                    eval_logger.warning(
+                        f"API request failed with error message: {error_text}. Retrying..."
+                    )
+                # raising exception will retry the request
+                response.raise_for_status()
+                outputs = await response.json()
+            answers = (
+                self.parse_generations(
+                    outputs=outputs,
+                )
+                if generate
+                else self.parse_logprobs(
+                    outputs=outputs,
+                    tokens=messages,
+                    ctxlens=ctxlens,
+                )
+            )
+            if cache_keys:
+                for res, cache in zip(answers, cache_keys):
+                    self.cache_hook.add_partial(cache_method, cache, res)
+            return answers
+        # If the retries also fail
+        except RetryError:
+            eval_logger.error(
+                "API request failed after multiple retries. Please check the API status."
+            )
+            return None
+
+    def batch_loglikelihood_requests(
+        self, chunks: Iterable[List[LogLikelihoodInputs]]
+    ) -> Tuple[List[List[int]], List[int], List[Tuple[str, str]]]:
+        inputs = []
+        ctxlens = []
+        cache_keys = []
+        for chunk in chunks:
+            for cache_key, context_enc, continuation_enc in chunk:
+                # max_length - 1 as we always have 1 token for generation
+                inp = (context_enc + continuation_enc)[-(self.max_length) :]
+                ctxlen = len(context_enc) - max(
+                    0, len(context_enc) + len(continuation_enc) - (self.max_length)
+                )
+
+                inputs.append(inp)
+                ctxlens.append(ctxlen)
+                cache_keys.append(cache_key)
+        return inputs, ctxlens, cache_keys
+
+    async def get_batched_requests(
+        self,
+        requests: list,
+        cache_keys: list,
+        *,
+        generate: bool = True,
+        ctxlens: List[int] = None,
+        **kwargs,
+    ) -> Union[List[List[str]], List[List[Tuple[float, bool]]]]:
+        ctxlens = ctxlens if ctxlens else [None] * len(requests)
+        conn = TCPConnector(limit=self._concurrent)
+        async with ClientSession(connector=conn) as session:
+            retry_: Callable[..., Awaitable[Any]] = retry(
+                stop=stop_after_attempt(self.max_retries),
+                wait=wait_exponential(multiplier=0.5, min=1, max=10),
+                reraise=True,
+            )(self.amodel_call)
+            # Create tasks for each batch of request
+            tasks = [
+                asyncio.create_task(
+                    retry_(
+                        session=session,
+                        messages=message,
+                        cache_keys=cache_key,
+                        generate=generate,
+                        ctxlens=ctxlen,
+                        **kwargs,
+                    )
+                )
+                for message, cache_key, ctxlen in zip(
+                    chunks(requests, n=self._batch_size),
+                    chunks(cache_keys, n=self._batch_size),
+                    chunks(ctxlens, n=self._batch_size),
+                )
+            ]
+
+            return await tqdm_asyncio.gather(*tasks, desc="Requesting API")
+
+    def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
+        assert (
+            self.tokenizer is not None
+        ), "Tokenizer is required for loglikelihood tasks to compute context lengths."
+        res = []
+
+        def _collate(req: LogLikelihoodInputs):
+            """Defines the key for the sorted method"""
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+
+            toks = req[1] + req[2]
+            return -len(toks), tuple(toks)
+
+        re_ord = Collator(
+            requests,
+            sort_fn=_collate,
+            group_by=None,
+        )
+        # if concurrent then we'll batch in the async context
+        chunked = re_ord.get_batched(n=self._batch_size if self._concurrent <= 1 else 0)
+        if self._concurrent <= 1:
+            pbar = tqdm(desc="Requesting API", total=len(requests))
+            for chunk in chunked:
+                inputs, ctxlens, cache_keys = self.batch_loglikelihood_requests([chunk])
+
+                outputs = retry(
+                    stop=stop_after_attempt(self.max_retries),
+                    wait=wait_exponential(multiplier=0.5, min=1, max=10),
+                    reraise=True,
+                )(self.model_call)(messages=inputs, generate=False)
+                if isinstance(outputs, dict):
+                    outputs = [outputs]
+                for answer_, cache_key in zip(
+                    self.parse_logprobs(
+                        outputs=outputs, tokens=inputs, ctxlens=ctxlens
+                    ),
+                    cache_keys,
+                ):
+                    if answer_ is not None:
+                        res.append(answer_)
+                        # cache requests that aren't from a loglikelihood_rolling request
+                        if cache_key is not None:
+                            self.cache_hook.add_partial(
+                                "loglikelihood", cache_key, answer_
+                            )
+                        pbar.update(1)
+        else:
+            inputs, ctxlens, cache_keys = self.batch_loglikelihood_requests(chunked)
+            res = itertools.chain.from_iterable(
+                asyncio.run(
+                    self.get_batched_requests(
+                        inputs, cache_keys, generate=False, ctxlens=ctxlens
+                    )
+                )
+            )
+
+        return re_ord.get_original(res)
+
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
+        res = []
+
+        def _collate_gen(_requests):
+            # sort by the length of the non-tokenized contexts
+            return -len(_requests[0])
+
+        # Let the API deal with tokenization
+        requests, all_gen_kwargs = zip(*(req.args for req in requests))
+        if self.tokenized_requests:
+            encodings_list = self.tok_encode(
+                requests, add_special_tokens=self.add_bos_token
+            )
+        else:
+            encodings_list = [None] * len(requests)
+        requests = [
+            (a, b, c) for a, b, c in zip(requests, all_gen_kwargs, encodings_list)
+        ]
+
+        re_ord = Collator(
+            requests,
+            sort_fn=_collate_gen,
+            group_by="gen_kwargs",
+        )
+        chunked = re_ord.get_batched(
+            n=self._batch_size if self._concurrent <= 1 else 0, batch_fn=None
+        )
+        if self._concurrent <= 1:
+            pbar = tqdm(desc="Requesting API", total=len(requests))
+            for chunk in chunked:
+                contexts, all_gen_kwargs, encodings_list = zip(*chunk)
+                req = encodings_list if self.tokenized_requests else contexts
+                outputs = retry(
+                    stop=stop_after_attempt(self.max_retries),
+                    wait=wait_exponential(multiplier=0.5, min=1, max=10),
+                    reraise=True,
+                )(self.model_call)(
+                    messages=req,
+                    generate=True,
+                    gen_kwargs=copy.deepcopy(all_gen_kwargs[0]),
+                )
+                for generated_text, context in zip(
+                    self.parse_generations(
+                        outputs=outputs,
+                        contexts=contexts,
+                    ),
+                    contexts,
+                ):
+                    if generated_text is not None:
+                        res.append(generated_text)
+
+                        # partial caching
+                        if context is not None:
+                            self.cache_hook.add_partial(
+                                "generate_until",
+                                (context, all_gen_kwargs[0]),
+                                generated_text,
+                            )
+                            pbar.update(1)
+        else:
+            for chunk in chunked:
+                contexts, all_gen_kwargs, encodings_list = zip(*chunk)
+                req = encodings_list if self.tokenized_requests else contexts
+                results = itertools.chain.from_iterable(
+                    asyncio.run(
+                        self.get_batched_requests(
+                            req,
+                            cache_keys=[(ctx, all_gen_kwargs[0]) for ctx in contexts],
+                            generate=True,
+                            gen_kwargs=copy.deepcopy(all_gen_kwargs[0]),
+                        )
+                    )
+                )
+                res.extend(results)
+
+        return re_ord.get_original(res)
+
+    def loglikelihood_rolling(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[float]:
+        loglikelihoods = []
+
+        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
+            rolling_token_windows = list(
+                map(
+                    utils.make_disjoint_window,
+                    utils.get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.prefix_token_id,
+                        # max_seq_len - (1 for context)
+                        max_seq_len=self.max_length - 1,
+                        context_len=1,
+                    ),
+                )
+            )
+
+            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+
+            string_nll = self._loglikelihood_tokens(
+                rolling_token_windows,
+                disable_tqdm=True,
+            )
+
+            # discard is_greedy
+            string_nll = [x[0] for x in string_nll]
+
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+
+            # cache this loglikelihood_rolling request
+            self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
+        return loglikelihoods
--- a/lm_eval/models/dummy.py
+++ b/lm_eval/models/dummy.py
@@ -26,9 +26,9 @@ class DummyLM(LM):
    def generate_until(self, requests, disable_tqdm: bool = False):
        res = []

-        for ctx, _ in tqdm(requests, disable=disable_tqdm):
+        for request in tqdm(requests, disable=disable_tqdm):
            res.append("lol")
-            assert ctx.strip() != ""
+            assert request.arguments[0].strip() != ""

        return res


--- a/lm_eval/models/gguf.py
+++ b/lm_eval/models/gguf.py
@@ -68,7 +68,9 @@ class GGUFLM(LM):
                logger.error(f"RequestException: {e}")
                time.sleep(delay)  # wait before retrying
        else:
-            raise Exception(f"Failed to get a valid response after {retries} retries.")
+            raise RuntimeError(
+                f"Failed to get a valid response after {retries} retries."
+            )

    def loglikelihood(self, requests, disable_tqdm: bool = False):
        if not requests:

--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
+import copy
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import transformers
+from tqdm import tqdm
+from transformers import BatchEncoding
+
+from lm_eval import utils
+from lm_eval.api.instance import Instance
+from lm_eval.api.registry import register_model
+from lm_eval.models.huggingface import HFLM
+from lm_eval.models.utils import (
+    Collator,
+    flatten_image_list,
+    pad_and_concat,
+    replace_placeholders,
+    stop_sequences_criteria,
+)
+
+
+DEFAULT_IMAGE_PLACEHOLDER = "<image>"
+
+
+eval_logger = utils.eval_logger
+
+
+@register_model("hf-multimodal")
+class HFMultimodalLM(HFLM):
+    """
+    An abstracted Hugging Face model class for multimodal LMs like Llava and Idefics.
+    """
+
+    AUTO_MODEL_CLASS = transformers.AutoModelForVision2Seq
+    MULTIMODAL = True  # flag to indicate, for now, that this model type can run multimodal requests
+
+    def __init__(
+        self,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        image_token_id: Optional[int] = None,
+        image_string: Optional[str] = None,
+        interleave: bool = True,
+        # TODO: handle whitespace in image placeholder (replacement)
+        max_images: Optional[int] = 999,
+        convert_img_format=False,
+        **kwargs,
+    ):
+        # We initialize using HFLM's init. Sub-methods like _create_model and _create_tokenizer
+        # modify init behavior.
+        super().__init__(pretrained, **kwargs)
+
+        assert (
+            self.batch_size != "auto"
+        ), "Batch size 'auto' is not yet supported for hf-multimodal models."
+        self.chat_applied: bool = False
+        # TODO: phi-3.5 "image placeholders" are <image_1>, <image_2>, ... in order. how to handle this case
+
+        # HF AutoModelForVision2Seq models have an `image_token_id` value in their configs
+        # denoting the token which indicates a location where an image will be substituted in.
+        # This can take different string values across models, e.g. <image> for Idefics2 and <|image_pad|> for Qwen2-VL
+        self.interleave = interleave
+        self.max_images = max_images
+        self.rgb = convert_img_format
+        # WARNING: improperly set image_token_id can lead to ignored image input or other (potentially silent) errors!
+        if not image_string:
+            self.image_token_id = (
+                int(image_token_id)
+                if image_token_id
+                else (
+                    getattr(self.config, "image_token_id", None)
+                    or getattr(self.config, "image_token_index", None)
+                )
+            )
+            assert (
+                self.image_token_id is not None
+            ), "Must have a non-None image_token_id to evaluate a Hugging Face AutoModelForVision2Seq model. Please pass `image_token_id` in `--model_args` if model's config does not already specify one."
+            # get the string this token ID corresponds to
+            self.image_token = self.tok_decode(
+                [self.image_token_id], skip_special_tokens=False
+            )
+            if image_token_id is not None:
+                eval_logger.info(
+                    f"A non-default image_token_id with image_token_id={self.image_token_id} and string value '{self.image_token}' was specified manually. Note that using an improper image_token placeholder may lead to ignored image input or errors!"
+                )
+        else:
+            eval_logger.info(
+                f"A non-default image_token string with string value image_string='{image_string}' was specified manually. Note that using an improper image_token placeholder may lead to ignored image input or errors!"
+            )
+            self.image_token = image_string
+
+    def _create_tokenizer(
+        self,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        tokenizer: Optional[
+            Union[
+                str,
+                transformers.ProcessorMixin,
+            ]
+        ],
+        revision: Optional[str] = "main",
+        trust_remote_code: Optional[bool] = False,
+        **kwargs,
+    ) -> None:
+        """
+        Helper method during initialization.
+
+        For the multimodal variant, we initialize not just
+        `self.tokenizer` but also `self.processor`.
+        """
+
+        if tokenizer:
+            if isinstance(tokenizer, str):
+                return transformers.AutoProcessor.from_pretrained(
+                    tokenizer,
+                    revision=revision,
+                    trust_remote_code=trust_remote_code,
+                    # use_fast=use_fast_tokenizer,
+                )
+            else:
+                assert isinstance(
+                    tokenizer, transformers.ProcessorMixin
+                )  # TODO: check this condition
+                return tokenizer
+
+        # Get tokenizer based on 'pretrained'
+        if isinstance(pretrained, str):
+            model_name = pretrained
+        else:
+            # get the HF hub name via accessor on model
+            model_name = self.model.name_or_path
+
+        self.processor = transformers.AutoProcessor.from_pretrained(
+            model_name,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            # use_fast=use_fast_tokenizer,
+        )
+
+        self.tokenizer = self.processor.tokenizer
+
+    def tok_multimodal_encode(
+        self, string, images, left_truncate_len=None, add_special_tokens=None
+    ):
+        """Helper function which encodes an image + string combo using AutoProcessor"""
+        # We inherit special token kwarg setup from HFLM.tok_encode
+        # special_tokens_kwargs = {}
+
+        # by default for CausalLM - false or self.add_bos_token is set
+        # if add_special_tokens is None:
+        #     special_tokens_kwargs = {"add_special_tokens": False or self.add_bos_token}
+        # otherwise the method explicitly defines the value
+        # else:
+        #     special_tokens_kwargs = {"add_special_tokens": add_special_tokens}
+
+        # encode text+images
+        # TODO: why does (Qwen2-VL) processor error when attempting to add special tokens to text?
+        encoding = self.processor(
+            text=string, images=images, return_tensors=None
+        )  # , **special_tokens_kwargs)
+
+        # remove (and store) our tokenized text
+        text_encoding = encoding.pop("input_ids")
+        encoding.pop("attention_mask")
+
+        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+        if left_truncate_len:
+            text_encoding = text_encoding[-left_truncate_len:]
+
+        return text_encoding, encoding  # image_encoding is a dict
+
+    def _encode_multimodal_pair(self, context, continuation, images):
+        """Helper function to perform the role of TemplateLM._encode_pair
+        Except allowing for image input to also be processed alongside `context`.
+
+        This method is a bit messy due to the need to defer conversion of image and text token input
+        into PyTorch tensors until the main inference loop.
+        """
+
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+
+        # TODO: replace default <image> placeholder with self.image_token, for contexts
+
+        whole_enc, image_enc = self.tok_multimodal_encode(
+            context + continuation, images
+        )
+        context_enc, _ = self.tok_multimodal_encode(context, images)
+
+        # tok_multimodal_encode returns List[List[int]] for tokenized text. Get rid of the batch dim
+        # since we only are encoding a single string.
+        # TODO: this is a bit hacky, it'd be nice to make this generally cleaner
+        whole_enc, context_enc = whole_enc[0], context_enc[0]
+
+        context_enc_len = len(context_enc)
+        continuation_enc = whole_enc[context_enc_len:]
+
+        return context_enc, continuation_enc, image_enc
+
+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        self.chat_applied = True
+        if not self.interleave:
+            for content in chat_history:
+                c = []
+                text = content["content"]
+
+                # Count and remove image placeholders
+                image_count = min(
+                    self.max_images, text.count(DEFAULT_IMAGE_PLACEHOLDER)
+                )
+                text = text.replace(DEFAULT_IMAGE_PLACEHOLDER, "")
+
+                # Add image entries
+                for _ in range(image_count):
+                    c.append({"type": "image", "image": None})
+
+                # Add single text entry at the end
+                c.append({"type": "text", "text": text})
+
+                content["content"] = c
+        else:
+            for content in chat_history:
+                c = []
+                text = content["content"]
+                expected_image_count = min(
+                    self.max_images, text.count(DEFAULT_IMAGE_PLACEHOLDER)
+                )
+                actual_image_count = 0
+
+                text_parts = text.split(DEFAULT_IMAGE_PLACEHOLDER)
+
+                for i, part in enumerate(text_parts):
+                    # TODO: concatenate text parts (esp. if skipping images)?
+                    if part:  # Add non-empty text parts
+                        c.append({"type": "text", "text": part})
+                    if (
+                        (i < len(text_parts) - 1) and i < self.max_images
+                    ):  # Add image placeholder after each split except the last
+                        c.append({"type": "image"})
+                        actual_image_count += 1
+
+                content["content"] = c
+
+                if actual_image_count != expected_image_count:
+                    raise ValueError(
+                        f"Mismatch in image placeholder count. Expected: {expected_image_count}, Actual: {actual_image_count}"
+                    )
+
+        return self.processor.apply_chat_template(
+            chat_history, add_generation_prompt=True
+        )
+
+    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
+        if hasattr(self.processor, "apply_chat_template"):
+            _tokenizer = self.tokenizer
+            self.tokenizer = self.processor
+
+            selected_template = super().chat_template(chat_template)
+
+            self.tokenizer = _tokenizer
+            return selected_template
+        else:
+            return super().chat_template(chat_template)
+
+    def tok_batch_multimodal_encode(
+        self,
+        strings: List[str],  # note that input signature of this fn is different
+        images: List[List],  # TODO: images are pil.Image at the moment, update typehint
+        padding_side: str = "left",
+        left_truncate_len: int = None,
+        truncation: bool = False,
+    ) -> Union[
+        BatchEncoding, Dict[str, torch.Tensor]
+    ]:  # note that this return signature differs from HFLM tok_batch_encode.
+        # NOTE: here, we replace <image> tags with our model's corresponding image_token string value.
+        if not self.chat_applied:
+            # TODO<baber>: This still keeps the whitespace in the image placeholder, which is not ideal.
+            strings = [
+                replace_placeholders(
+                    string, DEFAULT_IMAGE_PLACEHOLDER, self.image_token, self.max_images
+                )
+                for string in strings
+            ]
+
+        # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
+        old_padding_side = self.tokenizer.padding_side
+        self.tokenizer.padding_side = padding_side
+
+        # add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
+
+        images = [img[: self.max_images] for img in images]
+        if self.rgb:
+            images = [[img.convert("RGB") for img in sublist] for sublist in images]
+
+        # certain models like llava expect a single-level image list even for bs>1, multi-image. TODO: port this over to loglikelihoods
+        if getattr(self.config, "model_type", "") == "llava":
+            images = flatten_image_list(images)
+
+        encoding = self.processor(
+            images=images,
+            text=strings,
+            truncation=truncation,
+            padding="longest",
+            return_tensors="pt",
+            # **add_special_tokens, # TODO: at least some Processors error out when passing this. How do we control whether text gets BOS added?
+        )
+
+        encoding.to(  # TODO: our other tokenization methods in HFLM don't typically move to device. this breaks convention
+            self.device, self.model.dtype
+        )  # TODO: This only casts the pixel values. Should they always be float16?
+        if left_truncate_len:
+            encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
+            encoding["attention_mask"] = encoding["attention_mask"][
+                :, -left_truncate_len:
+            ]
+        self.tokenizer.padding_side = old_padding_side
+
+        return encoding
+
+    def _model_multimodal_call(self, inps, imgs, attn_mask=None, labels=None):
+        """
+        TODO: update docstring
+        """
+        # note: imgs is a dict.
+        with torch.no_grad():
+            return self.model(inps, **imgs).logits
+
+    def _model_multimodal_generate(self, inputs, max_length, stop, **generation_kwargs):
+        generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
+        do_sample = generation_kwargs.get("do_sample", None)
+
+        # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
+        if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
+            generation_kwargs["do_sample"] = do_sample = False
+
+        if do_sample is False and generation_kwargs.get("temperature") == 0.0:
+            generation_kwargs.pop("temperature")
+
+        stopping_criteria = stop_sequences_criteria(
+            self.tokenizer,
+            stop,
+            inputs["input_ids"].shape[1],
+            inputs["input_ids"].shape[0],
+        )
+        return self.model.generate(
+            **inputs,
+            max_length=max_length,
+            stopping_criteria=stopping_criteria,
+            pad_token_id=self.tokenizer.pad_token_id,
+            use_cache=True,
+            **generation_kwargs,
+        )
+
+    def _batch_images(self, image_encs):
+        """
+        Helper function: batch together image encodings across examples in a batch.
+        # TODO: for variable-sized images, this may break down.
+        """
+        batched_imgs = {}
+        for key in image_encs[0].keys():
+            batched_imgs[key] = torch.cat(
+                [
+                    torch.tensor(
+                        image_enc[key], device=self.device, dtype=self.model.dtype
+                    )
+                    for image_enc in image_encs
+                ],
+                dim=0,
+            )
+        return batched_imgs
+
+    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+        raise NotImplementedError(
+            "model type `hf-multimodal` does not support loglikelihood_rolling. Use 'hf' model type for text-only loglikelihood_rolling tasks ",
+            "this is because we do not support measuring the loglikelihood a model assigns to an image.",
+        )
+
+    def loglikelihood(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[Tuple[float, bool]]:
+        raise NotImplementedError(
+            "'loglikelihood' requests for model type `hf-multimodal` are not yet tested. This feature will be enabled when a loglikelihood-based multiple-choice VQA dataset is added!"
+        )
+
+        new_reqs = []
+        for context, continuation, aux_arguments in [req.args for req in requests]:
+            if context == "":
+                raise ValueError(
+                    "Must get non-empty context for multimodal requests! You might be trying to run 'loglikelihood_rolling', which is not supported in the multimodal case."
+                )
+            else:
+                visuals = aux_arguments["visual"]
+
+                context_enc, continuation_enc, image_enc = self._encode_multimodal_pair(
+                    context, continuation, visuals
+                )
+            # TODO: key to pick for caching images
+            new_reqs.append(
+                (
+                    (context, continuation, visuals),
+                    context_enc,
+                    continuation_enc,
+                    image_enc,
+                )
+            )
+
+        return self._loglikelihood_tokens(new_reqs, disable_tqdm=disable_tqdm)
+
+    def _loglikelihood_tokens(
+        self,
+        requests: List[
+            Tuple[Tuple[None, str, str], List[int], List[int], List[int]]
+        ],  # TODO: update typehint to be correct
+        disable_tqdm: bool = False,
+        override_bs: int = None,
+    ) -> List[Tuple[float, bool]]:
+        res = []
+
+        # TODO: **improve multimodal collation.** We currently ignore image size when ordering docs. ideally we'd take them into account
+        def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key for the sorted method"""
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = req[1] + req[2]
+            return -len(toks), tuple(toks)
+
+        def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key to group and lookup one-token continuations"""
+            # Use with group_by="contexts" (optional)"
+            # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
+            # speeds up some multiple-choice tasks proportionally to the number of choices.
+            # groups requests by context+continuation[:-1] and infer on one request/group.
+            return req[-1] + req[-3] + req[-2][:-1]
+
+        re_ord = Collator(
+            requests,
+            sort_fn=_collate,
+            group_by="contexts"  # TODO: can't group-by just "contexts" any more, need to incorporate imgs
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+            and self.logits_cache
+            else None,
+            group_fn=_lookup_one_token_cont,
+        )
+
+        # automatic (variable) batch size detection for vectorization
+        # pull longest context sample from request
+        n_reordered_requests = len(re_ord)
+        batch_size = (
+            self.batch_size
+            if self.batch_size != "auto"
+            else override_bs
+            if override_bs is not None
+            else 0
+        )
+        batch_fn = (
+            self._batch_scheduler
+            if self.batch_size == "auto"
+            and n_reordered_requests > 0
+            and not override_bs
+            else None
+        )
+
+        chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn)
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running loglikelihood requests with text+image input",
+        )
+        for chunk in chunks:
+            imgs = []
+            inps = []
+            cont_toks_list = []
+            inplens = []
+
+            padding_len_inp = None
+            # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
+            # tensors, then we pack them together into a batch, call the model, and then pick it all apart
+            # again because vectorizing is annoying
+
+            for _, context_enc, continuation_enc, image_enc in chunk:
+                # sanity check
+                assert len(image_enc) > 0
+                assert len(context_enc) > 0
+                assert len(continuation_enc) > 0
+                assert len(continuation_enc) <= self.max_length
+
+                # how this all works (illustrated on a causal decoder-only setup):
+                #          CTX      CONT
+                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+                # model  \               \
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+
+                # when too long to fit in context, truncate from the left
+                # TODO: assuming that we won't handle enc-dec Vision2Seq models. Is that a safe assumption?
+                inp = torch.tensor(
+                    (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
+                    dtype=torch.long,
+                    device=self.device,
+                )
+                (inplen,) = inp.shape
+
+                padding_len_inp = (
+                    max(padding_len_inp, inplen)
+                    if padding_len_inp is not None
+                    else inplen
+                )
+
+                inps.append(inp)  # [1, inp_length]
+                cont_toks_list.append(continuation_enc)
+                inplens.append(inplen)
+
+                imgs.append(image_enc)
+
+            # create encoder attn mask and batched conts, if seq2seq
+            call_kwargs = {}
+            batched_inps = pad_and_concat(
+                padding_len_inp, inps, padding_side="right"
+            )  # [batch, padding_len_inp]
+            # batch our examples' image inputs together
+            batched_imgs = self._batch_images(
+                imgs
+            )  # TODO: fix/test for bs>1 case with differently-sized imgs!
+
+            multi_logits = F.log_softmax(
+                self._model_multimodal_call(batched_inps, batched_imgs, **call_kwargs),
+                dim=-1,
+            )  # [batch, padding_length (inp or cont), vocab]
+
+            for (
+                request_str,
+                ctx_tokens,
+                _,
+                image_encs,
+            ), logits, inplen, cont_toks in zip(
+                chunk, multi_logits, inplens, cont_toks_list
+            ):
+                # Slice to original seq length
+                contlen = len(cont_toks)
+                # take only logits in the continuation
+                # (discard context toks if decoder-only ; discard right-padding)
+                # also discards + checks for "virtual tokens" in the causal LM's input window
+                # from prompt/prefix tuning tokens, if applicable
+                ctx_len = (
+                    inplen + (logits.shape[0] - padding_len_inp)
+                    if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+                    else None
+                )
+                logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
+                logits = logits.unsqueeze(0)  # [1, seq, vocab]
+
+                # Check if per-token argmax is exactly equal to continuation
+                greedy_tokens = logits.argmax(dim=-1)
+
+                # check for one-token continuation cache hits.
+                # noop in case group_by != "contexts" or no cache hit and returns the
+                # original args. Otherwise, expands the logits batch dimension and yields each
+                # batch along with matching continuation tokens and prompt strings.
+                # logits -> [1, seq, vocab]
+                for request_str, cont_toks, logits in re_ord.get_cache(
+                    req_str=request_str,
+                    cxt_toks=ctx_tokens,
+                    cont_toks=cont_toks,
+                    logits=logits,
+                ):
+                    cont_toks = torch.tensor(
+                        cont_toks, dtype=torch.long, device=self.device
+                    ).unsqueeze(0)  # [1, seq]
+                    max_equal = (greedy_tokens == cont_toks).all()
+
+                    # Obtain log-probs at the corresponding continuation token indices
+                    # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
+                    logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
+                        -1
+                    )  # [1, seq]
+
+                    # Answer: (log prob, is-exact-match)
+                    answer = (float(logits.sum()), bool(max_equal))
+
+                    res.append(answer)
+
+                    self.cache_hook.add_partial(
+                        "loglikelihood", request_str, answer
+                    )  # TODO: choose convention for adding images into the cache key
+                    pbar.update(1)
+
+        pbar.close()
+
+        return re_ord.get_original(res)
+
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
+        # TODO: back out to HFLM.generate_until() for all requests without aux_arguments (text-only reqs)
+        res = []
+
+        def _collate(x):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = self.tok_encode(x[0])
+            return -len(toks), x[0]
+
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running generate_until requests with text+image input",
+        )
+        # TODO: port auto-batch sizing into this.
+
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        re_ords = Collator(
+            [reg.args for reg in requests],
+            _collate,
+            group_by="gen_kwargs",
+            group_fn=lambda x: x[1],
+        )
+        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
+
+        ### Up to here: was identical to non-multimodal HFLM generate_until ###
+
+        for chunk in chunks:
+            contexts, all_gen_kwargs, aux_arguments = zip(*chunk)
+
+            visuals = [arg["visual"] for arg in aux_arguments]
+
+            if not isinstance(contexts, list):
+                contexts = list(
+                    contexts
+                )  # for Qwen2-VL, processor is unhappy accepting a tuple of strings instead of a list.
+                # TODO: could we upstream this workaround to HF?
+            ### this part onward: same as HFLM ###
+
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            gen_kwargs = all_gen_kwargs[0]
+            # unpack our keyword arguments.
+            until = None
+            if isinstance(gen_kwargs, dict):
+                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                if "until" in kwargs.keys():
+                    until = kwargs.pop("until")
+                    if isinstance(until, str):
+                        until = [until]
+                    elif not isinstance(until, list):
+                        raise ValueError(
+                            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
+                        )
+            else:
+                raise ValueError(
+                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
+                )
+            # add EOS token to stop sequences
+            eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
+            if not until:
+                until = [eos]
+            else:
+                until.append(eos)
+            if "max_gen_toks" in kwargs.keys():
+                max_gen_toks = kwargs.pop("max_gen_toks")
+            else:
+                max_gen_toks = self.max_gen_toks
+
+            ### end stuff that's entirely copied verbatim from HFLM ###
+
+            max_ctx_len = self.max_length - max_gen_toks
+
+            inputs = self.tok_batch_multimodal_encode(
+                contexts,
+                visuals,
+                left_truncate_len=max_ctx_len,
+                truncation=self.truncation,
+            )
+
+            context_enc = inputs["input_ids"]
+
+            if "max_length" not in kwargs:
+                kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
+
+            cont = self._model_multimodal_generate(inputs, stop=until, **kwargs)
+
+            del inputs
+            torch.cuda.empty_cache()
+            import gc
+
+            gc.collect()
+
+            ### essentially same as HFLM beyond this line!
+
+            cont_toks_list = cont.tolist()
+            for cont_toks, context in zip(cont_toks_list, contexts):
+                # discard context + left-padding toks if using causal decoder-only VLM
+                cont_toks = cont_toks[context_enc.shape[1] :]
+
+                s = self.tok_decode(cont_toks)
+
+                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
+                for term in until:
+                    if len(term) > 0:
+                        # ignore '' separator,
+                        # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
+                        s = s.split(term)[0]
+
+                res.append(s)
+                self.cache_hook.add_partial(
+                    "generate_until", (context, gen_kwargs), s
+                )  # TODO: cache key for multimodal input should be what?
+                pbar.update(1)
+        # reorder this group of results back to original unsorted form
+        res = re_ords.get_original(res)
+
+        pbar.close()
+        return res
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -9,10 +9,10 @@ import torch.nn.functional as F
 import transformers
 from accelerate import (
    Accelerator,
-    DistributedType,
    InitProcessGroupKwargs,
    find_executable_batch_size,
 )
+from accelerate.utils import get_max_memory
 from huggingface_hub import HfApi
 from packaging import version
 from peft import PeftModel
@@ -30,6 +30,7 @@ from lm_eval.api.registry import register_model
 from lm_eval.models.utils import (
    Collator,
    clear_torch_cache,
+    configure_pad_token,
    get_dtype,
    pad_and_concat,
    stop_sequences_criteria,
@@ -39,31 +40,6 @@ from lm_eval.models.utils import (
 eval_logger = utils.eval_logger


-def _get_accelerate_args(
-    device_map_option: Optional[str] = "auto",
-    max_memory_per_gpu: Optional[Union[int, str]] = None,
-    max_cpu_memory: Optional[Union[int, str]] = None,
-    offload_folder: Optional[str] = "./offload",
-    gpus: Optional[int] = None,
-) -> dict:
-    """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
-    max_memory = {}
-    if max_memory_per_gpu is not None:
-        max_memory_per_gpu_map = {
-            device_idx: max_memory_per_gpu for device_idx in range(gpus)
-        }
-        max_memory.update(max_memory_per_gpu_map)
-    if max_cpu_memory is not None:
-        max_memory["cpu"] = max_cpu_memory
-
-    args = {}
-    if max_memory:
-        args["max_memory"] = max_memory
-    args["device_map"] = device_map_option
-    args["offload_folder"] = offload_folder
-    return args
-
-
 @register_model("hf-auto", "hf", "huggingface")
 class HFLM(TemplateLM):
    """
@@ -79,7 +55,7 @@ class HFLM(TemplateLM):
    def __init__(
        self,
        pretrained: Union[str, transformers.PreTrainedModel],
-        backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
+        backend: Literal["default", "causal", "seq2seq"] = "default",
        # override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq)
        revision: Optional[str] = "main",
        subfolder: Optional[str] = None,
@@ -104,7 +80,6 @@ class HFLM(TemplateLM):
        # arguments used for splitting a model across GPUs naively.
        # only used if `parallelize=True`.
        parallelize: Optional[bool] = False,
-        device_map_option: Optional[str] = "auto",
        max_memory_per_gpu: Optional[Union[int, str]] = None,
        max_cpu_memory: Optional[Union[int, str]] = None,
        offload_folder: Optional[Union[str, os.PathLike]] = "./offload",
@@ -112,10 +87,10 @@ class HFLM(TemplateLM):
        peft: Optional[str] = None,
        delta: Optional[str] = None,
        autogptq: Optional[Union[bool, str]] = False,
+        gptqmodel: Optional[bool] = False,
        **kwargs,
    ) -> None:
        super().__init__()
-
        # optionally: take in an already-initialized transformers.PreTrainedModel
        if not isinstance(pretrained, str):
            eval_logger.warning(
@@ -127,21 +102,6 @@ class HFLM(TemplateLM):
            self._config = self._model.config
            gpus = 0

-            if tokenizer:
-                assert isinstance(
-                    tokenizer, transformers.PreTrainedTokenizer
-                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
-                self.tokenizer = tokenizer
-            else:
-                # Get tokenizer
-                model_name = self._model.name_or_path
-                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                    model_name,
-                    revision=revision,
-                    trust_remote_code=trust_remote_code,
-                    use_fast=use_fast_tokenizer,
-                )
-
        else:
            assert isinstance(device, str)
            assert isinstance(pretrained, str)
@@ -156,6 +116,7 @@ class HFLM(TemplateLM):
            if "npu" in accelerator.device.type:
                gpus = torch.npu.device_count()

+            # using one process with no model parallelism
            if not (parallelize or accelerator.num_processes > 1):
                # use user-passed device
                device_list = set(
@@ -181,14 +142,19 @@ class HFLM(TemplateLM):
                        if torch.cuda.is_available()
                        else torch.device("cpu")
                    )
-            else:
+            else:  # Parallelism managed by accelerate
                if device != "cuda":
                    eval_logger.info(
                        f"Using `accelerate launch` or `parallelize=True`, device '{device}' will be overridden when placing model."
                    )
                # TODO: include in warning that `load_in_8bit` etc. affect this too
-                self._device = torch.device(device)
+                self._device = (
+                    self.accelerator.device
+                    if hasattr(self, "accelerator")
+                    else torch.device(device)
+                )

+            revision = str(revision)  # cast to string if not already one
            # TODO: update this to be less of a hack once subfolder is fixed in HF
            revision = revision + ("/" + subfolder if subfolder is not None else "")

@@ -198,7 +164,7 @@ class HFLM(TemplateLM):
                trust_remote_code=trust_remote_code,
            )

-        # determine which of 'causal' and 'seq2seq' backends to use
+            # determine which of 'causal' and 'seq2seq' backends to use for HF models
        self._get_backend(
            config=self.config, backend=backend, trust_remote_code=trust_remote_code
        )
@@ -221,13 +187,13 @@ class HFLM(TemplateLM):
                trust_remote_code=trust_remote_code,
                parallelize=parallelize,
                gpus=gpus,
-                device_map_option=device_map_option,
                max_memory_per_gpu=max_memory_per_gpu,
                max_cpu_memory=max_cpu_memory,
                offload_folder=offload_folder,
                peft=peft,
                delta=delta,
                autogptq=autogptq,
+                gptqmodel=gptqmodel,
                **kwargs,
            )

@@ -236,52 +202,17 @@ class HFLM(TemplateLM):
            self.model.eval()
            self.model.tie_weights()

-        if isinstance(pretrained, str) and (gpus >= 1 or str(self.device) == "mps"):
-            # TODO: can remove this whole snippet except in the mps case, perhaps?
-            if not (parallelize or autogptq or hasattr(self, "accelerator")):
-                # place model onto device requested manually,
-                # if not using HF Accelerate or device_map
-                # or any other option that preloads model onto device
-                try:
-                    self.model.to(self.device)
-                except ValueError:
-                    eval_logger.debug(
-                        "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
-                    )
-
        self.truncation = truncation
        self.logits_cache = logits_cache
        self.vocab_size = self.tokenizer.vocab_size
        # select (or create) a pad token to use
-        if self.tokenizer.pad_token:
-            pass
-        elif self.tokenizer.unk_token:
-            self.tokenizer.pad_token_id = self.tokenizer.unk_token_id
-        elif self.tokenizer.eos_token:
-            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
-        else:
-            if getattr(self.config, "model_type", None) == "qwen":
-                # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
-                self.tokenizer.pad_token = "<|endoftext|>"
-            elif (
-                self.tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
-                or self.tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
-            ):
-                # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
-                # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
-                # ---
-                # Note that the world tokenizer class name, might change in the future for the final huggingface merge
-                # https://github.com/huggingface/transformers/pull/26963
-                assert self.tokenizer.pad_token_id == 0
-            else:
-                self.tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
+        self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)

-        # TODO: override this for Gemma
        self.add_bos_token = add_bos_token
-        if getattr(self.config, "model_type", None) == "gemma":
+        if "gemma" in getattr(self.config, "model_type", ""):
            self.add_bos_token = True
            eval_logger.info(
-                f"Model type is '{self.config.model_type}', a BOS token will be used as Gemma underperforms without it."
+                f"Model type is '{self.config.model_type}', part of the Gemma family--a BOS token will be used as Gemma underperforms without it."
            )

        self._max_length = max_length
@@ -302,49 +233,46 @@ class HFLM(TemplateLM):
            self.batch_size_per_gpu = int(batch_size)

        if isinstance(pretrained, str):
+            if gpus >= 1 or str(self.device) == "mps":
+                # TODO: can remove this whole snippet except in the mps case, perhaps?
+                if not (parallelize or autogptq or hasattr(self, "accelerator")):
+                    # place model onto device requested manually,
+                    # if not using HF Accelerate or device_map
+                    # or any other option that preloads model onto device
+                    try:
+                        self.model.to(self.device)
+                    except ValueError:
+                        eval_logger.debug(
+                            "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
+                        )
            # multigpu data-parallel support when launched with accelerate
            if gpus > 1:
-                if parallelize:
-                    if accelerator.num_processes > 1:
-                        raise RuntimeError(
-                            "Attempted to use both a HF Accelerate `device_map` and to launch via `accelerate launch`. If this is the case, please either remove `parallelize=True` from --model_args or launch outside of the Accelerate launcher."
+                if accelerator.num_processes > 1:
+                    if parallelize:
+                        eval_logger.warning(
+                            "You are both using a HF Accelerate `device_map` (`--model_args parallelize=True`) and launching via `accelerate launch`. This will attempt to do model and data parallelism depending on the resources available."
                        )
-                    else:
-                        pass
-                elif accelerator.num_processes == 1:
-                    # if we aren't launching via accelerate, ditch
-                    self._rank = 0
-                    self._world_size = 1
-                else:
-                    if gpus > accelerator.num_processes:
+                    elif gpus > accelerator.num_processes:
                        eval_logger.warning(
                            "WARNING: The number of total system GPUs does not match the number of spawned processes. "
                            "If you would like to use data parallelism, please launch the script "
                            "with 'accelerate launch *script*'. "
                            f"Current run will proceed with {accelerator.num_processes} devices."
                        )
-                    assert (
-                        accelerator.distributed_type
-                        in [
-                            DistributedType.FSDP,
-                            DistributedType.MULTI_GPU,
-                            DistributedType.MULTI_NPU,
-                        ]
-                    ), "Unsupported distributed type provided. Only DDP and FSDP are supported."
-                    if accelerator.distributed_type == DistributedType.FSDP:
-                        self._model = accelerator.prepare(self.model)
-                    else:
-                        self._model = accelerator.prepare_model(
-                            self.model, evaluation_mode=True
-                        )
+                        if self.accelerator.is_local_main_process:
+                            eval_logger.info(
+                                f"Using {gpus} devices with data parallelism"
+                            )
+
                    self._device = torch.device(f"{accelerator.device}")
                    self.accelerator = accelerator

-                    if self.accelerator.is_local_main_process:
-                        eval_logger.info(f"Using {gpus} devices with data parallelism")
-
                    self._rank = self.accelerator.local_process_index
                    self._world_size = self.accelerator.num_processes
+                else:
+                    # if we aren't launching via accelerate, ditch
+                    self._rank = 0
+                    self._world_size = 1
        else:
            # if a PreTrainedModel was passed into HFLM, we forgo distributed setup.
            eval_logger.warning(
@@ -359,6 +287,94 @@ class HFLM(TemplateLM):
                f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}"
            )

+    def _get_accelerate_args(
+        self,
+        parallelize: Optional[bool] = None,
+        device_map: Optional[str] = "auto",
+        max_memory_per_gpu: Optional[Union[int, str]] = None,
+        max_cpu_memory: Optional[Union[int, str]] = None,
+        offload_folder: Optional[str] = "./offload",
+        gpus: Optional[int] = None,
+    ) -> dict:
+        """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
+        num_local_processes = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
+        num_machines = int(os.environ.get("WORLD_SIZE", 0)) // num_local_processes
+        if (
+            num_machines == 0
+            and hasattr(self, "accelerator")
+            and self.accelerator is not None
+        ):
+            eval_logger.info(
+                "We are not in a distributed setting for accelerate. Setting model_parallel to False."
+            )
+            parallelize = False
+
+        if parallelize is None:
+            # If parallelism is unset by the user, we automatically assign model parallelism
+            # if enough extra GPUs are available
+            max_memory_all_gpus = get_max_memory()
+            # We just want gpu, not cpu, max memory
+            if "cpu" in max_memory_all_gpus:
+                del max_memory_all_gpus["cpu"]
+            parallelize = bool(num_local_processes < len(max_memory_all_gpus))
+            eval_logger.info(
+                f"Setting model parallel to {parallelize} since "
+                f"the number of local processes is {num_local_processes} "
+                f"and the number of GPUs is {len(max_memory_all_gpus)}"
+            )
+
+        args = {}
+        if parallelize:  # Model parallelism will be used
+            max_memory = {}
+            if max_memory_per_gpu is not None:  # Using the provided memory requirements
+                max_memory_per_gpu_map = {
+                    device_idx: max_memory_per_gpu for device_idx in range(gpus)
+                }
+            else:  # Estimating the possible memory requirements
+                max_memory_all_gpus = get_max_memory()
+                if "cpu" in max_memory_all_gpus:
+                    del max_memory_all_gpus["cpu"]
+                if not hasattr(self, "accelerator"):
+                    max_memory_per_gpu_map = {
+                        k: v for k, v in max_memory_all_gpus.items()
+                    }
+                else:
+                    # use only 1 / num_processes of the GPUs if we are running under accelerate launch
+                    max_memory_per_gpu_map = {
+                        k: v
+                        for k, v in max_memory_all_gpus.items()
+                        if k % num_local_processes
+                        == (self.accelerator.process_index % num_local_processes)
+                    }
+            args["max_memory"] = max_memory_per_gpu_map
+            args["device_map"] = "auto" if device_map is None else device_map
+            eval_logger.info(
+                f"Model parallel was set to True, setting max memory per GPU to {max_memory_per_gpu_map} and device map to {args.get('device_map')}"
+            )
+
+            if max_cpu_memory is not None:
+                max_memory["cpu"] = max_cpu_memory
+
+            args["offload_folder"] = offload_folder
+        elif (
+            device_map is None
+        ):  # No model parallelism, we use the default provided device for our model
+            if hasattr(self, "accelerator"):
+                device_map = {"": f"{self.accelerator.device}"}
+            else:
+                device_map = {"": str(self.device)}
+            args["max_memory"] = None
+            args["device_map"] = device_map
+            eval_logger.info(
+                f"Model parallel was set to False, max memory was not set, and device map was set to {device_map}"
+            )
+        else:
+            args["max_memory"] = None
+            args["device_map"] = None
+            eval_logger.info("Model parallel was set to False.")
+
+        return args
+
    @property
    def config(self):
        # return the associated transformers.AutoConfig for the given pretrained model.
@@ -424,31 +440,29 @@ class HFLM(TemplateLM):
    def tokenizer_name(self) -> str:
        return self.tokenizer.name_or_path.replace("/", "__")

-    @property
-    def chat_template(self) -> str:
-        if self.tokenizer.chat_template is not None:
-            return self.tokenizer.chat_template
-        return self.tokenizer.default_chat_template
-
    def _get_backend(
        self,
        config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
-        backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
+        backend: Literal["default", "causal", "seq2seq"] = "default",
        trust_remote_code: Optional[bool] = False,
    ) -> None:
        """
        Helper method during initialization.
-        Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder))
-        model type to be used.
+        Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder)) model type to be used.
+        sets `self.AUTO_MODEL_CLASS` appropriately if not already set.
+
+        **If not calling HFLM.__init__() or HFLM._get_backend() within a subclass of HFLM,
+        user must set `self.backend` to be either "causal" or "seq2seq" manually!**
        """
+
        assert backend in ["default", "causal", "seq2seq"]

        if backend != "default":
            # if we've settled on non-default backend, use that manually
            if backend == "causal":
-                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+                self.backend = backend
            elif backend == "seq2seq":
-                self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
+                self.backend = backend
            eval_logger.info(
                f"Overrode HF model backend type, and using type '{backend}'"
            )
@@ -461,26 +475,32 @@ class HFLM(TemplateLM):
                # first check if model type is listed under seq2seq models, since some
                # models like MBart are listed in both seq2seq and causal mistakenly in HF transformers.
                # these special cases should be treated as seq2seq models.
-                self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
+                self.backend = "seq2seq"
+                eval_logger.info(f"Using model type '{backend}'")
            elif (
                getattr(self.config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
            ):
-                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+                self.backend = "causal"
+                eval_logger.info(f"Using model type '{backend}'")
            else:
                if not trust_remote_code:
                    eval_logger.warning(
                        "HF model type is neither marked as CausalLM or Seq2SeqLM. \
                    This is expected if your model requires `trust_remote_code=True` but may be an error otherwise."
+                        "Setting backend to causal"
                    )
                # if model type is neither in HF transformers causal or seq2seq model registries
-                # then we default to AutoModelForCausalLM
-                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+                # then we default to assuming AutoModelForCausalLM
+                self.backend = "causal"
+                eval_logger.info(
+                    f"Model type cannot be determined. Using default model type '{backend}'"
+                )

-        assert self.AUTO_MODEL_CLASS in [
-            transformers.AutoModelForCausalLM,
-            transformers.AutoModelForSeq2SeqLM,
-        ]
-        return None
+        if self.AUTO_MODEL_CLASS is None:
+            if self.backend == "causal":
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+            elif self.backend == "seq2seq":
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM

    def _get_config(
        self,
@@ -488,6 +508,7 @@ class HFLM(TemplateLM):
        revision: str = "main",
        trust_remote_code: bool = False,
    ) -> None:
+        """Return the model config for HuggingFace models"""
        self._config = transformers.AutoConfig.from_pretrained(
            pretrained,
            revision=revision,
@@ -505,7 +526,6 @@ class HFLM(TemplateLM):
        # (accelerate naive PP (device_map) options)
        parallelize: Optional[bool] = False,
        gpus: Optional[int] = None,
-        device_map_option: Optional[str] = "auto",
        max_memory_per_gpu: Optional[Union[int, str]] = None,
        max_cpu_memory: Optional[Union[int, str]] = None,
        offload_folder: Optional[str] = "./offload",
@@ -513,6 +533,7 @@ class HFLM(TemplateLM):
        peft: Optional[str] = None,
        delta: Optional[str] = None,
        autogptq: Optional[Union[bool, str]] = False,
+        gptqmodel: Optional[bool] = False,
        **kwargs,
    ) -> None:
        """
@@ -529,27 +550,18 @@ class HFLM(TemplateLM):

        model_kwargs = kwargs if kwargs else {}

-        if parallelize:
-            model_kwargs.update(
-                _get_accelerate_args(
-                    device_map_option,  # TODO: phase out device_map_option?
-                    max_memory_per_gpu,
-                    max_cpu_memory,
-                    offload_folder,
-                    gpus,
-                )
+        model_kwargs.update(
+            self._get_accelerate_args(
+                parallelize=parallelize,
+                device_map=kwargs.get("device_map", None),
+                max_memory_per_gpu=max_memory_per_gpu,
+                max_cpu_memory=max_cpu_memory,
+                offload_folder=offload_folder,
+                gpus=gpus,
            )
-        elif "device_map" not in model_kwargs:
-            # set a device_map to initialize model on the right GPU.
-            # this is needed because it seems that the default behavior
-            # for quantized models now seems to be device_map="auto"
-            # which breaks data-parallel mode.
-            if hasattr(self, "accelerator"):
-                model_kwargs.update({"device_map": {"": f"{self.accelerator.device}"}})
-            else:
-                model_kwargs.update({"device_map": {"": str(self.device)}})
+        )

-        if not autogptq:
+        if not autogptq and not gptqmodel:
            if model_kwargs.get("load_in_4bit", None):
                assert (
                    transformers.__version__ >= "4.30.0"
@@ -560,6 +572,7 @@ class HFLM(TemplateLM):
                        model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(
                            model_kwargs["bnb_4bit_compute_dtype"]
                        )
+
            self._model = self.AUTO_MODEL_CLASS.from_pretrained(
                pretrained,
                revision=revision,
@@ -568,23 +581,42 @@ class HFLM(TemplateLM):
                **model_kwargs,
            )
        else:
-            try:
-                from auto_gptq import AutoGPTQForCausalLM
-            except ModuleNotFoundError:
-                raise Exception(
-                    "Tried to load auto_gptq, but auto-gptq is not installed ",
-                    "please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]",
+            if autogptq and gptqmodel:
+                raise ValueError(
+                    "Cannot use both 'autogptq' and 'gptqmodel' options at the same time."
                )

-            self._model = AutoGPTQForCausalLM.from_quantized(
-                pretrained,
-                trust_remote_code=trust_remote_code,
-                model_basename=None if autogptq is True else Path(autogptq).stem,
-                use_safetensors=True
-                if autogptq is True
-                else autogptq.endswith(".safetensors"),
-                **model_kwargs,
-            )
+            if autogptq:
+                try:
+                    from auto_gptq import AutoGPTQForCausalLM
+                except ModuleNotFoundError as exception:
+                    raise type(exception)(
+                        "Tried to load auto_gptq, but auto-gptq is not installed ",
+                        "please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]",
+                    )
+
+                self._model = AutoGPTQForCausalLM.from_quantized(
+                    pretrained,
+                    trust_remote_code=trust_remote_code,
+                    model_basename=None if autogptq is True else Path(autogptq).stem,
+                    use_safetensors=True
+                    if autogptq is True
+                    else autogptq.endswith(".safetensors"),
+                    **model_kwargs,
+                )
+
+            if gptqmodel:
+                try:
+                    from gptqmodel import GPTQModel
+                except ModuleNotFoundError as exception:
+                    raise type(exception)(
+                        "Tried to load gptqmodel, but gptqmodel is not installed ",
+                        "please install gptqmodel via `pip install gptqmodel --no-build-isolation` or `pip install lm-eval[gptqmodel] --no-build-isolation`",
+                    )
+
+                self._model = GPTQModel.from_quantized(
+                    pretrained, trust_remote_code=trust_remote_code, **model_kwargs
+                )

        if peft and delta:
            raise ValueError(
@@ -597,10 +629,10 @@ class HFLM(TemplateLM):
                    raise AssertionError("load_in_4bit requires peft >= 0.4.0")
            if self._model.config.vocab_size != len(self.tokenizer):
                # resize model for LoRAs with added tokens
-                self._model.resize_token_embeddings(len(self.tokenizer))
                eval_logger.info(
                    f"Model config indicates vocab_size='{self._model.config.vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..."
                )
+                self._model.resize_token_embeddings(len(self.tokenizer))
            self._model = PeftModel.from_pretrained(
                self._model, peft, revision=revision
            )
@@ -718,7 +750,7 @@ class HFLM(TemplateLM):
        @find_executable_batch_size(starting_batch_size=self.max_batch_size)
        def forward_batch(batch_size):
            security_margin = int(0.05 * security_margin_factor * batch_size)
-            if self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+            if self.backend == "seq2seq":
                length = max(max_context_enc, max_cont_enc)
                batched_conts = torch.ones(
                    (batch_size + security_margin, length), device=self.device
@@ -773,7 +805,7 @@ class HFLM(TemplateLM):

        # by default for CausalLM - false or self.add_bos_token is set
        if add_special_tokens is None:
-            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+            if self.backend == "causal":
                special_tokens_kwargs = {
                    "add_special_tokens": False or self.add_bos_token
                }
@@ -801,7 +833,7 @@ class HFLM(TemplateLM):
        self.tokenizer.padding_side = padding_side

        add_special_tokens = {}
-        if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+        if self.backend == "causal":
            add_special_tokens = {"add_special_tokens": False or self.add_bos_token}

        encoding = self.tokenizer(
@@ -879,14 +911,14 @@ class HFLM(TemplateLM):
    def _select_cont_toks(
        self, logits: torch.Tensor, contlen: int = None, inplen: int = None
    ) -> torch.Tensor:
-        if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+        if self.backend == "causal":
            assert (
                contlen and inplen
            ), "Must pass input len and cont. len to select scored logits for causal LM"
            # discard right-padding.
            # also discard the input/context tokens. we'll only score continuations.
            logits = logits[inplen - contlen : inplen]
-        elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+        elif self.backend == "seq2seq":
            assert (
                contlen and not inplen
            ), "Selecting scored logits for Seq2SeqLM requires only cont. len"
@@ -944,6 +976,9 @@ class HFLM(TemplateLM):
            string_nll = sum(string_nll)
            loglikelihoods.append(string_nll)

+            # cache this loglikelihood_rolling request
+            self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
+
        return loglikelihoods

    def _batch_scheduler(self, pos, n_reordered_requests):
@@ -1000,8 +1035,7 @@ class HFLM(TemplateLM):
            requests,
            sort_fn=_collate,
            group_by="contexts"
-            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
-            and self.logits_cache
+            if self.backend == "causal" and self.logits_cache
            else None,
            group_fn=_lookup_one_token_cont,
        )
@@ -1058,14 +1092,14 @@ class HFLM(TemplateLM):
                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice

                # when too long to fit in context, truncate from the left
-                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                if self.backend == "causal":
                    inp = torch.tensor(
                        (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
                        dtype=torch.long,
                        device=self.device,
                    )
                    (inplen,) = inp.shape
-                elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                elif self.backend == "seq2seq":
                    inp = torch.tensor(
                        (context_enc)[-self.max_length :],
                        dtype=torch.long,
@@ -1105,11 +1139,11 @@ class HFLM(TemplateLM):

            # create encoder attn mask and batched conts, if seq2seq
            call_kwargs = {}
-            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+            if self.backend == "causal":
                batched_inps = pad_and_concat(
                    padding_len_inp, inps, padding_side="right"
                )  # [batch, padding_len_inp]
-            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+            elif self.backend == "seq2seq":
                # TODO: left-pad encoder inps and mask?
                batched_inps = pad_and_concat(
                    padding_len_inp, inps
@@ -1142,7 +1176,7 @@ class HFLM(TemplateLM):
                # from prompt/prefix tuning tokens, if applicable
                ctx_len = (
                    inplen + (logits.shape[0] - padding_len_inp)
-                    if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+                    if self.backend == "causal"
                    else None
                )
                logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
@@ -1178,7 +1212,13 @@ class HFLM(TemplateLM):

                    res.append(answer)

-                    self.cache_hook.add_partial("loglikelihood", request_str, answer)
+                    if request_str is not None:
+                        # special case: loglikelihood_rolling produces a number of loglikelihood requests
+                        # all with cache key None. instead do add_partial on the per-example level
+                        # in the loglikelihood_rolling() function for those.
+                        self.cache_hook.add_partial(
+                            "loglikelihood", request_str, answer
+                        )
                    pbar.update(1)

        pbar.close()
@@ -1260,7 +1300,7 @@ class HFLM(TemplateLM):
                max_gen_toks = self.max_gen_toks

            # set the max length in tokens of inputs ("context_enc")
-            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+            if self.backend == "causal":
                # max len for inputs = max length, minus room to generate the max new tokens
                # if the max new tokens is too large, halve it until it fits as we cannot change
                # the max model length
@@ -1268,7 +1308,7 @@ class HFLM(TemplateLM):
                while max_ctx_len <= 0:
                    max_gen_toks = max_gen_toks // 2
                    max_ctx_len = self.max_length - max_gen_toks
-            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+            elif self.backend == "seq2seq":
                # max len for inputs = encoder's whole max_length
                max_ctx_len = self.max_length

@@ -1295,7 +1335,7 @@ class HFLM(TemplateLM):
            cont_toks_list = cont.tolist()
            for cont_toks, context in zip(cont_toks_list, contexts):
                # discard context + left-padding toks if using causal decoder-only LM
-                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                if self.backend == "causal":
                    cont_toks = cont_toks[context_enc.shape[1] :]

                s = self.tok_decode(cont_toks)

--- a/lm_eval/models/ibm_watsonx_ai.py
+++ b/lm_eval/models/ibm_watsonx_ai.py
+import json
+import os
+from configparser import ConfigParser
+from functools import lru_cache
+from pathlib import Path
+from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Type, cast
+
+from tqdm import tqdm
+
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from lm_eval.utils import eval_logger, simple_parse_args_string
+
+
+class LogLikelihoodResult(NamedTuple):
+    log_likelihood: float
+    is_greedy: bool
+
+
+@lru_cache(maxsize=None)
+def get_watsonx_credentials(
+    env_name: str = "YP_QA",
+    config_path: str = "config.ini",
+) -> Dict[str, str]:
+    """
+    Retrieves Watsonx API credentials from environmental variables or from a configuration file.
+    Args:
+        env_name (str, optional): The name of the environment from which to retrieve credentials. Defaults to "YP_QA".
+        config_path (str, optional): The file path to the `config.ini` configuration file. Defaults to "config.ini".
+    Returns:
+        dict[str, str]: A dictionary containing the credentials necessary for authentication, including
+                        keys such as `apikey`, `url`, and `project_id`.
+    Raises:
+        FileNotFoundError: If the specified configuration file does not exist.
+        AssertionError: If the credentials format is invalid.
+    """
+
+    def _verify_credentials(creds: Any) -> None:
+        assert isinstance(creds, dict) and all(
+            key in creds.keys() for key in ["apikey", "url", "project_id"]
+        ), "Wrong configuration for credentials."
+
+    credentials = {
+        "apikey": os.getenv("WATSONX_API_KEY", None),
+        "url": os.getenv("WATSONX_URL", None),
+        "project_id": os.getenv("WATSONX_PROJECT_ID", None),
+    }
+
+    if any(credentials.get(key) is None for key in ["apikey", "url", "project_id"]):
+        eval_logger.warning(
+            "One or more required environment variables are missing, trying to load config.ini file."
+        )
+
+        config_path = "config.ini" if not config_path else config_path
+
+        if not Path(config_path).is_absolute():
+            config_path = os.path.join(
+                Path(__file__).parent.parent.absolute(), config_path
+            )
+
+        if not os.path.exists(config_path):
+            raise FileNotFoundError(
+                f"Provided config file path {config_path} does not exist. "
+                "You need to specify credentials in config.ini file under specified location."
+            )
+
+        config = ConfigParser()
+        config.read(config_path)
+        credentials = json.loads(config.get(env_name))
+
+    _verify_credentials(credentials)
+    return credentials
+
+
+@register_model("watsonx_llm")
+class WatsonxLLM(LM):
+    """
+    Implementation of LM model interface for evaluating Watsonx model with the lm_eval framework.
+    See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/model_guide.md for reference.
+    """
+
+    @classmethod
+    def create_from_arg_string(
+        cls: Type["WatsonxLLM"],
+        arg_string: str,
+        config_path: Optional[str] = None,
+    ) -> "WatsonxLLM":
+        """
+        Allow the user to specify model parameters (TextGenerationParameters) in CLI arguments.
+        """
+        try:
+            from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
+        except ImportError:
+            raise ImportError(
+                "Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
+            )
+
+        args = simple_parse_args_string(arg_string)
+        model_id = args.pop("model_id", None)
+        if model_id is None:
+            raise ValueError("'model_id' is required, please pass it in 'model_args'")
+
+        if not args.get("do_sample", None):
+            args["temperature"] = None
+            args["top_p"] = None
+            args["top_k"] = None
+            args["seed"] = None
+
+        cls.generate_params = {
+            GenParams.DECODING_METHOD: (
+                "greedy" if not args.get("do_sample", None) else "sample"
+            ),
+            GenParams.LENGTH_PENALTY: args.get("length_penalty", None),
+            GenParams.TEMPERATURE: args.get("temperature", None),
+            GenParams.TOP_P: args.get("top_p", None),
+            GenParams.TOP_K: args.get("top_k", None),
+            GenParams.RANDOM_SEED: args.get("seed", None),
+            GenParams.REPETITION_PENALTY: args.get("repetition_penalty", None),
+            GenParams.MIN_NEW_TOKENS: args.get("min_new_tokens", None),
+            GenParams.MAX_NEW_TOKENS: args.get("max_new_tokens", 256),
+            GenParams.STOP_SEQUENCES: args.get("stop_sequences", None),
+            GenParams.TIME_LIMIT: args.get("time_limit", None),
+            GenParams.TRUNCATE_INPUT_TOKENS: args.get("truncate_input_tokens", None),
+            GenParams.RETURN_OPTIONS: {
+                "generated_tokens": True,
+                "input_tokens": True,
+                "token_logprobs": True,
+                "token_ranks": True,
+            },
+        }
+
+        generate_params = {
+            k: v for k, v in cls.generate_params.items() if v is not None
+        }
+
+        return cls(
+            watsonx_credentials=get_watsonx_credentials(config_path),
+            model_id=model_id,
+            generate_params=generate_params,
+        )
+
+    def __init__(
+        self,
+        watsonx_credentials: Dict,
+        model_id,
+        generate_params: Optional[Dict[Any, Any]] = None,
+    ) -> None:
+        try:
+            from ibm_watsonx_ai import APIClient
+            from ibm_watsonx_ai.foundation_models import ModelInference
+        except ImportError:
+            raise ImportError(
+                "Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
+            )
+        super().__init__()
+        client = APIClient(watsonx_credentials)
+        project_id = watsonx_credentials.get("project_id", None)
+        deployment_id = watsonx_credentials.get("deployment_id", None)
+        client.set.default_project(project_id)
+        self.generate_params = generate_params or {}
+        self.model = ModelInference(
+            model_id=model_id,
+            deployment_id=deployment_id,
+            api_client=client,
+            project_id=project_id,
+        )
+        self._model_id = model_id
+
+    @staticmethod
+    def _has_stop_token(response_tokens: List[str], context_tokens: List[str]) -> bool:
+        """
+        Determines whether a stop token has been generated in the `response_tokens` compared to the `context_tokens`.
+        If the tokens do not match as expected, the function raises a RuntimeError, indicating a possible
+        misalignment between the tokens generated by the tokenizer and the model.
+        Args:
+            response_tokens (List[str]): The List of tokens generated as a response by the model.
+            context_tokens (List[str]): The List of tokens representing the input context.
+        Returns:
+            bool: True if the `response_tokens` likely contain a stop token that terminates the sequence,
+                  otherwise raises an exception.
+        Raises:
+            RuntimeError: If there is an unexpected mismatch between the `response_tokens` and the `context_tokens`.
+        """
+        context_length = len(context_tokens)
+        if response_tokens[: context_length - 1] == context_tokens[:-1]:
+            return (
+                response_tokens[-1] != context_tokens[-1]
+            )  # only last token differs, probably stop sequence (</s>)
+        raise RuntimeError(
+            f"There is an unexpected difference between tokenizer and model tokens:\n"
+            f"context_tokens={context_tokens}\n"
+            f"response_tokens={response_tokens[:context_length]}"
+        )
+
+    def _check_model_logprobs_support(self):
+        """
+        Verifies if the model supports returning log probabilities for input tokens.
+        This function sends a prompt to the model and checks whether the model's response
+        includes log probabilities for the input tokens. If log probabilities are not present,
+        it raises a `RuntimeError`, indicating that the model is not supported.
+        Raises:
+            RuntimeError: If the model does not return log probabilities for input tokens.
+        """
+        tokens = self.model.generate_text(
+            prompt=["The best ice cream flavor is:"],
+            params=self.generate_params,
+            raw_response=True,
+        )[0]["results"][0]
+        if all(token.get("logprob", None) is None for token in tokens["input_tokens"]):
+            raise RuntimeError(
+                f"Model {self._model_id} is not supported: does not return logprobs for input tokens"
+            )
+
+    def _get_log_likelihood(
+        self,
+        input_tokens: List[Dict[str, float]],
+        context_tokens: List[Dict[str, float]],
+    ) -> LogLikelihoodResult:
+        """
+        Calculates the log likelihood of the generated tokens compared to the context tokens.
+        Args:
+            input_tokens (List[dict[str, float]]): A List of token dictionaries, each containing
+                token information like `text` and `logprob`.
+            context_tokens (List[dict[str, float]]): A List of token dictionaries representing
+                the input context.
+        Returns:
+            LogLikelihoodResult: An object containing the calculated log likelihood and a boolean
+            flag indicating if the tokens were generated greedily.
+        """
+
+        response_tokens = [token["text"] for token in input_tokens]
+        context_length = len(context_tokens)
+
+        if self._has_stop_token(response_tokens, context_tokens):
+            context_length -= 1
+
+        return LogLikelihoodResult(
+            log_likelihood=sum(
+                token.get("logprob", 0) for token in input_tokens[context_length:]
+            ),
+            is_greedy=all(
+                token["rank"] == 1 for token in input_tokens[context_length:]
+            ),
+        )
+
+    def generate_until(self, requests: List[Instance]) -> List[str]:
+        """
+        Generates text responses for a List of requests, with progress tracking and caching.
+        Args:
+            requests (List[Instance]): A List of instances, each containing a text input to be processed.
+        Returns:
+            List[str]: A List of generated responses.
+        """
+        requests = [request.args[0] for request in requests]
+        results = []
+        batch_size = 5
+
+        for i in tqdm(
+            range(0, len(requests), batch_size),
+            desc=f"Running generate_until function with batch size {batch_size}",
+        ):
+            batch = requests[i : i + batch_size]
+            try:
+                responses = self.model.generate_text(batch, self.generate_params)
+
+            except Exception as exp:
+                eval_logger.error(f"Error while generating text {exp}")
+                continue
+
+            for response, context in zip(responses, batch):
+                results.append(response)
+                self.cache_hook.add_partial("generated_text", context, response)
+
+            eval_logger.info("Cached responses")
+
+        return results
+
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        """
+        Args:
+            requests: Each request contains Instance.args : Tuple[str, str] containing:
+                1. an input string to the LM and
+                2. a target string on which the loglikelihood of the LM producing this target,
+                   conditioned on the input, will be returned.
+        Returns:
+            tuple (loglikelihood, is_greedy) for each request according to the input order:
+                loglikelihood: probability of generating the target string conditioned on the input
+                is_greedy: True if and only if the target string would be generated by greedy sampling from the LM
+        """
+        try:
+            from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
+        except ImportError:
+            raise ImportError(
+                "Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
+            )
+        self._check_model_logprobs_support()
+        self.generate_params[GenParams.MAX_NEW_TOKENS] = 1
+
+        requests = [request.args for request in requests]
+        results: List[LogLikelihoodResult] = []
+        batch_size = 5
+
+        for i in tqdm(
+            range(0, len(requests), batch_size),
+            desc=f"Running loglikelihood function with batch size {batch_size}",
+        ):
+            batch = requests[i : i + batch_size]
+            try:
+                tokenized_contexts = [
+                    self.model.tokenize(prompt=context, return_tokens=True)["result"][
+                        "tokens"
+                    ]
+                    for context, _ in batch
+                ]
+            except Exception as exp:
+                eval_logger.error(f"Error while model tokenize:\n {exp}")
+                continue
+
+            input_prompts = [context + continuation for context, continuation in batch]
+
+            try:
+                responses = self.model.generate_text(
+                    prompt=input_prompts, params=self.generate_params, raw_response=True
+                )
+            except Exception as exp:
+                eval_logger.error(f"Error while model generate text:\n {exp}")
+                continue
+
+            for response, tokenized_context, (context, continuation) in zip(
+                responses, tokenized_contexts, batch
+            ):
+                log_likelihood_response = self._get_log_likelihood(
+                    response["results"][0]["input_tokens"], tokenized_context
+                )
+                results.append(log_likelihood_response)
+                self.cache_hook.add_partial(
+                    "loglikelihood",
+                    (context, continuation),
+                    (
+                        log_likelihood_response.log_likelihood,
+                        log_likelihood_response.is_greedy,
+                    ),
+                )
+            eval_logger.info("Cached batch")
+
+        return cast(List[Tuple[float, bool]], results)
+
+    def loglikelihood_rolling(self, requests) -> List[Tuple[float, bool]]:
+        """
+        Used to evaluate perplexity on a data distribution.
+        Args:
+            requests: Each request contains Instance.args : tuple[str] containing an input string to the model whose
+                entire loglikelihood, conditioned on purely the EOT token, will be calculated.
+        Returns:
+            tuple (loglikelihood,) for each request according to the input order:
+                loglikelihood: solely the probability of producing each piece of text given no starting input.
+        """
+        try:
+            from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
+        except ImportError:
+            raise ImportError(
+                "Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
+            )
+        self._check_model_logprobs_support()
+        self.generate_params[GenParams.MAX_NEW_TOKENS] = 1
+
+        requests = [request.args[0] for request in requests]
+        results: List[LogLikelihoodResult] = []
+        batch_size = 5
+
+        for i in tqdm(
+            range(0, len(requests), batch_size),
+            desc=f"Running loglikelihood_rolling function with batch size {batch_size}",
+        ):
+            batch = requests[i : i + batch_size]
+
+            try:
+                responses = self.model.generate_text(
+                    prompt=batch, params=self.generate_params, raw_response=True
+                )
+            except Exception as exp:
+                eval_logger.error(f"Error while model generate text:\n {exp}")
+                continue
+
+            for response, context in zip(responses, batch):
+                try:
+                    log_likelihood_response = self._get_log_likelihood(
+                        response["results"][0]["input_tokens"], []
+                    )
+                    results.append(log_likelihood_response)
+
+                    self.cache_hook.add_partial(
+                        "loglikelihood_rolling",
+                        context,
+                        (
+                            log_likelihood_response.log_likelihood,
+                            log_likelihood_response.is_greedy,
+                        ),
+                    )
+                except Exception as exp:
+                    eval_logger.error(
+                        f"Error during log likelihood calculation:\n {exp}"
+                    )
+                    continue
+
+            eval_logger.info("Cached batch")
+
+        return cast(List[Tuple[float, bool]], results)
--- a/lm_eval/models/mamba_lm.py
+++ b/lm_eval/models/mamba_lm.py
@@ -69,8 +69,8 @@ class MambaLMWrapper(HFLM):
    ) -> None:
        try:
            from mamba_ssm.utils.hf import load_config_hf  # noqa: F811
-        except ModuleNotFoundError:
-            raise Exception(
+        except ModuleNotFoundError as exception:
+            raise type(exception)(
                "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
 please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
            )
@@ -88,8 +88,8 @@ please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba
    ) -> None:
        try:
            from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel  # noqa: F811
-        except ModuleNotFoundError:
-            raise Exception(
+        except ModuleNotFoundError as exception:
+            raise type(exception)(
                "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
 please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
            )

--- a/lm_eval/models/nemo_lm.py
+++ b/lm_eval/models/nemo_lm.py
@@ -39,8 +39,8 @@ def _patch_pretrained_cfg(
 ):
    try:
        import omegaconf
-    except ModuleNotFoundError:
-        raise Exception(
+    except ModuleNotFoundError as exception:
+        raise type(exception)(
            "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
            "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
            "or installing nemo following https://github.com/NVIDIA/NeMo.",
@@ -79,8 +79,8 @@ def load_model(
            MegatronGPTModel,
        )
        from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
-    except ModuleNotFoundError:
-        raise Exception(
+    except ModuleNotFoundError as exception:
+        raise type(exception)(
            "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
            "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
            "or installing nemo following https://github.com/NVIDIA/NeMo.",
@@ -140,8 +140,8 @@ def load_model(
 def setup_distributed_environment(trainer):
    try:
        from nemo.utils.app_state import AppState
-    except ModuleNotFoundError:
-        raise Exception(
+    except ModuleNotFoundError as exception:
+        raise type(exception)(
            "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
            "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
            "or installing nemo following https://github.com/NVIDIA/NeMo.",
@@ -194,8 +194,8 @@ class NeMoLM(LM):
            from pytorch_lightning.trainer.trainer import Trainer

            self.generate = generate
-        except ModuleNotFoundError:
-            raise Exception(
+        except ModuleNotFoundError as exception:
+            raise type(exception)(
                "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
                "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
                "or installing nemo following https://github.com/NVIDIA/NeMo.",
@@ -386,6 +386,9 @@ class NeMoLM(LM):

            string_nll = sum(string_nll)
            loglikelihoods.append(string_nll)
+
+            # cache this loglikelihood_rolling request
+            self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
        return loglikelihoods

    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
@@ -468,6 +471,9 @@ class NeMoLM(LM):
                answer = (logprob, is_greedy)

                if cache_key is not None:
+                    # special case: loglikelihood_rolling produces a number of loglikelihood requests
+                    # all with cache key None. instead do add_partial on the per-example level
+                    # in the loglikelihood_rolling() function for those.
                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)

                res.append(answer)

--- a/lm_eval/models/neuralmagic.py
+++ b/lm_eval/models/neuralmagic.py
@@ -38,8 +38,8 @@ class SparseMLLM(HFLM):
    ) -> None:
        try:
            from sparseml.transformers import SparseAutoModelForCausalLM
-        except ModuleNotFoundError:
-            raise Exception(
+        except ModuleNotFoundError as exception:
+            raise type(exception)(
                "Package `sparseml` is not installed. "
                "Please install it via `pip install sparseml[transformers]`"
            )
@@ -88,8 +88,8 @@ class SparseMLLM(HFLM):
    def _get_config(self, pretrained: str, **kwargs) -> None:
        try:
            from sparseml.transformers import SparseAutoConfig
-        except ModuleNotFoundError:
-            raise Exception(
+        except ModuleNotFoundError as exception:
+            raise type(exception)(
                "Package `sparseml` is not installed. "
                "Please install it via `pip install sparseml[transformers]`"
            )
@@ -112,8 +112,8 @@ class SparseMLLM(HFLM):
    ) -> None:
        try:
            from sparseml.transformers import SparseAutoTokenizer
-        except ModuleNotFoundError:
-            raise Exception(
+        except ModuleNotFoundError as exception:
+            raise type(exception)(
                "Package `sparseml` is not installed. "
                "Please install it via `pip install sparseml[transformers]`"
            )
@@ -171,8 +171,8 @@ class DeepSparseLM(LM):

        try:
            import deepsparse
-        except ModuleNotFoundError:
-            raise Exception(
+        except ModuleNotFoundError as exception:
+            raise type(exception)(
                "Package `deepsparse` is not installed. "
                "Please install it via `pip install deepsparse[transformers]`"
            )
@@ -321,6 +321,9 @@ class DeepSparseLM(LM):
                res.append(answer)

                if cache_key is not None:
+                    # special case: loglikelihood_rolling produces a number of loglikelihood requests
+                    # all with cache key None. instead do add_partial on the per-example level
+                    # in the loglikelihood_rolling() function for those.
                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)

        return re_ord.get_original(res)

--- a/lm_eval/models/neuron_optimum.py
+++ b/lm_eval/models/neuron_optimum.py
 import copy
-import json
 import logging
-import subprocess
 from collections import defaultdict
 from typing import List, Optional, Union

@@ -33,54 +31,6 @@ except ImportError:
 logger = logging.getLogger(__name__)


-def get_nc_count() -> Union[int, None]:
-    """Returns the number of neuron cores on the current instance."""
-    try:
-        cmd = "neuron-ls --json-output"
-        result = subprocess.run(cmd, shell=True, capture_output=True)
-        print(f"inferring nc_count from `neuron-ls` {result.stdout}")
-        json_output = json.loads(result.stdout)
-        count = sum([x["nc_count"] for x in json_output])
-        print(f"nc_count={count}")
-        return count
-    except Exception:
-        return None
-
-
-def wrap_constant_batch_size(func):
-    def _decorator(self, input_ids):
-        """input_ids a 2D array with batch_size on dim=0
-
-        makes sure the func runs with self.batch_size
-        """
-        # access a from TestSample
-        batch_size = input_ids.shape[0]
-
-        if batch_size < self.batch_size:
-            # handle the event of input_ids.shape[0] != batch_size
-            # Neuron cores expect constant batch_size
-            input_ids = torch.concat(
-                (
-                    input_ids,
-                    # add missing_batch_size dummy
-                    torch.zeros(
-                        [self.batch_size - batch_size, *input_ids.size()[1:]],
-                        dtype=input_ids.dtype,
-                        device=input_ids.device,
-                    ),
-                ),
-                dim=0,
-            )
-        elif batch_size > self.batch_size:
-            raise ValueError(
-                f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
-            )
-        # return the forward pass that requires constant batch size
-        return func(self, input_ids)[:batch_size]
-
-    return _decorator
-
-
 class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
    """NeuronModelForCausalLM with `stopping_criteria` in `generate`"""

@@ -146,7 +96,7 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
            raise ValueError(
                f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
            )
-        elif batch_size < self.batch_size:
+        elif batch_size < self.batch_size and not self.continuous_batching:
            logger.warning(
                "Inputs will be padded to match the model static batch size. This will increase latency."
            )
@@ -158,8 +108,6 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
            if attention_mask is not None:
                padding = torch.zeros(padding_shape, dtype=torch.int64)
                padded_attention_mask = torch.cat([attention_mask, padding])
-        # Drop the current generation context and clear the Key/Value cache
-        self.reset_generation()

        output_ids = self.generate_tokens(
            padded_input_ids,
@@ -179,8 +127,6 @@ class NEURON_HF(TemplateLM):
    Tested with neuron 2.17.0
    """

-    _DEFAULT_MAX_LENGTH = 2048
-
    def __init__(
        self,
        pretrained: Optional[str] = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
@@ -198,12 +144,12 @@ class NEURON_HF(TemplateLM):
        add_bos_token: Optional[bool] = False,
    ) -> None:
        if not NEURON_AVAILABLE:
-            raise Exception(
+            raise ImportError(
                "Tried to load neuron model, but neuron is not installed ",
                "please install neuron via pip install transformers-neuron ",
                "also make sure you are running on an AWS inf2 instance",
            )
-        if version.parse(optimum_neuron_version) != version.parse("0.0.17"):
+        if version.parse(optimum_neuron_version) != version.parse("0.0.24"):
            logger.warning(
                '`optimum-neuron` model requires `pip install "optimum[neuronx]>=0.0.17" '
                "preferably using the Hugging Face Neuron Deep Learning AMI (Ubuntu 22.04) "
@@ -217,34 +163,16 @@ class NEURON_HF(TemplateLM):

        self.batch_size_per_gpu = int(batch_size)
        batch_size = int(batch_size)
-        if tp_degree is None:
-            # execute `neuron-ls --json-output | jq '.[0].nc_count'``
-            # to get the number of neuron cores on your instance
-            tp_degree = get_nc_count()
-
-        assert isinstance(tp_degree, int), (
-            f"model_args must include tp_degree. tp_degree must be set to an integer,"
-            f" but is tp_degree=`{tp_degree}` with type=`{type(tp_degree)}`."
-            "Set it to number of neuron cores on your instance."
-            " For inf2.xlarge and inf2.8xlarge, set it to `2`."
-            " For inf2.24xlarge, set it to `12`."
-            " For inf2.48xlarge, set it to `24`."
-        )
-
-        # TODO: update this to be less of a hack once subfolder is fixed in HF
-        revision = revision + ("/" + subfolder if subfolder is not None else "")

        self._config = transformers.AutoConfig.from_pretrained(
            pretrained,
            revision=revision,
            trust_remote_code=trust_remote_code,
        )
-        torch_dtype = lm_eval.models.utils.get_dtype(dtype)

-        assert torch_dtype in [
-            torch.float16,
-            torch.bfloat16,
-        ], "Only float16 and bfloat16 are supported"
+        revision = str(revision)  # cast to string if not already one
+        # TODO: update this to be less of a hack once subfolder is fixed in HF
+        revision = revision + ("/" + subfolder if subfolder is not None else "")

        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
            pretrained if tokenizer is None else tokenizer,
@@ -253,36 +181,58 @@ class NEURON_HF(TemplateLM):
            use_fast=use_fast_tokenizer,
        )

-        # Neuron specific code
-        if torch_dtype == torch.float16:
-            self.amp_dtype = "f16"
-        elif torch_dtype == torch.bfloat16:
-            self.amp_dtype = "bf16"
-        elif torch_dtype == torch.float32:
-            self.amp_dtype = "f32"
-        else:
-            raise NotImplementedError("Only float16 and bfloat16 are implemented.")
-
-        compiler_args = {"num_cores": tp_degree, "auto_cast_type": self.amp_dtype}
-        input_shapes = {
-            "batch_size": batch_size,
-            "sequence_length": self._DEFAULT_MAX_LENGTH,
-        }
+        neuron_config = getattr(self._config, "neuron", None)
+        if neuron_config is None:
+            # Check export parameters
+            if tp_degree is not None:
+                assert isinstance(tp_degree, int), (
+                    f"tp_degree must be set to an integer,"
+                    f" but is tp_degree=`{tp_degree}` with type=`{type(tp_degree)}`."
+                    "Set it to a number lower than the number of neuron cores on your instance."
+                    " For inf2.xlarge and inf2.8xlarge, set it to `2`."
+                    " For inf2.24xlarge, set it <= `12`."
+                    " For inf2.48xlarge, set it <= `24`."
+                )
+            torch_dtype = lm_eval.models.utils.get_dtype(dtype)
+
+            if torch_dtype == torch.float16:
+                self.amp_dtype = "f16"
+            elif torch_dtype == torch.bfloat16:
+                self.amp_dtype = "bf16"
+            elif torch_dtype == torch.float32:
+                self.amp_dtype = "f32"
+            else:
+                raise NotImplementedError(
+                    "Only float16/bfloat16/float32 are supported."
+                )

-        print(
-            f"{'='*20} \n loading model to neuron with"
-            f" {compiler_args}, {input_shapes}..."
-        )
-        self.model = CustomNeuronModelForCausalLM.from_pretrained(
-            pretrained,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-            low_cpu_mem_usage=low_cpu_mem_usage,
-            export=True,
-            **compiler_args,
-            **input_shapes,
-        )
-        print(f"SUCCESS: neuron model compiled. \n {'='*20}")
+            print(f"{'='*20} \n exporting model to neuron")
+            self.model = CustomNeuronModelForCausalLM.from_pretrained(
+                pretrained,
+                revision=revision,
+                trust_remote_code=trust_remote_code,
+                low_cpu_mem_usage=low_cpu_mem_usage,
+                export=True,
+                batch_size=batch_size,
+                num_cores=tp_degree,
+                auto_cast_type=self.amp_dtype,
+                sequence_length=max_length,
+            )
+            neuron_config = self.model.config.neuron
+            print(
+                f"SUCCESS: neuron model exported with config {neuron_config}. \n {'='*20}"
+            )
+        else:
+            print(
+                f"{'='*20} \n loading neuron model with config" f" {neuron_config}..."
+            )
+            self.model = CustomNeuronModelForCausalLM.from_pretrained(
+                pretrained,
+                revision=revision,
+                trust_remote_code=trust_remote_code,
+                low_cpu_mem_usage=low_cpu_mem_usage,
+            )
+            print(f"SUCCESS: neuron model loaded. \n {'='*20}")

        self.truncation = truncation

@@ -290,8 +240,6 @@ class NEURON_HF(TemplateLM):
        self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
        self.add_bos_token = add_bos_token

-        self._max_length = max_length
-
        self.batch_schedule = 1
        self.batch_sizes = {}

@@ -312,17 +260,7 @@ class NEURON_HF(TemplateLM):

    @property
    def max_length(self):
-        if self._max_length:  # if max length manually set, return it
-            return self._max_length
-        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
-        for attr in seqlen_config_attrs:
-            if hasattr(self.model.config, attr):
-                return getattr(self.model.config, attr)
-        if hasattr(self.tokenizer, "model_max_length"):
-            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
-                return self._DEFAULT_MAX_LENGTH
-            return self.tokenizer.model_max_length
-        return self._DEFAULT_MAX_LENGTH
+        return self.model.max_length

    @property
    def max_gen_toks(self) -> int:
@@ -390,34 +328,6 @@ class NEURON_HF(TemplateLM):
    def tok_decode(self, tokens):
        return self.tokenizer.decode(tokens)

-    @wrap_constant_batch_size
-    def _model_call(self, input_ids: torch.Tensor):
-        """
-        get logits for the entire sequence
-
-        :param input_ids: torch.Tensor
-            A torch tensor of shape [batch, sequence_cont]
-            the size of sequence may vary from call to call
-        :return
-            A torch tensor of shape [batch, sequence, vocab] with the
-            logits returned from the model's decoder-lm head
-        """
-        _, sequence_length = input_ids.shape
-
-        with torch.inference_mode():
-            cache_ids = torch.arange(0, sequence_length, dtype=torch.int32).split(1)
-            input_ids_split = input_ids.split(1, dim=1)
-
-            return torch.concat(
-                [
-                    self.model.forward(
-                        input_ids=input_id, cache_ids=cache_id, return_dict=False
-                    )[0]
-                    for input_id, cache_id in zip(input_ids_split, cache_ids)
-                ],
-                dim=1,
-            )
-
    def _model_generate(self, context, max_length, stop, **generation_kwargs):
        # we require users to pass do_sample=True explicitly
        # for non-greedy gen. This should be reevaluated when considering beam search.
@@ -501,7 +411,8 @@ class NEURON_HF(TemplateLM):

            string_nll = sum(string_nll)
            loglikelihoods.append(string_nll)
-
+            # cache this loglikelihood_rolling request
+            self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
        return loglikelihoods

    def _loglikelihood_tokens(
@@ -578,15 +489,41 @@ class NEURON_HF(TemplateLM):
                cont_toks_list.append(continuation_enc)
                inplens.append(inplen)

-            # create encoder attn mask and batched conts, if seq2seq
-            call_kwargs = {}
+            # Add dummy inputs up to the model static batch size
+            if len(inps) < self.batch_size:
+                inps = inps + [
+                    torch.zeros_like(inps[0]),
+                ] * (self.batch_size - len(inps))
+
+            masks = [torch.ones_like(inp) for inp in inps]
            batched_inps = lm_eval.models.utils.pad_and_concat(
                padding_len_inp, inps, padding_side="right"
            )  # [batch, padding_len_inp]

-            multi_logits = F.log_softmax(
-                self._model_call(batched_inps, **call_kwargs), dim=-1
-            )  # [batch, padding_length (inp or cont), vocab]
+            batched_masks = lm_eval.models.utils.pad_and_concat(
+                padding_len_inp, masks, padding_side="right"
+            )
+            if self.model.model.neuron_config.output_all_logits:
+                inputs = self.model.prepare_inputs_for_prefill(
+                    batched_inps, batched_masks
+                )
+                multi_logits = F.log_softmax(
+                    self.model.forward(**inputs).logits, dim=-1
+                )  # [batch, padding_length (inp or cont), vocab]
+            else:
+                # The model will only return the logits for the last input token, so we need
+                # to iterate over inputs to accumulate logits.
+                # To speed things up we use the KV cache as we would do when generating.
+                inputs = self.model.prepare_inputs_for_prefill(
+                    batched_inps[:, :1], batched_masks[:, :1]
+                )
+                outputs = [self.model.forward(**inputs).logits]
+                for i in range(1, padding_len_inp):
+                    inputs = self.model.prepare_inputs_for_decode(
+                        batched_inps[:, : i + 1], batched_masks[:, : i + 1]
+                    )
+                    outputs.append(self.model.forward(**inputs).logits)
+                multi_logits = F.log_softmax(torch.concat(outputs, dim=1), dim=-1)

            for (cache_key, _, _), logits, inplen, cont_toks in zip(
                chunk, multi_logits, inplens, cont_toks_list
@@ -619,7 +556,11 @@ class NEURON_HF(TemplateLM):

                res.append(answer)

-                self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+                if cache_key is not None:
+                    # special case: loglikelihood_rolling produces a number of loglikelihood requests
+                    # all with cache key None. instead do add_partial on the per-example level
+                    # in the loglikelihood_rolling() function for those.
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)

        return re_ord.get_original(res)


--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
-import copy
 import os
-from collections import defaultdict
-from importlib.util import find_spec
-from typing import List, Literal, Optional, Tuple
+from functools import cached_property
+from typing import Any, Dict, List, Optional, Tuple, Union

-from tqdm import tqdm
-
-import lm_eval.models.utils
-from lm_eval import utils
-from lm_eval.api.model import LM, TemplateLM
 from lm_eval.api.registry import register_model
-from lm_eval.models.utils import retry_on_specific_exceptions
+from lm_eval.models.api_models import TemplateAPI
 from lm_eval.utils import eval_logger


-def get_result(response) -> Tuple[float, bool]:
-    """Process results from OpenAI API response.
-
-    :param response: dict
-        OpenAI API Response
-    :return:
-        continuation_logprobs: np.array
-            Log probabilities of continuation tokens
-        is_greedy: bool
-            whether argmax matches given continuation exactly
-    """
-    is_greedy = True
-    logprobs = response.logprobs.token_logprobs
-    continuation_logprobs = sum(logprobs)
-
-    for i in range(len(response.logprobs.token_logprobs)):
-        token = response.logprobs.token_logprobs[i]
-        top_tokens = response.logprobs.top_logprobs[i]
-        top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
-        if top_token != token:
-            is_greedy = False
-            break
-
-    return continuation_logprobs, is_greedy
-
-
-def oa_completion(client, chat: bool = False, **kwargs):
-    """Query OpenAI API for completion.
-
-    Retry with back-off until they respond
-    """
-    if not find_spec("openai") or not find_spec("tiktoken"):
-        raise Exception(
-            "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. "
-            "Please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`"
+@register_model("local-completions")
+class LocalCompletionsAPI(TemplateAPI):
+    def __init__(
+        self,
+        base_url=None,
+        tokenizer_backend="huggingface",
+        **kwargs,
+    ):
+        super().__init__(
+            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
        )
-    else:
-        import openai
-
-    def _exception_callback(e: Exception, sleep_time: float) -> None:
-        import traceback
-
-        traceback.print_exc()
-
-    @retry_on_specific_exceptions(
-        on_exceptions=[openai.OpenAIError],
-        max_retries=None,  # retry forever, consider changing
-        on_exception_callback=_exception_callback,
-    )
-    def completion():
-        if chat:
-            return client.chat.completions.create(**kwargs)
-        else:
-            return client.completions.create(**kwargs)

-    return completion()
-
-
-@register_model("openai-completions", "local-completions")
-class OpenaiCompletionsLM(TemplateLM):
-    _DEFAULT_MAX_LENGTH = 2048
-
-    def __init__(
+    def _create_payload(
        self,
-        model: str,
-        base_url: str = None,
-        tokenizer: Optional[str] = None,
-        tokenizer_backend: Literal["tiktoken", "huggingface"] = "tiktoken",
-        truncate: bool = False,
-        max_gen_toks: int = 256,
-        batch_size: int = 1,
+        messages: Union[List[List[int]], List[dict], List[str], str],
+        generate=False,
+        gen_kwargs: Optional[dict] = None,
        seed: int = 1234,
-        max_length: Optional[int] = None,
-    ) -> None:
-        """
-
-        :param engine: str
-            OpenAI API engine (e.g. gpt-3.5-turbo-instruct)
-        :param truncate: bool
-            Truncate input if too long (if False and input is too long, throw error)
-        """
-        super().__init__()
-        self.seed = seed
-        try:
-            import openai  # noqa: E401
-            import tiktoken
-        except ModuleNotFoundError:
-            raise Exception(
-                "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
-    please install these via `pip install lm-eval[openai]` or `pip install -e .\"[openai]\"`",
-            )
-        self.model = model
-        self.base_url = base_url
-        self.tokenizer_backend = tokenizer_backend
-        self.truncate = truncate
-        self._batch_size = int(batch_size)
-        self._max_gen_toks = max_gen_toks
-        self._max_length = max_length
-
-        # if we have a local model, use HF tokenizer over tiktoken
-        if self.tokenizer_backend == "huggingface":
-            import transformers  # noqa: E401
-
-            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                tokenizer if tokenizer else self.model
-            )
-            self.vocab_size = self.tokenizer.vocab
-            self.end_of_text_token_id = self.tokenizer.eos_token
-        elif self.tokenizer_backend == "tiktoken":
-            if self.base_url:
-                eval_logger.warning(
-                    f"Passed `base_url={self.base_url}` but using Tiktoken tokenizer backend. "
-                    "Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken."
-                )
-
-            self.tokenizer = tiktoken.encoding_for_model(self.model)
-            self.vocab_size = self.tokenizer.n_vocab
-            self.end_of_text_token_id = self.tokenizer.eot_token
-        else:
-            raise ValueError(
-                f"Expected tokenizer_backend to be one of ['tiktoken', 'huggingface'] but got {self.tokenizer_backend}"
-            )
-
-        # Read from environment variable OPENAI_API_KEY
-        # Set to EMPTY for local
-        openai.api_key = os.environ["OPENAI_API_KEY"]
-        if self.base_url:
-            self.client = openai.OpenAI(base_url=self.base_url)
-        else:
-            self.client = openai.OpenAI()
-
-    @property
-    def eot_token_id(self):
-        return self.end_of_text_token_id
-
-    @property
-    def max_length(self) -> int:
-        if self._max_length:
-            return self._max_length
+        **kwargs,
+    ) -> dict:
+        if generate:
+            gen_kwargs.pop("do_sample", False)
+            if "max_tokens" in gen_kwargs:
+                max_tokens = gen_kwargs.pop("max_tokens")
+            else:
+                max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
+            temperature = gen_kwargs.pop("temperature", 0)
+            stop = gen_kwargs.pop("until", ["<|endoftext|>"])
+            return {
+                "prompt": messages,
+                "model": self.model,
+                "max_tokens": max_tokens,
+                "temperature": temperature,
+                "stop": stop,
+                "seed": seed,
+                **gen_kwargs,
+            }
        else:
-            return self._DEFAULT_MAX_LENGTH
-
-    @property
-    def max_gen_toks(self) -> int:
-        return self._max_gen_toks
-
-    @property
-    def batch_size(self) -> int:
-        return self._batch_size
-
-    @property
-    def device(self):
-        # Isn't used because we override _loglikelihood_tokens
-        raise NotImplementedError()
-
-    def tok_encode(self, string: str, **kwargs) -> List[int]:
-        return self.tokenizer.encode(string)
-
-    def tok_decode(self, tokens: List[int]) -> str:
-        return self.tokenizer.decode(tokens)
-
-    def _loglikelihood_tokens(
-        self, requests, disable_tqdm: bool = False
+            return {
+                "model": self.model,
+                "prompt": messages,
+                "temperature": 0,
+                "max_tokens": 1,
+                "logprobs": 1,
+                "seed": seed,
+                "echo": True,
+            }
+
+    @staticmethod
+    def parse_logprobs(
+        outputs: Union[Dict, List[Dict]],
+        tokens: List[List[int]] = None,
+        ctxlens: List[int] = None,
+        **kwargs,
    ) -> List[Tuple[float, bool]]:
        res = []
-
-        def _collate(x):
-            # this doesn't efficiently handle last-token differences yet, but those are kinda annoying because
-            # it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations
-            # we care about, and so we need some kind of backup for when it isn't
-            toks = x[1] + x[2]
-            return -len(toks), tuple(toks)
-
-        re_ord = utils.Reorderer(requests, _collate)
-
-        for chunk in tqdm(
-            list(lm_eval.models.utils.chunks(re_ord.get_reordered(), self.batch_size)),
-            disable=disable_tqdm,
-        ):
-            inps = []
-            ctxlens = []
-            for cache_key, context_enc, continuation_enc in chunk:
-                # max_length+1 because the API takes up to 2049 tokens, including the first context token
-                inp = (context_enc + continuation_enc)[-(self.max_length + 1) :]
-                # TODO: the logic is much simpler if we just look at the length of continuation tokens
-                ctxlen = len(context_enc) - max(
-                    0, len(context_enc) + len(continuation_enc) - (self.max_length + 1)
-                )
-
-                inps.append(inp)
-                ctxlens.append(ctxlen)
-
-            response = oa_completion(
-                client=self.client,
-                model=self.model,
-                prompt=inps,
-                max_tokens=0,
-                temperature=0.0,
-                logprobs=10,
-                seed=self.seed,
-            )
-
-            for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip(
-                response.choices, ctxlens, chunk
-            ):
-                answer = get_result(resp)
-
-                res.append(answer)
-
-                # partial caching
-                if cache_key is not None:
-                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
-        return re_ord.get_original(res)
-
-    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
-        if not requests:
-            return []
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+        for out in outputs:
+            for choice, ctxlen in zip(out["choices"], ctxlens):
+                assert ctxlen > 0, "Context length must be greater than 0"
+                logprobs = sum(choice["logprobs"]["token_logprobs"][ctxlen:-1])
+                tokens_logprobs = choice["logprobs"]["token_logprobs"][ctxlen:-1]
+                top_logprobs = choice["logprobs"]["top_logprobs"][ctxlen:-1]
+                is_greedy = True
+                for tok, top in zip(tokens_logprobs, top_logprobs):
+                    if tok != max(top.values()):
+                        is_greedy = False
+                        break
+                res.append((logprobs, is_greedy))
+        return res
+
+    @staticmethod
+    def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]:
        res = []
-        requests = [req.args for req in requests]
-
-        def _collate(x):
-            toks = self.tok_encode(x[0])
-            return len(toks), x[0]
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+        for out in outputs:
+            for choices in out["choices"]:
+                res.append(choices["text"])
+        return res

-        re_ord = utils.Reorderer(requests, _collate)
-
-        def sameuntil_chunks(xs, size):
-            ret = []
-            lastuntil = xs[0][1]
-            for x in xs:
-                if len(ret) >= size or x[1] != lastuntil:
-                    yield ret, lastuntil
-                    ret = []
-                    lastuntil = x[1]
-                ret.append(x)
-
-            if ret:
-                yield ret, lastuntil
-
-        # todo: more intelligent batching for heterogeneous `until`
-        for chunk, request_args in tqdm(
-            list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size)),
-            disable=disable_tqdm,
-        ):
-            inps = []
-            self._max_gen_toks = request_args.get("max_gen_toks", self.max_gen_toks)
-            for context, _ in chunk:
-                context_enc = self.tok_encode(context)
-                inp = context_enc[-(self.max_length - self.max_gen_toks) :]
-                inps.append(inp)
+    @property
+    def api_key(self):
+        return os.environ.get("OPENAI_API_KEY", "")

-            until = request_args.get("until", ["<|endoftext|>"])
-            request_args["temperature"] = request_args.get("temperature", 0)

-            response = oa_completion(
-                client=self.client,
-                model=self.model,
-                prompt=inps,
-                max_tokens=self.max_gen_toks,
-                stop=until,
-                seed=self.seed,
-                **{
-                    k: v
-                    for k, v in request_args.items()
-                    if k not in {"do_sample", "max_gen_toks", "until"}
-                },
+@register_model("local-chat-completions")
+class LocalChatCompletion(LocalCompletionsAPI):
+    def __init__(
+        self,
+        base_url=None,
+        tokenizer_backend=None,
+        tokenized_requests=False,
+        **kwargs,
+    ):
+        eval_logger.warning(
+            "chat-completions endpoint requires the `--apply_chat_template` flag."
+        )
+        super().__init__(
+            base_url=base_url,
+            tokenizer_backend=tokenizer_backend,
+            tokenized_requests=tokenized_requests,
+            **kwargs,
+        )
+        if self._batch_size > 1:
+            eval_logger.warning(
+                "Chat completions does not support batching. Defaulting to batch size 1."
            )
-            for resp, (context, args_) in zip(response.choices, chunk):
-                s = getattr(resp, "text")
-
-                until_ = until
-
-                for term in until_:
-                    if len(term) > 0:
-                        s = s.split(term)[0]
-
-                # partial caching
-                self.cache_hook.add_partial(
-                    "generate_until", (context, {"until": until_}), s
-                )
-
-                res.append(s)
-        return re_ord.get_original(res)
-
-    def _model_call(self, inps):
-        # Isn't used because we override _loglikelihood_tokens
-        raise NotImplementedError()
+            self._batch_size = 1

-    def _model_generate(self, context, max_length, eos_token_id):
-        # Isn't used because we override generate_until
-        raise NotImplementedError()
+    def _create_payload(
+        self,
+        messages: List[Dict],
+        generate=False,
+        gen_kwargs: dict = None,
+        seed=1234,
+        **kwargs,
+    ) -> dict:
+        assert (
+            type(messages) is not str
+        ), "chat-completions require the --apply_chat_template flag."
+        gen_kwargs.pop("do_sample", False)
+        if "max_tokens" in gen_kwargs:
+            max_tokens = gen_kwargs.pop("max_tokens")
+        else:
+            max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
+        temperature = gen_kwargs.pop("temperature", 0)
+        stop = gen_kwargs.pop("until", ["<|endoftext|>"])
+        if not isinstance(stop, (list, tuple)):
+            stop = [stop]
+        return {
+            "messages": messages,
+            "model": self.model,
+            "max_tokens": max_tokens,
+            "temperature": temperature,
+            "stop": stop[:4],
+            "seed": seed,
+            **gen_kwargs,
+        }
+
+    @staticmethod
+    def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]:
+        res = []
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+        for out in outputs:
+            for choices in out["choices"]:
+                res.append(choices["message"]["content"])
+        return res
+
+    def tok_encode(
+        self,
+        string: Union[str, Any],
+        left_truncate_len=None,
+        add_special_tokens=None,
+        **kwargs,
+    ) -> Union[List[str], List[int], Any]:
+        return string

-    def loglikelihood_rolling(
-        self, requests, disable_tqdm: bool = False
-    ) -> List[float]:
-        loglikelihoods = []
+    def loglikelihood(self, requests, **kwargs):
+        raise NotImplementedError(
+            "Loglikelihood is not supported for chat completions. Consider using the completions API instead."
+        )

-        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
-            rolling_token_windows = list(
-                map(
-                    utils.make_disjoint_window,
-                    utils.get_rolling_token_windows(
-                        token_list=self.tok_encode(string),
-                        prefix_token=self.eot_token_id,
-                        max_seq_len=self.max_length,
-                        context_len=1,
-                    ),
-                )
-            )

-            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
-            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+@register_model(
+    "openai-completions",
+)
+class OpenAICompletionsAPI(LocalCompletionsAPI):
+    def __init__(
+        self,
+        base_url="https://api.openai.com/v1/completions",
+        tokenizer_backend="tiktoken",
+        **kwargs,
+    ):
+        super().__init__(
+            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
+        )

-            string_nll = self._loglikelihood_tokens(
-                rolling_token_windows,
-                disable_tqdm=True,
+    @cached_property
+    def api_key(self):
+        """Override this property to return the API key for the API request."""
+        key = os.environ.get("OPENAI_API_KEY", None)
+        if key is None:
+            raise ValueError(
+                "API key not found. Please set the `OPENAI_API_KEY` environment variable."
            )
+        return key

-            # discard is_greedy
-            string_nll = [x[0] for x in string_nll]
+    def loglikelihood(self, requests, **kwargs):
+        assert (
+            self.model
+            in [
+                "babbage-002",
+                "davinci-002",
+            ]
+        ), f"Prompt loglikelihoods are only supported by OpenAI's API for {['babbage-002', 'davinci-002']}."
+        return super().loglikelihood(requests, **kwargs)

-            string_nll = sum(string_nll)
-            loglikelihoods.append(string_nll)
-        return loglikelihoods
+    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
+        return ""


-@register_model("openai-chat-completions", "local-chat-completions")
-class OpenaiChatCompletionsLM(LM):
+@register_model("openai-chat-completions")
+class OpenAIChatCompletion(LocalChatCompletion):
    def __init__(
        self,
-        model: str = "gpt-3.5-turbo",  # GPT model or Local model using HuggingFace model paths
-        base_url: str = None,
-        truncate: bool = False,
+        base_url="https://api.openai.com/v1/chat/completions",
+        tokenizer_backend=None,
+        tokenized_requests=False,
        **kwargs,
-    ) -> None:
-        """
-
-        :param model: str
-            Implements an OpenAI-style chat completion API for
-            accessing both OpenAI OR locally-hosted models using
-            HuggingFace Tokenizer
-            OpenAI API model (e.g. gpt-3.5-turbo)
-            using the **gen_kwargs passed on init
-        :param truncate: bool
-            Truncate input if too long (if False and input is too long, throw error)
-        """
-        super().__init__()
-        try:
-            import openai  # noqa: E401
-        except ModuleNotFoundError:
-            raise Exception(
-                "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
-    please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
+    ):
+        if "o1" in kwargs.get("model", ""):
+            eval_logger.warning(
+                "o1 models do not support `stop` and only support temperature=1"
            )
-        self.model = model
-        self.base_url = base_url
-        self.truncate = truncate
-
-        # Read from environment variable OPENAI_API_KEY
-        # Set to EMPTY for local
-        if self.base_url:
-            self.client = openai.OpenAI(base_url=self.base_url)
-        else:
-            self.client = openai.OpenAI()  # openai.AsyncOpenAI()
-
-    @property
-    def max_length(self) -> int:
-        # Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
-        return 2048
-
-    @property
-    def max_gen_toks(self) -> int:
-        return 256
-
-    @property
-    def batch_size(self):
-        # Isn't used because we override _loglikelihood_tokens
-        raise NotImplementedError()
-
-    @property
-    def device(self):
-        # Isn't used because we override _loglikelihood_tokens
-        raise NotImplementedError()
-
-    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
-        res = defaultdict(list)
-        re_ords = {}
+        super().__init__(
+            base_url=base_url,
+            tokenizer_backend=tokenizer_backend,
+            tokenized_requests=tokenized_requests,
+            **kwargs,
+        )

-        # we group requests by their generation_kwargs,
-        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
-        # in the same batch.
-        grouper = lm_eval.models.utils.Grouper(requests, lambda x: str(x.args[1]))
-        for key, reqs in grouper.get_grouped().items():
-            # within each set of reqs for given kwargs, we reorder by token length, descending.
-            re_ords[key] = utils.Reorderer(
-                [req.args for req in reqs], lambda x: (-len(x[0]), x[0])
+    @cached_property
+    def api_key(self):
+        """Override this property to return the API key for the API request."""
+        key = os.environ.get("OPENAI_API_KEY", None)
+        if key is None:
+            raise ValueError(
+                "API key not found. Please set the `OPENAI_API_KEY` environment variable."
            )
+        return key

-        pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
-        for key, re_ord in re_ords.items():
-            # n needs to be 1 because messages in
-            # chat completion are not batch but
-            # is regarded as a single conversation.
-            chunks = lm_eval.models.utils.chunks(re_ord.get_reordered(), n=1)
-            for chunk in chunks:
-                contexts, all_gen_kwargs = zip(*chunk)
-                inps = [{"role": "user", "content": context} for context in contexts]
-
-                gen_kwargs = all_gen_kwargs[0]
-                until = None
-                if isinstance(kwargs := copy.deepcopy(gen_kwargs), dict):
-                    if "do_sample" in kwargs.keys():
-                        kwargs.pop("do_sample")
-                    if "until" in kwargs.keys():
-                        until = kwargs.pop("until")
-                        if isinstance(until, str):
-                            until = [until]
-                        elif not isinstance(until, list):
-                            raise ValueError(
-                                f"Expected repr(kwargs['until']) to be of type Union[str, list] but got {until}"
-                            )
-                        kwargs["stop"] = until
-                    kwargs["max_tokens"] = kwargs.pop("max_gen_toks", self.max_gen_toks)
-                else:
-                    raise ValueError(
-                        f"Expected repr(kwargs) to be of type repr(dict) but got {kwargs}"
-                    )
-
-                response = oa_completion(
-                    client=self.client,
-                    chat=True,
-                    messages=inps,
-                    model=self.model,
-                    **kwargs,
-                )
-
-                for resp, (context, args_) in zip(response.choices, chunk):
-                    s = resp.message.content
-
-                    if until is not None:
-                        for term in until:
-                            if len(term) > 0:
-                                s = s.split(term)[0]
-
-                    res[key].append(s)
-
-                    self.cache_hook.add_partial(
-                        "generate_until", (context, {"until": until}), s
-                    )
-                    pbar.update(1)
-            # reorder this group of results back to original unsorted form
-            res[key] = re_ord.get_original(res[key])
-
-        pbar.close()
-
-        return grouper.get_original(res)
-
-    def loglikelihood(self, requests, disable_tqdm: bool = False):
-        raise NotImplementedError("No support for logits.")
+    def loglikelihood(self, requests, **kwargs):
+        raise NotImplementedError(
+            "Loglikelihood (and therefore `multiple_choice`-type tasks) is not supported for chat completions as OpenAI does not provide prompt logprobs. See https://github.com/EleutherAI/lm-evaluation-harness/issues/942#issuecomment-1777836312 or https://github.com/EleutherAI/lm-evaluation-harness/issues/1196 for more background on this limitation."
+        )

-    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
-        raise NotImplementedError("No support for logits.")
+    def _create_payload(
+        self,
+        messages: List[Dict],
+        generate=False,
+        gen_kwargs: dict = None,
+        seed=1234,
+        **kwargs,
+    ) -> dict:
+        assert (
+            type(messages) is not str
+        ), "chat-completions require the --apply_chat_template flag."
+        gen_kwargs.pop("do_sample", False)
+        if "max_tokens" in gen_kwargs:
+            max_tokens = gen_kwargs.pop("max_tokens")
+        else:
+            max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
+        temperature = gen_kwargs.pop("temperature", 0)
+        stop = gen_kwargs.pop("until", ["<|endoftext|>"])
+        if not isinstance(stop, (list, tuple)):
+            stop = [stop]
+        output = {
+            "messages": messages,
+            "model": self.model,
+            "max_completion_tokens": max_tokens,
+            "temperature": temperature,
+            "stop": stop[:4],
+            "seed": seed,
+            **gen_kwargs,
+        }
+        if "o1" in self.model:
+            output.pop("stop")
+            output["temperature"] = 1
+        return output