update latest

bf2517cc · lintangsutawika · 8bca751c · 7397b965 · bf2517cc · bf2517cc
Commit bf2517cc authored Feb 12, 2024 by lintangsutawika
20 changed files
--- a/README.md
+++ b/README.md
@@ -45,6 +45,7 @@ git clone https://github.com/EleutherAI/lm-evaluation-harness
 cd lm-evaluation-harness
 pip install -e .
 ```
+
 We also provide a number of optional dependencies for extended functionality. A detailed table is available at the end of this document.

 ## Basic Usage
@@ -174,6 +175,7 @@ Note that for externally hosted models, configs such as `--device` and `--batch_
 | vLLM                                                                                                                      | :heavy_check_mark:       | `vllm`                                                              | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
 | Mamba                       | :heavy_check_mark:       | `mamba_ssm`                                                                      | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces) | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                             |
 | Huggingface Optimum (Causal LMs)    | ✔️         | `openvino`                                 |     Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format                           |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
+| Neuron via AWS Inf2 (Causal LMs)    | ✔️         | `neuronx`                                 |     Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2)                         |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
 | Your local inference server!                                                                                              | :heavy_check_mark:                             | `local-completions` or `local-chat-completions` (using `openai-chat-completions` model type)    | Any server address that accepts GET requests using HF models and mirror's OpenAI's Completions or ChatCompletions interface                                  | `generate_until`                                           |                                | ...                |

 Models which do not supply logits or logprobs can be used with tasks of type `generate_until` only, while local models, or APIs that supply logprobs/logits of their prompts, can be run on all task types: `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
@@ -196,7 +198,7 @@ If you have a Metal compatible Mac, you can run the eval harness using the MPS b
 > You can inspect what the LM inputs look like by running the following command:
 > ```bash
 > python write_out.py \
->     --tasks all_tasks \
+>     --tasks <task1,task2,...> \
 >     --num_fewshot 5 \
 >     --num_examples 10 \
 >     --output_base_path /path/to/output/folder
@@ -312,7 +314,9 @@ Extras dependencies can be installed via `pip install -e ".[NAME]"`
 | anthropic     | For using Anthropic's models          |
 | dev           | For linting PRs and contributions     |
 | gptq          | For loading models with GPTQ          |
+| hf_transfer   | For speeding up HF Hub file downloads |
 | ifeval        | For running the IFEval task           |
+| neuronx       | For running on AWS inf2 instances     |
 | mamba         | For loading Mamba SSM models          |
 | math          | For running math task answer checking |
 | multilingual  | For multilingual tokenizers           |

--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -237,7 +237,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
    results = evaluator.simple_evaluate(
        model=args.model,
        model_args=args.model_args,
-        tasks=all_tasks,
+        tasks=task_names,
        num_fewshot=args.num_fewshot,
        batch_size=args.batch_size,
        max_batch_size=args.max_batch_size,

--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -3,6 +3,7 @@ import math
 import random
 from collections.abc import Iterable
 from collections import defaultdict
+from typing import List

 import evaluate
 import numpy as np
@@ -459,3 +460,64 @@ def stderr_for_metric(metric, bootstrap_iters):
    stderr = {mean: mean_stderr, acc_all: acc_all_stderr}

    return stderr.get(metric, None)
+
+
+def pooled_sample_stderr(stderrs: List[float], sizes: List[int]):
+    # Used to aggregate bootstrapped stderrs across subtasks in a group,
+    # when we are weighting by the size of each subtask.
+    #
+
+    assert len(stderrs) == len(sizes)
+
+    # formula source: https://en.wikipedia.org/wiki/Pooled_variance
+    # this empirically matches running `stderr_for_metric` on all instances
+    # from the subtasks concatenated with each other.
+    pooled_sample_var = (
+        sum([(size - 1) * stderr**2 for size, stderr in zip(sizes, stderrs)])
+    ) / (sum(sizes) - len(sizes))
+
+    return np.sqrt(pooled_sample_var)
+
+
+def combined_sample_stderr(stderrs: List[float], sizes: List[int], metrics=None):
+    assert (
+        metrics is not None
+    ), "Need to pass a list of each subtask's metric for this stderr aggregation"
+    assert len(stderrs) == len(sizes) and len(sizes) == len(metrics)
+
+    # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1390 for more documentation.
+    # This formula depends on sample means.
+    # removed because it seems to give erroneously huge stderrs for groupings of tasks
+    # and does not seem to match up with bootstrap-calculated stderrs for groups.
+
+    ### don't use this unless a statistician has told you it's the right thing to do ###
+
+    # accumulators: we'll aggregate pairwise N - 1 times
+    variance = stderrs[0] ** 2
+    curr_size = sizes[0]
+    curr_score = metrics[0]
+
+    for stderr, size, score in zip(stderrs[1:], sizes[1:], metrics[1:]):
+        curr_score = ((curr_score * curr_size) + (score * size)) / (
+            curr_size + size
+        )  # NOTE: this assumes our aggregation fn is "mean"
+
+        variance = ((curr_size - 1) * variance + (size - 1) * (stderr**2)) / (
+            curr_size + size - 1
+        ) + curr_size * size / ((curr_size + size) * (curr_size + size - 1)) * (
+            curr_score - score
+        ) ** 2
+
+    return np.sqrt(variance)
+
+
+def aggregate_subtask_metrics(metrics, sizes, weight_by_size=True):
+    # A helper function that is used to aggregate
+    # subtask scores cross-task.
+    # TODO: does not hold for non-mean aggregations
+    if weight_by_size:
+        sizes = [1] * len(sizes)
+
+    assert len(metrics) == len(sizes)
+
+    return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes)
--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
 import logging
+from typing import Callable, Dict

 import evaluate

@@ -75,7 +76,7 @@ def register_group(name):
 OUTPUT_TYPE_REGISTRY = {}
 METRIC_REGISTRY = {}
 METRIC_AGGREGATION_REGISTRY = {}
-AGGREGATION_REGISTRY = {}
+AGGREGATION_REGISTRY: Dict[str, Callable[[], Dict[str, Callable]]] = {}
 HIGHER_IS_BETTER_REGISTRY = {}

 DEFAULT_METRIC_REGISTRY = {
@@ -118,7 +119,7 @@ def register_metric(**args):
    return decorate


-def get_metric(name, hf_evaluate_metric=False):
+def get_metric(name: str, hf_evaluate_metric=False) -> Callable:
    if not hf_evaluate_metric:
        if name in METRIC_REGISTRY:
            return METRIC_REGISTRY[name]
@@ -136,7 +137,7 @@ def get_metric(name, hf_evaluate_metric=False):
        )


-def register_aggregation(name):
+def register_aggregation(name: str):
    def decorate(fn):
        assert (
            name not in AGGREGATION_REGISTRY
@@ -148,21 +149,21 @@ def register_aggregation(name):
    return decorate


-def get_aggregation(name):
+def get_aggregation(name: str) -> Callable[[], Dict[str, Callable]]:
    try:
        return AGGREGATION_REGISTRY[name]
    except KeyError:
        eval_logger.warning(f"{name} not a registered aggregation metric!")


-def get_metric_aggregation(name):
+def get_metric_aggregation(name: str) -> Callable[[], Dict[str, Callable]]:
    try:
        return METRIC_AGGREGATION_REGISTRY[name]
    except KeyError:
        eval_logger.warning(f"{name} metric is not assigned a default aggregation!")


-def is_higher_better(metric_name):
+def is_higher_better(metric_name) -> bool:
    try:
        return HIGHER_IS_BETTER_REGISTRY[metric_name]
    except KeyError:

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -373,7 +373,7 @@ class Task(abc.ABC):
        else:
            assert False, f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"

-        eval_logger.info(f"Building contexts for task on rank {rank}...")
+        eval_logger.info(f"Building contexts for {self.config.task} on rank {rank}...")

        instances = []
        for doc_id, doc in utils.create_iterator(
@@ -527,6 +527,7 @@ class Task(abc.ABC):
        return description + labeled_examples + example

    def apply_filters(self):
+        """Iterates over FilterEnsembles and applies them to instances"""
        if hasattr(self, "_filters"):
            for f in self._filters:
                f.apply(self._instances)
@@ -535,15 +536,51 @@ class Task(abc.ABC):
            return self._instances

    def dump_config(self) -> dict:
-        """Returns a dictionary representing the task's config.
-
-        :returns: str
-            The fewshot context.
-        """
+        """Returns the config as a dictionary."""
        # TODO: this should only return the overrides applied to a non-YAML task's configuration.
        # (num_fewshot)
        return self.config.to_dict()

+    def set_config(self, key: str, value: Any, update: bool = False) -> None:
+        """Set or update the configuration for a given key."""
+        if key is None:
+            raise ValueError("Key must be provided.")
+
+        if update:
+            current_value = getattr(self._config, key, {})
+            if not isinstance(current_value, dict):
+                raise TypeError(
+                    f"Expected a dict for key '{key}', got {type(current_value).__name__} instead."
+                )
+            current_value.update(value)
+        else:
+            setattr(self._config, key, value)
+
+    def override_metric(self, metric_name: str) -> None:
+        """
+        Override the default metrics used for evaluation with custom metrics.
+
+        Parameters:
+        - metric_name (str): The name of the custom metric to override. Should be registered in api.metrics.
+        """
+        (
+            self._metric_fn_list,
+            self._aggregation_list,
+            self._metric_fn_kwargs,
+            self._higher_is_better,
+        ) = ({}, {}, {}, {})
+        self._metric_fn_list[metric_name] = get_metric(metric_name)
+        self._aggregation_list[metric_name] = get_metric_aggregation(metric_name)
+        self._higher_is_better[metric_name] = is_higher_better(metric_name)
+        self._metric_fn_kwargs[metric_name] = {}
+        if not isinstance(self, ConfigurableTask):
+            self.process_results = lambda x, y: {metric_name: get_metric(metric_name)}
+            self.aggregation = lambda: {
+                metric_name: get_metric_aggregation(metric_name)
+            }
+        setattr(self._config, "metric_list", [{"metric": metric_name}])
+        setattr(self._config, "process_results", None)
+

 class ConfigurableTask(Task):
    VERSION = "Yaml"
@@ -849,6 +886,7 @@ class ConfigurableTask(Task):
                    return labeled_examples + str(example)

    def apply_filters(self):
+        """Iterates over FilterEnsembles and applies them to instances"""
        if hasattr(self, "_filters"):
            for f in self._filters:
                f.apply(self._instances)
@@ -1255,37 +1293,6 @@ class ConfigurableTask(Task):
    def get_config(self, key: str) -> Any:
        return getattr(self._config, key, None)

-    def override_metric(self, metric_name: str) -> None:
-        """
-        Override the default metrics used for evaluation with custom metrics.
-
-        Parameters:
-        - metric_name (str): The name of the custom metric to override. Should be registered in api.metrics.
-        """
-        (
-            self._metric_fn_list,
-            self._aggregation_list,
-            self._metric_fn_kwargs,
-            self._higher_is_better,
-        ) = ({}, {}, {}, {})
-        self._metric_fn_list[metric_name] = get_metric(metric_name)
-        self._aggregation_list[metric_name] = get_metric_aggregation(metric_name)
-        self._higher_is_better[metric_name] = is_higher_better(metric_name)
-        self._metric_fn_kwargs[metric_name] = {}
-        setattr(self._config, "metric_list", [{"metric": metric_name}])
-        setattr(self._config, "process_results", None)
-
-    def override_config(
-        self, key: str = None, value: Any = None, update: bool = False
-    ) -> None:
-        if update:
-            current_value = getattr(self._config, key)
-            assert isinstance(current_value, dict)
-            current_value.update(value)
-            setattr(self._config, key, current_value)
-        else:
-            setattr(self._config, key, value)
-

 class MultipleChoiceTask(Task):
    OUTPUT_TYPE: str = "loglikelihood"

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
-import random
-import itertools
 import collections
-
-import torch
-
+import itertools
 import logging
+import random
+from typing import Optional, Union
+
 import numpy as np
+import torch

-import lm_eval.api
-import lm_eval.models
 import lm_eval.api.metrics
 import lm_eval.api.registry
-
-from lm_eval.tasks import (
-    get_task_dict,
-    TaskManager
-)
+import lm_eval.models
+from lm_eval.tasks import TaskManager, get_task_dict
 from lm_eval.utils import (
+    eval_logger,
+    get_git_commit_hash,
    positional_deprecated,
    run_task_tests,
-    get_git_commit_hash,
    simple_parse_args_string,
-    eval_logger
 )


 @positional_deprecated
 def simple_evaluate(
    model,
-    model_args=None,
+    model_args: Optional[str] = None,
    tasks=None,
-    num_fewshot=None,
-    batch_size=None,
-    max_batch_size=None,
-    device=None,
-    use_cache=None,
-    limit=None,
+    num_fewshot: Optional[int] = None,
+    batch_size: Optional[int] = None,
+    max_batch_size: Optional[int] = None,
+    device: Optional[str] = None,
+    use_cache: Optional[str] = None,
+    limit: Optional[Union[int, float]] = None,
    bootstrap_iters: int = 100000,
    check_integrity: bool = False,
    decontamination_ngrams_path=None,
@@ -138,8 +133,8 @@ def simple_evaluate(

    eval_logger.info(
        "get_task_dict has been updated to accept an optional argument, `task_manager`"
-        "Read more here: https://github.com/EleutherAI/lm-evaluation-harness/blob/recursive-groups/docs/interface.md#external-library-usage"
-        )
+        "Read more here:https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
+    )
    task_dict = get_task_dict(tasks, task_manager)
    for task_name in task_dict.keys():
        task_obj = task_dict[task_name]
@@ -150,7 +145,7 @@ def simple_evaluate(

        if task_obj.get_config("output_type") == "generate_until":
            if gen_kwargs is not None:
-                task_obj.override_config(
+                task_obj.set_config(
                    key="generation_kwargs", value=gen_kwargs, update=True
                )

@@ -171,7 +166,7 @@ def simple_evaluate(
                eval_logger.warning(
                    f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
                )
-                task_obj.override_config(key="num_fewshot", value=num_fewshot)
+                task_obj.set_config(key="num_fewshot", value=num_fewshot)

    if check_integrity:
        run_task_tests(task_list=tasks)
@@ -222,8 +217,8 @@ decontaminate_suffix = "_decontaminate"
 def evaluate(
    lm,
    task_dict,
-    limit=None,
-    bootstrap_iters: int = 100000,
+    limit: Optional[int] = None,
+    bootstrap_iters: Optional[int] = 100000,
    decontamination_ngrams_path=None,
    write_out: bool = False,
    log_samples: bool = True,
@@ -297,13 +292,9 @@ def evaluate(
        versions[task_name] = task.VERSION
        configs[task_name] = dict(task.dump_config())

-        if "num_fewshot" in configs[task_name]:
-            if configs[task_name]["metadata"]:
-                n_shot = configs[task_name]["metadata"].get("num_fewshot", None)
-            if not n_shot:
-                n_shot = configs[task_name]["num_fewshot"]
-        else:
-            n_shot = 0 # TODO: is this always right?
+        # Number of few-shots for printing.
+        if (n_shot := configs[task_name].get("num_fewshot")) == 0:
+            n_shot = configs[task_name].get("metadata", {}).get("num_fewshot", 0)
        num_fewshot[task_name] = n_shot

        if "task_alias" in configs[task_name]:
@@ -483,97 +474,70 @@ def evaluate(
        vals = vals_torch

    if lm.rank == 0:
-
        ### Aggregate results over all datapoints ###
        # aggregate results ; run bootstrap CIs
        for (task_name, key, metric), items in vals.items():
            task = task_dict[task_name]
-            metric_key = metric + "," + key
-
-            if isinstance(task, tuple):
-                group_name, task = task
-            else:
-                group_name = None
+            group_name, task = task if isinstance(task, tuple) else (None, task)

+            metric_key = f"{metric},{key}"
            agg_fn = task.aggregation()[metric]
+
            results[task_name][metric_key] = agg_fn(items)
            results[task_name]["samples"] = len(items)

            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
            # so we run them less iterations. still looking for a cleaner way to do this
            if bootstrap_iters > 0:
-                stderr = lm_eval.api.metrics.stderr_for_metric(
-                    metric=task.aggregation()[metric],
+                stderr_fn = lm_eval.api.metrics.stderr_for_metric(
+                    metric=agg_fn,
                    bootstrap_iters=min(bootstrap_iters, 100)
                    if metric in ["bleu", "chrf", "ter"]
                    else bootstrap_iters,
                )

-                if stderr is not None and len(items) > 1:
-                    results[task_name][metric + "_stderr" + "," + key] = stderr(items)
-                else:
-                    results[task_name][metric + "_stderr" + "," + key] = "N/A"
+                results[task_name][f"{metric}_stderr,{key}"] = (
+                    stderr_fn(items) if (stderr_fn and len(items) > 1) else "N/A"
+                )

        if bool(results):
            for group, task_list in reversed(task_hierarchy.items()):
-                if task_list == []:
-                    # TODO: No samples when bypass
-                    total_size = results[group].get("samples", 999)
-                else:
-                    total_size = 0
-
-                    for task in task_list:
-                        metrics = results[task].copy()
-
-                        if "alias" in metrics:
-                            metrics.pop("alias")
-
-                        current_size = metrics.pop("samples")
-
-                        all_stderr = []
-                        for metric in [
-                            key for key in metrics.keys() if "_stderr" not in key
-                        ]:
-                            stderr = "_stderr,".join(metric.split(","))
-                            stderr_score = results[task][stderr]
-                            if stderr_score == "N/A":
-                                var_score = "N/A"
-                            else:
-                                var_score = stderr_score**2
-                                all_stderr.append(stderr)
-
-                            metric_score = results[task][metric]
-
-                            if metric in results[group]:
-                                results[group][metric] = (
-                                    results[group][metric] * total_size
-                                    + metric_score * current_size
-                                ) / (total_size + current_size)
-                                # $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
-                                if var_score == "N/A" or results[group][stderr] == "N/A":
-                                    results[group][stderr] = "N/A"
-                                else:
-                                    results[group][stderr] = (
-                                        (total_size - 1) * results[group][stderr]
-                                        + (current_size - 1) * var_score
-                                    ) / (
-                                        total_size + current_size - 1
-                                    ) + total_size * current_size / (
-                                        (total_size + current_size)
-                                        * (total_size + current_size - 1)
-                                    ) * (
-                                        results[group][metric] - metric_score
-                                    ) ** 2
-                            else:
-                                results[group][metric] = metric_score
-                                results[group][stderr] = var_score
-
-                        total_size += current_size
-
-                    for stderr in all_stderr:
-                        results[group][stderr] = np.sqrt(results[group][stderr])
-
-                results[group]["samples"] = total_size
+                if len(task_list) == 0:
+                    # task_hierarchy entries are either
+                    # `group_name: [subtask1, subtask2, ...]`
+                    # or `task_name: []`.
+                    # we only want to operate on groups here.
+                    continue
+                for metric in [
+                    key
+                    for key in results[task_list[0]].keys()
+                    if "_stderr" not in key and key not in ["alias", "samples"]
+                ]:  # TODO: what if tasks don't all share the same metrics
+                    stderr = "_stderr,".join(metric.split(","))
+
+                    # gather metrics, sizes, and stderrs from subtasks
+                    metrics = [
+                        results[task][metric] for task in task_list
+                    ]  # TODO: copy?
+                    stderrs = [results[task][stderr] for task in task_list]
+                    sizes = [results[task]["samples"] for task in task_list]
+
+                    # compute group's pooled metric and stderr
+                    results[group][
+                        metric
+                    ] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
+                    # TODO: calculate grouped metric using aggregation fn
+                    if "N/A" in stderrs:
+                        results[group][stderr] = "N/A"
+                    else:
+                        results[group][
+                            stderr
+                        ] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
+                        # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
+                        # To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
+                        # results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
+
+                    results[group]["samples"] = sum(sizes)

        def print_tasks(task_hierarchy, results, tab=0):
            results_agg = collections.defaultdict(dict)
@@ -648,8 +612,10 @@ def evaluate(
            groups_agg = {**groups_agg, **_groups_agg}

        for group_name, task_list in task_hierarchy.items():
-            if task_list != []:
-                num_fewshot[group_name] = num_fewshot[task_list[0]] # TODO: validate this
+            if task_list:
+                num_fewshot[group_name] = num_fewshot[
+                    task_list[0]
+                ]  # TODO: validate this

        results_dict = {
            "results": dict(results_agg.items()),

--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
@@ -7,5 +7,16 @@ from . import gguf
 from . import vllm_causallms
 from . import mamba_lm
 from . import optimum_lm
-
+from . import neuron_optimum
 # TODO: implement __all__
+
+
+import os
+
+try:
+    # enabling faster model download
+    import hf_transfer
+
+    os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+except ImportError:
+    pass
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
 import copy
 import os
+from datetime import timedelta
 from pathlib import Path
 from typing import List, Literal, Optional, Tuple, Union

 import torch
 import torch.nn.functional as F
 import transformers
-from accelerate import Accelerator, DistributedType, find_executable_batch_size
+from accelerate import (
+    Accelerator,
+    DistributedType,
+    InitProcessGroupKwargs,
+    find_executable_batch_size,
+)
 from packaging import version
 from peft import PeftModel
 from peft import __version__ as PEFT_VERSION
@@ -132,7 +138,8 @@ class HFLM(LM):
            assert isinstance(batch_size, (int, str))

            gpus = torch.cuda.device_count()
-            accelerator = Accelerator()
+            accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
+            accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
            if accelerator.num_processes > 1:
                self.accelerator = accelerator

@@ -617,7 +624,13 @@ class HFLM(LM):

            return batch_size

-        batch_size = forward_batch()
+        try:
+            batch_size = forward_batch()
+        except RuntimeError as e:
+            if "No executable batch size found" in str(e):
+                batch_size = 1
+            else:
+                raise

        if self.world_size > 1:
            # if multi-GPU, always take minimum over all selected batch sizes
@@ -721,6 +734,11 @@ class HFLM(LM):
        # and we don't want a warning from HF
        generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
        do_sample = generation_kwargs.get("do_sample", None)
+
+        # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
+        if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
+            generation_kwargs["do_sample"] = do_sample = False
+
        if do_sample is False and generation_kwargs.get("temperature") == 0.0:
            generation_kwargs.pop("temperature")
        # build stopping criteria

--- a/lm_eval/models/neuron_optimum.py
+++ b/lm_eval/models/neuron_optimum.py
+import copy
+import json
+import logging
+import subprocess
+from collections import defaultdict
+from typing import List, Optional, Union
+
+import torch
+import torch.nn.functional as F
+import transformers
+from packaging import version
+from tqdm import tqdm
+from transformers import GenerationConfig
+from transformers.generation import StoppingCriteriaList
+
+from lm_eval import utils
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from lm_eval.utils import stop_sequences_criteria
+
+
+try:
+    NEURON_AVAILABLE = True
+    from optimum.neuron import NeuronModelForCausalLM
+    from optimum.neuron.generation import TokenSelector
+    from optimum.neuron.version import __version__ as optimum_neuron_version
+except ImportError:
+    NeuronModelForCausalLM = object
+    NEURON_AVAILABLE = False
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_nc_count() -> Union[int, None]:
+    """Returns the number of neuron cores on the current instance."""
+    try:
+        cmd = "neuron-ls --json-output"
+        result = subprocess.run(cmd, shell=True, capture_output=True)
+        print(f"inferring nc_count from `neuron-ls` {result.stdout}")
+        json_output = json.loads(result.stdout)
+        count = sum([x["nc_count"] for x in json_output])
+        print(f"nc_count={count}")
+        return count
+    except Exception:
+        return None
+
+
+def wrap_constant_batch_size(func):
+    def _decorator(self, input_ids):
+        """input_ids a 2D array with batch_size on dim=0
+
+        makes sure the func runs with self.batch_size
+        """
+        # access a from TestSample
+        batch_size = input_ids.shape[0]
+
+        if batch_size < self.batch_size:
+            # handle the event of input_ids.shape[0] != batch_size
+            # Neuron cores expect constant batch_size
+            input_ids = torch.concat(
+                (
+                    input_ids,
+                    # add missing_batch_size dummy
+                    torch.zeros(
+                        [self.batch_size - batch_size, *input_ids.size()[1:]],
+                        dtype=input_ids.dtype,
+                        device=input_ids.device,
+                    ),
+                ),
+                dim=0,
+            )
+        elif batch_size > self.batch_size:
+            raise ValueError(
+                f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
+            )
+        # return the forward pass that requires constant batch size
+        return func(self, input_ids)[:batch_size]
+
+    return _decorator
+
+
+class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
+    """NeuronModelForCausalLM with `stopping_criteria` in `generate`"""
+
+    def generate(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        stopping_criteria: Optional["StoppingCriteriaList"] = None,
+        generation_config: Optional["GenerationConfig"] = None,
+        **kwargs,
+    ) -> torch.LongTensor:
+        r"""
+        A streamlined generate() method overriding the transformers.GenerationMixin.generate() method.
+
+        This method uses the same logits processors/warpers and stopping criteria as the transformers library
+        `generate()` method but restricts the generation to greedy search and sampling.
+
+        It does not support transformers `generate()` advanced options.
+
+        Please refer to https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate
+        for details on generation configuration.
+
+        Parameters:
+            input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices.
+            generation_config (`~transformers.generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~transformers.generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+
+        Returns:
+            `torch.Tensor`: A  `torch.FloatTensor`.
+        """
+        # The actual generation configuration is a combination of config and parameters
+        generation_config = copy.deepcopy(
+            self.generation_config if generation_config is None else generation_config
+        )
+        model_kwargs = generation_config.update(
+            **kwargs
+        )  # All unused kwargs must be model kwargs
+        # Check model kwargs are actually used by either prepare_inputs_for_generation or forward
+        self._validate_model_kwargs(model_kwargs)
+
+        # Instantiate a TokenSelector for the specified configuration
+        selector = TokenSelector.create(
+            input_ids, generation_config, self, self.max_length
+        )
+        selector.stopping_criteria.append(stopping_criteria)
+        # Verify that the inputs are compatible with the model static input dimensions
+        batch_size, sequence_length = input_ids.shape
+        if sequence_length > self.max_length:
+            raise ValueError(
+                f"The input sequence length ({sequence_length}) exceeds the model static sequence length ({self.max_length})"
+            )
+        padded_input_ids = input_ids
+        padded_attention_mask = attention_mask
+        if batch_size > self.batch_size:
+            raise ValueError(
+                f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
+            )
+        elif batch_size < self.batch_size:
+            logger.warning(
+                "Inputs will be padded to match the model static batch size. This will increase latency."
+            )
+            padding_shape = [self.batch_size - batch_size, sequence_length]
+            padding = torch.full(
+                padding_shape, fill_value=self.config.eos_token_id, dtype=torch.int64
+            )
+            padded_input_ids = torch.cat([input_ids, padding])
+            if attention_mask is not None:
+                padding = torch.zeros(padding_shape, dtype=torch.int64)
+                padded_attention_mask = torch.cat([attention_mask, padding])
+        # Drop the current generation context and clear the Key/Value cache
+        self.reset_generation()
+
+        output_ids = self.generate_tokens(
+            padded_input_ids,
+            selector,
+            batch_size,
+            attention_mask=padded_attention_mask,
+            **model_kwargs,
+        )
+        return output_ids[:batch_size, :]
+
+
+@register_model("neuronx")
+class NEURON_HF(LM):
+    """
+    Enables usage with on AWS Neuron
+    using the HuggingFace Transformers + Transformers neuronx library.
+    Tested with neuron 2.17.0
+    """
+
+    _DEFAULT_MAX_LENGTH = 2048
+
+    def __init__(
+        self,
+        pretrained: Optional[str] = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        revision: Optional[str] = "main",
+        tp_degree: Optional[int] = None,
+        subfolder: Optional[str] = None,
+        tokenizer: Optional[str] = None,
+        truncation: Optional[bool] = False,
+        max_length: Optional[int] = None,
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
+        batch_size: Optional[int] = 1,
+        low_cpu_mem_usage: Optional[bool] = True,
+        trust_remote_code: Optional[bool] = False,
+        use_fast_tokenizer: Optional[bool] = True,
+        # arguments used for splitting a model across GPUs naively.
+        # only used if `parallelize=True`.
+    ) -> None:
+        if not NEURON_AVAILABLE:
+            raise Exception(
+                "Tried to load neuron model, but neuron is not installed ",
+                "please install neuron via pip install transformers-neuron ",
+                "also make sure you are running on an AWS inf2 instance",
+            )
+        if version.parse(optimum_neuron_version) != version.parse("0.0.17"):
+            logger.warning(
+                '`optimum-neuron` model requires `pip install "optimum[neuronx]>=0.0.17" '
+                "preferably using the Hugging Face Neuron Deep Learning AMI (Ubuntu 22.04) "
+                "https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2 "
+                f"You are using optimum-neuron={optimum_neuron_version}"
+            )
+        super().__init__()
+
+        assert isinstance(pretrained, str)
+        assert isinstance(batch_size, (int, str))
+
+        self.batch_size_per_gpu = int(batch_size)
+        batch_size = int(batch_size)
+        if tp_degree is None:
+            # execute `neuron-ls --json-output | jq '.[0].nc_count'``
+            # to get the number of neuron cores on your instance
+            tp_degree = get_nc_count()
+
+        assert isinstance(tp_degree, int), (
+            f"model_args must include tp_degree. tp_degree must be set to an integer,"
+            f" but is tp_degree=`{tp_degree}` with type=`{type(tp_degree)}`."
+            "Set it to number of neuron cores on your instance."
+            " For inf2.xlarge and inf2.8xlarge, set it to `2`."
+            " For inf2.24xlarge, set it to `12`."
+            " For inf2.48xlarge, set it to `24`."
+        )
+
+        # TODO: update this to be less of a hack once subfolder is fixed in HF
+        revision = revision + ("/" + subfolder if subfolder is not None else "")
+
+        self._config = transformers.AutoConfig.from_pretrained(
+            pretrained,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+        )
+        torch_dtype = utils.get_dtype(dtype)
+
+        assert torch_dtype in [
+            torch.float16,
+            torch.bfloat16,
+        ], "Only float16 and bfloat16 are supported"
+
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+            pretrained if tokenizer is None else tokenizer,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            use_fast=use_fast_tokenizer,
+        )
+
+        # Neuron specific code
+        if torch_dtype == torch.float16:
+            self.amp_dtype = "f16"
+        elif torch_dtype == torch.bfloat16:
+            self.amp_dtype = "bf16"
+        elif torch_dtype == torch.float32:
+            self.amp_dtype = "f32"
+        else:
+            raise NotImplementedError("Only float16 and bfloat16 are implemented.")
+
+        compiler_args = {"num_cores": tp_degree, "auto_cast_type": self.amp_dtype}
+        input_shapes = {
+            "batch_size": batch_size,
+            "sequence_length": self._DEFAULT_MAX_LENGTH,
+        }
+
+        print(
+            f"{'='*20} \n loading model to neuron with"
+            f" {compiler_args}, {input_shapes}..."
+        )
+        self.model = CustomNeuronModelForCausalLM.from_pretrained(
+            pretrained,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+            export=True,
+            **compiler_args,
+            **input_shapes,
+        )
+        print(f"SUCCESS: neuron model compiled. \n {'='*20}")
+
+        self.truncation = truncation
+
+        self.vocab_size = self.tokenizer.vocab_size
+        self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+
+        self._max_length = max_length
+
+        self.batch_schedule = 1
+        self.batch_sizes = {}
+
+    @property
+    def config(self):
+        # return the associated transformers.AutoConfig for the given pretrained model.
+        return self._config
+
+    @property
+    def eot_token_id(self):
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+        return self.tokenizer.eos_token_id
+
+    @property
+    def max_length(self):
+        if self._max_length:  # if max length manually set, return it
+            return self._max_length
+        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
+        for attr in seqlen_config_attrs:
+            if hasattr(self.model.config, attr):
+                return getattr(self.model.config, attr)
+        if hasattr(self.tokenizer, "model_max_length"):
+            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
+                return self._DEFAULT_MAX_LENGTH
+            return self.tokenizer.model_max_length
+        return self._DEFAULT_MAX_LENGTH
+
+    @property
+    def max_gen_toks(self) -> int:
+        return 256
+
+    @property
+    def batch_size(self):
+        return self.batch_size_per_gpu
+
+    @property
+    def device(self):
+        """device are neuron cores, but the created tensors are on CPU."""
+        return "cpu"
+
+    @property
+    def rank(self):
+        return 0
+
+    @property
+    def world_size(self):
+        return 1
+
+    def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None):
+        """ """
+        if add_special_tokens is None:
+            add_special_tokens = False
+
+        encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens)
+
+        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+        if left_truncate_len:
+            encoding = encoding[-left_truncate_len:]
+
+        return encoding
+
+    def tok_batch_encode(
+        self,
+        strings: List[str],
+        padding_side: str = "left",
+        left_truncate_len: int = None,
+        truncation: bool = False,
+    ):
+        # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
+        old_padding_side = self.tokenizer.padding_side
+        self.tokenizer.padding_side = padding_side
+
+        add_special_tokens = False
+
+        encoding = self.tokenizer(
+            strings,
+            truncation=truncation,
+            padding="longest",
+            return_tensors="pt",
+            add_special_tokens=add_special_tokens,
+        )
+        if left_truncate_len:
+            encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
+            encoding["attention_mask"] = encoding["attention_mask"][
+                :, -left_truncate_len:
+            ]
+        self.tokenizer.padding_side = old_padding_side
+
+        return encoding["input_ids"], encoding["attention_mask"]
+
+    def tok_decode(self, tokens):
+        return self.tokenizer.decode(tokens)
+
+    @wrap_constant_batch_size
+    def _model_call(self, input_ids: torch.Tensor):
+        """
+        get logits for the entire sequence
+
+        :param input_ids: torch.Tensor
+            A torch tensor of shape [batch, sequence_cont]
+            the size of sequence may vary from call to call
+        :return
+            A torch tensor of shape [batch, sequence, vocab] with the
+            logits returned from the model's decoder-lm head
+        """
+        _, sequence_length = input_ids.shape
+
+        with torch.inference_mode():
+            cache_ids = torch.arange(0, sequence_length, dtype=torch.int32).split(1)
+            input_ids_split = input_ids.split(1, dim=1)
+
+            return torch.concat(
+                [
+                    self.model.forward(
+                        input_ids=input_id, cache_ids=cache_id, return_dict=False
+                    )[0]
+                    for input_id, cache_id in zip(input_ids_split, cache_ids)
+                ],
+                dim=1,
+            )
+
+    def _model_generate(self, context, max_length, stop, **generation_kwargs):
+        # we require users to pass do_sample=True explicitly
+        # for non-greedy gen. This should be reevaluated when considering beam search.
+
+        with torch.inference_mode():
+            if "do_sample" not in generation_kwargs.keys():
+                generation_kwargs["do_sample"] = False
+
+            stopping_criteria = stop_sequences_criteria(
+                self.tokenizer,
+                stop + [self.tokenizer.decode([self.config.eos_token_id])],
+                1,
+                context.shape[0],
+            )
+
+            return self.model.generate(
+                input_ids=context,
+                max_length=max_length,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=self.eot_token_id,
+                use_cache=True,
+                **generation_kwargs,
+            )
+
+    def _select_cont_toks(self, logits, contlen=None, inplen=None):
+        assert (
+            contlen and inplen
+        ), "Must pass input len and cont. len to select scored logits for causal LM"
+        # discard right-padding.
+        # also discard the input/context tokens. we'll only score continuations.
+        logits = logits[inplen - contlen : inplen]
+
+        return logits
+
+    def _encode_pair(self, context, continuation):
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+
+        whole_enc = self.tok_encode(context + continuation, add_special_tokens=False)
+        context_enc = self.tok_encode(context, add_special_tokens=False)
+
+        # whole_enc = self.tok_encode(context + continuation)
+        # context_enc = self.tok_encode(context, add_special_tokens=False)
+        context_enc_len = len(context_enc)
+        continuation_enc = whole_enc[context_enc_len:]
+        return context_enc, continuation_enc
+
+    def loglikelihood(self, requests):
+        new_reqs = []
+        for context, continuation in [req.args for req in requests]:
+            if context == "":
+                # end of text as context
+                context_enc, continuation_enc = (
+                    [self.eot_token_id],
+                    self.tok_encode(continuation),
+                )
+            else:
+                context_enc, continuation_enc = self._encode_pair(context, continuation)
+
+            new_reqs.append(((context, continuation), context_enc, continuation_enc))
+
+        return self._loglikelihood_tokens(new_reqs)
+
+    def loglikelihood_rolling(self, requests):
+        loglikelihoods = []
+
+        adaptive_batch_size = None
+
+        for (string,) in tqdm([req.args for req in requests], disable=(self.rank != 0)):
+            rolling_token_windows = list(
+                map(
+                    utils.make_disjoint_window,
+                    utils.get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.eot_token_id,
+                        max_seq_len=self.max_length,
+                        context_len=1,
+                    ),
+                )
+            )
+
+            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+
+            pad_amnt = 0
+            if self.world_size > 1:
+                # We pad out the external document-level iterator so the inner iterator doesn't hang
+                mytensor = torch.tensor(len(rolling_token_windows), device=self.device)
+                gathered = (
+                    self.accelerator.gather(mytensor).cpu().detach().numpy().tolist()
+                )
+
+                pad_amnt = max(gathered) - gathered[self.rank]
+                if pad_amnt > 0:
+                    rolling_token_windows += pad_amnt * [rolling_token_windows[0]]
+
+            string_nll = self._loglikelihood_tokens(
+                rolling_token_windows,
+                disable_tqdm=True,
+                override_bs=adaptive_batch_size,
+            )
+
+            if (self.world_size > 1) and (pad_amnt > 0):
+                string_nll = [x[0] for x in string_nll[:-pad_amnt]]
+            else:
+                # discard is_greedy
+                string_nll = [x[0] for x in string_nll]
+
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+
+        return loglikelihoods
+
+    def _loglikelihood_tokens(
+        self, requests, disable_tqdm: bool = False, override_bs=None
+    ):
+        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
+        res = []
+
+        def _collate(x):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+
+            toks = x[1] + x[2]
+            return -len(toks), tuple(toks)
+
+        re_ord = utils.Reorderer(requests, _collate)
+
+        n_reordered_requests = len(re_ord.get_reordered())  # noqa
+        # automatic (variable) batch size detection for vectorization
+        # pull longest context sample from request
+
+        chunks = utils.chunks(
+            re_ord.get_reordered(),
+            n=self.batch_size,
+            fn=None,
+        )
+
+        for chunk in tqdm(chunks, disable=(disable_tqdm or (self.rank != 0))):
+            inps = []
+            cont_toks_list = []
+            inplens = []
+
+            conts = []  # noqa
+            encoder_attns = []  # noqa
+
+            padding_len_inp = None
+            padding_len_cont = None  # noqa
+            # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
+            # tensors, then we pack them together into a batch, call the model, and then pick it all apart
+            # again because vectorizing is annoying
+
+            for _, context_enc, continuation_enc in chunk:
+                # sanity check
+                assert len(context_enc) > 0
+                assert len(continuation_enc) > 0
+                assert len(continuation_enc) <= self.max_length
+
+                # how this all works (illustrated on a causal decoder-only setup):
+                #          CTX      CONT
+                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+                # model  \               \
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+
+                # when too long to fit in context, truncate from the left
+                inp = torch.tensor(
+                    (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
+                    dtype=torch.long,
+                    device=self.device,
+                )
+                (inplen,) = inp.shape
+
+                padding_len_inp = (
+                    max(padding_len_inp, inplen)
+                    if padding_len_inp is not None
+                    else inplen
+                )
+
+                inps.append(inp)  # [1, inp_length]
+                cont_toks_list.append(continuation_enc)
+                inplens.append(inplen)
+
+            # create encoder attn mask and batched conts, if seq2seq
+            call_kwargs = {}
+            batched_inps = utils.pad_and_concat(
+                padding_len_inp, inps, padding_side="right"
+            )  # [batch, padding_len_inp]
+
+            multi_logits = F.log_softmax(
+                self._model_call(batched_inps, **call_kwargs), dim=-1
+            )  # [batch, padding_length (inp or cont), vocab]
+
+            for (cache_key, _, _), logits, inplen, cont_toks in zip(
+                chunk, multi_logits, inplens, cont_toks_list
+            ):
+                # Slice to original seq length
+                contlen = len(cont_toks)
+                # take only logits in the continuation
+                # (discard context toks if decoder-only ; discard right-padding)
+                # also discards + checks for "virtual tokens" in the causal LM's input window
+                # from prompt/prefix tuning tokens, if applicable
+                ctx_len = inplen + (logits.shape[0] - padding_len_inp)
+                logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
+                logits = logits.unsqueeze(0)  # [1, seq, vocab]
+
+                # Check if per-token argmax is exactly equal to continuation
+                greedy_tokens = logits.argmax(dim=-1)
+                cont_toks = torch.tensor(
+                    cont_toks, dtype=torch.long, device=self.device
+                ).unsqueeze(0)  # [1, seq]
+                max_equal = (greedy_tokens == cont_toks).all()
+
+                # Obtain log-probs at the corresponding continuation token indices
+                # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
+                logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
+                    -1
+                )  # [1, seq]
+
+                # Answer: (log prob, is-exact-match)
+                answer = (float(logits.sum()), bool(max_equal))
+
+                res.append(answer)
+
+                self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+
+        return re_ord.get_original(res)
+
+    def generate_until(self, requests):
+        res = defaultdict(list)
+        re_ords = {}
+
+        def _collate(x):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = self.tok_encode(x[0])
+            return -len(toks), x[0]
+
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        grouper = utils.Grouper(requests, lambda x: str(x.args[1]))
+        for key, reqs in grouper.get_grouped().items():
+            # within each set of reqs for given kwargs, we reorder by token length, descending.
+            re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
+
+        pbar = tqdm(total=len(requests), disable=(self.rank != 0))
+
+        # for each different set of kwargs, we execute all requests, by batch.
+        for key, re_ord in re_ords.items():
+            chunks = utils.chunks(re_ord.get_reordered(), n=self.batch_size)
+            for chunk in tqdm(chunks, disable=self.rank != 0):
+                contexts, all_gen_kwargs = zip(*chunk)
+                # we assume all gen kwargs in the batch are the same
+                # this is safe to assume because the `grouper` object ensures it.
+                gen_kwargs = all_gen_kwargs[0]
+                # unpack our keyword arguments.
+                until = None
+                if isinstance(gen_kwargs, dict):
+                    kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                    if "until" in kwargs.keys():
+                        until = kwargs.pop("until")
+                        if isinstance(until, str):
+                            until = [kwargs]
+                        elif not isinstance(until, list):
+                            raise ValueError(
+                                f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
+                            )
+                else:
+                    raise ValueError(
+                        f"Expected `kwargs` to be of type `dict` but got {kwargs}"
+                    )
+                if not until:
+                    until = [self.tok_decode(self.eot_token_id)]
+                if "max_gen_toks" in kwargs.keys():
+                    max_gen_toks = kwargs.pop("max_gen_toks")
+                else:
+                    max_gen_toks = self.max_gen_toks
+                # first stop sequence is used to halt generation upon encountering
+                primary_until = [until[0]]
+
+                max_ctx_len = self.max_length - max_gen_toks
+
+                # encode, pad, and truncate contexts for this batch
+                context_enc, attn_masks = self.tok_batch_encode(
+                    contexts,
+                    left_truncate_len=max_ctx_len,
+                    truncation=self.truncation,
+                )
+                context_enc = context_enc.to(self.device)
+                attn_masks = attn_masks.to(self.device)
+
+                if "max_length" not in kwargs:
+                    kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
+
+                # perform batched generation
+                cont = self._model_generate(
+                    context=context_enc,
+                    attention_mask=attn_masks,
+                    stop=primary_until,
+                    **kwargs,
+                )
+
+                cont_toks_list = cont.tolist()
+                for cont_toks, context in zip(cont_toks_list, contexts):
+                    # discard context + left-padding toks if using causal decoder-only LM
+                    cont_toks = cont_toks[context_enc.shape[1] :]
+
+                    s = self.tok_decode(cont_toks)
+
+                    # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
+                    for term in until:
+                        if len(term) > 0:
+                            # ignore '' separator,
+                            # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
+                            s = s.split(term)[0]
+
+                    res[key].append(s)
+
+                    self.cache_hook.add_partial(
+                        "generate_until", (context, gen_kwargs), s
+                    )
+                    pbar.update(1)
+            # reorder this group of results back to original unsorted form
+            res[key] = re_ord.get_original(res[key])
+
+        pbar.close()
+
+        return grouper.get_original(res)
--- a/lm_eval/tasks/okapi/arc_multilingual/README.md
+++ b/lm_eval/tasks/okapi/arc_multilingual/README.md
+# Multilingual ARC
+
+### Paper
+
+Title: `Okapi: Instruction-tuned Large Language Models in Multiple Languages with Reinforcement Learning from Human Feedback`
+
+Abstract: https://arxiv.org/abs/2307.16039
+
+A key technology for the development of large language models (LLMs) involves instruction tuning that helps align the models' responses with human expectations to realize impressive learning abilities. Two major approaches for instruction tuning characterize supervised fine-tuning (SFT) and reinforcement learning from human feedback (RLHF), which are currently applied to produce the best commercial LLMs (e.g., ChatGPT). To improve the accessibility of LLMs for research and development efforts, various instruction-tuned open-source LLMs have also been introduced recently, e.g., Alpaca, Vicuna, to name a few. However, existing open-source LLMs have only been instruction-tuned for English and a few popular languages, thus hindering their impacts and accessibility to many other languages in the world. Among a few very recent work to explore instruction tuning for LLMs in multiple languages, SFT has been used as the only approach to instruction-tune LLMs for multiple languages. This has left a significant gap for fine-tuned LLMs based on RLHF in diverse languages and raised important questions on how RLHF can boost the performance of multilingual instruction tuning. To overcome this issue, we present Okapi, the first system with instruction-tuned LLMs based on RLHF for multiple languages. Okapi introduces instruction and response-ranked data in 26 diverse languages to facilitate the experiments and development of future multilingual LLM research. We also present benchmark datasets to enable the evaluation of generative LLMs in multiple languages. Our experiments demonstrate the advantages of RLHF for multilingual instruction over SFT for different base models and datasets. Our framework and resources are released at this https URL.
+
+Homepage: `https://github.com/nlp-uoregon/Okapi`
+
+
+### Citation
+
+```
+@article{dac2023okapi,
+  title={Okapi: Instruction-tuned Large Language Models in Multiple Languages with Reinforcement Learning from Human Feedback},
+  author={Dac Lai, Viet and Van Nguyen, Chien and Ngo, Nghia Trung and Nguyen, Thuat and Dernoncourt, Franck and Rossi, Ryan A and Nguyen, Thien Huu},
+  journal={arXiv e-prints},
+  pages={arXiv--2307},
+  year={2023}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+- arc_multilingual
+
+#### Tasks
+
+- `arc_{ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh}`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/okapi/arc_multilingual/_arc_yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/_arc_yaml
+group:
+  - arc_multilingual
+dataset_path: null
+dataset_name: null
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: test
+process_docs: !function utils.process_docs
+doc_to_text: "query"
+doc_to_target: "gold"
+doc_to_choice: "choices"
+should_decontaminate: true
+doc_to_decontamination_query: "query"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_ar.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_ar.yaml
+include: _arc_yaml
+task: arc_ar
+dataset_path: alexandrainst/m_arc
+dataset_name: ar
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_bn.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_bn.yaml
+include: _arc_yaml
+task: arc_bn
+dataset_path: alexandrainst/m_arc
+dataset_name: bn
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_ca.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_ca.yaml
+include: _arc_yaml
+task: arc_ca
+dataset_path: alexandrainst/m_arc
+dataset_name: ca
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_da.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_da.yaml
+include: _arc_yaml
+task: arc_da
+dataset_path: alexandrainst/m_arc
+dataset_name: da
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_de.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_de.yaml
+include: _arc_yaml
+task: arc_de
+dataset_path: alexandrainst/m_arc
+dataset_name: de
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_es.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_es.yaml
+include: _arc_yaml
+task: arc_es
+dataset_path: alexandrainst/m_arc
+dataset_name: es
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_eu.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_eu.yaml
+include: _arc_yaml
+task: arc_eu
+dataset_path: alexandrainst/m_arc
+dataset_name: eu
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_fr.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_fr.yaml
+include: _arc_yaml
+task: arc_fr
+dataset_path: alexandrainst/m_arc
+dataset_name: fr
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_gu.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_gu.yaml
+include: _arc_yaml
+task: arc_gu
+dataset_path: alexandrainst/m_arc
+dataset_name: gu
+training_split: train
+validation_split: validation
+test_split: test