update latest

bf2517cc · lintangsutawika · 8bca751c · 7397b965 · bf2517cc · bf2517cc
Commit bf2517cc authored Feb 12, 2024 by lintangsutawika
20 changed files
--- a/README.md
+++ b/README.md
@@ -45,6 +45,7 @@ git clone https://github.com/EleutherAI/lm-evaluation-harness
 cd lm-evaluation-harness
 pip install -e .
 ```
 We also provide a number of optional dependencies for extended functionality. A detailed table is available at the end of this document.
 ## Basic Usage
@@ -174,6 +175,7 @@ Note that for externally hosted models, configs such as `--device` and `--batch_
 | vLLM                                                                                                                      | :heavy_check_mark:       | `vllm`                                                              | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
 | Mamba                       | :heavy_check_mark:       | `mamba_ssm`                                                                      | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces) | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                             |
 | Huggingface Optimum (Causal LMs)    | ✔️         | `openvino`                                 |     Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format                           |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
+| Neuron via AWS Inf2 (Causal LMs)    | ✔️         | `neuronx`                                 |     Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2)                         |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
 | Your local inference server!                                                                                              | :heavy_check_mark:                             | `local-completions` or `local-chat-completions` (using `openai-chat-completions` model type)    | Any server address that accepts GET requests using HF models and mirror's OpenAI's Completions or ChatCompletions interface                                  | `generate_until`                                           |                                | ...                |
 Models which do not supply logits or logprobs can be used with tasks of type `generate_until` only, while local models, or APIs that supply logprobs/logits of their prompts, can be run on all task types: `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
@@ -196,7 +198,7 @@ If you have a Metal compatible Mac, you can run the eval harness using the MPS b
 > You can inspect what the LM inputs look like by running the following command:
 > ```bash
 > python write_out.py \
->     --tasks all_tasks \
+>     --tasks <task1,task2,...> \
 >     --num_fewshot 5 \
 >     --num_examples 10 \
 >     --output_base_path /path/to/output/folder
@@ -312,7 +314,9 @@ Extras dependencies can be installed via `pip install -e ".[NAME]"`
 | anthropic     | For using Anthropic's models          |
 | dev           | For linting PRs and contributions     |
 | gptq          | For loading models with GPTQ          |
+| hf_transfer   | For speeding up HF Hub file downloads |
 | ifeval        | For running the IFEval task           |
+| neuronx       | For running on AWS inf2 instances     |
 | mamba         | For loading Mamba SSM models          |
 | math          | For running math task answer checking |
 | multilingual  | For multilingual tokenizers           |

--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -237,7 +237,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
    results = evaluator.simple_evaluate(
        model=args.model,
        model_args=args.model_args,
-        tasks=all_tasks,
+        tasks=task_names,
        num_fewshot=args.num_fewshot,
        batch_size=args.batch_size,
        max_batch_size=args.max_batch_size,

--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -3,6 +3,7 @@ import math
 import random
 from collections.abc import Iterable
 from collections import defaultdict
+from typing import List
 import evaluate
 import numpy as np
@@ -459,3 +460,64 @@ def stderr_for_metric(metric, bootstrap_iters):
    stderr = {mean: mean_stderr, acc_all: acc_all_stderr}
    return stderr.get(metric, None)
+def pooled_sample_stderr(stderrs: List[float], sizes: List[int]):
+    # Used to aggregate bootstrapped stderrs across subtasks in a group,
+    # when we are weighting by the size of each subtask.
+    #
+    assert len(stderrs) == len(sizes)
+    # formula source: https://en.wikipedia.org/wiki/Pooled_variance
+    # this empirically matches running `stderr_for_metric` on all instances
+    # from the subtasks concatenated with each other.
+    pooled_sample_var = (
+        sum([(size - 1) * stderr**2 for size, stderr in zip(sizes, stderrs)])
+    ) / (sum(sizes) - len(sizes))
+    return np.sqrt(pooled_sample_var)
+def combined_sample_stderr(stderrs: List[float], sizes: List[int], metrics=None):
+    assert (
+        metrics is not None
+    ), "Need to pass a list of each subtask's metric for this stderr aggregation"
+    assert len(stderrs) == len(sizes) and len(sizes) == len(metrics)
+    # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1390 for more documentation.
+    # This formula depends on sample means.
+    # removed because it seems to give erroneously huge stderrs for groupings of tasks
+    # and does not seem to match up with bootstrap-calculated stderrs for groups.
+    ### don't use this unless a statistician has told you it's the right thing to do ###
+    # accumulators: we'll aggregate pairwise N - 1 times
+    variance = stderrs[0] ** 2
+    curr_size = sizes[0]
+    curr_score = metrics[0]
+    for stderr, size, score in zip(stderrs[1:], sizes[1:], metrics[1:]):
+        curr_score = ((curr_score * curr_size) + (score * size)) / (
+            curr_size + size
+        )  # NOTE: this assumes our aggregation fn is "mean"
+        variance = ((curr_size - 1) * variance + (size - 1) * (stderr**2)) / (
+            curr_size + size - 1
+        ) + curr_size * size / ((curr_size + size) * (curr_size + size - 1)) * (
+            curr_score - score
+        ) ** 2
+    return np.sqrt(variance)
+def aggregate_subtask_metrics(metrics, sizes, weight_by_size=True):
+    # A helper function that is used to aggregate
+    # subtask scores cross-task.
+    # TODO: does not hold for non-mean aggregations
+    if weight_by_size:
+        sizes = [1] * len(sizes)
+    assert len(metrics) == len(sizes)
+    return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes)
--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
 import logging
+from typing import Callable, Dict
 import evaluate
@@ -75,7 +76,7 @@ def register_group(name):
 OUTPUT_TYPE_REGISTRY = {}
 METRIC_REGISTRY = {}
 METRIC_AGGREGATION_REGISTRY = {}
-AGGREGATION_REGISTRY = {}
+AGGREGATION_REGISTRY: Dict[str, Callable[[], Dict[str, Callable]]] = {}
 HIGHER_IS_BETTER_REGISTRY = {}
 DEFAULT_METRIC_REGISTRY = {
@@ -118,7 +119,7 @@ def register_metric(**args):
    return decorate
-def get_metric(name, hf_evaluate_metric=False):
+def get_metric(name: str, hf_evaluate_metric=False) -> Callable:
    if not hf_evaluate_metric:
        if name in METRIC_REGISTRY:
            return METRIC_REGISTRY[name]
@@ -136,7 +137,7 @@ def get_metric(name, hf_evaluate_metric=False):
        )
-def register_aggregation(name):
+def register_aggregation(name: str):
    def decorate(fn):
        assert (
            name not in AGGREGATION_REGISTRY
@@ -148,21 +149,21 @@ def register_aggregation(name):
    return decorate
-def get_aggregation(name):
+def get_aggregation(name: str) -> Callable[[], Dict[str, Callable]]:
    try:
        return AGGREGATION_REGISTRY[name]
    except KeyError:
        eval_logger.warning(f"{name} not a registered aggregation metric!")
-def get_metric_aggregation(name):
+def get_metric_aggregation(name: str) -> Callable[[], Dict[str, Callable]]:
    try:
        return METRIC_AGGREGATION_REGISTRY[name]
    except KeyError:
        eval_logger.warning(f"{name} metric is not assigned a default aggregation!")
-def is_higher_better(metric_name):
+def is_higher_better(metric_name) -> bool:
    try:
        return HIGHER_IS_BETTER_REGISTRY[metric_name]
    except KeyError:

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -373,7 +373,7 @@ class Task(abc.ABC):
        else:
            assert False, f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
-        eval_logger.info(f"Building contexts for task on rank {rank}...")
+        eval_logger.info(f"Building contexts for {self.config.task} on rank {rank}...")
        instances = []
        for doc_id, doc in utils.create_iterator(
@@ -527,6 +527,7 @@ class Task(abc.ABC):
        return description + labeled_examples + example
    def apply_filters(self):
+        """Iterates over FilterEnsembles and applies them to instances"""
        if hasattr(self, "_filters"):
            for f in self._filters:
                f.apply(self._instances)
@@ -535,15 +536,51 @@ class Task(abc.ABC):
            return self._instances
    def dump_config(self) -> dict:
-        """Returns a dictionary representing the task's config.
+        """Returns the config as a dictionary."""
-        :returns: str
-            The fewshot context.
-        """
        # TODO: this should only return the overrides applied to a non-YAML task's configuration.
        # (num_fewshot)
        return self.config.to_dict()
+    def set_config(self, key: str, value: Any, update: bool = False) -> None:
+        """Set or update the configuration for a given key."""
+        if key is None:
+            raise ValueError("Key must be provided.")
+        if update:
+            current_value = getattr(self._config, key, {})
+            if not isinstance(current_value, dict):
+                raise TypeError(
+                    f"Expected a dict for key '{key}', got {type(current_value).__name__} instead."
+                )
+            current_value.update(value)
+        else:
+            setattr(self._config, key, value)
+    def override_metric(self, metric_name: str) -> None:
+        """
+        Override the default metrics used for evaluation with custom metrics.
+        Parameters:
+        - metric_name (str): The name of the custom metric to override. Should be registered in api.metrics.
+        """
+        (
+            self._metric_fn_list,
+            self._aggregation_list,
+            self._metric_fn_kwargs,
+            self._higher_is_better,
+        ) = ({}, {}, {}, {})
+        self._metric_fn_list[metric_name] = get_metric(metric_name)
+        self._aggregation_list[metric_name] = get_metric_aggregation(metric_name)
+        self._higher_is_better[metric_name] = is_higher_better(metric_name)
+        self._metric_fn_kwargs[metric_name] = {}
+        if not isinstance(self, ConfigurableTask):
+            self.process_results = lambda x, y: {metric_name: get_metric(metric_name)}
+            self.aggregation = lambda: {
+                metric_name: get_metric_aggregation(metric_name)
+            }
+        setattr(self._config, "metric_list", [{"metric": metric_name}])
+        setattr(self._config, "process_results", None)
 class ConfigurableTask(Task):
    VERSION = "Yaml"
@@ -849,6 +886,7 @@ class ConfigurableTask(Task):
                    return labeled_examples + str(example)
    def apply_filters(self):
+        """Iterates over FilterEnsembles and applies them to instances"""
        if hasattr(self, "_filters"):
            for f in self._filters:
                f.apply(self._instances)
@@ -1255,37 +1293,6 @@ class ConfigurableTask(Task):
    def get_config(self, key: str) -> Any:
        return getattr(self._config, key, None)
-    def override_metric(self, metric_name: str) -> None:
-        """
-        Override the default metrics used for evaluation with custom metrics.
-        Parameters:
-        - metric_name (str): The name of the custom metric to override. Should be registered in api.metrics.
-        """
-        (
-            self._metric_fn_list,
-            self._aggregation_list,
-            self._metric_fn_kwargs,
-            self._higher_is_better,
-        ) = ({}, {}, {}, {})
-        self._metric_fn_list[metric_name] = get_metric(metric_name)
-        self._aggregation_list[metric_name] = get_metric_aggregation(metric_name)
-        self._higher_is_better[metric_name] = is_higher_better(metric_name)
-        self._metric_fn_kwargs[metric_name] = {}
-        setattr(self._config, "metric_list", [{"metric": metric_name}])
-        setattr(self._config, "process_results", None)
-    def override_config(
-        self, key: str = None, value: Any = None, update: bool = False
-    ) -> None:
-        if update:
-            current_value = getattr(self._config, key)
-            assert isinstance(current_value, dict)
-            current_value.update(value)
-            setattr(self._config, key, current_value)
-        else:
-            setattr(self._config, key, value)
 class MultipleChoiceTask(Task):
    OUTPUT_TYPE: str = "loglikelihood"

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
-import random
-import itertools
 import collections
+import itertools
-import torch
 import logging
+import random
+from typing import Optional, Union
 import numpy as np
+import torch
-import lm_eval.api
-import lm_eval.models
 import lm_eval.api.metrics
 import lm_eval.api.registry
+import lm_eval.models
-from lm_eval.tasks import (
+from lm_eval.tasks import TaskManager, get_task_dict
-    get_task_dict,
-    TaskManager
-)
 from lm_eval.utils import (
+    eval_logger,
+    get_git_commit_hash,
    positional_deprecated,
    run_task_tests,
-    get_git_commit_hash,
    simple_parse_args_string,
-    eval_logger
 )
 @positional_deprecated
 def simple_evaluate(
    model,
-    model_args=None,
+    model_args: Optional[str] = None,
    tasks=None,
-    num_fewshot=None,
+    num_fewshot: Optional[int] = None,
-    batch_size=None,
+    batch_size: Optional[int] = None,
-    max_batch_size=None,
+    max_batch_size: Optional[int] = None,
-    device=None,
+    device: Optional[str] = None,
-    use_cache=None,
+    use_cache: Optional[str] = None,
-    limit=None,
+    limit: Optional[Union[int, float]] = None,
    bootstrap_iters: int = 100000,
    check_integrity: bool = False,
    decontamination_ngrams_path=None,
@@ -138,8 +133,8 @@ def simple_evaluate(
    eval_logger.info(
        "get_task_dict has been updated to accept an optional argument, `task_manager`"
-        "Read more here: https://github.com/EleutherAI/lm-evaluation-harness/blob/recursive-groups/docs/interface.md#external-library-usage"
+        "Read more here:https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
-        )
+    )
    task_dict = get_task_dict(tasks, task_manager)
    for task_name in task_dict.keys():
        task_obj = task_dict[task_name]
@@ -150,7 +145,7 @@ def simple_evaluate(
        if task_obj.get_config("output_type") == "generate_until":
            if gen_kwargs is not None:
-                task_obj.override_config(
+                task_obj.set_config(
                    key="generation_kwargs", value=gen_kwargs, update=True
                )
@@ -171,7 +166,7 @@ def simple_evaluate(
                eval_logger.warning(
                    f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
                )
-                task_obj.override_config(key="num_fewshot", value=num_fewshot)
+                task_obj.set_config(key="num_fewshot", value=num_fewshot)
    if check_integrity:
        run_task_tests(task_list=tasks)
@@ -222,8 +217,8 @@ decontaminate_suffix = "_decontaminate"
 def evaluate(
    lm,
    task_dict,
-    limit=None,
+    limit: Optional[int] = None,
-    bootstrap_iters: int = 100000,
+    bootstrap_iters: Optional[int] = 100000,
    decontamination_ngrams_path=None,
    write_out: bool = False,
    log_samples: bool = True,
@@ -297,13 +292,9 @@ def evaluate(
        versions[task_name] = task.VERSION
        configs[task_name] = dict(task.dump_config())
-        if "num_fewshot" in configs[task_name]:
+        # Number of few-shots for printing.
-            if configs[task_name]["metadata"]:
+        if (n_shot := configs[task_name].get("num_fewshot")) == 0:
-                n_shot = configs[task_name]["metadata"].get("num_fewshot", None)
+            n_shot = configs[task_name].get("metadata", {}).get("num_fewshot", 0)
-            if not n_shot:
-                n_shot = configs[task_name]["num_fewshot"]
-        else:
-            n_shot = 0 # TODO: is this always right?
        num_fewshot[task_name] = n_shot
        if "task_alias" in configs[task_name]:
@@ -483,97 +474,70 @@ def evaluate(
        vals = vals_torch
    if lm.rank == 0:
        ### Aggregate results over all datapoints ###
        # aggregate results ; run bootstrap CIs
        for (task_name, key, metric), items in vals.items():
            task = task_dict[task_name]
-            metric_key = metric + "," + key
+            group_name, task = task if isinstance(task, tuple) else (None, task)
-            if isinstance(task, tuple):
-                group_name, task = task
-            else:
-                group_name = None
+            metric_key = f"{metric},{key}"
            agg_fn = task.aggregation()[metric]
            results[task_name][metric_key] = agg_fn(items)
            results[task_name]["samples"] = len(items)
            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
            # so we run them less iterations. still looking for a cleaner way to do this
            if bootstrap_iters > 0:
-                stderr = lm_eval.api.metrics.stderr_for_metric(
+                stderr_fn = lm_eval.api.metrics.stderr_for_metric(
-                    metric=task.aggregation()[metric],
+                    metric=agg_fn,
                    bootstrap_iters=min(bootstrap_iters, 100)
                    if metric in ["bleu", "chrf", "ter"]
                    else bootstrap_iters,
                )
-                if stderr is not None and len(items) > 1:
+                results[task_name][f"{metric}_stderr,{key}"] = (
-                    results[task_name][metric + "_stderr" + "," + key] = stderr(items)
+                    stderr_fn(items) if (stderr_fn and len(items) > 1) else "N/A"
-                else:
+                )
-                    results[task_name][metric + "_stderr" + "," + key] = "N/A"
        if bool(results):
            for group, task_list in reversed(task_hierarchy.items()):
-                if task_list == []:
+                if len(task_list) == 0:
-                    # TODO: No samples when bypass
+                    # task_hierarchy entries are either
-                    total_size = results[group].get("samples", 999)
+                    # `group_name: [subtask1, subtask2, ...]`
-                else:
+                    # or `task_name: []`.
-                    total_size = 0
+                    # we only want to operate on groups here.
+                    continue
-                    for task in task_list:
+                for metric in [
-                        metrics = results[task].copy()
+                    key
+                    for key in results[task_list[0]].keys()
-                        if "alias" in metrics:
+                    if "_stderr" not in key and key not in ["alias", "samples"]
-                            metrics.pop("alias")
+                ]:  # TODO: what if tasks don't all share the same metrics
+                    stderr = "_stderr,".join(metric.split(","))
-                        current_size = metrics.pop("samples")
+                    # gather metrics, sizes, and stderrs from subtasks
-                        all_stderr = []
+                    metrics = [
-                        for metric in [
+                        results[task][metric] for task in task_list
-                            key for key in metrics.keys() if "_stderr" not in key
+                    ]  # TODO: copy?
-                        ]:
+                    stderrs = [results[task][stderr] for task in task_list]
-                            stderr = "_stderr,".join(metric.split(","))
+                    sizes = [results[task]["samples"] for task in task_list]
-                            stderr_score = results[task][stderr]
-                            if stderr_score == "N/A":
+                    # compute group's pooled metric and stderr
-                                var_score = "N/A"
+                    results[group][
-                            else:
+                        metric
-                                var_score = stderr_score**2
+                    ] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
-                                all_stderr.append(stderr)
+                    # TODO: calculate grouped metric using aggregation fn
+                    if "N/A" in stderrs:
-                            metric_score = results[task][metric]
+                        results[group][stderr] = "N/A"
+                    else:
-                            if metric in results[group]:
+                        results[group][
-                                results[group][metric] = (
+                            stderr
-                                    results[group][metric] * total_size
+                        ] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
-                                    + metric_score * current_size
+                        # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
-                                ) / (total_size + current_size)
+                        # To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
-                                # $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
+                        # results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
-                                if var_score == "N/A" or results[group][stderr] == "N/A":
-                                    results[group][stderr] = "N/A"
+                    results[group]["samples"] = sum(sizes)
-                                else:
-                                    results[group][stderr] = (
-                                        (total_size - 1) * results[group][stderr]
-                                        + (current_size - 1) * var_score
-                                    ) / (
-                                        total_size + current_size - 1
-                                    ) + total_size * current_size / (
-                                        (total_size + current_size)
-                                        * (total_size + current_size - 1)
-                                    ) * (
-                                        results[group][metric] - metric_score
-                                    ) ** 2
-                            else:
-                                results[group][metric] = metric_score
-                                results[group][stderr] = var_score
-                        total_size += current_size
-                    for stderr in all_stderr:
-                        results[group][stderr] = np.sqrt(results[group][stderr])
-                results[group]["samples"] = total_size
        def print_tasks(task_hierarchy, results, tab=0):
            results_agg = collections.defaultdict(dict)
@@ -648,8 +612,10 @@ def evaluate(
            groups_agg = {**groups_agg, **_groups_agg}
        for group_name, task_list in task_hierarchy.items():
-            if task_list != []:
+            if task_list:
-                num_fewshot[group_name] = num_fewshot[task_list[0]] # TODO: validate this
+                num_fewshot[group_name] = num_fewshot[
+                    task_list[0]
+                ]  # TODO: validate this
        results_dict = {
            "results": dict(results_agg.items()),

--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
@@ -7,5 +7,16 @@ from . import gguf
 from . import vllm_causallms
 from . import mamba_lm
 from . import optimum_lm
+from . import neuron_optimum
 # TODO: implement __all__
+import os
+try:
+    # enabling faster model download
+    import hf_transfer
+    os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+except ImportError:
+    pass
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
 import copy
 import os
+from datetime import timedelta
 from pathlib import Path
 from typing import List, Literal, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 import transformers
-from accelerate import Accelerator, DistributedType, find_executable_batch_size
+from accelerate import (
+    Accelerator,
+    DistributedType,
+    InitProcessGroupKwargs,
+    find_executable_batch_size,
+)
 from packaging import version
 from peft import PeftModel
 from peft import __version__ as PEFT_VERSION
@@ -132,7 +138,8 @@ class HFLM(LM):
            assert isinstance(batch_size, (int, str))
            gpus = torch.cuda.device_count()
-            accelerator = Accelerator()
+            accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
+            accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
            if accelerator.num_processes > 1:
                self.accelerator = accelerator
@@ -617,7 +624,13 @@ class HFLM(LM):
            return batch_size
-        batch_size = forward_batch()
+        try:
+            batch_size = forward_batch()
+        except RuntimeError as e:
+            if "No executable batch size found" in str(e):
+                batch_size = 1
+            else:
+                raise
        if self.world_size > 1:
            # if multi-GPU, always take minimum over all selected batch sizes
@@ -721,6 +734,11 @@ class HFLM(LM):
        # and we don't want a warning from HF
        generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
        do_sample = generation_kwargs.get("do_sample", None)
+        # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
+        if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
+            generation_kwargs["do_sample"] = do_sample = False
        if do_sample is False and generation_kwargs.get("temperature") == 0.0:
            generation_kwargs.pop("temperature")
        # build stopping criteria

--- a/lm_eval/models/neuron_optimum.py
+++ b/lm_eval/models/neuron_optimum.py
--- a/lm_eval/tasks/okapi/arc_multilingual/README.md
+++ b/lm_eval/tasks/okapi/arc_multilingual/README.md
+# Multilingual ARC
+### Paper
+Title: `Okapi: Instruction-tuned Large Language Models in Multiple Languages with Reinforcement Learning from Human Feedback`
+Abstract: https://arxiv.org/abs/2307.16039
+A key technology for the development of large language models (LLMs) involves instruction tuning that helps align the models' responses with human expectations to realize impressive learning abilities. Two major approaches for instruction tuning characterize supervised fine-tuning (SFT) and reinforcement learning from human feedback (RLHF), which are currently applied to produce the best commercial LLMs (e.g., ChatGPT). To improve the accessibility of LLMs for research and development efforts, various instruction-tuned open-source LLMs have also been introduced recently, e.g., Alpaca, Vicuna, to name a few. However, existing open-source LLMs have only been instruction-tuned for English and a few popular languages, thus hindering their impacts and accessibility to many other languages in the world. Among a few very recent work to explore instruction tuning for LLMs in multiple languages, SFT has been used as the only approach to instruction-tune LLMs for multiple languages. This has left a significant gap for fine-tuned LLMs based on RLHF in diverse languages and raised important questions on how RLHF can boost the performance of multilingual instruction tuning. To overcome this issue, we present Okapi, the first system with instruction-tuned LLMs based on RLHF for multiple languages. Okapi introduces instruction and response-ranked data in 26 diverse languages to facilitate the experiments and development of future multilingual LLM research. We also present benchmark datasets to enable the evaluation of generative LLMs in multiple languages. Our experiments demonstrate the advantages of RLHF for multilingual instruction over SFT for different base models and datasets. Our framework and resources are released at this https URL.
+Homepage: `https://github.com/nlp-uoregon/Okapi`
+### Citation
+```
+@article{dac2023okapi,
+  title={Okapi: Instruction-tuned Large Language Models in Multiple Languages with Reinforcement Learning from Human Feedback},
+  author={Dac Lai, Viet and Van Nguyen, Chien and Ngo, Nghia Trung and Nguyen, Thuat and Dernoncourt, Franck and Rossi, Ryan A and Nguyen, Thien Huu},
+  journal={arXiv e-prints},
+  pages={arXiv--2307},
+  year={2023}
+}
+```
+### Groups and Tasks
+#### Groups
+- arc_multilingual
+#### Tasks
+- `arc_{ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh}`
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/okapi/arc_multilingual/_arc_yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/_arc_yaml
+group:
+  - arc_multilingual
+dataset_path: null
+dataset_name: null
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: test
+process_docs: !function utils.process_docs
+doc_to_text: "query"
+doc_to_target: "gold"
+doc_to_choice: "choices"
+should_decontaminate: true
+doc_to_decontamination_query: "query"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_ar.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_ar.yaml
+include: _arc_yaml
+task: arc_ar
+dataset_path: alexandrainst/m_arc
+dataset_name: ar
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_bn.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_bn.yaml
+include: _arc_yaml
+task: arc_bn
+dataset_path: alexandrainst/m_arc
+dataset_name: bn
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_ca.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_ca.yaml
+include: _arc_yaml
+task: arc_ca
+dataset_path: alexandrainst/m_arc
+dataset_name: ca
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_da.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_da.yaml
+include: _arc_yaml
+task: arc_da
+dataset_path: alexandrainst/m_arc
+dataset_name: da
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_de.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_de.yaml
+include: _arc_yaml
+task: arc_de
+dataset_path: alexandrainst/m_arc
+dataset_name: de
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_es.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_es.yaml
+include: _arc_yaml
+task: arc_es
+dataset_path: alexandrainst/m_arc
+dataset_name: es
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_eu.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_eu.yaml
+include: _arc_yaml
+task: arc_eu
+dataset_path: alexandrainst/m_arc
+dataset_name: eu
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_fr.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_fr.yaml
+include: _arc_yaml
+task: arc_fr
+dataset_path: alexandrainst/m_arc
+dataset_name: fr
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_gu.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_gu.yaml
+include: _arc_yaml
+task: arc_gu
+dataset_path: alexandrainst/m_arc
+dataset_name: gu
+training_split: train
+validation_split: validation
+test_split: test