added deepseekv2

74df9bea · zhaoying1 · 74df9bea · 74df9bea · 74df9bea · 74df9bea
Commit 74df9bea authored Sep 02, 2024 by zhaoying1
20 changed files
--- a/LM-Evaluation-Harness-240310/lm_eval/filters/transformation.py
+++ b/LM-Evaluation-Harness-240310/lm_eval/filters/transformation.py
+from lm_eval.api.filter import Filter
+
+
+class LowercaseFilter(Filter):
+    def __init__(self) -> None:
+        pass
+
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [resp.lower() for resp in inst]
+
+        return [filter_set(resp) for resp in resps]
+
+
+class UppercaseFilter(Filter):
+    def __init__(self) -> None:
+        pass
+
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [resp.upper() for resp in inst]
+
+        return [filter_set(resp) for resp in resps]
+
+
+class MapFilter(Filter):
+    def __init__(self, mapping_dict: dict = None, default_value=None) -> None:
+        """
+        Initializes the MapFilter with a given mapping dictionary and default value.
+
+        Args:
+        - mapping_dict (dict): A dictionary containing the key-value mappings.
+                               Default is an empty dictionary.
+        - default_value (Any): The value to be returned when a key is not found in the mapping_dict.
+                               Default is None.
+
+        Example:
+        mapper = MapFilter({'A': 1, 'B': 2}, default_value=0)
+        """
+        if mapping_dict is None:
+            mapping_dict = {}
+        assert isinstance(
+            mapping_dict, dict
+        ), "Provided mapping_dict is not a dictionary"
+        self.mapping_dict = mapping_dict
+        self.default_value = default_value
+
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [self.mapping_dict.get(resp, self.default_value) for resp in inst]
+
+        return [filter_set(resp) for resp in resps]
--- a/LM-Evaluation-Harness-240310/lm_eval/logging_utils.py
+++ b/LM-Evaluation-Harness-240310/lm_eval/logging_utils.py
+import copy
+import json
+import logging
+import os
+import re
+import subprocess
+from pathlib import Path
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+
+import numpy as np
+import pandas as pd
+from packaging.version import Version
+from torch.utils.collect_env import get_pretty_env_info
+from transformers import __version__ as trans_version
+
+
+logger = logging.getLogger(__name__)
+
+
+def remove_none_pattern(input_string: str) -> Tuple[str, bool]:
+    """Remove the ',none' substring from the input_string if it exists at the end.
+
+    Args:
+        input_string (str): The input string from which to remove the ',none' substring.
+
+    Returns:
+        Tuple[str, bool]: A tuple containing the modified input_string with the ',none' substring removed
+                          and a boolean indicating whether the modification was made (True) or not (False).
+    """
+    # Define the pattern to match ',none' at the end of the string
+    pattern = re.compile(r",none$")
+
+    # Use sub() to replace ',none' with an empty string
+    result = re.sub(pattern, "", input_string)
+
+    # check if the input_string changed
+    removed = result != input_string
+
+    return result, removed
+
+
+def _handle_non_serializable(o: Any) -> Union[int, str, list]:
+    """Handle non-serializable objects by converting them to serializable types.
+
+    Args:
+        o (Any): The object to be handled.
+
+    Returns:
+        Union[int, str, list]: The converted object. If the object is of type np.int64 or np.int32,
+            it will be converted to int. If the object is of type set, it will be converted
+            to a list. Otherwise, it will be converted to str.
+    """
+    if isinstance(o, np.int64) or isinstance(o, np.int32):
+        return int(o)
+    elif isinstance(o, set):
+        return list(o)
+    else:
+        return str(o)
+
+
+def get_wandb_printer() -> Literal["Printer"]:
+    """Returns a wandb printer instance for pretty stdout."""
+    from wandb.sdk.lib.printer import get_printer
+    from wandb.sdk.wandb_settings import Settings
+
+    printer = get_printer(Settings()._jupyter)
+    return printer
+
+
+class WandbLogger:
+    def __init__(self, **kwargs) -> None:
+        """Attaches to wandb logger if already initialized. Otherwise, passes kwargs to wandb.init()
+
+        Args:
+            kwargs Optional[Any]: Arguments for configuration.
+
+        Parse and log the results returned from evaluator.simple_evaluate() with:
+            wandb_logger.post_init(results)
+            wandb_logger.log_eval_result()
+            wandb_logger.log_eval_samples(results["samples"])
+        """
+        try:
+            import wandb
+
+            assert Version(wandb.__version__) >= Version("0.13.6")
+            if Version(wandb.__version__) < Version("0.13.6"):
+                wandb.require("report-editing:v0")
+        except Exception as e:
+            logger.warning(
+                "To use the wandb reporting functionality please install wandb>=0.13.6.\n"
+                "To install the latest version of wandb run `pip install wandb --upgrade`\n"
+                f"{e}"
+            )
+
+        self.wandb_args: Dict[str, Any] = kwargs
+
+        # initialize a W&B run
+        if wandb.run is None:
+            self.run = wandb.init(**self.wandb_args)
+        else:
+            self.run = wandb.run
+
+        self.printer = get_wandb_printer()
+
+    def post_init(self, results: Dict[str, Any]) -> None:
+        self.results: Dict[str, Any] = copy.deepcopy(results)
+        self.task_names: List[str] = list(results.get("results", {}).keys())
+        self.group_names: List[str] = list(results.get("groups", {}).keys())
+
+    def _get_config(self) -> Dict[str, Any]:
+        """Get configuration parameters."""
+        self.task_configs = self.results.get("configs", {})
+        cli_configs = self.results.get("config", {})
+        configs = {
+            "task_configs": self.task_configs,
+            "cli_configs": cli_configs,
+        }
+
+        return configs
+
+    def _sanitize_results_dict(self) -> Tuple[Dict[str, str], Dict[str, Any]]:
+        """Sanitize the results dictionary."""
+        _results = copy.deepcopy(self.results.get("results", dict()))
+
+        # Remove None from the metric string name
+        tmp_results = copy.deepcopy(_results)
+        for task_name in self.task_names:
+            task_result = tmp_results.get(task_name, dict())
+            for metric_name, metric_value in task_result.items():
+                _metric_name, removed = remove_none_pattern(metric_name)
+                if removed:
+                    _results[task_name][_metric_name] = metric_value
+                    _results[task_name].pop(metric_name)
+
+        # remove string valued keys from the results dict
+        wandb_summary = {}
+        for task in self.task_names:
+            task_result = _results.get(task, dict())
+            for metric_name, metric_value in task_result.items():
+                if isinstance(metric_value, str):
+                    wandb_summary[f"{task}/{metric_name}"] = metric_value
+
+        for summary_metric, summary_value in wandb_summary.items():
+            _task, _summary_metric = summary_metric.split("/")
+            _results[_task].pop(_summary_metric)
+
+        tmp_results = copy.deepcopy(_results)
+        for task_name, task_results in tmp_results.items():
+            for metric_name, metric_value in task_results.items():
+                _results[f"{task_name}/{metric_name}"] = metric_value
+                _results[task_name].pop(metric_name)
+        for task in self.task_names:
+            _results.pop(task)
+
+        return wandb_summary, _results
+
+    def _log_results_as_table(self) -> None:
+        """Generate and log evaluation results as a table to W&B."""
+        columns = [
+            "Version",
+            "Filter",
+            "num_fewshot",
+            "Metric",
+            "Value",
+            "Stderr",
+        ]
+
+        def make_table(columns: List[str], key: str = "results"):
+            import wandb
+
+            table = wandb.Table(columns=columns)
+            results = copy.deepcopy(self.results)
+
+            for k, dic in results.get(key).items():
+                if k in self.group_names and not key == "groups":
+                    continue
+                version = results.get("versions").get(k)
+                if version == "N/A":
+                    version = None
+                n = results.get("n-shot").get(k)
+
+                for (mf), v in dic.items():
+                    m, _, f = mf.partition(",")
+                    if m.endswith("_stderr"):
+                        continue
+                    if m == "alias":
+                        continue
+
+                    if m + "_stderr" + "," + f in dic:
+                        se = dic[m + "_stderr" + "," + f]
+                        if se != "N/A":
+                            se = "%.4f" % se
+                        table.add_data(*[k, version, f, n, m, str(v), str(se)])
+                    else:
+                        table.add_data(*[k, version, f, n, m, str(v), ""])
+
+            return table
+
+        # log the complete eval result to W&B Table
+        table = make_table(["Tasks"] + columns, "results")
+        self.run.log({"evaluation/eval_results": table})
+
+        if "groups" in self.results.keys():
+            table = make_table(["Groups"] + columns, "groups")
+            self.run.log({"evaluation/group_eval_results": table})
+
+    def _log_results_as_artifact(self) -> None:
+        """Log results as JSON artifact to W&B."""
+        import wandb
+
+        dumped = json.dumps(
+            self.results, indent=2, default=_handle_non_serializable, ensure_ascii=False
+        )
+        artifact = wandb.Artifact("results", type="eval_results")
+        with artifact.new_file("results.json", mode="w", encoding="utf-8") as f:
+            f.write(dumped)
+        self.run.log_artifact(artifact)
+
+    def log_eval_result(self) -> None:
+        """Log evaluation results to W&B."""
+        # Log configs to wandb
+        configs = self._get_config()
+        self.run.config.update(configs)
+
+        wandb_summary, self.wandb_results = self._sanitize_results_dict()
+        # update wandb.run.summary with items that were removed
+        self.run.summary.update(wandb_summary)
+        # Log the evaluation metrics to wandb
+        self.run.log(self.wandb_results)
+        # Log the evaluation metrics as W&B Table
+        self._log_results_as_table()
+        # Log the results dict as json to W&B Artifacts
+        self._log_results_as_artifact()
+
+    def _generate_dataset(
+        self, data: List[Dict[str, Any]], config: Dict[str, Any]
+    ) -> pd.DataFrame:
+        """Generate a dataset from evaluation data.
+
+        Args:
+            data (List[Dict[str, Any]]): The data to generate a dataset for.
+            config (Dict[str, Any]): The configuration of the task.
+
+        Returns:
+            pd.DataFrame: A dataframe that is ready to be uploaded to W&B.
+        """
+        ids = [x["doc_id"] for x in data]
+        labels = [x["target"] for x in data]
+        instance = [""] * len(ids)
+        resps = [""] * len(ids)
+        filtered_resps = [""] * len(ids)
+        model_outputs = {}
+
+        metrics_list = config["metric_list"]
+        metrics = {}
+        for metric in metrics_list:
+            metric = metric.get("metric")
+            if metric in ["word_perplexity", "byte_perplexity", "bits_per_byte"]:
+                metrics[f"{metric}_loglikelihood"] = [x[metric][0] for x in data]
+                if metric in ["byte_perplexity", "bits_per_byte"]:
+                    metrics[f"{metric}_bytes"] = [x[metric][1] for x in data]
+                else:
+                    metrics[f"{metric}_words"] = [x[metric][1] for x in data]
+            else:
+                metrics[metric] = [x[metric] for x in data]
+
+        if config["output_type"] == "loglikelihood":
+            instance = [x["arguments"][0][0] for x in data]
+            labels = [x["arguments"][0][1] for x in data]
+            resps = [
+                f'log probability of continuation is {x["resps"][0][0][0]} '
+                + "\n\n"
+                + "continuation will {} generated with greedy sampling".format(
+                    "not be" if not x["resps"][0][0][1] else "be"
+                )
+                for x in data
+            ]
+            filtered_resps = [
+                f'log probability of continuation is {x["filtered_resps"][0][0]} '
+                + "\n\n"
+                + "continuation will {} generated with greedy sampling".format(
+                    "not be" if not x["filtered_resps"][0][1] else "be"
+                )
+                for x in data
+            ]
+        elif config["output_type"] == "multiple_choice":
+            instance = [x["arguments"][0][0] for x in data]
+            choices = [
+                "\n".join([f"{idx}. {y[1]}" for idx, y in enumerate(x["arguments"])])
+                for x in data
+            ]
+            resps = [np.argmax([n[0][0] for n in x["resps"]]) for x in data]
+            filtered_resps = [
+                np.argmax([n[0] for n in x["filtered_resps"]]) for x in data
+            ]
+        elif config["output_type"] == "loglikelihood_rolling":
+            instance = [x["arguments"][0][0] for x in data]
+            resps = [x["resps"][0][0] for x in data]
+            filtered_resps = [x["filtered_resps"][0] for x in data]
+        elif config["output_type"] == "generate_until":
+            instance = [x["arguments"][0][0] for x in data]
+            resps = [x["resps"][0][0] for x in data]
+            filtered_resps = [x["filtered_resps"][0] for x in data]
+
+        model_outputs["raw_predictions"] = resps
+        model_outputs["filtered_predictions"] = filtered_resps
+
+        df_data = {
+            "id": ids,
+            "data": instance,
+        }
+        if config["output_type"] == "multiple_choice":
+            df_data["choices"] = choices
+
+        tmp_data = {
+            "input_len": [len(x) for x in instance],
+            "labels": labels,
+            "output_type": config["output_type"],
+        }
+        df_data.update(tmp_data)
+        df_data.update(model_outputs)
+        df_data.update(metrics)
+
+        return pd.DataFrame(df_data)
+
+    def _log_samples_as_artifact(
+        self, data: List[Dict[str, Any]], task_name: str
+    ) -> None:
+        import wandb
+
+        # log the samples as an artifact
+        dumped = json.dumps(
+            data,
+            indent=2,
+            default=_handle_non_serializable,
+            ensure_ascii=False,
+        )
+        artifact = wandb.Artifact(f"{task_name}", type="samples_by_task")
+        with artifact.new_file(
+            f"{task_name}_eval_samples.json", mode="w", encoding="utf-8"
+        ) as f:
+            f.write(dumped)
+        self.run.log_artifact(artifact)
+        # artifact.wait()
+
+    def log_eval_samples(self, samples: Dict[str, List[Dict[str, Any]]]) -> None:
+        """Log evaluation samples to W&B.
+
+        Args:
+            samples (Dict[str, List[Dict[str, Any]]]): Evaluation samples for each task.
+        """
+        task_names: List[str] = [
+            x for x in self.task_names if x not in self.group_names
+        ]
+
+        ungrouped_tasks = []
+        tasks_by_groups = {}
+
+        for task_name in task_names:
+            group_names = self.task_configs[task_name].get("group", None)
+            if group_names:
+                if isinstance(group_names, str):
+                    group_names = [group_names]
+
+                for group_name in group_names:
+                    if not tasks_by_groups.get(group_name):
+                        tasks_by_groups[group_name] = [task_name]
+                    else:
+                        tasks_by_groups[group_name].append(task_name)
+            else:
+                ungrouped_tasks.append(task_name)
+
+        for task_name in ungrouped_tasks:
+            eval_preds = samples[task_name]
+
+            # log the samples as a W&B Table
+            df = self._generate_dataset(eval_preds, self.task_configs.get(task_name))
+            self.run.log({f"{task_name}_eval_results": df})
+
+            # log the samples as a json file as W&B Artifact
+            self._log_samples_as_artifact(eval_preds, task_name)
+
+        for group, grouped_tasks in tasks_by_groups.items():
+            grouped_df = pd.DataFrame()
+            for task_name in grouped_tasks:
+                eval_preds = samples[task_name]
+                df = self._generate_dataset(
+                    eval_preds, self.task_configs.get(task_name)
+                )
+                df["group"] = group
+                df["task"] = task_name
+                grouped_df = pd.concat([grouped_df, df], ignore_index=True)
+
+                # log the samples as a json file as W&B Artifact
+                self._log_samples_as_artifact(eval_preds, task_name)
+
+            self.run.log({f"{group}_eval_results": grouped_df})
+
+
+def get_commit_from_path(repo_path: Path) -> Optional[str]:
+    git_folder = Path(repo_path, ".git")
+    if git_folder.is_file():
+        git_folder = Path(
+            git_folder.parent,
+            git_folder.read_text(encoding="utf-8").split("\n")[0].split(" ")[-1],
+        )
+    if Path(git_folder, "HEAD").exists():
+        head_name = (
+            Path(git_folder, "HEAD")
+            .read_text(encoding="utf-8")
+            .split("\n")[0]
+            .split(" ")[-1]
+        )
+        head_ref = Path(git_folder, head_name)
+        git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "")
+    else:
+        git_hash = None
+    return git_hash
+
+
+def get_git_commit_hash():
+    """
+    Gets the git commit hash of your current repo (if it exists).
+    Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
+    """
+    try:
+        git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
+        git_hash = git_hash.decode()
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        # FileNotFoundError occurs when git not installed on system
+        git_hash = get_commit_from_path(os.getcwd())  # git hash of repo if exists
+    return git_hash
+
+
+def add_env_info(storage: Dict[str, Any]):
+    try:
+        pretty_env_info = get_pretty_env_info()
+    except Exception as err:
+        pretty_env_info = str(err)
+    transformers_version = trans_version
+    upper_dir_commit = get_commit_from_path(
+        Path(os.getcwd(), "..")
+    )  # git hash of upper repo if exists
+    added_info = {
+        "pretty_env_info": pretty_env_info,
+        "transformers_version": transformers_version,
+        "upper_git_hash": upper_dir_commit,  # in case this repo is submodule
+    }
+    storage.update(added_info)
--- a/LM-Evaluation-Harness-240310/lm_eval/models/__init__.py
+++ b/LM-Evaluation-Harness-240310/lm_eval/models/__init__.py
+from . import (
+    anthropic_llms,
+    dummy,
+    gguf,
+    huggingface,
+    mamba_lm,
+    neuron_optimum,
+    openai_completions,
+    optimum_lm,
+    textsynth,
+    vllm_causallms,
+)
+
+
+# TODO: implement __all__
+
+
+try:
+    # enable hf hub transfer if available
+    import hf_transfer  # type: ignore # noqa
+    import huggingface_hub.constants  # type: ignore
+
+    huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
+except ImportError:
+    pass
--- a/LM-Evaluation-Harness-240310/lm_eval/models/anthropic_llms.py
+++ b/LM-Evaluation-Harness-240310/lm_eval/models/anthropic_llms.py
+from typing import Any, List, Tuple
+
+from tqdm import tqdm
+
+from lm_eval import utils
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import retry_on_specific_exceptions
+
+
+eval_logger = utils.eval_logger
+
+
+def anthropic_completion(
+    client,  #: anthropic.Anthropic,
+    model: str,
+    prompt: str,
+    max_tokens_to_sample: int,
+    temperature: float,
+    stop: List[str],
+    **kwargs: Any,
+) -> str:
+    """Wrapper function around the Anthropic completion API client with exponential back-off
+    in case of RateLimitError.
+
+    params:
+        client: anthropic.Anthropic
+            Anthropic API client
+        model: str
+            Anthropic model e.g. 'claude-instant-v1', 'claude-2'
+        prompt: str
+            Prompt to feed to the model
+        max_tokens_to_sample: int
+            Maximum number of tokens to sample from the model
+        temperature: float
+            Sampling temperature
+        stop: List[str]
+            List of stop sequences
+        kwargs: Any
+            Additional model_args to pass to the API client
+    """
+
+    try:
+        import anthropic
+    except ModuleNotFoundError:
+        raise Exception(
+            "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e .[anthropic]`",
+        )
+
+    def _exception_callback(e: Exception, sleep_time: float) -> None:
+        eval_logger.warning(
+            f"RateLimitError occurred: {e.__cause__}\n Retrying in {sleep_time} seconds"
+        )
+
+    @retry_on_specific_exceptions(
+        on_exceptions=[anthropic.RateLimitError],
+        max_retries=None,  # retry forever, consider changing
+        on_exception_callback=_exception_callback,
+    )
+    def completion():
+        response = client.completions.create(
+            prompt=f"{anthropic.HUMAN_PROMPT} {prompt}{anthropic.AI_PROMPT}",
+            model=model,
+            # NOTE: Claude really likes to do CoT, and overly aggressive stop sequences
+            #       (e.g. gsm8k's ":") may truncate a lot of the input.
+            stop_sequences=[anthropic.HUMAN_PROMPT] + stop,
+            max_tokens_to_sample=max_tokens_to_sample,
+            temperature=temperature,
+            **kwargs,
+        )
+        return response.completion
+
+    return completion()
+
+
+@register_model("anthropic")
+class AnthropicLM(LM):
+    REQ_CHUNK_SIZE = 20  # TODO: not used
+
+    def __init__(
+        self,
+        batch_size: int = 1,
+        model: str = "claude-2.0",
+        max_tokens_to_sample: int = 256,
+        temperature: float = 0,  # defaults to 1
+        **kwargs,  # top_p, top_k, etc.
+    ) -> None:
+        """Anthropic API wrapper.
+
+        :param model: str
+            Anthropic model e.g. 'claude-instant-v1', 'claude-2'
+        :param max_tokens_to_sample: int
+            Maximum number of tokens to sample from the model
+        :param temperature: float
+            Sampling temperature
+        :param kwargs: Any
+            Additional model_args to pass to the API client
+        """
+        super().__init__()
+
+        try:
+            import anthropic
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e .[anthropic]`",
+            )
+
+        self.model = model
+        # defaults to os.environ.get("ANTHROPIC_API_KEY")
+        self.client = anthropic.Anthropic()
+        self.temperature = temperature
+        self.max_tokens_to_sample = max_tokens_to_sample
+        self.tokenizer = self.client.get_tokenizer()
+        self.kwargs = kwargs
+
+    @property
+    def eot_token_id(self):
+        # Not sure but anthropic.HUMAN_PROMPT ?
+        raise NotImplementedError("No idea about anthropic tokenization.")
+
+    @property
+    def max_length(self) -> int:
+        return 2048
+
+    @property
+    def max_gen_toks(self) -> int:
+        return self.max_tokens_to_sample
+
+    @property
+    def batch_size(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError("No support for logits.")
+
+    @property
+    def device(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError("No support for logits.")
+
+    def tok_encode(self, string: str) -> List[int]:
+        return self.tokenizer.encode(string).ids
+
+    def tok_decode(self, tokens: List[int]) -> str:
+        return self.tokenizer.decode(tokens)
+
+    def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError("No support for logits.")
+
+    def generate_until(self, requests) -> List[str]:
+        try:
+            import anthropic
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e .[anthropic]`",
+            )
+
+        if not requests:
+            return []
+
+        _requests: List[Tuple[str, dict]] = [req.args for req in requests]
+
+        res = []
+        for request in tqdm(_requests):
+            try:
+                inp = request[0]
+                request_args = request[1]
+                # generation_kwargs
+                until = request_args.get("until")
+                max_gen_toks = request_args.get("max_gen_toks", self.max_length)
+                temperature = request_args.get("temperature", self.temperature)
+                response = anthropic_completion(
+                    client=self.client,
+                    model=self.model,
+                    prompt=inp,
+                    max_tokens_to_sample=max_gen_toks,
+                    temperature=temperature,  # TODO: implement non-greedy sampling for Anthropic
+                    stop=until,  # type: ignore
+                    **self.kwargs,
+                )
+                res.append(response)
+
+                self.cache_hook.add_partial("generate_until", request, response)
+            except anthropic.APIConnectionError as e:  # type: ignore # noqa: F821
+                eval_logger.critical(f"Server unreachable: {e.__cause__}")
+                break
+            except anthropic.APIStatusError as e:  # type: ignore # noqa: F821
+                eval_logger.critical(f"API error {e.status_code}: {e.message}")
+                break
+
+        return res
+
+    def _model_call(self, inps):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        # Isn't used because we override generate_until
+        raise NotImplementedError()
+
+    def loglikelihood(self, requests):
+        raise NotImplementedError("No support for logits.")
+
+    def loglikelihood_rolling(self, requests):
+        raise NotImplementedError("No support for logits.")
--- a/LM-Evaluation-Harness-240310/lm_eval/models/dummy.py
+++ b/LM-Evaluation-Harness-240310/lm_eval/models/dummy.py
+import random
+
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+
+
+@register_model("dummy")
+class DummyLM(LM):
+    def __init__(self) -> None:
+        super().__init__()
+
+    @classmethod
+    def create_from_arg_string(cls, arg_string, additional_config=None):
+        return cls()
+
+    def loglikelihood(self, requests):
+        res = []
+
+        for _ in requests:
+            res.append((-random.random(), False))
+
+        return res
+
+    def generate_until(self, requests):
+        res = []
+
+        for ctx, _ in requests:
+            res.append("lol")
+            assert ctx.strip() != ""
+
+        return res
+
+    def loglikelihood_rolling(self, requests):
+        res = []
+
+        for _ in requests:
+            res.append(-random.random())
+
+        return res
--- a/LM-Evaluation-Harness-240310/lm_eval/models/gguf.py
+++ b/LM-Evaluation-Harness-240310/lm_eval/models/gguf.py
+import logging
+import time
+
+import requests
+from requests.exceptions import RequestException
+from tqdm import tqdm
+
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_result(logprobs, context_length):
+    is_greedy = True
+    offsets = logprobs["text_offset"]
+    tokens = logprobs["tokens"]
+    tokens_logprobs = logprobs["token_logprobs"]
+
+    idx = 0
+    while offsets[idx] < context_length:
+        idx += 1
+    continuation_logprobs = sum(tokens_logprobs[idx:-1])
+    for i in range(idx, len(tokens)):
+        token = tokens[i]
+        top_tokens = logprobs["top_logprobs"][i]
+        top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
+        if top_token != token:
+            is_greedy = False
+            break
+
+    return continuation_logprobs, is_greedy
+
+
+@register_model("gguf", "ggml")
+class GGUFLM(LM):
+    def __init__(self, base_url=None, max_length=2048, **kwargs):
+        super().__init__()
+        self.base_url = base_url
+        assert self.base_url, "must pass `base_url` to use GGUF LM!"
+        self.logprobs = 10
+        self.temperature = 0.0
+        self.max_length = max_length
+
+    def gguf_completion(
+        self, context, continuation=None, stop=None, retries=3, delay=5, **kwargs
+    ):
+        for _ in range(retries):
+            try:
+                prompt = context
+                request = {
+                    "prompt": prompt,
+                    "logprobs": self.logprobs,
+                    "temperature": self.temperature,
+                }
+                if continuation:
+                    prompt += continuation
+                    request.update({"prompt": prompt, "max_tokens": 1, "echo": True})
+                if stop is not None:
+                    request["stop"] = stop
+                response = requests.post(
+                    f"{self.base_url}/v1/completions", json=request
+                )
+                response.raise_for_status()
+                return response.json()
+            except RequestException as e:
+                logger.error(f"RequestException: {e}")
+                time.sleep(delay)  # wait before retrying
+        else:
+            raise Exception(f"Failed to get a valid response after {retries} retries.")
+
+    def loglikelihood(self, requests):
+        if not requests:
+            return []
+        res = []
+        for context, continuation in tqdm([req.args for req in requests]):
+            response = self.gguf_completion(context=context, continuation=continuation)
+            if response and "choices" in response and response["choices"]:
+                choice = response["choices"][0]
+                logprobs = choice.get("logprobs")
+                if (
+                    logprobs
+                    and "token_logprobs" in logprobs
+                    and logprobs["token_logprobs"]
+                ):
+                    logprob, is_greedy = get_result(logprobs, len(context))
+                    res.append((logprob, is_greedy))
+                else:
+                    logger.warning(
+                        "Invalid logprobs data. Expected 'logprobs' to contain 'token_logprobs' list."
+                    )
+            else:
+                logger.error(
+                    f"Invalid response for loglikelihood. Response: {response}"
+                )
+                assert False
+        return res
+
+    def generate_until(self, requests):
+        if not requests:
+            return []
+
+        res = []
+        for request in tqdm([req.args for req in requests]):
+            inp = request[0]
+            request_args = request[1]
+            until = request_args.get("until", ["</s>"])
+            response = self.gguf_completion(context=inp, stop=until)
+            if response and "choices" in response and response["choices"]:
+                choice = response["choices"][0]
+                if "text" in choice:
+                    generated_text = choice["text"].strip()
+                    res.append(generated_text)
+                else:
+                    logger.error(
+                        f"Invalid response for greedy_until. Response: {response}"
+                    )
+                    res.append(None)  # Add default value in case of error
+            else:
+                logger.error(f"Invalid response for greedy_until. Response: {response}")
+                res.append(None)  # Add default value in case of error
+        return res
+
+    def loglikelihood_rolling(self, requests):
+        raise NotImplementedError(
+            "loglikelihood_rolling not yet supported for GGUF models"
+        )
--- a/LM-Evaluation-Harness-240310/lm_eval/models/huggingface.py
+++ b/LM-Evaluation-Harness-240310/lm_eval/models/huggingface.py
+import copy
+import os
+from datetime import timedelta
+from pathlib import Path
+from typing import List, Literal, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import transformers
+from accelerate import (
+    Accelerator,
+    DistributedType,
+    InitProcessGroupKwargs,
+    find_executable_batch_size,
+)
+from packaging import version
+from peft import PeftModel
+from peft import __version__ as PEFT_VERSION
+from tqdm import tqdm
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
+    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+)
+
+from lm_eval import utils
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import TemplateLM
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import (
+    Collator,
+    clear_torch_cache,
+    get_dtype,
+    pad_and_concat,
+    stop_sequences_criteria,
+)
+
+
+eval_logger = utils.eval_logger
+
+
+def _get_accelerate_args(
+    device_map_option: Optional[str] = "auto",
+    max_memory_per_gpu: Optional[Union[int, str]] = None,
+    max_cpu_memory: Optional[Union[int, str]] = None,
+    offload_folder: Optional[str] = "./offload",
+) -> dict:
+    """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
+    max_memory = {}
+    if max_memory_per_gpu is not None:
+        max_memory_per_gpu_map = {
+            device_idx: max_memory_per_gpu
+            for device_idx in range(torch.cuda.device_count())
+        }
+        max_memory.update(max_memory_per_gpu_map)
+    if max_cpu_memory is not None:
+        max_memory["cpu"] = max_cpu_memory
+
+    args = {}
+    if max_memory:
+        args["max_memory"] = max_memory
+    args["device_map"] = device_map_option
+    args["offload_folder"] = offload_folder
+    return args
+
+
+@register_model("hf-auto", "hf", "huggingface")
+class HFLM(TemplateLM):
+    """
+    An abstracted Huggingface model class. Enables usage with both models of
+    `transformers.AutoModelForCausalLM` and `transformers.AutoModelForSeq2SeqLM` classes.
+
+    Supports data-parallel multi-GPU with HF Accelerate.
+    """
+
+    AUTO_MODEL_CLASS = None
+    _DEFAULT_MAX_LENGTH = 2048
+
+    def __init__(
+        self,
+        pretrained: Optional[Union[str, transformers.PreTrainedModel]] = "gpt2",
+        backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
+        # override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq)
+        revision: Optional[str] = "main",
+        subfolder: Optional[str] = None,
+        tokenizer: Optional[
+            Union[
+                str,
+                transformers.PreTrainedTokenizer,
+                transformers.PreTrainedTokenizerFast,
+            ]
+        ] = None,
+        truncation: Optional[bool] = False,
+        logits_cache: bool = True,
+        max_length: Optional[int] = None,
+        device: Optional[str] = "cuda",
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
+        batch_size: Optional[Union[int, str]] = 1,
+        max_batch_size: Optional[int] = 64,
+        trust_remote_code: Optional[bool] = False,
+        use_fast_tokenizer: Optional[bool] = True,
+        add_bos_token: Optional[bool] = False,
+        # arguments used for splitting a model across GPUs naively.
+        # only used if `parallelize=True`.
+        parallelize: Optional[bool] = False,
+        device_map_option: Optional[str] = "auto",
+        max_memory_per_gpu: Optional[Union[int, str]] = None,
+        max_cpu_memory: Optional[Union[int, str]] = None,
+        offload_folder: Optional[Union[str, os.PathLike]] = "./offload",
+        # PEFT and quantization options
+        peft: Optional[str] = None,
+        autogptq: Optional[Union[bool, str]] = False,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+
+        # optionally: take in an already-initialized transformers.PreTrainedModel
+        if not isinstance(pretrained, str):
+            eval_logger.warning(
+                "`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way."
+            )
+            assert not parallelize, "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
+            self._model = pretrained
+            self._device = self._model.device
+            self._config = self._model.config
+            gpus = 0
+
+            if tokenizer:
+                assert isinstance(
+                    tokenizer, transformers.PreTrainedTokenizer
+                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
+                self.tokenizer = tokenizer
+            else:
+                # Get tokenizer
+                model_name = self._model.name_or_path
+                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                    model_name,
+                    revision=revision,
+                    trust_remote_code=trust_remote_code,
+                    use_fast=use_fast_tokenizer,
+                )
+
+        else:
+            assert isinstance(device, str)
+            assert isinstance(pretrained, str)
+            assert isinstance(batch_size, (int, str))
+
+            gpus = torch.cuda.device_count()
+            accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
+            accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
+            if accelerator.num_processes > 1:
+                self.accelerator = accelerator
+
+            if not (parallelize or accelerator.num_processes > 1):
+                # use user-passed device
+                device_list = set(
+                    ["cuda", "cpu"]
+                    + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
+                    + ["mps", "mps:0"]
+                )
+                if device and device in device_list:
+                    self._device = torch.device(device)
+                    eval_logger.info(f"Using device '{device}'")
+                    if device in ("mps", "mps:0") and version.parse(
+                        torch.__version__
+                    ) < version.parse("2.1"):
+                        raise RuntimeError(
+                            f"mps requires torch >= 2.1. You have {torch.__version__}"
+                        )
+                else:
+                    eval_logger.info("Device not specified")
+                    eval_logger.info(f"Cuda Available? {torch.cuda.is_available()}")
+                    self._device = (
+                        torch.device("cuda")
+                        if torch.cuda.is_available()
+                        else torch.device("cpu")
+                    )
+            else:
+                if device != "cuda":
+                    eval_logger.info(
+                        f"Using `accelerate launch` or `parallelize=True`, device '{device}' will be overridden when placing model."
+                    )
+                # TODO: include in warning that `load_in_8bit` etc. affect this too
+                self._device = torch.device(device)
+
+            # TODO: update this to be less of a hack once subfolder is fixed in HF
+            revision = revision + ("/" + subfolder if subfolder is not None else "")
+
+            self._get_config(
+                pretrained,
+                revision=revision,
+                trust_remote_code=trust_remote_code,
+            )
+
+        # determine which of 'causal' and 'seq2seq' backends to use
+        self._get_backend(
+            config=self.config, backend=backend, trust_remote_code=trust_remote_code
+        )
+
+        # if we passed `pretrained` as a string, initialize our model now
+        if isinstance(pretrained, str):
+            self._create_model(
+                pretrained=pretrained,
+                revision=revision,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+                parallelize=parallelize,
+                device_map_option=device_map_option,
+                max_memory_per_gpu=max_memory_per_gpu,
+                max_cpu_memory=max_cpu_memory,
+                offload_folder=offload_folder,
+                peft=peft,
+                autogptq=autogptq,
+                **kwargs,
+            )
+
+        # access self._model through self.model property outside this method
+        if isinstance(self.model, torch.nn.Module):
+            self.model.eval()
+            self.model.tie_weights()
+
+        if isinstance(pretrained, str) and (gpus >= 1 or str(self.device) == "mps"):
+            # TODO: can remove this whole snippet except in the mps case, perhaps?
+            if not (parallelize or autogptq or hasattr(self, "accelerator")):
+                # place model onto device requested manually,
+                # if not using HF Accelerate or device_map
+                # or any other option that preloads model onto device
+                try:
+                    self.model.to(self.device)
+                except ValueError:
+                    eval_logger.debug(
+                        "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
+                    )
+
+        self._create_tokenizer(
+            pretrained,
+            tokenizer,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            use_fast_tokenizer=use_fast_tokenizer,
+        )
+
+        self.truncation = truncation
+        self.logits_cache = logits_cache
+        self.vocab_size = self.tokenizer.vocab_size
+        # select (or create) a pad token to use
+        if self.tokenizer.pad_token:
+            pass
+        elif self.tokenizer.unk_token:
+            self.tokenizer.pad_token_id = self.tokenizer.unk_token_id
+        elif self.tokenizer.eos_token:
+            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+        else:
+            if getattr(self.config, "model_type", None) == "qwen":
+                # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
+                self.tokenizer.pad_token = "<|endoftext|>"
+            elif (
+                self.tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
+                or self.tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
+            ):
+                # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
+                # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
+                # ---
+                # Note that the world tokenizer class name, might change in the future for the final huggingface merge
+                # https://github.com/huggingface/transformers/pull/26963
+                assert self.tokenizer.pad_token_id == 0
+            else:
+                self.tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
+
+        # TODO: override this for Gemma
+        self.add_bos_token = add_bos_token
+        if getattr(self.config, "model_type", None) == "gemma":
+            self.add_bos_token = True
+            eval_logger.info(
+                f"Model type is '{self.config.model_type}', a BOS token will be used as Gemma underperforms without it."
+            )
+
+        self._max_length = max_length
+
+        self.batch_schedule = 1
+        self.batch_sizes = {}
+        self.max_batch_size = max_batch_size
+
+        if str(batch_size).startswith("auto"):
+            batch_size = batch_size.split(":")
+            self.batch_size_per_gpu = batch_size[0]
+            self.batch_schedule = float(batch_size[1]) if len(batch_size) > 1 else 1
+        else:
+            self.batch_size_per_gpu = int(batch_size)
+
+        if isinstance(pretrained, str):
+            # multigpu data-parallel support when launched with accelerate
+            if gpus > 1:
+                if parallelize:
+                    if accelerator.num_processes > 1:
+                        raise RuntimeError(
+                            "Attempted to use both a HF Accelerate `device_map` and to launch via `accelerate launch`. If this is the case, please either remove `parallelize=True` from --model_args or launch outside of the Accelerate launcher."
+                        )
+                    else:
+                        pass
+                elif accelerator.num_processes == 1:
+                    # if we aren't launching via accelerate, ditch
+                    self._rank = 0
+                    self._world_size = 1
+                else:
+                    if gpus > accelerator.num_processes:
+                        eval_logger.warning(
+                            "WARNING: The number of total system GPUs does not match the number of spawned processes. "
+                            "If you would like to use data parallelism, please launch the script "
+                            "with 'accelerate launch *script*'. "
+                            f"Current run will proceed with {accelerator.num_processes} devices."
+                        )
+                    assert (
+                        accelerator.distributed_type
+                        in [
+                            DistributedType.FSDP,
+                            DistributedType.MULTI_GPU,
+                        ]
+                    ), "Unsupported distributed type provided. Only DDP and FSDP are supported."
+                    if accelerator.distributed_type == DistributedType.FSDP:
+                        self._model = accelerator.prepare(self.model)
+                    else:
+                        self._model = accelerator.prepare_model(
+                            self.model, evaluation_mode=True
+                        )
+                    self._device = torch.device(
+                        f"cuda:{accelerator.local_process_index}"
+                    )
+                    self.accelerator = accelerator
+
+                    if self.accelerator.is_local_main_process:
+                        eval_logger.info(f"Using {gpus} devices with data parallelism")
+
+                    self._rank = self.accelerator.local_process_index
+                    self._world_size = self.accelerator.num_processes
+        else:
+            # if a PreTrainedModel was passed into HFLM, we forgo distributed setup.
+            eval_logger.warning(
+                "Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration"
+            )
+            self._rank = 0
+            self._world_size = 1
+
+    @property
+    def config(self):
+        # return the associated transformers.AutoConfig for the given pretrained model.
+        return self._config
+
+    @property
+    def model(self):
+        # returns the model, unwrapping it if using Accelerate
+        if hasattr(self, "accelerator"):
+            return self.accelerator.unwrap_model(self._model)
+        else:
+            return self._model
+
+    @property
+    def eot_token_id(self):
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+        return self.tokenizer.eos_token_id
+
+    @property
+    def max_length(self):
+        if self._max_length:  # if max length manually set, return it
+            return self._max_length
+        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
+        for attr in seqlen_config_attrs:
+            if hasattr(self.model.config, attr):
+                return getattr(self.model.config, attr)
+        if hasattr(self.tokenizer, "model_max_length"):
+            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
+                return self._DEFAULT_MAX_LENGTH
+            return self.tokenizer.model_max_length
+        return self._DEFAULT_MAX_LENGTH
+
+    @property
+    def max_gen_toks(self) -> int:
+        return 256
+
+    @property
+    def batch_size(self):
+        return self.batch_size_per_gpu
+
+    @property
+    def device(self):
+        return self._device
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @property
+    def world_size(self):
+        return self._world_size
+
+    def _get_backend(
+        self,
+        config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
+        backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
+        trust_remote_code: Optional[bool] = False,
+    ) -> None:
+        """
+        Helper method during initialization.
+        Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder))
+        model type to be used.
+        """
+        assert backend in ["default", "causal", "seq2seq"]
+
+        if backend != "default":
+            # if we've settled on non-default backend, use that manually
+            if backend == "causal":
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+            elif backend == "seq2seq":
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
+            eval_logger.info(
+                f"Overrode HF model backend type, and using type '{backend}'"
+            )
+        else:
+            # determine and use the default HF backend for this model, based on its config + metadata.
+            if (
+                getattr(config, "model_type")
+                in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+            ):
+                # first check if model type is listed under seq2seq models, since some
+                # models like MBart are listed in both seq2seq and causal mistakenly in HF transformers.
+                # these special cases should be treated as seq2seq models.
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
+            elif (
+                getattr(self.config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+            ):
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+            else:
+                if not trust_remote_code:
+                    eval_logger.warning(
+                        "HF model type is neither marked as CausalLM or Seq2SeqLM. \
+                    This is expected if your model requires `trust_remote_code=True` but may be an error otherwise."
+                    )
+                # if model type is neither in HF transformers causal or seq2seq model registries
+                # then we default to AutoModelForCausalLM
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+
+        assert self.AUTO_MODEL_CLASS in [
+            transformers.AutoModelForCausalLM,
+            transformers.AutoModelForSeq2SeqLM,
+        ]
+        return None
+
+    def _get_config(
+        self,
+        pretrained: str,
+        revision: str = "main",
+        trust_remote_code: bool = False,
+    ) -> None:
+        self._config = transformers.AutoConfig.from_pretrained(
+            pretrained,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+        )
+
+    def _create_model(
+        self,
+        pretrained: str,
+        revision: Optional[str] = "main",
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
+        trust_remote_code: Optional[bool] = False,
+        # arguments used for splitting a model across GPUs naively.
+        # only used if `parallelize=True`.
+        # (accelerate naive PP (device_map) options)
+        parallelize: Optional[bool] = False,
+        device_map_option: Optional[str] = "auto",
+        max_memory_per_gpu: Optional[Union[int, str]] = None,
+        max_cpu_memory: Optional[Union[int, str]] = None,
+        offload_folder: Optional[str] = "./offload",
+        # PEFT and quantization options
+        peft: Optional[str] = None,
+        autogptq: Optional[Union[bool, str]] = False,
+        **kwargs,
+    ) -> None:
+        """
+        Initializes an HF or HF-compatible PreTrainedModel from scratch
+        inside HFLM, using the kwargs passed into self.__init__().
+
+        Also handles functionality such as AutoGPTQ usage and PEFT wrapping.
+
+        For future similar extensions to AutoGPTQ that are not core to HF's ecosystem,
+        (such as PyTorch models that are nearly, but not quite, fully mirroring
+        HF's public interface relied on in this HFLM class)
+        please consider subclassing HFLM and overriding this and other methods as needed.
+        """
+
+        model_kwargs = kwargs if kwargs else {}
+
+        if parallelize:
+            model_kwargs.update(
+                _get_accelerate_args(
+                    device_map_option,  # TODO: phase out device_map_option?
+                    max_memory_per_gpu,
+                    max_cpu_memory,
+                    offload_folder,
+                )
+            )
+        elif "device_map" not in model_kwargs:
+            # set a device_map to initialize model on the right GPU.
+            # this is needed because it seems that the default behavior
+            # for quantized models now seems to be device_map="auto"
+            # which breaks data-parallel mode.
+            if hasattr(self, "accelerator"):
+                model_kwargs.update(
+                    {"device_map": {"": f"cuda:{self.accelerator.local_process_index}"}}
+                )
+            else:
+                model_kwargs.update({"device_map": {"": str(self.device)}})
+
+        if not autogptq:
+            if model_kwargs.get("load_in_4bit", None):
+                assert (
+                    transformers.__version__ >= "4.30.0"
+                ), "load_in_4bit requires transformers >= 4.30.0"
+            if transformers.__version__ >= "4.30.0":
+                if model_kwargs.get("load_in_4bit", None):
+                    if model_kwargs.get("bnb_4bit_compute_dtype", None):
+                        model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(
+                            model_kwargs["bnb_4bit_compute_dtype"]
+                        )
+            self._model = self.AUTO_MODEL_CLASS.from_pretrained(
+                pretrained,
+                revision=revision,
+                torch_dtype=get_dtype(dtype),
+                trust_remote_code=trust_remote_code,
+                **model_kwargs,
+            )
+        else:
+            try:
+                from auto_gptq import AutoGPTQForCausalLM
+            except ModuleNotFoundError:
+                raise Exception(
+                    "Tried to load auto_gptq, but auto-gptq is not installed ",
+                    "please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]",
+                )
+
+            self._model = AutoGPTQForCausalLM.from_quantized(
+                pretrained,
+                trust_remote_code=trust_remote_code,
+                model_basename=None if autogptq is True else Path(autogptq).stem,
+                use_safetensors=True
+                if autogptq is True
+                else autogptq.endswith(".safetensors"),
+                **model_kwargs,
+            )
+
+        if peft:
+            if model_kwargs.get("load_in_4bit", None):
+                assert PEFT_VERSION >= "0.4.0", "load_in_4bit requires peft >= 0.4.0"
+            self._model = PeftModel.from_pretrained(
+                self._model, peft, revision=revision
+            )
+
+        return None
+
+    def _create_tokenizer(
+        self,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        tokenizer: Optional[
+            Union[
+                str,
+                transformers.PreTrainedTokenizer,
+                transformers.PreTrainedTokenizerFast,
+            ]
+        ],
+        revision: Optional[str] = "main",
+        trust_remote_code: Optional[bool] = False,
+        use_fast_tokenizer: Optional[bool] = True,
+    ) -> None:
+        """
+        Helper method during initialization.
+
+        Create a tokenizer object corresponding to the correct
+        tokenizer for value of `pretrained`, or use the pre-initialized tokenizer passed.
+        """
+
+        if tokenizer:
+            if isinstance(tokenizer, str):
+                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                    tokenizer,
+                    revision=revision,
+                    trust_remote_code=trust_remote_code,
+                    use_fast=use_fast_tokenizer,
+                )
+            else:
+                assert isinstance(
+                    tokenizer, transformers.PreTrainedTokenizer
+                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
+                self.tokenizer = tokenizer
+        else:
+            # Get tokenizer based on 'pretrained'
+            if isinstance(pretrained, str):
+                model_name = pretrained
+            else:
+                # get the HF hub name via accessor on model
+                model_name = self.model.name_or_path
+            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                model_name,
+                revision=revision,
+                trust_remote_code=trust_remote_code,
+                use_fast=use_fast_tokenizer,
+            )
+        return None
+
+    def _detect_batch_size(self, requests=None, pos: int = 0):
+        if requests:
+            _, context_enc, continuation_enc = requests[pos]
+            max_length = len(
+                (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
+            )
+            max_context_enc = len(context_enc[-(self.max_length + 1) :])
+            max_cont_enc = len(continuation_enc[-(self.max_length + 1) :])
+        else:
+            max_length = self.max_length
+
+        # if OOM, then halves batch_size and tries again
+        @find_executable_batch_size(starting_batch_size=self.max_batch_size)
+        def forward_batch(batch_size):
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                length = max(max_context_enc, max_cont_enc)
+                batched_conts = torch.ones(
+                    (batch_size, length), device=self.device
+                ).long()
+                test_batch = torch.ones((batch_size, length), device=self.device).long()
+                call_kwargs = {
+                    "attn_mask": test_batch,
+                    "labels": batched_conts,
+                }
+            else:
+                call_kwargs = {}
+                test_batch = torch.ones(
+                    (batch_size, max_length), device=self.device
+                ).long()
+            for _ in range(5):
+                out = F.log_softmax(self._model_call(test_batch, **call_kwargs), dim=-1)  # noqa: F841
+
+            return batch_size
+
+        try:
+            batch_size = forward_batch()
+        except RuntimeError as e:
+            if "No executable batch size found" in str(e):
+                batch_size = 1
+            else:
+                raise
+
+        if self.world_size > 1:
+            # if multi-GPU, always take minimum over all selected batch sizes
+            max_rnk_bs = torch.tensor([batch_size], device=self.device)
+            gathered = (
+                self.accelerator.gather(max_rnk_bs).cpu().detach().numpy().tolist()
+            )
+            batch_size = min(gathered)
+            clear_torch_cache()
+            return batch_size
+
+        clear_torch_cache()
+        return batch_size
+
+    def tok_encode(
+        self, string: str, left_truncate_len=None, add_special_tokens=None
+    ) -> List[int]:
+        """ """
+        if add_special_tokens is None:
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                add_special_tokens = False or self.add_bos_token
+            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                # TODO: investigate best practices for enc-dec models + special tokens
+                add_special_tokens = True
+
+        encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens)
+
+        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+        if left_truncate_len:
+            encoding = encoding[-left_truncate_len:]
+
+        return encoding
+
+    def tok_batch_encode(
+        self,
+        strings: List[str],
+        padding_side: str = "left",
+        left_truncate_len: int = None,
+        truncation: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
+        old_padding_side = self.tokenizer.padding_side
+        self.tokenizer.padding_side = padding_side
+
+        if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+            add_special_tokens = False or self.add_bos_token
+        elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+            add_special_tokens = True
+
+        encoding = self.tokenizer(
+            strings,
+            truncation=truncation,
+            padding="longest",
+            return_tensors="pt",
+            add_special_tokens=add_special_tokens,
+        )
+        if left_truncate_len:
+            encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
+            encoding["attention_mask"] = encoding["attention_mask"][
+                :, -left_truncate_len:
+            ]
+        self.tokenizer.padding_side = old_padding_side
+
+        return encoding["input_ids"], encoding["attention_mask"]
+
+    def tok_decode(self, tokens):
+        if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+            return self.tokenizer.decode(tokens)
+        elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+            return self.tokenizer.decode(tokens, skip_special_tokens=True)
+
+    def _model_call(self, inps, attn_mask=None, labels=None):
+        """
+        :param inps: torch.Tensor
+            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)] or of shape
+            [batch, sequence_ctx]. the size of sequence may vary from call to call
+        :param attn_mask: torch.Tensor, optional
+            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)]. Only passed
+            (and must be passed) if self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM
+        :param labels: torch.Tensor, optional
+            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)]. Only passed
+            (and must be passed) if self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM
+        :return
+            A torch tensor of shape [batch, sequence, vocab] with the
+        logits returned from the model's decoder
+        """
+        with torch.no_grad():
+            if attn_mask is not None or labels is not None:
+                assert attn_mask is not None and labels is not None
+                assert self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM
+                return self.model(
+                    input_ids=inps, attention_mask=attn_mask, labels=labels
+                ).logits
+            else:
+                assert self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+                return self.model(inps).logits
+
+    def _model_generate(self, context, max_length, stop, **generation_kwargs):
+        # temperature = 0.0 if not set
+        # if do_sample is false and temp==0.0:
+        # remove temperature, as do_sample=False takes care of this
+        # and we don't want a warning from HF
+        generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
+        do_sample = generation_kwargs.get("do_sample", None)
+
+        # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
+        if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
+            generation_kwargs["do_sample"] = do_sample = False
+
+        if do_sample is False and generation_kwargs.get("temperature") == 0.0:
+            generation_kwargs.pop("temperature")
+        # build stopping criteria
+        stopping_criteria = stop_sequences_criteria(
+            self.tokenizer, stop, context.shape[1], context.shape[0]
+        )
+        return self.model.generate(
+            input_ids=context,
+            max_length=max_length,
+            stopping_criteria=stopping_criteria,
+            pad_token_id=self.tokenizer.pad_token_id,
+            use_cache=True,
+            **generation_kwargs,
+        )
+
+    def _select_cont_toks(
+        self, logits: torch.Tensor, contlen: int = None, inplen: int = None
+    ) -> torch.Tensor:
+        if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+            assert (
+                contlen and inplen
+            ), "Must pass input len and cont. len to select scored logits for causal LM"
+            # discard right-padding.
+            # also discard the input/context tokens. we'll only score continuations.
+            logits = logits[inplen - contlen : inplen]
+        elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+            assert (
+                contlen and not inplen
+            ), "Selecting scored logits for Seq2SeqLM requires only cont. len"
+            # only discard right-padding.
+            # the logits input to this fn only contain decoder-side tokens.
+            logits = logits[:contlen]
+
+        return logits
+
+    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+        loglikelihoods = []
+
+        adaptive_batch_size = None
+        if self.batch_size == "auto":
+            # using rolling window with maximum context
+            print("Passed argument batch_size = auto. Detecting largest batch size")
+            batch_size = self._detect_batch_size()
+            print(f"Determined Largest batch size: {batch_size}")
+            adaptive_batch_size = batch_size
+
+        for (string,) in tqdm([req.args for req in requests], disable=(self.rank != 0)):
+            rolling_token_windows = list(
+                map(
+                    utils.make_disjoint_window,
+                    utils.get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.eot_token_id,
+                        max_seq_len=self.max_length,
+                        context_len=1,
+                    ),
+                )
+            )
+
+            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+
+            pad_amnt = 0
+            if self.world_size > 1:
+                # We pad out the external document-level iterator so the inner iterator doesn't hang
+                mytensor = torch.tensor(len(rolling_token_windows), device=self.device)
+                gathered = (
+                    self.accelerator.gather(mytensor).cpu().detach().numpy().tolist()
+                )
+
+                pad_amnt = max(gathered) - gathered[self.rank]
+                if pad_amnt > 0:
+                    rolling_token_windows += pad_amnt * [rolling_token_windows[0]]
+
+            string_nll = self._loglikelihood_tokens(
+                requests=rolling_token_windows,
+                disable_tqdm=True,
+                override_bs=adaptive_batch_size,
+            )
+
+            if (self.world_size > 1) and (pad_amnt > 0):
+                string_nll = [x[0] for x in string_nll[:-pad_amnt]]
+            else:
+                # discard is_greedy
+                string_nll = [x[0] for x in string_nll]
+
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+
+        return loglikelihoods
+
+    def _batch_scheduler(self, pos, n_reordered_requests):
+        sched = pos // int(len(n_reordered_requests) / self.batch_schedule)
+        if sched in self.batch_sizes:
+            return self.batch_sizes[sched]
+        if (len(self.batch_sizes) > 1) and (
+            self.batch_sizes[sched - 1] == self.max_batch_size
+        ):
+            # if previous batch size is already maximal, skip recomputation
+            self.batch_sizes[sched] = self.max_batch_size
+            return self.batch_sizes[sched]
+        print(
+            f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size"
+        )
+        self.batch_sizes[sched] = self._detect_batch_size(n_reordered_requests, pos)
+        print(f"Determined largest batch size: {self.batch_sizes[sched]}")
+        return self.batch_sizes[sched]
+
+    def _loglikelihood_tokens(
+        self,
+        requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
+        disable_tqdm: bool = False,
+        override_bs: int = None,
+    ) -> List[Tuple[float, bool]]:
+        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
+        res = []
+
+        def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key for the sorted method"""
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+
+            toks = req[1] + req[2]
+            return -len(toks), tuple(toks)
+
+        def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key to group and lookup one-token continuations"""
+            # Use with group_by="contexts" (optional)"
+            # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
+            # speeds up some multiple-choice tasks proportionally to the number of choices.
+            # groups requests by context+continuation[:-1] and infer on one request/group.
+            return req[-2] + req[-1][:-1]
+
+        re_ord = Collator(
+            requests,
+            sort_fn=_collate,
+            group_by="contexts"
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+            and self.logits_cache
+            else None,
+            group_fn=_lookup_one_token_cont,
+        )
+
+        # automatic (variable) batch size detection for vectorization
+        # pull longest context sample from request
+        n_reordered_requests = len(re_ord)
+        batch_size = (
+            self.batch_size
+            if self.batch_size != "auto"
+            else override_bs
+            if override_bs is not None
+            else 0
+        )
+        batch_fn = (
+            self._batch_scheduler
+            if self.batch_size == "auto"
+            and n_reordered_requests > 0
+            and not override_bs
+            else None
+        )
+
+        chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn)
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running loglikelihood requests",
+        )
+        for chunk in chunks:
+            inps = []
+            cont_toks_list = []
+            inplens = []
+
+            conts = []
+            encoder_attns = []
+
+            padding_len_inp = None
+            padding_len_cont = None
+            # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
+            # tensors, then we pack them together into a batch, call the model, and then pick it all apart
+            # again because vectorizing is annoying
+
+            for _, context_enc, continuation_enc in chunk:
+                # sanity check
+                assert len(context_enc) > 0
+                assert len(continuation_enc) > 0
+                assert len(continuation_enc) <= self.max_length
+
+                # how this all works (illustrated on a causal decoder-only setup):
+                #          CTX      CONT
+                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+                # model  \               \
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+
+                # when too long to fit in context, truncate from the left
+                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                    inp = torch.tensor(
+                        (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
+                        dtype=torch.long,
+                        device=self.device,
+                    )
+                    (inplen,) = inp.shape
+                elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                    inp = torch.tensor(
+                        (context_enc)[-self.max_length :],
+                        dtype=torch.long,
+                        device=self.device,
+                    )
+                    (inplen,) = inp.shape
+
+                    # build encoder attn masks
+                    encoder_attns.append(torch.ones_like(inp))
+
+                    cont = torch.tensor(
+                        (continuation_enc)[-self.max_length :],
+                        # TODO: left-shift these?
+                        # TODO: our code assumes we never end up truncating conts for either model type
+                        dtype=torch.long,
+                        device=self.device,
+                    )
+                    (contlen,) = cont.shape
+
+                    conts.append(cont)
+
+                    padding_len_cont = (
+                        max(padding_len_cont, contlen)
+                        if padding_len_cont is not None
+                        else contlen
+                    )
+
+                padding_len_inp = (
+                    max(padding_len_inp, inplen)
+                    if padding_len_inp is not None
+                    else inplen
+                )
+
+                inps.append(inp)  # [1, inp_length]
+                cont_toks_list.append(continuation_enc)
+                inplens.append(inplen)
+
+            # create encoder attn mask and batched conts, if seq2seq
+            call_kwargs = {}
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                batched_inps = pad_and_concat(
+                    padding_len_inp, inps, padding_side="right"
+                )  # [batch, padding_len_inp]
+            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                # TODO: left-pad encoder inps and mask?
+                batched_inps = pad_and_concat(
+                    padding_len_inp, inps
+                )  # [batch, padding_len_inp]
+                batched_conts = pad_and_concat(
+                    padding_len_cont, conts
+                )  # [batch, padding_len_cont]
+                batched_encoder_mask = pad_and_concat(
+                    padding_len_inp, encoder_attns
+                )  # [batch, padding_len_inp]
+                call_kwargs = {
+                    "attn_mask": batched_encoder_mask,
+                    "labels": batched_conts,
+                }
+
+            multi_logits = F.log_softmax(
+                self._model_call(batched_inps, **call_kwargs), dim=-1
+            )  # [batch, padding_length (inp or cont), vocab]
+
+            for (request_str, ctx_tokens, _), logits, inplen, cont_toks in zip(
+                chunk, multi_logits, inplens, cont_toks_list
+            ):
+                # Slice to original seq length
+                contlen = len(cont_toks)
+                # take only logits in the continuation
+                # (discard context toks if decoder-only ; discard right-padding)
+                # also discards + checks for "virtual tokens" in the causal LM's input window
+                # from prompt/prefix tuning tokens, if applicable
+                ctx_len = (
+                    inplen + (logits.shape[0] - padding_len_inp)
+                    if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+                    else None
+                )
+                logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
+                logits = logits.unsqueeze(0)  # [1, seq, vocab]
+
+                # Check if per-token argmax is exactly equal to continuation
+                greedy_tokens = logits.argmax(dim=-1)
+
+                # check for one-token continuation cache hits.
+                # noop in case group_by != "contexts" or no cache hit and returns the
+                # original args. Otherwise, expands the logits batch dimension and yields each
+                # batch along with matching continuation tokens and prompt strings.
+                # logits -> [1, seq, vocab]
+                for request_str, cont_toks, logits in re_ord.get_cache(
+                    req_str=request_str,
+                    cxt_toks=ctx_tokens,
+                    cont_toks=cont_toks,
+                    logits=logits,
+                ):
+                    cont_toks = torch.tensor(
+                        cont_toks, dtype=torch.long, device=self.device
+                    ).unsqueeze(0)  # [1, seq]
+                    max_equal = (greedy_tokens == cont_toks).all()
+
+                    # Obtain log-probs at the corresponding continuation token indices
+                    # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
+                    logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
+                        -1
+                    )  # [1, seq]
+
+                    # Answer: (log prob, is-exact-match)
+                    answer = (float(logits.sum()), bool(max_equal))
+
+                    res.append(answer)
+
+                    self.cache_hook.add_partial("loglikelihood", request_str, answer)
+                    pbar.update(1)
+
+        pbar.close()
+
+        return re_ord.get_original(res)
+
+    def generate_until(self, requests: List[Instance]) -> List[str]:
+        res = []
+
+        def _collate(req: Tuple[str, dict]):
+            """Defines the key for the sorted method"""
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = self.tok_encode(req[0])
+            return -len(toks), req[0]
+
+        pbar = tqdm(
+            total=len(requests),
+            disable=(self.rank != 0),
+            desc="Running generate_until requests",
+        )
+        adaptive_batch_size = None
+        if self.batch_size == "auto":
+            # using rolling window with maximum context
+            print("Passed argument batch_size = auto. Detecting largest batch size")
+            batch_size = self._detect_batch_size()
+            print(f"Determined Largest batch size: {batch_size}")
+            adaptive_batch_size = batch_size
+        # for each different set of kwargs, we execute all requests, by batch.
+        batch_size = (
+            self.batch_size
+            if self.batch_size != "auto"
+            else adaptive_batch_size
+            if adaptive_batch_size is not None
+            else 0
+        )
+        batch_fn = (
+            self._batch_scheduler
+            if self.batch_size == "auto" and not adaptive_batch_size
+            else None
+        )
+
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        # group_fn=lambda x: x[1] -> x=(context, gen_kwargs)
+        re_ords = Collator(
+            [reg.args for reg in requests],
+            sort_fn=_collate,
+            group_by="gen_kwargs",
+            group_fn=lambda x: x[1],
+        )
+        chunks = re_ords.get_batched(n=batch_size, batch_fn=batch_fn)
+        for chunk in chunks:
+            contexts, all_gen_kwargs = zip(*chunk)
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            gen_kwargs = all_gen_kwargs[0]
+            # unpack our keyword arguments.
+            until = None
+            if isinstance(gen_kwargs, dict):
+                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                if "until" in kwargs.keys():
+                    until = kwargs.pop("until")
+                    if isinstance(until, str):
+                        until = [kwargs]
+                    elif not isinstance(until, list):
+                        raise ValueError(
+                            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
+                        )
+            else:
+                raise ValueError(
+                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
+                )
+            # add EOS token to stop sequences
+            eos = self.tok_decode(self.eot_token_id)
+            if not until:
+                until = [eos]
+            else:
+                until.append(eos)
+            if "max_gen_toks" in kwargs.keys():
+                max_gen_toks = kwargs.pop("max_gen_toks")
+            else:
+                max_gen_toks = self.max_gen_toks
+
+            # set the max length in tokens of inputs ("context_enc")
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                # max len for inputs = max length, minus room to generate the max new tokens
+                max_ctx_len = self.max_length - max_gen_toks
+            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                # max len for inputs = encoder's whole max_length
+                max_ctx_len = self.max_length
+
+            # encode, pad, and truncate contexts for this batch
+            context_enc, attn_masks = self.tok_batch_encode(
+                contexts,
+                left_truncate_len=max_ctx_len,
+                truncation=self.truncation,
+            )
+            context_enc = context_enc.to(self.device)
+            attn_masks = attn_masks.to(self.device)
+
+            if "max_length" not in kwargs:
+                kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
+
+            # perform batched generation
+            cont = self._model_generate(
+                context=context_enc,
+                attention_mask=attn_masks,
+                stop=until,
+                **kwargs,
+            )
+
+            cont_toks_list = cont.tolist()
+            for cont_toks, context in zip(cont_toks_list, contexts):
+                # discard context + left-padding toks if using causal decoder-only LM
+                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                    cont_toks = cont_toks[context_enc.shape[1] :]
+
+                s = self.tok_decode(cont_toks)
+
+                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
+                for term in until:
+                    if len(term) > 0:
+                        # ignore '' separator,
+                        # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
+                        s = s.split(term)[0]
+
+                res.append(s)
+
+                self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
+                pbar.update(1)
+        # reorder this group of results back to original unsorted form
+        res = re_ords.get_original(res)
+
+        pbar.close()
+
+        return res
--- a/LM-Evaluation-Harness-240310/lm_eval/models/mamba_lm.py
+++ b/LM-Evaluation-Harness-240310/lm_eval/models/mamba_lm.py
+from typing import Optional, Union
+
+import torch
+
+import lm_eval.models.utils
+from lm_eval.api.registry import register_model
+from lm_eval.models.huggingface import HFLM
+
+
+@register_model("mamba_ssm")
+class MambaLMWrapper(HFLM):
+    def __init__(
+        self,
+        pretrained="state-spaces/mamba-130m",
+        **kwargs,
+    ) -> None:
+        """
+        Mamba (via the `mamba_ssm` package) supports the following args:
+        ```
+        d_model: int,
+        n_layer: int,
+        vocab_size: int,
+        initializer_cfg=None,
+        pad_vocab_size_multiple: int = 1,
+        ssm_cfg=None,
+        norm_epsilon: float = 1e-5,
+        rms_norm: bool = False,
+        initializer_cfg=None,
+        fused_add_norm=False,
+        residual_in_fp32=False,
+        ```
+
+        See https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py#L175 for more info.
+        The above can all be passed via `--model_args` or to this __init__() directly
+        but we recommend placing many of these within the config.json file uploaded alongside your
+        Mamba model to the HF Hub instead.
+        All other HuggingFace from_pretrained() kwargs
+        such as those related to
+        `parallelize=True`, PEFT, autoGPTQ,
+        or any sub-configurations of these advanced args,
+        are unsupported by the `mamba_ssm` package.
+
+        The HFLM arguments
+
+        `backend`, `tokenizer`, `truncation`, `max_length`,
+        `device`, `dtype`, `batch_size`, `max_batch_size`, `trust_remote_code`, `use_fast_tokenizer`
+
+        Are all supported by Mamba where they do not conflict
+        with Mamba-specific restrictions such as causal LMs only.
+        """
+
+        if "backend" in kwargs:
+            # mamba currently only supports causal models
+            assert kwargs["backend"] == "causal"
+
+        super().__init__(
+            pretrained=pretrained,
+            # set appropriate defaults for tokenizer, max length, etc
+            backend=kwargs.pop("backend", "causal"),
+            tokenizer=kwargs.pop("tokenizer", "EleutherAI/gpt-neox-20b"),
+            max_length=kwargs.pop("max_length", 2048),
+            **kwargs,
+        )
+
+    def _get_config(
+        self,
+        pretrained: str,
+        **kwargs,
+    ) -> None:
+        try:
+            from mamba_ssm.utils.hf import load_config_hf  # noqa: F811
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
+please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
+            )
+
+        self._config = load_config_hf(pretrained)
+
+    def _create_model(
+        self,
+        pretrained: str,
+        dtype: Optional[Union[str, torch.dtype]] = "float16",
+        # no `parallelize=True` options
+        # no PEFT and quantization options
+        # Mamba does not support arbitrary HF from_pretrained() args
+        **kwargs,
+    ) -> None:
+        try:
+            from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel  # noqa: F811
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
+please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
+            )
+
+        self._model = MambaLMHeadModel.from_pretrained(
+            pretrained,
+            device=self._device,
+            dtype=torch.float16
+            if dtype == "auto"
+            else lm_eval.models.utils.get_dtype(dtype),
+        )
+
+    def _model_generate(self, context, max_length, stop, **generation_kwargs):
+        for key in ("do_sample", "attention_mask"):
+            if key in generation_kwargs:
+                generation_kwargs.pop(key)
+
+        # mamba's custom GenerationMixin currently does not support
+        # passing stopping criteria.
+        # for the time being, we simply generate to max length,
+        # then truncate (equivalent result)
+        # -- this should be revisited to speed up generation
+        # stopping_criteria = stop_sequences_criteria(
+        #     self.tokenizer, stop, 1, context.shape[0]
+        # )
+
+        return self.model.generate(
+            input_ids=context,
+            max_length=max_length,
+            # stopping_criteria=stopping_criteria,
+            # pad_token_id=self.tokenizer.pad_token_id,
+            # use_cache=True,
+            **generation_kwargs,
+        )
--- a/LM-Evaluation-Harness-240310/lm_eval/models/neuron_optimum.py
+++ b/LM-Evaluation-Harness-240310/lm_eval/models/neuron_optimum.py
+import copy
+import json
+import logging
+import subprocess
+from collections import defaultdict
+from typing import List, Optional, Union
+
+import torch
+import torch.nn.functional as F
+import transformers
+from packaging import version
+from tqdm import tqdm
+from transformers import GenerationConfig
+from transformers.generation import StoppingCriteriaList
+
+import lm_eval.models.utils
+from lm_eval import utils
+from lm_eval.api.model import TemplateLM
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import stop_sequences_criteria
+
+
+try:
+    NEURON_AVAILABLE = True
+    from optimum.neuron import NeuronModelForCausalLM
+    from optimum.neuron.generation import TokenSelector
+    from optimum.neuron.version import __version__ as optimum_neuron_version
+except ImportError:
+    NeuronModelForCausalLM = object
+    NEURON_AVAILABLE = False
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_nc_count() -> Union[int, None]:
+    """Returns the number of neuron cores on the current instance."""
+    try:
+        cmd = "neuron-ls --json-output"
+        result = subprocess.run(cmd, shell=True, capture_output=True)
+        print(f"inferring nc_count from `neuron-ls` {result.stdout}")
+        json_output = json.loads(result.stdout)
+        count = sum([x["nc_count"] for x in json_output])
+        print(f"nc_count={count}")
+        return count
+    except Exception:
+        return None
+
+
+def wrap_constant_batch_size(func):
+    def _decorator(self, input_ids):
+        """input_ids a 2D array with batch_size on dim=0
+
+        makes sure the func runs with self.batch_size
+        """
+        # access a from TestSample
+        batch_size = input_ids.shape[0]
+
+        if batch_size < self.batch_size:
+            # handle the event of input_ids.shape[0] != batch_size
+            # Neuron cores expect constant batch_size
+            input_ids = torch.concat(
+                (
+                    input_ids,
+                    # add missing_batch_size dummy
+                    torch.zeros(
+                        [self.batch_size - batch_size, *input_ids.size()[1:]],
+                        dtype=input_ids.dtype,
+                        device=input_ids.device,
+                    ),
+                ),
+                dim=0,
+            )
+        elif batch_size > self.batch_size:
+            raise ValueError(
+                f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
+            )
+        # return the forward pass that requires constant batch size
+        return func(self, input_ids)[:batch_size]
+
+    return _decorator
+
+
+class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
+    """NeuronModelForCausalLM with `stopping_criteria` in `generate`"""
+
+    def generate(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        stopping_criteria: Optional["StoppingCriteriaList"] = None,
+        generation_config: Optional["GenerationConfig"] = None,
+        **kwargs,
+    ) -> torch.LongTensor:
+        r"""
+        A streamlined generate() method overriding the transformers.GenerationMixin.generate() method.
+
+        This method uses the same logits processors/warpers and stopping criteria as the transformers library
+        `generate()` method but restricts the generation to greedy search and sampling.
+
+        It does not support transformers `generate()` advanced options.
+
+        Please refer to https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate
+        for details on generation configuration.
+
+        Parameters:
+            input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices.
+            generation_config (`~transformers.generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~transformers.generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+
+        Returns:
+            `torch.Tensor`: A  `torch.FloatTensor`.
+        """
+        # The actual generation configuration is a combination of config and parameters
+        generation_config = copy.deepcopy(
+            self.generation_config if generation_config is None else generation_config
+        )
+        model_kwargs = generation_config.update(
+            **kwargs
+        )  # All unused kwargs must be model kwargs
+        # Check model kwargs are actually used by either prepare_inputs_for_generation or forward
+        self._validate_model_kwargs(model_kwargs)
+
+        # Instantiate a TokenSelector for the specified configuration
+        selector = TokenSelector.create(
+            input_ids, generation_config, self, self.max_length
+        )
+        selector.stopping_criteria.append(stopping_criteria)
+        # Verify that the inputs are compatible with the model static input dimensions
+        batch_size, sequence_length = input_ids.shape
+        if sequence_length > self.max_length:
+            raise ValueError(
+                f"The input sequence length ({sequence_length}) exceeds the model static sequence length ({self.max_length})"
+            )
+        padded_input_ids = input_ids
+        padded_attention_mask = attention_mask
+        if batch_size > self.batch_size:
+            raise ValueError(
+                f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
+            )
+        elif batch_size < self.batch_size:
+            logger.warning(
+                "Inputs will be padded to match the model static batch size. This will increase latency."
+            )
+            padding_shape = [self.batch_size - batch_size, sequence_length]
+            padding = torch.full(
+                padding_shape, fill_value=self.config.eos_token_id, dtype=torch.int64
+            )
+            padded_input_ids = torch.cat([input_ids, padding])
+            if attention_mask is not None:
+                padding = torch.zeros(padding_shape, dtype=torch.int64)
+                padded_attention_mask = torch.cat([attention_mask, padding])
+        # Drop the current generation context and clear the Key/Value cache
+        self.reset_generation()
+
+        output_ids = self.generate_tokens(
+            padded_input_ids,
+            selector,
+            batch_size,
+            attention_mask=padded_attention_mask,
+            **model_kwargs,
+        )
+        return output_ids[:batch_size, :]
+
+
+@register_model("neuronx")
+class NEURON_HF(TemplateLM):
+    """
+    Enables usage with on AWS Neuron
+    using the HuggingFace Transformers + Transformers neuronx library.
+    Tested with neuron 2.17.0
+    """
+
+    _DEFAULT_MAX_LENGTH = 2048
+
+    def __init__(
+        self,
+        pretrained: Optional[str] = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        revision: Optional[str] = "main",
+        tp_degree: Optional[int] = None,
+        subfolder: Optional[str] = None,
+        tokenizer: Optional[str] = None,
+        truncation: Optional[bool] = False,
+        max_length: Optional[int] = None,
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
+        batch_size: Optional[int] = 1,
+        low_cpu_mem_usage: Optional[bool] = True,
+        trust_remote_code: Optional[bool] = False,
+        use_fast_tokenizer: Optional[bool] = True,
+        add_bos_token: Optional[bool] = False,
+    ) -> None:
+        if not NEURON_AVAILABLE:
+            raise Exception(
+                "Tried to load neuron model, but neuron is not installed ",
+                "please install neuron via pip install transformers-neuron ",
+                "also make sure you are running on an AWS inf2 instance",
+            )
+        if version.parse(optimum_neuron_version) != version.parse("0.0.17"):
+            logger.warning(
+                '`optimum-neuron` model requires `pip install "optimum[neuronx]>=0.0.17" '
+                "preferably using the Hugging Face Neuron Deep Learning AMI (Ubuntu 22.04) "
+                "https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2 "
+                f"You are using optimum-neuron={optimum_neuron_version}"
+            )
+        super().__init__()
+
+        assert isinstance(pretrained, str)
+        assert isinstance(batch_size, (int, str))
+
+        self.batch_size_per_gpu = int(batch_size)
+        batch_size = int(batch_size)
+        if tp_degree is None:
+            # execute `neuron-ls --json-output | jq '.[0].nc_count'``
+            # to get the number of neuron cores on your instance
+            tp_degree = get_nc_count()
+
+        assert isinstance(tp_degree, int), (
+            f"model_args must include tp_degree. tp_degree must be set to an integer,"
+            f" but is tp_degree=`{tp_degree}` with type=`{type(tp_degree)}`."
+            "Set it to number of neuron cores on your instance."
+            " For inf2.xlarge and inf2.8xlarge, set it to `2`."
+            " For inf2.24xlarge, set it to `12`."
+            " For inf2.48xlarge, set it to `24`."
+        )
+
+        # TODO: update this to be less of a hack once subfolder is fixed in HF
+        revision = revision + ("/" + subfolder if subfolder is not None else "")
+
+        self._config = transformers.AutoConfig.from_pretrained(
+            pretrained,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+        )
+        torch_dtype = lm_eval.models.utils.get_dtype(dtype)
+
+        assert torch_dtype in [
+            torch.float16,
+            torch.bfloat16,
+        ], "Only float16 and bfloat16 are supported"
+
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+            pretrained if tokenizer is None else tokenizer,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            use_fast=use_fast_tokenizer,
+        )
+
+        # Neuron specific code
+        if torch_dtype == torch.float16:
+            self.amp_dtype = "f16"
+        elif torch_dtype == torch.bfloat16:
+            self.amp_dtype = "bf16"
+        elif torch_dtype == torch.float32:
+            self.amp_dtype = "f32"
+        else:
+            raise NotImplementedError("Only float16 and bfloat16 are implemented.")
+
+        compiler_args = {"num_cores": tp_degree, "auto_cast_type": self.amp_dtype}
+        input_shapes = {
+            "batch_size": batch_size,
+            "sequence_length": self._DEFAULT_MAX_LENGTH,
+        }
+
+        print(
+            f"{'='*20} \n loading model to neuron with"
+            f" {compiler_args}, {input_shapes}..."
+        )
+        self.model = CustomNeuronModelForCausalLM.from_pretrained(
+            pretrained,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+            export=True,
+            **compiler_args,
+            **input_shapes,
+        )
+        print(f"SUCCESS: neuron model compiled. \n {'='*20}")
+
+        self.truncation = truncation
+
+        self.vocab_size = self.tokenizer.vocab_size
+        self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+        self.add_bos_token = self.add_bos_token
+
+        self._max_length = max_length
+
+        self.batch_schedule = 1
+        self.batch_sizes = {}
+
+    @property
+    def config(self):
+        # return the associated transformers.AutoConfig for the given pretrained model.
+        return self._config
+
+    @property
+    def eot_token_id(self):
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+        return self.tokenizer.eos_token_id
+
+    @property
+    def max_length(self):
+        if self._max_length:  # if max length manually set, return it
+            return self._max_length
+        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
+        for attr in seqlen_config_attrs:
+            if hasattr(self.model.config, attr):
+                return getattr(self.model.config, attr)
+        if hasattr(self.tokenizer, "model_max_length"):
+            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
+                return self._DEFAULT_MAX_LENGTH
+            return self.tokenizer.model_max_length
+        return self._DEFAULT_MAX_LENGTH
+
+    @property
+    def max_gen_toks(self) -> int:
+        return 256
+
+    @property
+    def batch_size(self):
+        return self.batch_size_per_gpu
+
+    @property
+    def device(self):
+        """device are neuron cores, but the created tensors are on CPU."""
+        return "cpu"
+
+    @property
+    def rank(self):
+        return 0
+
+    @property
+    def world_size(self):
+        return 1
+
+    def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None):
+        """ """
+        if add_special_tokens is None:
+            add_special_tokens = False or self.add_bos_token
+
+        encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens)
+
+        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+        if left_truncate_len:
+            encoding = encoding[-left_truncate_len:]
+
+        return encoding
+
+    def tok_batch_encode(
+        self,
+        strings: List[str],
+        padding_side: str = "left",
+        left_truncate_len: int = None,
+        truncation: bool = False,
+    ):
+        # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
+        old_padding_side = self.tokenizer.padding_side
+        self.tokenizer.padding_side = padding_side
+
+        add_special_tokens = False or self.add_bos_token
+
+        encoding = self.tokenizer(
+            strings,
+            truncation=truncation,
+            padding="longest",
+            return_tensors="pt",
+            add_special_tokens=add_special_tokens,
+        )
+        if left_truncate_len:
+            encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
+            encoding["attention_mask"] = encoding["attention_mask"][
+                :, -left_truncate_len:
+            ]
+        self.tokenizer.padding_side = old_padding_side
+
+        return encoding["input_ids"], encoding["attention_mask"]
+
+    def tok_decode(self, tokens):
+        return self.tokenizer.decode(tokens)
+
+    @wrap_constant_batch_size
+    def _model_call(self, input_ids: torch.Tensor):
+        """
+        get logits for the entire sequence
+
+        :param input_ids: torch.Tensor
+            A torch tensor of shape [batch, sequence_cont]
+            the size of sequence may vary from call to call
+        :return
+            A torch tensor of shape [batch, sequence, vocab] with the
+            logits returned from the model's decoder-lm head
+        """
+        _, sequence_length = input_ids.shape
+
+        with torch.inference_mode():
+            cache_ids = torch.arange(0, sequence_length, dtype=torch.int32).split(1)
+            input_ids_split = input_ids.split(1, dim=1)
+
+            return torch.concat(
+                [
+                    self.model.forward(
+                        input_ids=input_id, cache_ids=cache_id, return_dict=False
+                    )[0]
+                    for input_id, cache_id in zip(input_ids_split, cache_ids)
+                ],
+                dim=1,
+            )
+
+    def _model_generate(self, context, max_length, stop, **generation_kwargs):
+        # we require users to pass do_sample=True explicitly
+        # for non-greedy gen. This should be reevaluated when considering beam search.
+
+        with torch.inference_mode():
+            if "do_sample" not in generation_kwargs.keys():
+                generation_kwargs["do_sample"] = False
+
+            stopping_criteria = stop_sequences_criteria(
+                self.tokenizer,
+                stop + [self.tokenizer.decode([self.config.eos_token_id])],
+                1,
+                context.shape[0],
+            )
+
+            return self.model.generate(
+                input_ids=context,
+                max_length=max_length,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=self.eot_token_id,
+                use_cache=True,
+                **generation_kwargs,
+            )
+
+    def _select_cont_toks(self, logits, contlen=None, inplen=None):
+        assert (
+            contlen and inplen
+        ), "Must pass input len and cont. len to select scored logits for causal LM"
+        # discard right-padding.
+        # also discard the input/context tokens. we'll only score continuations.
+        logits = logits[inplen - contlen : inplen]
+
+        return logits
+
+    def loglikelihood_rolling(self, requests):
+        loglikelihoods = []
+
+        adaptive_batch_size = None
+
+        for (string,) in tqdm([req.args for req in requests], disable=(self.rank != 0)):
+            rolling_token_windows = list(
+                map(
+                    utils.make_disjoint_window,
+                    utils.get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.eot_token_id,
+                        max_seq_len=self.max_length,
+                        context_len=1,
+                    ),
+                )
+            )
+
+            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+
+            pad_amnt = 0
+            if self.world_size > 1:
+                # We pad out the external document-level iterator so the inner iterator doesn't hang
+                mytensor = torch.tensor(len(rolling_token_windows), device=self.device)
+                gathered = (
+                    self.accelerator.gather(mytensor).cpu().detach().numpy().tolist()
+                )
+
+                pad_amnt = max(gathered) - gathered[self.rank]
+                if pad_amnt > 0:
+                    rolling_token_windows += pad_amnt * [rolling_token_windows[0]]
+
+            string_nll = self._loglikelihood_tokens(
+                rolling_token_windows,
+                disable_tqdm=True,
+                override_bs=adaptive_batch_size,
+            )
+
+            if (self.world_size > 1) and (pad_amnt > 0):
+                string_nll = [x[0] for x in string_nll[:-pad_amnt]]
+            else:
+                # discard is_greedy
+                string_nll = [x[0] for x in string_nll]
+
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+
+        return loglikelihoods
+
+    def _loglikelihood_tokens(
+        self, requests, disable_tqdm: bool = False, override_bs=None
+    ):
+        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
+        res = []
+
+        def _collate(x):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+
+            toks = x[1] + x[2]
+            return -len(toks), tuple(toks)
+
+        re_ord = utils.Reorderer(requests, _collate)
+
+        n_reordered_requests = len(re_ord.get_reordered())  # noqa
+        # automatic (variable) batch size detection for vectorization
+        # pull longest context sample from request
+
+        chunks = lm_eval.models.utils.chunks(
+            re_ord.get_reordered(),
+            n=self.batch_size,
+            fn=None,
+        )
+
+        for chunk in tqdm(chunks, disable=(disable_tqdm or (self.rank != 0))):
+            inps = []
+            cont_toks_list = []
+            inplens = []
+
+            conts = []  # noqa
+            encoder_attns = []  # noqa
+
+            padding_len_inp = None
+            padding_len_cont = None  # noqa
+            # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
+            # tensors, then we pack them together into a batch, call the model, and then pick it all apart
+            # again because vectorizing is annoying
+
+            for _, context_enc, continuation_enc in chunk:
+                # sanity check
+                assert len(context_enc) > 0
+                assert len(continuation_enc) > 0
+                assert len(continuation_enc) <= self.max_length
+
+                # how this all works (illustrated on a causal decoder-only setup):
+                #          CTX      CONT
+                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+                # model  \               \
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+
+                # when too long to fit in context, truncate from the left
+                inp = torch.tensor(
+                    (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
+                    dtype=torch.long,
+                    device=self.device,
+                )
+                (inplen,) = inp.shape
+
+                padding_len_inp = (
+                    max(padding_len_inp, inplen)
+                    if padding_len_inp is not None
+                    else inplen
+                )
+
+                inps.append(inp)  # [1, inp_length]
+                cont_toks_list.append(continuation_enc)
+                inplens.append(inplen)
+
+            # create encoder attn mask and batched conts, if seq2seq
+            call_kwargs = {}
+            batched_inps = lm_eval.models.utils.pad_and_concat(
+                padding_len_inp, inps, padding_side="right"
+            )  # [batch, padding_len_inp]
+
+            multi_logits = F.log_softmax(
+                self._model_call(batched_inps, **call_kwargs), dim=-1
+            )  # [batch, padding_length (inp or cont), vocab]
+
+            for (cache_key, _, _), logits, inplen, cont_toks in zip(
+                chunk, multi_logits, inplens, cont_toks_list
+            ):
+                # Slice to original seq length
+                contlen = len(cont_toks)
+                # take only logits in the continuation
+                # (discard context toks if decoder-only ; discard right-padding)
+                # also discards + checks for "virtual tokens" in the causal LM's input window
+                # from prompt/prefix tuning tokens, if applicable
+                ctx_len = inplen + (logits.shape[0] - padding_len_inp)
+                logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
+                logits = logits.unsqueeze(0)  # [1, seq, vocab]
+
+                # Check if per-token argmax is exactly equal to continuation
+                greedy_tokens = logits.argmax(dim=-1)
+                cont_toks = torch.tensor(
+                    cont_toks, dtype=torch.long, device=self.device
+                ).unsqueeze(0)  # [1, seq]
+                max_equal = (greedy_tokens == cont_toks).all()
+
+                # Obtain log-probs at the corresponding continuation token indices
+                # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
+                logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
+                    -1
+                )  # [1, seq]
+
+                # Answer: (log prob, is-exact-match)
+                answer = (float(logits.sum()), bool(max_equal))
+
+                res.append(answer)
+
+                self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+
+        return re_ord.get_original(res)
+
+    def generate_until(self, requests):
+        res = defaultdict(list)
+        re_ords = {}
+
+        def _collate(x):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = self.tok_encode(x[0])
+            return -len(toks), x[0]
+
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        grouper = lm_eval.models.utils.Grouper(requests, lambda x: str(x.args[1]))
+        for key, reqs in grouper.get_grouped().items():
+            # within each set of reqs for given kwargs, we reorder by token length, descending.
+            re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
+
+        pbar = tqdm(total=len(requests), disable=(self.rank != 0))
+
+        # for each different set of kwargs, we execute all requests, by batch.
+        for key, re_ord in re_ords.items():
+            chunks = lm_eval.models.utils.chunks(
+                re_ord.get_reordered(), n=self.batch_size
+            )
+            for chunk in tqdm(chunks, disable=self.rank != 0):
+                contexts, all_gen_kwargs = zip(*chunk)
+                # we assume all gen kwargs in the batch are the same
+                # this is safe to assume because the `grouper` object ensures it.
+                gen_kwargs = all_gen_kwargs[0]
+                # unpack our keyword arguments.
+                until = None
+                if isinstance(gen_kwargs, dict):
+                    kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                    if "until" in kwargs.keys():
+                        until = kwargs.pop("until")
+                        if isinstance(until, str):
+                            until = [kwargs]
+                        elif not isinstance(until, list):
+                            raise ValueError(
+                                f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
+                            )
+                else:
+                    raise ValueError(
+                        f"Expected `kwargs` to be of type `dict` but got {kwargs}"
+                    )
+                # add EOS token to stop sequences
+                eos = self.tok_decode(self.eot_token_id)
+                if not until:
+                    until = [eos]
+                else:
+                    until.append(eos)
+                if "max_gen_toks" in kwargs.keys():
+                    max_gen_toks = kwargs.pop("max_gen_toks")
+                else:
+                    max_gen_toks = self.max_gen_toks
+                # first stop sequence is used to halt generation upon encountering
+                primary_until = [until[0]]
+
+                max_ctx_len = self.max_length - max_gen_toks
+
+                # encode, pad, and truncate contexts for this batch
+                context_enc, attn_masks = self.tok_batch_encode(
+                    contexts,
+                    left_truncate_len=max_ctx_len,
+                    truncation=self.truncation,
+                )
+                context_enc = context_enc.to(self.device)
+                attn_masks = attn_masks.to(self.device)
+
+                if "max_length" not in kwargs:
+                    kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
+
+                # perform batched generation
+                cont = self._model_generate(
+                    context=context_enc,
+                    attention_mask=attn_masks,
+                    stop=primary_until,
+                    **kwargs,
+                )
+
+                cont_toks_list = cont.tolist()
+                for cont_toks, context in zip(cont_toks_list, contexts):
+                    # discard context + left-padding toks if using causal decoder-only LM
+                    cont_toks = cont_toks[context_enc.shape[1] :]
+
+                    s = self.tok_decode(cont_toks)
+
+                    # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
+                    for term in until:
+                        if len(term) > 0:
+                            # ignore '' separator,
+                            # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
+                            s = s.split(term)[0]
+
+                    res[key].append(s)
+
+                    self.cache_hook.add_partial(
+                        "generate_until", (context, gen_kwargs), s
+                    )
+                    pbar.update(1)
+            # reorder this group of results back to original unsorted form
+            res[key] = re_ord.get_original(res[key])
+
+        pbar.close()
+
+        return grouper.get_original(res)
--- a/LM-Evaluation-Harness-240310/lm_eval/models/openai_completions.py
+++ b/LM-Evaluation-Harness-240310/lm_eval/models/openai_completions.py
+import copy
+import os
+from collections import defaultdict
+from importlib.util import find_spec
+from typing import List, Literal, Optional, Tuple
+
+from tqdm import tqdm
+
+import lm_eval.models.utils
+from lm_eval import utils
+from lm_eval.api.model import LM, TemplateLM
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import retry_on_specific_exceptions
+from lm_eval.utils import eval_logger
+
+
+def get_result(response, ctxlen: int) -> Tuple[float, bool]:
+    """Process results from OpenAI API response.
+
+    :param response: dict
+        OpenAI API Response
+    :param ctxlen: int
+        Length of context (so we can slice them away and only keep the predictions)
+    :return:
+        continuation_logprobs: np.array
+            Log probabilities of continuation tokens
+        is_greedy: bool
+            whether argmax matches given continuation exactly
+    """
+    is_greedy = True
+    logprobs = response.logprobs.token_logprobs
+    continuation_logprobs = sum(logprobs[ctxlen:])
+
+    for i in range(ctxlen, len(response.logprobs.token_logprobs)):
+        token = response.logprobs.token_logprobs[i]
+        top_tokens = response.logprobs.top_logprobs[i]
+        top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
+        if top_token != token:
+            is_greedy = False
+            break
+
+    return continuation_logprobs, is_greedy
+
+
+def oa_completion(client, chat: bool = False, **kwargs):
+    """Query OpenAI API for completion.
+
+    Retry with back-off until they respond
+    """
+    if not find_spec("openai") or not find_spec("tiktoken"):
+        raise Exception(
+            "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. "
+            "Please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`"
+        )
+    else:
+        import openai
+
+    def _exception_callback(e: Exception, sleep_time: float) -> None:
+        import traceback
+
+        traceback.print_exc()
+
+    @retry_on_specific_exceptions(
+        on_exceptions=[openai.OpenAIError],
+        max_retries=None,  # retry forever, consider changing
+        on_exception_callback=_exception_callback,
+    )
+    def completion():
+        if chat:
+            return client.chat.completions.create(**kwargs)
+        else:
+            return client.completions.create(**kwargs)
+
+    return completion()
+
+
+@register_model("openai-completions", "local-completions")
+class OpenaiCompletionsLM(TemplateLM):
+    _DEFAULT_MAX_LENGTH = 2048
+
+    def __init__(
+        self,
+        model: str,
+        base_url: str = None,
+        tokenizer: Optional[str] = None,
+        tokenizer_backend: Literal["tiktoken", "huggingface"] = "tiktoken",
+        truncate: bool = False,
+        max_gen_toks: int = 256,
+        batch_size: int = 1,
+        seed: int = 1234,
+        max_length: Optional[int] = None,
+    ) -> None:
+        """
+
+        :param engine: str
+            OpenAI API engine (e.g. gpt-3.5-turbo-instruct)
+        :param truncate: bool
+            Truncate input if too long (if False and input is too long, throw error)
+        """
+        super().__init__()
+        self.seed = seed
+        try:
+            import openai  # noqa: E401
+            import tiktoken
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
+    please install these via `pip install lm-eval[openai]` or `pip install -e .\"[openai]\"`",
+            )
+        self.model = model
+        self.base_url = base_url
+        self.tokenizer_backend = tokenizer_backend
+        self.truncate = truncate
+        self._batch_size = batch_size
+        self._max_gen_toks = max_gen_toks
+        self._max_length = max_length
+
+        # if we have a local model, use HF tokenizer over tiktoken
+        if self.tokenizer_backend == "huggingface":
+            import transformers  # noqa: E401
+
+            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                tokenizer if tokenizer else self.model
+            )
+            self.vocab_size = self.tokenizer.vocab
+            self.end_of_text_token_id = self.tokenizer.eos_token
+        elif self.tokenizer_backend == "tiktoken":
+            if self.base_url:
+                eval_logger.warning(
+                    f"Passed `base_url={self.base_url}` but using Tiktoken tokenizer backend. "
+                    "Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken."
+                )
+
+            self.tokenizer = tiktoken.encoding_for_model(self.model)
+            self.vocab_size = self.tokenizer.n_vocab
+            self.end_of_text_token_id = self.tokenizer.eot_token
+        else:
+            raise ValueError(
+                f"Expected tokenizer_backend to be one of ['tiktoken', 'huggingface'] but got {self.tokenizer_backend}"
+            )
+
+        # Read from environment variable OPENAI_API_KEY
+        # Set to EMPTY for local
+        openai.api_key = os.environ["OPENAI_API_KEY"]
+        if self.base_url:
+            self.client = openai.OpenAI(base_url=self.base_url)
+        else:
+            self.client = openai.OpenAI()
+
+    @property
+    def eot_token_id(self):
+        return self.end_of_text_token_id
+
+    @property
+    def max_length(self) -> int:
+        if self._max_length:
+            return self._max_length
+        else:
+            return self._DEFAULT_MAX_LENGTH
+
+    @property
+    def max_gen_toks(self) -> int:
+        return self._max_gen_toks
+
+    @property
+    def batch_size(self) -> int:
+        return self._batch_size
+
+    @property
+    def device(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def tok_encode(self, string: str, **kwargs) -> List[int]:
+        return self.tokenizer.encode(string)
+
+    def tok_decode(self, tokens: List[int]) -> str:
+        return self.tokenizer.decode(tokens)
+
+    def _loglikelihood_tokens(
+        self, requests, disable_tqdm: bool = False
+    ) -> List[Tuple[float, bool]]:
+        res = []
+
+        def _collate(x):
+            # this doesn't efficiently handle last-token differences yet, but those are kinda annoying because
+            # it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations
+            # we care about, and so we need some kind of backup for when it isn't
+            toks = x[1] + x[2]
+            return -len(toks), tuple(toks)
+
+        re_ord = utils.Reorderer(requests, _collate)
+
+        for chunk in tqdm(
+            list(lm_eval.models.utils.chunks(re_ord.get_reordered(), self.batch_size)),
+            disable=disable_tqdm,
+        ):
+            inps = []
+            ctxlens = []
+            for cache_key, context_enc, continuation_enc in chunk:
+                # max_length+1 because the API takes up to 2049 tokens, including the first context token
+                inp = (context_enc + continuation_enc)[-(self.max_length + 1) :]
+                # TODO: the logic is much simpler if we just look at the length of continuation tokens
+                ctxlen = len(context_enc) - max(
+                    0, len(context_enc) + len(continuation_enc) - (self.max_length + 1)
+                )
+
+                inps.append(inp)
+                ctxlens.append(ctxlen)
+
+            response = oa_completion(
+                client=self.client,
+                model=self.model,
+                prompt=inps,
+                echo=True,
+                max_tokens=0,
+                temperature=0.0,
+                logprobs=10,
+                seed=self.seed,
+            )
+
+            for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip(
+                response.choices, ctxlens, chunk
+            ):
+                answer = get_result(resp, ctxlen)
+
+                res.append(answer)
+
+                # partial caching
+                if cache_key is not None:
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+        return re_ord.get_original(res)
+
+    def generate_until(self, requests) -> List[str]:
+        if not requests:
+            return []
+        res = []
+        requests = [req.args for req in requests]
+
+        def _collate(x):
+            toks = self.tok_encode(x[0])
+            return len(toks), x[0]
+
+        re_ord = utils.Reorderer(requests, _collate)
+
+        def sameuntil_chunks(xs, size):
+            ret = []
+            lastuntil = xs[0][1]
+            for x in xs:
+                if len(ret) >= size or x[1] != lastuntil:
+                    yield ret, lastuntil
+                    ret = []
+                    lastuntil = x[1]
+                ret.append(x)
+
+            if ret:
+                yield ret, lastuntil
+
+        # todo: more intelligent batching for heterogeneous `until`
+        for chunk, request_args in tqdm(
+            list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size))
+        ):
+            inps = []
+            self._max_gen_toks = request_args.get("max_gen_toks", self.max_gen_toks)
+            for context, _ in chunk:
+                context_enc = self.tok_encode(context)
+                inp = context_enc[-(self.max_length - self.max_gen_toks) :]
+                inps.append(inp)
+
+            until = request_args.get("until", ["<|endoftext|>"])
+            request_args["temperature"] = request_args.get("temperature", 0)
+
+            response = oa_completion(
+                client=self.client,
+                model=self.model,
+                prompt=inps,
+                max_tokens=self.max_gen_toks,
+                stop=until,
+                seed=self.seed,
+                **{
+                    k: v
+                    for k, v in request_args.items()
+                    if k not in ["do_sample", "max_gen_toks"]
+                },
+            )
+            for resp, (context, args_) in zip(response.choices, chunk):
+                s = getattr(resp, "text")
+
+                until_ = until
+
+                for term in until_:
+                    if len(term) > 0:
+                        s = s.split(term)[0]
+
+                # partial caching
+                self.cache_hook.add_partial(
+                    "generate_until", (context, {"until": until_}), s
+                )
+
+                res.append(s)
+        return re_ord.get_original(res)
+
+    def _model_call(self, inps):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        # Isn't used because we override generate_until
+        raise NotImplementedError()
+
+    def loglikelihood_rolling(self, requests) -> List[float]:
+        loglikelihoods = []
+
+        for (string,) in tqdm([req.args for req in requests]):
+            rolling_token_windows = list(
+                map(
+                    utils.make_disjoint_window,
+                    utils.get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.eot_token_id,
+                        max_seq_len=self.max_length,
+                        context_len=1,
+                    ),
+                )
+            )
+
+            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+
+            string_nll = self._loglikelihood_tokens(
+                rolling_token_windows,
+                disable_tqdm=True,
+            )
+
+            # discard is_greedy
+            string_nll = [x[0] for x in string_nll]
+
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+        return loglikelihoods
+
+
+@register_model("openai-chat-completions", "local-chat-completions")
+class OpenaiChatCompletionsLM(LM):
+    def __init__(
+        self,
+        model: str = "gpt-3.5-turbo",  # GPT model or Local model using HuggingFace model paths
+        base_url: str = None,
+        truncate: bool = False,
+        **kwargs,
+    ) -> None:
+        """
+
+        :param model: str
+            Implements an OpenAI-style chat completion API for
+            accessing both OpenAI OR locally-hosted models using
+            HuggingFace Tokenizer
+            OpenAI API model (e.g. gpt-3.5-turbo)
+            using the **gen_kwargs passed on init
+        :param truncate: bool
+            Truncate input if too long (if False and input is too long, throw error)
+        """
+        super().__init__()
+        try:
+            import openai  # noqa: E401
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
+    please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
+            )
+        self.model = model
+        self.base_url = base_url
+        self.truncate = truncate
+
+        # Read from environment variable OPENAI_API_KEY
+        # Set to EMPTY for local
+        if self.base_url:
+            self.client = openai.OpenAI(base_url=self.base_url)
+        else:
+            self.client = openai.OpenAI()  # openai.AsyncOpenAI()
+
+    @property
+    def max_length(self) -> int:
+        # Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
+        return 2048
+
+    @property
+    def max_gen_toks(self) -> int:
+        return 256
+
+    @property
+    def batch_size(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    @property
+    def device(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def generate_until(self, requests) -> List[str]:
+        res = defaultdict(list)
+        re_ords = {}
+
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        grouper = lm_eval.models.utils.Grouper(requests, lambda x: str(x.args[1]))
+        for key, reqs in grouper.get_grouped().items():
+            # within each set of reqs for given kwargs, we reorder by token length, descending.
+            re_ords[key] = utils.Reorderer(
+                [req.args for req in reqs], lambda x: (-len(x[0]), x[0])
+            )
+
+        pbar = tqdm(total=len(requests), disable=(self.rank != 0))
+        for key, re_ord in re_ords.items():
+            # n needs to be 1 because messages in
+            # chat completion are not batch but
+            # is regarded as a single conversation.
+            chunks = lm_eval.models.utils.chunks(re_ord.get_reordered(), n=1)
+            for chunk in chunks:
+                contexts, all_gen_kwargs = zip(*chunk)
+                inps = [{"role": "user", "content": context} for context in contexts]
+
+                gen_kwargs = all_gen_kwargs[0]
+                until = None
+                if isinstance(kwargs := copy.deepcopy(gen_kwargs), dict):
+                    if "do_sample" in kwargs.keys():
+                        kwargs.pop("do_sample")
+                    if "until" in kwargs.keys():
+                        until = kwargs.pop("until")
+                        if isinstance(until, str):
+                            until = [kwargs]
+                        elif not isinstance(until, list):
+                            raise ValueError(
+                                f"Expected repr(kwargs['until']) to be of type Union[str, list] but got {until}"
+                            )
+                        kwargs["stop"] = until
+                    kwargs["max_tokens"] = kwargs.pop("max_gen_toks", self.max_gen_toks)
+                else:
+                    raise ValueError(
+                        f"Expected repr(kwargs) to be of type repr(dict) but got {kwargs}"
+                    )
+
+                response = oa_completion(
+                    client=self.client,
+                    chat=True,
+                    messages=inps,
+                    model=self.model,
+                    **kwargs,
+                )
+
+                for resp, (context, args_) in zip(response.choices, chunk):
+                    s = resp.message.content
+
+                    if until is not None:
+                        for term in until:
+                            if len(term) > 0:
+                                s = s.split(term)[0]
+
+                    res[key].append(s)
+
+                    self.cache_hook.add_partial(
+                        "generate_until", (context, {"until": until}), s
+                    )
+                    pbar.update(1)
+            # reorder this group of results back to original unsorted form
+            res[key] = re_ord.get_original(res[key])
+
+        pbar.close()
+
+        return grouper.get_original(res)
+
+    def loglikelihood(self, requests):
+        raise NotImplementedError("No support for logits.")
+
+    def loglikelihood_rolling(self, requests):
+        raise NotImplementedError("No support for logits.")
--- a/LM-Evaluation-Harness-240310/lm_eval/models/optimum_lm.py
+++ b/LM-Evaluation-Harness-240310/lm_eval/models/optimum_lm.py
+from importlib.util import find_spec
+from pathlib import Path
+
+from lm_eval.api.registry import register_model
+from lm_eval.models.huggingface import HFLM
+
+
+@register_model("openvino")
+class OptimumLM(HFLM):
+    """
+    Optimum Intel provides a simple interface to optimize Transformer models and convert them to \
+    OpenVINO™ Intermediate Representation (IR) format to accelerate end-to-end pipelines on \
+    Intel® architectures using OpenVINO™ runtime.
+    """
+
+    def __init__(
+        self,
+        device="cpu",
+        **kwargs,
+    ) -> None:
+        if "backend" in kwargs:
+            # optimum currently only supports causal models
+            assert (
+                kwargs["backend"] == "causal"
+            ), "Currently, only OVModelForCausalLM is supported."
+
+        self.openvino_device = device
+
+        super().__init__(
+            device=self.openvino_device,
+            backend=kwargs.pop("backend", "causal"),
+            **kwargs,
+        )
+
+    def _create_model(
+        self,
+        pretrained: str,
+        revision="main",
+        dtype="auto",
+        trust_remote_code=False,
+        **kwargs,
+    ) -> None:
+        if not find_spec("optimum"):
+            raise Exception(
+                "package `optimum` is not installed. Please install it via `pip install optimum[openvino]`"
+            )
+        else:
+            from optimum.intel.openvino import OVModelForCausalLM
+
+        model_kwargs = kwargs if kwargs else {}
+        model_file = Path(pretrained) / "openvino_model.xml"
+        if model_file.exists():
+            export = False
+        else:
+            export = True
+        kwargs["ov_config"] = {
+            "PERFORMANCE_HINT": "LATENCY",
+            "NUM_STREAMS": "1",
+            "CACHE_DIR": "",
+        }
+
+        self._model = OVModelForCausalLM.from_pretrained(
+            pretrained,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            export=export,
+            device=self.openvino_device.upper(),
+            **model_kwargs,
+        )
--- a/LM-Evaluation-Harness-240310/lm_eval/models/textsynth.py
+++ b/LM-Evaluation-Harness-240310/lm_eval/models/textsynth.py
+""" TextSynth API
+Implementation provided by Fabrice Bellard:
+    https://github.com/EleutherAI/lm-evaluation-harness/issues/295
+
+In order to use the API, you must have a valid TextSynth account and
+enough credits.
+
+Example usage:
+
+    python main.py --model textsynth --model_args engine=gptj_6B --no_cache --tasks piqa
+
+Homepage: https://textsynth.com/index.html
+"""
+import logging
+import os
+
+import requests as _requests
+from tqdm import tqdm
+
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import retry_on_specific_exceptions
+
+
+logger = logging.getLogger(__name__)
+
+
+def textsynth_completion(**kwargs):
+    """Query TextSynth API for completion.
+    Retry with back-off until they respond.
+    """
+
+    def _exception_callback(e: Exception, sleep_time: float) -> None:
+        import traceback
+
+        traceback.print_exc()
+
+    @retry_on_specific_exceptions(
+        on_exceptions=[_requests.exceptions.RequestException],
+        max_retries=None,  # retry forever, consider changing
+        on_exception_callback=_exception_callback,
+    )
+    def completion():
+        return _requests.post(**kwargs)
+
+    return completion()
+
+
+@register_model("textsynth")
+class TextSynthLM(LM):
+    def __init__(self, engine, truncate: bool = False, **kwargs) -> None:
+        """
+        :param engine: str
+            TextSynth API engine (e.g. `gptj_6B`)
+        :param truncate: bool
+            Truncate input if too long (if False and input is too long, throw error)
+        """
+        super().__init__()
+
+        self.engine = engine
+        self.truncate = truncate
+        self.api_url = "https://api.textsynth.com"
+        # Read from environment variable TEXTSYNTH_API_SECRET_KEY
+        self.api_key = os.environ["TEXTSYNTH_API_SECRET_KEY"]
+
+    @property
+    def eot_token_id(self):
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
+        raise NotImplementedError()
+
+    @property
+    def max_length(self) -> int:
+        # NOTE: Turn on truncation to avoid errors on long inputs.
+        return 2048
+
+    @property
+    def max_gen_toks(self) -> int:
+        return 256
+
+    @property
+    def batch_size(self):
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
+        raise NotImplementedError()
+
+    @property
+    def device(self):
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
+        raise NotImplementedError()
+
+    def tok_encode(self, string: str):
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
+        raise NotImplementedError()
+
+    def tok_decode(self, tokens):
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
+        raise NotImplementedError()
+
+    def loglikelihood(self, requests):
+        res = []
+        for context, continuation in tqdm(requests):
+            response = textsynth_completion(
+                url=self.api_url + "/v1/engines/" + self.engine + "/logprob",
+                headers={"Authorization": "Bearer " + self.api_key},
+                json={"context": context, "continuation": continuation},
+            )
+            resp = response.json()
+            if "logprob" in resp:
+                logprob = resp["logprob"]
+                is_greedy = resp["is_greedy"]
+                res.append((logprob, is_greedy))
+
+                self.cache_hook.add_partial(
+                    "loglikelihood", (context, continuation), (logprob, is_greedy)
+                )
+            else:
+                logger.error(
+                    f"The following response does not contain `logprobs`. Got:\n{resp}"
+                )
+                assert False
+        return res
+
+    def loglikelihood_rolling(self, requests):
+        # TODO: The TextSynth API does not support tokenized inputs so we cannot
+        # manually partition long contexts into smaller rolling windows as
+        # done for other models derived from `BaseLM`. Override this method
+        # with a windowing scheme that works for direct string inputs.
+        raise NotImplementedError(
+            "`loglikelihood_rolling` is currently not supported due to lack of "
+            "input tokenization support from TextSynth."
+        )
+
+    def generate_until(self, requests):
+        if not requests:
+            return []
+
+        res = []
+        for request in tqdm(requests):
+            inp = request[0]
+            request_args = request[1]
+            until = request_args["until"]
+            response = textsynth_completion(
+                url=self.api_url + "/v1/engines/" + self.engine + "/completions",
+                headers={"Authorization": "Bearer " + self.api_key},
+                json={
+                    "prompt": inp,
+                    "max_tokens": self.max_gen_toks,
+                    "top_k": 1,
+                    "stop": until,
+                },
+            )
+            resp = response.json()
+            if "text" in resp:
+                s = resp["text"]
+                res.append(s)
+
+                self.cache_hook.add_partial("generate_until", (inp, request_args), s)
+            else:
+                logger.error(
+                    "The following response does not contain generated `text`. "
+                    "Got:\n{resp}"
+                )
+                assert False
+        return res
+
+    def _model_call(self, inps):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        # Isn't used because we override generate_until
+        raise NotImplementedError()
--- a/LM-Evaluation-Harness-240310/lm_eval/models/utils.py
+++ b/LM-Evaluation-Harness-240310/lm_eval/models/utils.py
+import collections
+import fnmatch
+import gc
+import itertools
+import time
+from functools import wraps
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+)
+
+import torch
+import transformers
+
+from lm_eval.utils import eval_logger
+
+
+def chunks(iter, n: int = 0, fn=None):
+    """
+    Divides an iterable into chunks of specified size or based on a given function.
+    Useful for batching
+
+    Parameters:
+    - iter: The input iterable to be divided into chunks.
+    - n: An integer representing the size of each chunk. Default is 0.
+    - fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
+
+    Returns:
+    An iterator that yields chunks of the input iterable.
+
+    Example usage:
+    ```
+    data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    for chunk in chunks(data, 3):
+        print(chunk)
+    ```
+    Output:
+    ```
+    [1, 2, 3]
+    [4, 5, 6]
+    [7, 8, 9]
+    [10]
+    ```
+    """
+    arr = []
+    for i, x in enumerate(iter):
+        arr.append(x)
+        if len(arr) == (fn(i, iter) if fn else n):
+            yield arr
+            arr = []
+
+    if arr:
+        yield arr
+
+
+class MultiChoice:
+    def __init__(self, choices) -> None:
+        self.choices = choices
+
+    # Simple wildcard support (linux filename patterns)
+    def __contains__(self, values) -> bool:
+        for value in values.split(","):
+            if len(fnmatch.filter(self.choices, value)) == 0:
+                eval_logger.info("Available tasks to choose:")
+                for choice in self.choices:
+                    eval_logger.info(f"  - {choice}")
+                raise ValueError("'{}' is not in task list".format(value))
+        return True
+
+    def __iter__(self) -> Iterator:
+        for choice in self.choices:
+            yield choice
+
+
+class Grouper:
+    """
+    takes an array `arr` and function `fn` and returns a dictionary
+    with keys fn(ob) for each ob in `arr` and with values `self.arr[key]` a list of all
+    objects in `arr` satisfying `key == fn(ob)`.
+    """
+
+    def __init__(self, arr, fn) -> None:
+        # self.orig_arr = arr
+        self.size = len(arr)
+        arr = list(enumerate(arr))
+
+        def group_return_dict(arr, fn):
+            res = collections.defaultdict(list)
+
+            for ob in arr:
+                res[fn(ob)].append(ob)
+            return res
+
+        arr = group_return_dict(arr, lambda x: fn(x[1]))
+
+        # self.arr has format Dict[Tuple[int, <entry from orig. arr>]]
+        self.arr = arr
+        self._grouped = None
+
+    def get_grouped(self):
+        # return the contents but not indices for our grouped dict.
+        if self._grouped:
+            return self._grouped
+        grouped = {}
+        for key in self.arr.keys():
+            # drop the index from each element of self.arr
+            grouped[key] = [y[1] for y in self.arr[key]]
+        self._grouped = grouped
+        return grouped
+
+    def get_original(self, grouped_dict):
+        # take in a grouped dictionary with e.g. results for each key listed
+        # in the same order as the instances in `self.arr`, and
+        # return the results in the same (single list) order as `self.orig_arr`.
+        res = [None] * self.size
+        cov = [False] * self.size
+        # orig = [None] * self.size
+
+        assert grouped_dict.keys() == self.arr.keys()
+
+        for key in grouped_dict.keys():
+            for (ind, _), v in zip(self.arr[key], grouped_dict[key]):
+                res[ind] = v
+                cov[ind] = True
+                # orig[ind] = _
+
+        assert all(cov)
+        # assert orig == self.orig_arr
+
+        return res
+
+
+def pad_and_concat(
+    max_length: int,
+    tensors: List[torch.Tensor],
+    padding_side: Literal["right", "left"] = "right",
+):
+    """
+    Method for padding a list of tensors given the maximum tensor
+    length in the batch. Used for batching inputs and continuations in
+    seq2seq models.
+    """
+    assert (
+        padding_side == "left" or padding_side == "right"
+    ), f"Unrecognized padding type: '{padding_side}' not 'left' or 'right'"
+
+    for i, tensor in enumerate(tensors):
+        if len(tensor.shape) == 2:
+            tensor = tensor.squeeze(0)  # squeeze, in case passed [1, seq] size
+        tensor_len = tensor.shape[0]
+        if tensor_len < max_length:
+            if padding_side == "right":
+                # right-pad
+                tensors[i] = torch.cat(
+                    [
+                        tensor,  # [seq]
+                        torch.zeros(
+                            max_length - tensor_len,
+                            dtype=torch.long,
+                            device=tensor.device,
+                        ),  # [padding_length - seq]
+                    ],
+                    dim=0,
+                ).unsqueeze(0)
+            else:
+                # left-pad
+                tensors[i] = torch.cat(
+                    [
+                        torch.zeros(
+                            max_length - tensor_len,
+                            dtype=torch.long,
+                            device=tensor.device,
+                        ),  # [padding_length - seq]
+                        tensor,  # [seq]
+                    ],
+                    dim=0,
+                ).unsqueeze(0)
+        else:
+            tensors[i] = tensor.unsqueeze(0)
+
+    return torch.cat(tensors, dim=0)
+
+
+def clear_torch_cache() -> None:
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+def get_dtype(dtype: Union[str, torch.dtype]) -> torch.dtype:
+    """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
+    if isinstance(dtype, str) and dtype != "auto":
+        # Convert `str` args torch dtype: `float16` -> `torch.float16`
+        _torch_dtype = getattr(torch, dtype)
+    else:
+        _torch_dtype = dtype
+    return _torch_dtype
+
+
+class MultiTokenEOSCriteria(transformers.StoppingCriteria):
+    """Criteria to stop on the specified multi-token sequence."""
+
+    def __init__(
+        self,
+        sequence: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        initial_decoder_input_length: int,
+        batch_size: int,
+    ) -> None:
+        self.initial_decoder_input_length = initial_decoder_input_length
+        self.done_tracker = [False] * batch_size
+        self.sequence = sequence
+        self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
+        # print(sequence, self.sequence_ids)
+        # we look back for 2 more tokens than it takes to encode our stop sequence
+        # because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
+        # and we don't want to mistakenly not stop a generation because our
+        # (string) stop sequence was output in a different tokenization
+
+        # NOTE: there is a minor danger that this will end up looking back 2 tokens into the past, into the inputs to the model,
+        # and stopping generation immediately as a result. With only 2 extra tokens of lookback, this risk is minimized
+        # Additionally, in lookback_ids_batch we should prevent ever looking back into the inputs as described.
+        self.sequence_id_len = len(self.sequence_ids) + 2
+        self.tokenizer = tokenizer
+
+    def __call__(self, input_ids, scores, **kwargs) -> bool:
+        # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
+        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :]
+
+        lookback_ids_batch = lookback_ids_batch[:, -self.sequence_id_len :]
+
+        lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
+
+        for i, done in enumerate(self.done_tracker):
+            if not done:
+                self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
+        return False not in self.done_tracker
+
+
+def stop_sequences_criteria(
+    tokenizer: transformers.PreTrainedTokenizer,
+    stop_sequences: List[str],
+    initial_decoder_input_length: int,
+    batch_size: int,
+) -> transformers.StoppingCriteriaList:
+    return transformers.StoppingCriteriaList(
+        [
+            *[
+                MultiTokenEOSCriteria(
+                    sequence, tokenizer, initial_decoder_input_length, batch_size
+                )
+                for sequence in stop_sequences
+            ],
+        ]
+    )
+
+
+def undistribute(iterable):
+    """
+    Undoes https://more-itertools.readthedocs.io/en/stable/api.html#more_itertools.distribute .
+
+    Re-interleaves results that have been split using more_itertools.distribute:
+        >>> group_1, group_2 = distribute(2, [1, 2, 3, 4, 5, 6])
+        >>> list(group_1)
+        [1, 3, 5]
+        >>> list(group_2)
+        [2, 4, 6]
+        >>> undistribute([group_1, group_2])
+        [1, 2, 3, 4, 5, 6]
+
+    Handles non-uniform component lengths:
+
+        >>> children = distribute(3, [1, 2, 3, 4, 5, 6, 7])
+        >>> [list(c) for c in children]
+        [[1, 4, 7], [2, 5], [3, 6]]
+        >>> undistribute(children)
+        [1, 2, 3, 4, 5, 6, 7]
+
+    Also handles when some iterables are empty:
+
+        >>> children = distribute(5, [1, 2, 3])
+        >>> [list(c) for c in children]
+        [[1], [2], [3], [], []]
+        >>> undistribute(children)
+        [1, 2, 3]
+
+    """
+
+    return [
+        x
+        for x in itertools.chain.from_iterable(
+            itertools.zip_longest(*[list(x) for x in iterable])
+        )
+        if x is not None
+    ]
+
+
+def retry_on_specific_exceptions(
+    on_exceptions: List[Type[Exception]],
+    max_retries: Optional[int] = None,
+    backoff_time: float = 3.0,
+    backoff_multiplier: float = 1.5,
+    on_exception_callback: Optional[Callable[[Exception, float], Any]] = None,
+):
+    """Retry on an LLM Provider's rate limit error with exponential backoff
+    For example, to use for OpenAI, do the following:
+    ```
+    from openai import RateLimitError
+
+    # Recommend specifying max_retries to avoid infinite loops!
+    @retry_on_specific_exceptions([RateLimitError], max_retries=3)
+    def completion(...):
+        # Wrap OpenAI completion function here
+        ...
+    ```
+    """
+
+    def decorator(func: Callable):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            sleep_time = backoff_time
+            attempt = 0
+            while max_retries is None or attempt < max_retries:
+                try:
+                    return func(*args, **kwargs)
+                except tuple(on_exceptions) as e:
+                    if on_exception_callback is not None:
+                        on_exception_callback(e, sleep_time)
+                    time.sleep(sleep_time)
+                    sleep_time *= backoff_multiplier
+                    attempt += 1
+
+        return wrapper
+
+    return decorator
+
+
+class Collator:
+    """
+    A class for reordering and batching elements of an array.
+
+    This class allows for sorting an array based on a provided sorting function, grouping elements based on a grouping function, and generating batches from the sorted and grouped data.
+
+    Objects of this class have the group_by attribute which determines the method for grouping
+    the data while batching it. Three options include "gen_kwargs", "contexts", or None:
+        If group_by == "gen_kwargs" then requests will be grouped by gen_kwargs
+        If group_by == "contexts" then requests will be grouped by context + cont[:-1]
+        If None then requests will just be reordered by length descending.
+    """
+
+    def __init__(
+        self,
+        arr: List,
+        sort_fn: Callable = lambda x: x,
+        group_fn: Callable = lambda x: x[1],
+        group_by: Union[Literal["gen_kwargs", "contexts"], None] = None,
+    ) -> None:
+        self._group_by = group_by
+        # 0 indices are enumerated indices. Apply functions to original arr.
+        self._sort_fn = lambda x: sort_fn(x[1])
+        self._group_fn = lambda x: group_fn(x[1])
+        self._reorder_indices: List = []
+        self._size = len(arr)
+        self._arr_with_indices: Union[Dict, Tuple[Tuple[int, Any], ...]] = tuple(
+            enumerate(arr)
+        )  # [indices, (arr)]
+        if self._group_by == "contexts":
+            self._group_by_context()
+        elif self._group_by == "gen_kwargs":
+            self._group_by_index()
+
+    def _group_by_index(self) -> None:
+        """Group the elements of a list based on their indices."""
+        self._arr_with_indices = self.group(
+            self._arr_with_indices, fn=self._group_fn, group_by="gen_kwargs"
+        )
+
+    def _group_by_context(self) -> None:
+        """Group the array with indices by context."""
+        self._arr_with_indices = self.group(
+            self._arr_with_indices, fn=self._group_fn, group_by="contexts"
+        )
+
+    def get_batched(self, n: int = 1, batch_fn: Optional[Callable] = None) -> Iterator:
+        """
+        Generates and yields batches from the reordered array. The method of grouping and batching
+        depends on the parameter `group_by`.
+        If `group_by` is set to "gen_kwargs", it will batch the
+        re-ordered values with same gen_kwargs for each batch.
+        If `group_by` is "contexts", it caches the requests by context before batching.
+        If `group_by` is neither "gen_kwargs" nor "contexts", it yields the reordered array
+
+        Parameters:
+        - n (int): The size of each batch. Defaults to 1.
+        - batch_fn ([Callable[[int, Iterable], int]] | None): A function to determine the size of
+          each batch. Optional, defaults to None.
+
+        Returns:
+        Iterator: An iterator over batches of reordered elements grouped as per the `group_by`
+                  attribute.
+
+        Yields:
+        List of batched elements according to the `group_by` attribute.
+        """
+        if self._group_by == "gen_kwargs":
+            for (
+                key,
+                values,
+            ) in self._arr_with_indices.items():  # type: ignore
+                values = self._reorder(values)
+                batch = self.get_chunks(values, n=n, fn=batch_fn)
+                yield from batch
+        elif self._group_by == "contexts":
+            # Get one sample from each key
+            values = self._reorder(
+                [value[0] for value in self._arr_with_indices.values()]
+            )
+            batch = self.get_chunks(values, n=n, fn=batch_fn)
+            yield from batch
+        else:
+            values = self._reorder(self._arr_with_indices)  # type: ignore
+            batch = self.get_chunks(values, n=n, fn=batch_fn)
+            yield from batch
+
+    def get_cache(
+        self,
+        req_str: Tuple[str, str] = None,
+        cxt_toks: List[int] = None,
+        cont_toks: List[int] = None,
+        logits: torch.Tensor = None,
+    ) -> Iterator[Tuple[Tuple[str, str], List[int], torch.Tensor]]:
+        """
+        Retrieves cached single-token continuations and their associated arguments, updating indices as necessary.
+
+        The behavior of this function varies depending on how the `group_by` attribute is set:
+
+        - When `group_by` is "contexts":
+            The function identifies single-token continuations by checking for keys that equate to
+            [context+continuation][-1] and logs the indices for re-ordering.
+            In this mode, this function can work in two scenarios:
+
+            1. Cache Hit - Single Match:
+                If a single matching context-continuation pair is found in the cache,
+                the function yields the original arguments.
+
+            2. Cache Hit - Multiple Matches:
+                If multiple matching context-continuation pairs are found in the cache,
+                the function expands the logits batch dimension to match the number of cache hits.
+                It updates the original requests and continuation tokens.
+
+        - When `group_by` is not set to "contexts":
+            This method yields the original arguments, logits and continuation tokens,
+            without checking for one-token continuations.
+
+        Parameters:
+        - req_str (tuple[str, str]): Original strings used for CachingLM.
+        - cxt_toks (list[int]): Full context tokens used for lookup.
+        - cont_toks (list[int]): Continuation tokens for which logits were generated.
+        - logits (torch.Tensor [1, seq_length, vocab_size]): Logits generated by the model given context and continuation keys.
+
+        Yields:
+        - Iterator:
+            - req_str (tuple[str, str]): strings used for CachingLM.
+            - cont_toks (list[int]) : continuation tokens.
+            - logits (torch.Tensor [1, seq_length, vocab_size]): The original logits (repeated cache hit times)
+        """
+        if self._group_by == "contexts":
+            cache_hit: List[
+                Tuple[int, Tuple[Tuple[str, str], List[int], List[int]]]
+            ] = self._arr_with_indices.pop(tuple(cxt_toks + cont_toks[:-1]))
+            if (cache_size := len(cache_hit)) == 1:
+                self._reorder_indices.extend(x[0] for x in cache_hit)
+                yield req_str, cont_toks, logits
+            else:
+                # If we have matching requests then expand the batch dimension (no-op) and
+                # yield each along with its corresponding args.
+                multilogits = logits.expand(cache_size, -1, -1).chunk(cache_size)
+                indices, req_str, cont_toks = zip(
+                    *[(x[0], x[1][0], x[-1][-1]) for x in cache_hit]
+                )
+                self._reorder_indices.extend(indices)
+                for c_key, cont_tok, logit in zip(req_str, cont_toks, multilogits):
+                    yield c_key, cont_tok, logit
+        else:
+            yield req_str, cont_toks, logits
+
+    def _reorder(self, arr: Union[List, Tuple[Tuple[int, Any], ...]]) -> Iterator:
+        """
+        Reorders the elements in the array based on the sorting function.
+
+        Parameters:
+        - arr (list | tuple[tuple[int, Any], ...]]): The array or iterable to be reordered.
+
+        Yields:
+            Iterator
+        """
+        arr = sorted(arr, key=self._sort_fn)
+        if not self._group_by == "contexts":
+            # If grouped by contexts then indices will be set in get_cache()
+            self._reorder_indices.extend([x[0] for x in arr])
+        yield from [x[1] for x in arr]
+
+    def get_original(self, newarr: List) -> List:
+        """
+        Restores the original order of elements from the reordered list.
+
+        Parameters:
+        - newarr (list): The reordered array.
+
+        Returns:
+        list: The array with elements restored to their original order.
+        """
+        res = [None] * self._size
+        cov = [False] * self._size
+
+        for ind, v in zip(self._reorder_indices, newarr):
+            res[ind] = v
+            cov[ind] = True
+
+        assert all(cov)
+
+        return res
+
+    def __len__(self):
+        return self._size
+
+    @staticmethod
+    def group(
+        arr: Iterable,
+        fn: Callable,
+        group_by: Literal["gen_kwargs", "contexts"] = "gen_kwargs",
+    ) -> dict:
+        """
+        Groups elements of an iterable based on a provided function.
+
+
+        The `group_by` parameter determines the method of grouping.
+        If `group_by` is "contexts", the elements are grouped by [context + cont][:-1].
+        If `group_by` is "gen_kwargs", the elements are grouped based on the gen_kwargs dict.
+
+        Parameters:
+        - arr (Iterable): The iterable to be grouped.
+        - fn (Callable): The function to determine the grouping.
+        - values (bool): If True, returns the values of the group. Defaults to False.
+
+        Returns:
+        Iterator: An iterable of grouped elements.
+        """
+        res = collections.defaultdict(list)
+        for ob in arr:
+            # where ob == [context + cont]
+            if group_by == "contexts":
+                res[tuple(fn(ob))].append(ob)
+            else:
+                try:
+                    hashable_dict = tuple(
+                        (
+                            key,
+                            tuple(value)
+                            if isinstance(value, collections.abc.Iterable)
+                            else value,
+                        )
+                        for key, value in sorted(fn(ob).items())
+                    )
+                    res[hashable_dict].append(ob)
+                except (TypeError, AttributeError):
+                    res[tuple(fn(ob))].append(ob)
+        return res
+
+    @staticmethod
+    def get_chunks(_iter, n: int = 0, fn=None):
+        """
+        Divides an iterable into chunks of specified size or based on a given function.
+        Useful for batching
+
+        Parameters:
+        - iter: The input iterable to be divided into chunks.
+        - n: An integer representing the size of each chunk. Default is 0.
+        - fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
+
+        Returns:
+        An iterator that yields chunks of the input iterable.
+
+        Example usage:
+        ```
+        data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        for chunk in chunks(data, 3):
+            print(chunk)
+        ```
+        Output:
+        ```
+        [1, 2, 3]
+        [4, 5, 6]
+        [7, 8, 9]
+        [10]
+        ```
+        """
+        arr = []
+        _iter = tuple(_iter)
+        for i, x in enumerate(_iter):
+            arr.append(x)
+            if len(arr) == (fn(i, _iter) if fn else n):
+                yield arr
+                arr = []
+
+        if arr:
+            yield arr
--- a/LM-Evaluation-Harness-240310/lm_eval/models/vllm_causallms.py
+++ b/LM-Evaluation-Harness-240310/lm_eval/models/vllm_causallms.py
+import copy
+from importlib.metadata import version
+from importlib.util import find_spec
+from typing import List, Literal, Optional, Tuple, Union
+
+from more_itertools import distribute
+from packaging.version import parse as parse_version
+from tqdm import tqdm
+
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import TemplateLM
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import Collator, undistribute
+from lm_eval.utils import (
+    eval_logger,
+    get_rolling_token_windows,
+    make_disjoint_window,
+)
+
+
+try:
+    import ray
+    from vllm import LLM, SamplingParams
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+except ModuleNotFoundError:
+    pass
+
+eval_logger = eval_logger
+
+
+@register_model("vllm")
+class VLLM(TemplateLM):
+    _DEFAULT_MAX_LENGTH = 2048
+
+    def __init__(
+        self,
+        pretrained="gpt2",
+        dtype: Literal["float16", "bfloat16", "float32", "auto"] = "auto",
+        revision: Optional[str] = None,
+        trust_remote_code: Optional[bool] = False,
+        tokenizer: Optional[str] = None,
+        tokenizer_mode: Literal["auto", "slow"] = "auto",
+        tokenizer_revision: Optional[str] = None,
+        add_bos_token: Optional[bool] = False,
+        tensor_parallel_size: int = 1,
+        quantization: Optional[str] = None,
+        max_gen_toks: int = 256,
+        swap_space: int = 4,
+        batch_size: Union[str, int] = 1,
+        max_batch_size=None,
+        max_length: int = None,
+        max_model_len: int = None,
+        seed: int = 1234,
+        gpu_memory_utilization: float = 0.9,
+        device: str = "cuda",
+        data_parallel_size: int = 1,
+        **kwargs,
+    ):
+        super().__init__()
+
+        if not find_spec("vllm"):
+            raise Exception(
+                "attempted to use 'vllm' LM type, but package `vllm` is not installed. "
+                "Please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`"
+            )
+
+        assert "cuda" in device or device is None, "vLLM only supports CUDA"
+        assert (
+            max_length is None or max_model_len is None
+        ), "Either max_length or max_model_len may be provided, but not both"
+
+        self._max_length = max_model_len if max_model_len is not None else max_length
+        self.tensor_parallel_size = int(tensor_parallel_size)
+        self.data_parallel_size = int(data_parallel_size)
+        self.model_args = {
+            "model": pretrained,
+            "gpu_memory_utilization": float(gpu_memory_utilization),
+            "revision": revision,
+            "dtype": dtype,
+            "tokenizer": tokenizer,
+            "tokenizer_mode": tokenizer_mode,
+            "tokenizer_revision": tokenizer_revision,
+            "trust_remote_code": trust_remote_code,
+            "tensor_parallel_size": int(tensor_parallel_size),
+            "max_model_len": int(self._max_length) if self._max_length else None,
+            "swap_space": int(swap_space),
+            "quantization": quantization,
+            "seed": int(seed),
+        }
+        self.model_args.update(kwargs)
+        self.batch_size = (
+            "auto"
+            if isinstance(batch_size, str) and "auto" in batch_size
+            else batch_size
+        )
+        if self.data_parallel_size <= 1:
+            self.model = LLM(**self.model_args)
+        else:
+            assert parse_version(version("vllm")) < parse_version(
+                "0.3.3"
+            ), "data_parallel is only compatible with vllm < v0.3.3."
+            eval_logger.warning(
+                "You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached."
+            )
+            self.model_args["worker_use_ray"] = True
+            self.batch_size = "auto"
+            eval_logger.info("Manual batching is not compatible with data parallelism.")
+
+            from transformers import AutoConfig
+
+            self._config = AutoConfig.from_pretrained(
+                pretrained, trust_remote_code=trust_remote_code, revision=revision
+            )
+        self.tokenizer = get_tokenizer(
+            tokenizer if tokenizer else pretrained,
+            tokenizer_mode=tokenizer_mode,
+            trust_remote_code=trust_remote_code,
+            tokenizer_revision=tokenizer_revision,
+        )
+        self.add_bos_token = add_bos_token
+
+        self._max_gen_toks = max_gen_toks
+
+    @property
+    def eot_token_id(self):
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+        return self.tokenizer.eos_token_id
+
+    @property
+    def max_length(self):
+        if self._max_length:  # if max length manually set, return it
+            return self._max_length
+        if self.data_parallel_size <= 1:
+            return self.model.llm_engine.model_config.max_model_len
+        else:
+            seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
+            for attr in seqlen_config_attrs:
+                if hasattr(self._config, attr):
+                    return getattr(self._config, attr)
+            if hasattr(self.tokenizer, "model_max_length"):
+                if self.tokenizer.model_max_length == 1000000000000000019884624838656:
+                    return self._DEFAULT_MAX_LENGTH
+                return self.tokenizer.model_max_length
+            return self._DEFAULT_MAX_LENGTH
+
+    @property
+    def max_gen_toks(self):
+        return self._max_gen_toks
+
+    def tok_encode(
+        self,
+        string: str,
+        left_truncate_len=None,
+        add_special_tokens=None,
+        truncation=False,
+    ):
+        """ """
+        if not add_special_tokens:
+            add_special_tokens = False or self.add_bos_token
+        encoding = self.tokenizer.encode(
+            string, add_special_tokens=add_special_tokens, truncation=truncation
+        )
+
+        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+        if left_truncate_len:
+            encoding = encoding[-left_truncate_len:]
+
+        return encoding
+
+    def _model_generate(
+        self,
+        requests: List[List[int]] = None,
+        generate: bool = False,
+        max_tokens: int = None,
+        stop: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        if generate:
+            kwargs = self.modify_gen_kwargs(kwargs)
+            sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs)
+        else:
+            sampling_params = SamplingParams(
+                temperature=0, prompt_logprobs=1, max_tokens=1
+            )
+        if self.data_parallel_size > 1:
+            # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
+            # also seems to only work with decorator and not with ray.remote() fn
+            # see https://github.com/vllm-project/vllm/issues/973
+            # note: this has changed on 0.3.3, and it only works now if num_gpus are set.
+            # but then tensor_parallel breaks
+            @ray.remote
+            def run_inference_one_model(
+                model_args: dict, sampling_params, requests: List[List[int]]
+            ):
+                llm = LLM(**model_args)
+                return llm.generate(
+                    prompt_token_ids=requests, sampling_params=sampling_params
+                )
+
+            # dispatch requests to all self.data_parallel_size workers, in interleaved fashion
+            # interleaved important to balance context lengths across workers
+            requests = [list(x) for x in distribute(self.data_parallel_size, requests)]
+            inputs = ((self.model_args, sampling_params, req) for req in requests)
+            object_refs = [run_inference_one_model.remote(*x) for x in inputs]
+            results = ray.get(object_refs)
+            # Invoke ray.shutdown() to prevent hang-ups if subsequent calls required.
+            ray.shutdown()
+            # flatten results
+            return undistribute(results)
+
+        outputs = self.model.generate(
+            prompt_token_ids=requests,
+            sampling_params=sampling_params,
+            use_tqdm=True if self.batch_size == "auto" else False,
+        )
+        return outputs
+
+    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+        loglikelihoods = []
+
+        for (string,) in tqdm([req.args for req in requests]):
+            rolling_token_windows = list(
+                map(
+                    make_disjoint_window,
+                    get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.eot_token_id,
+                        max_seq_len=self.max_length - 1,
+                        context_len=1,
+                    ),
+                )
+            )
+
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+
+            string_nll = self._loglikelihood_tokens(
+                rolling_token_windows,
+            )
+
+            # discard is_greedy
+            string_nll = [x[0] for x in string_nll]
+
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+        return loglikelihoods
+
+    def generate_until(self, requests: List[Instance]) -> List[str]:
+        res = []
+
+        # batch tokenize contexts
+        context, all_gen_kwargs = zip(*(req.args for req in requests))
+        context_encoding = self.tokenizer(context, add_special_tokens=False).input_ids
+        requests = [
+            ((a, b), c) for a, b, c in zip(context, context_encoding, all_gen_kwargs)
+        ]
+
+        def _collate_gen(_requests):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            return -len(_requests[0][1]), _requests[0][0]
+
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        re_ords = Collator(requests, _collate_gen, group_by="gen_kwargs")
+        chunks = re_ords.get_batched(
+            n=int(self.batch_size) if self.batch_size != "auto" else 0, batch_fn=None
+        )
+
+        pbar = tqdm(
+            total=len(requests),
+            disable=(self.rank != 0),
+            desc="Running generate_until requests",
+        )
+        # for each different set of kwargs, we execute all requests, by batch.
+        for chunk in chunks:
+            context_and_encoding, all_gen_kwargs = zip(*chunk)
+            context, context_encoding = zip(*context_and_encoding)
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            gen_kwargs = all_gen_kwargs[0]
+            # unpack our keyword arguments.
+            until = None
+            if isinstance(gen_kwargs, dict):
+                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                if "until" in kwargs.keys():
+                    until = kwargs.pop("until")
+                    if isinstance(until, str):
+                        until = [until]
+                    elif not isinstance(until, list):
+                        raise ValueError(
+                            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
+                        )
+            else:
+                raise ValueError(
+                    f"Expected `kwargs` to be of type `dict` but got {gen_kwargs}"
+                )
+            # add EOS token to stop sequences
+            eos = self.tokenizer.decode(self.eot_token_id)
+            if not until:
+                until = [eos]
+            else:
+                until.append(eos)
+            if "max_gen_toks" in kwargs.keys():
+                max_gen_toks = kwargs.pop("max_gen_toks")
+            else:
+                max_gen_toks = self.max_gen_toks
+
+            # set the max length in tokens of inputs ("context_enc")
+            # max len for inputs = max length, minus room to generate the max new tokens
+            max_ctx_len = self.max_length - max_gen_toks
+            context_encoding = [x[-max_ctx_len:] for x in context_encoding]
+
+            # perform batched generation
+            cont = self._model_generate(
+                requests=context_encoding,
+                generate=True,
+                max_tokens=max_gen_toks,
+                stop=until,
+                **kwargs,
+            )
+
+            # cache generations
+            for output, context in zip(cont, context):
+                generated_text = output.outputs[0].text
+                res.append(generated_text)
+                self.cache_hook.add_partial(
+                    "generate_until", (context, gen_kwargs), generated_text
+                )
+                pbar.update(1)
+
+        pbar.close()
+        # reorder all group of results back to original unsorted form
+        return re_ords.get_original(res)
+
+    def _loglikelihood_tokens(
+        self,
+        requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
+        disable_tqdm: bool = False,
+    ) -> List[Tuple[float, bool]]:
+        res = []
+
+        def _collate(x):
+            toks = x[1] + x[2]
+            return -len(toks), tuple(toks)
+
+        # Reorder requests by length and batch
+        re_ord = Collator(requests, sort_fn=_collate)
+        chunks = re_ord.get_batched(
+            n=int(self.batch_size) if self.batch_size != "auto" else 0, batch_fn=None
+        )
+
+        pbar = tqdm(
+            total=len(requests),
+            disable=disable_tqdm,
+            desc="Running loglikelihood requests",
+        )
+        for chunk in chunks:
+            inputs = []
+            ctxlens = []
+            for cache_key, context_enc, continuation_enc in chunk:
+                inp = (context_enc + continuation_enc)[-(self.max_length) :]
+                ctxlen = len(context_enc) - max(
+                    0, len(context_enc) + len(continuation_enc) - (self.max_length)
+                )
+
+                inputs.append(inp)
+                ctxlens.append(ctxlen)
+
+            outputs = self._model_generate(requests=inputs, generate=False)
+
+            for output, ctxlen, (cache_key, _, _), inp in zip(
+                outputs, ctxlens, chunk, inputs
+            ):
+                answer = self._parse_logprobs(
+                    tokens=inp,
+                    outputs=output,
+                    ctxlen=ctxlen,
+                )
+
+                res.append(answer)
+
+                # partial caching
+                if cache_key is not None:
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+                pbar.update(1)
+        pbar.close()
+        return re_ord.get_original(res)
+
+    @staticmethod
+    def _parse_logprobs(tokens: List, outputs, ctxlen: int) -> Tuple[float, bool]:
+        """Process logprobs and tokens.
+
+        :param tokens: list
+            Input tokens (potentially left-truncated)
+        :param outputs: RequestOutput
+            Contains prompt_logprobs
+        :param ctxlen: int
+            Length of context (so we can slice them away and only keep the predictions)
+        :return:
+            continuation_logprobs: float
+                Log probabilities of continuation tokens
+            is_greedy: bool
+                Whether argmax matches given continuation exactly
+        """
+
+        # The first entry of prompt_logprobs is None because the model has no previous tokens to condition on.
+        continuation_logprobs_dicts = outputs.prompt_logprobs
+
+        def coerce_logprob_to_num(logprob):
+            # vLLM changed the return type of logprobs from float
+            # to a Logprob object storing the float value + extra data
+            # (https://github.com/vllm-project/vllm/pull/3065).
+            # If we are dealing with vllm's Logprob object, return
+            # the logprob value stored as an attribute. Otherwise,
+            # return the object itself (which should be a float
+            # for older versions of vLLM).
+            return getattr(logprob, "logprob", logprob)
+
+        continuation_logprobs_dicts = [
+            {
+                token: coerce_logprob_to_num(logprob)
+                for token, logprob in logprob_dict.items()
+            }
+            if logprob_dict is not None
+            else None
+            for logprob_dict in continuation_logprobs_dicts
+        ]
+
+        # Calculate continuation_logprobs
+        # assume ctxlen always >= 1
+        continuation_logprobs = sum(
+            logprob_dict.get(token)
+            for token, logprob_dict in zip(
+                tokens[ctxlen:], continuation_logprobs_dicts[ctxlen:]
+            )
+        )
+
+        # Determine if is_greedy
+        is_greedy = True
+        for token, logprob_dict in zip(
+            tokens[ctxlen:], continuation_logprobs_dicts[ctxlen:]
+        ):
+            # Get the token with the maximum log probability from the logprob_dict
+            if logprob_dict:  # Ensure the logprob_dict is not None
+                top_token = max(logprob_dict, key=logprob_dict.get)
+                if top_token != token:
+                    is_greedy = False
+                    break
+
+        return continuation_logprobs, is_greedy
+
+    @staticmethod
+    def modify_gen_kwargs(kwargs: dict) -> dict:
+        # sampling_params
+        do_sample = kwargs.pop("do_sample", None)
+        if do_sample is False or "temperature" not in kwargs:
+            kwargs["temperature"] = 0.0
+        # hf defaults
+        kwargs["skip_special_tokens"] = kwargs.get("skip_special_tokens", False)
+        kwargs["spaces_between_special_tokens"] = kwargs.get(
+            "spaces_between_special_tokens", False
+        )
+        return kwargs
--- a/LM-Evaluation-Harness-240310/lm_eval/prompts/__init__.py
+++ b/LM-Evaluation-Harness-240310/lm_eval/prompts/__init__.py
+import ast
+import os
+from typing import Dict
+
+from lm_eval import utils
+from lm_eval.utils import eval_logger
+
+
+# Prompt library.
+# Stores prompts in a dictionary indexed by 2 levels:
+# prompt category name, and prompt name.
+# This allows us to access prompts
+PROMPT_REGISTRY: Dict[str, Dict[str, str]] = {
+    "qa-basic": {
+        "question-newline-answer": "Question: {{question}}\nAnswer:",
+        "q-newline-a": "Q: {{question}}\nA:",
+    },
+}
+
+
+def get_prompt(prompt_id: str, dataset_name: str = None, subset_name: str = None):
+    # unpack prompt name
+    category_name, prompt_name = prompt_id.split(":")
+    if subset_name is None:
+        dataset_full_name = dataset_name
+    else:
+        dataset_full_name = f"{dataset_name}-{subset_name}"
+    eval_logger.info(f"Loading prompt from {category_name} for {dataset_full_name}")
+    if category_name == "promptsource":
+        try:
+            from promptsource.templates import DatasetTemplates
+        except ModuleNotFoundError:
+            raise Exception(
+                "Tried to load a Promptsource template, but promptsource is not installed ",
+                "please install promptsource via pip install lm-eval[promptsource] or pip install -e .[promptsource]",
+            )
+        try:
+            if subset_name is None:
+                prompts = DatasetTemplates(dataset_name=dataset_name)
+            else:
+                prompts = DatasetTemplates(
+                    dataset_name=dataset_name, subset_name=subset_name
+                )
+        except Exception:
+            raise ValueError(f"{dataset_name} and {subset_name} not found")
+        if prompt_name in prompts.all_template_names:
+            return prompts[prompt_name]
+        else:
+            raise ValueError(
+                f"{prompt_name} not in prompt list {prompts.all_template_names}"
+            )
+    elif ".yaml" in category_name:
+        import yaml
+
+        with open(category_name, "rb") as file:
+            prompt_yaml_file = yaml.full_load(file)
+
+        prompt_string = prompt_yaml_file["prompts"][prompt_name]
+        return PromptString(prompt_string)
+    else:
+        try:
+            return PROMPT_REGISTRY[category_name][prompt_name]
+        except Exception:
+            raise ValueError(
+                f"expected only a single `:` as separator between \
+                prompt category and name, but got `{prompt_id}` instead"
+            )
+
+
+def load_prompt_list(
+    use_prompt: str, dataset_name=None, subset_name=None, yaml_path=None, **kwargs
+):
+    category_name, prompt_name = use_prompt.split(":")
+
+    if category_name == "promptsource":
+        from promptsource.templates import DatasetTemplates
+
+        if subset_name is None:
+            prompts = DatasetTemplates(dataset_name=dataset_name)
+        else:
+            prompts = DatasetTemplates(
+                dataset_name=dataset_name, subset_name=subset_name
+            )
+
+        prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
+
+    elif ".yaml" in category_name:
+        import yaml
+
+        if yaml_path is not None:
+            category_name = os.path.realpath(os.path.join(yaml_path, category_name))
+
+        with open(category_name, "rb") as file:
+            prompt_yaml_file = yaml.full_load(file)
+
+        prompt_list = utils.pattern_match(
+            prompt_name, prompt_yaml_file["prompts"].keys()
+        )
+
+    # category_name, *prompt_name = use_prompt.split(":")
+    # TODO allow to multiple prompt naming
+    # if len(prompt_name) > 1:
+    #     prompt_list = []
+    #     for prompt in prompt_name:
+    #         prompt_list.append(utils.pattern_match(prompt_name, prompts.all_template_names))
+    # else:
+    #     prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
+    return [":".join([category_name, prompt]) for prompt in prompt_list]
+
+
+class PromptString:
+    def __init__(self, prompt_string):
+        self.prompt_string = prompt_string
+
+    def apply(self, doc):
+        doc_to_text = self.prompt_string["doc_to_text"]
+        doc_to_target = self.prompt_string["doc_to_target"]
+
+        # TODO need a way to process doc_to_choice
+        if "doc_to_choice" in self.prompt_string:
+            raise Exception("Not yet implemented to accept doc_to_choice")
+
+        text_string = utils.apply_template(doc_to_text, doc)
+        target_string = utils.apply_template(doc_to_target, doc)
+
+        return [text_string, target_string]
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/README.md
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/README.md
+# v1.0 Tasks
+This list keeps track of which tasks' implementations have been ported to YAML / v2.0 of the Eval Harness.
+
+Boxes should be checked iff tasks are implemented in the refactor and tested for regression. Tasks should be struck through if checked *against original introducing paper* implementation or popularizing implementation. (WIP) Denotes that there exists a PR or person working on this task already.
+
+- [x] Glue
+- [x] SuperGlue
+- [x] CoQA
+- [x] DROP
+- [x] ~~Lambada~~
+- [x] Lambada (Cloze variants)
+- [x] ~~Lambada (Multilingual)~~
+- [x] Wikitext
+- [x] PiQA
+- [x] PROST
+- [x] MCTACO
+- [x] Pubmed QA
+- [x] SciQ
+- [x] QASPER
+- [x] QA4MRE
+- [x] TriviaQA
+- [x] AI2 ARC
+- [x] LogiQA
+- [x] HellaSwag
+- [x] SWAG
+- [x] OpenBookQA
+- [ ] SQuADv2 (Lintang)
+- [x] RACE
+- [x] HeadQA
+- [x] MathQA
+- [x] WebQs
+- [x] WSC273
+- [x] Winogrande
+- [x] ANLI
+- [x] Hendrycks Ethics (missing some tasks/metrics, see PR 660: <https://github.com/EleutherAI/lm-evaluation-harness/pull/660> for more info)
+- [x] TruthfulQA (mc1)
+- [x] TruthfulQA (mc2)
+- [x] TruthfulQA (gen)
+- [x] MuTual
+- [ ] Hendrycks Math (Hailey)
+- [x] Asdiv
+- [ ] GSM8k
+- [x] Arithmetic
+- [ ] MMMLU (Hailey)
+- [x] Translation (WMT) suite
+- [x] Unscramble
+- [x] ~~Pile (perplexity)~~
+- [x] BLiMP
+- [x] ToxiGen
+- [x] StoryCloze
+- [ ] NaturalQs (Hailey)
+- [x] CrowS-Pairs
+- [x] XCopa
+- [ ] BIG-Bench (Hailey)
+- [x] XStoryCloze
+- [x] XWinograd
+- [x] PAWS-X
+- [x] XNLI
+- [x] MGSM
+- [ ] SCROLLS
+- [x] Babi
+- [x] Belebele
+
+# Novel Tasks
+Tasks added in the revamped harness that were not previously available. Again, a strikethrough denotes checking performed *against the original task's implementation or published results introducing the task*.
+
+# Task Wishlist
+
+- [ ] TheoremQA
+- [ ] Theorem Proving evaluations
+- [ ] Chain of Thought
+- [ ] Self-consistency ; Least-to-Most prompting, etc.
+- [ ] Summarization Tasks
+- [ ] Anthropic Model-Written Evals
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/__init__.py
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/__init__.py
+import collections
+import logging
+import os
+from functools import partial
+from typing import Dict, List, Mapping, Optional, Union
+
+from lm_eval import utils
+from lm_eval.api.task import ConfigurableTask, Task
+
+
+class TaskManager:
+    """TaskManager indexes all tasks from the default `lm_eval/tasks/`
+    and an optional directory if provided.
+
+    """
+
+    def __init__(self, verbosity="INFO", include_path: Optional[str] = None) -> None:
+        self.verbosity = verbosity
+        self.include_path = include_path
+        self.logger = utils.eval_logger
+        self.logger.setLevel(getattr(logging, f"{verbosity}"))
+
+        self._task_index = self.initialize_tasks(include_path=include_path)
+        self._all_tasks = sorted(list(self._task_index.keys()))
+
+        self.task_group_map = collections.defaultdict(list)
+
+    def initialize_tasks(self, include_path: Optional[str] = None):
+        """Creates a dictionary of tasks index.
+
+        :param include_path: str = None
+            An additional path to be searched for tasks
+
+        :return
+            Dictionary of task names as key and task metadata
+        """
+        all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
+        if include_path is not None:
+            if isinstance(include_path, str):
+                include_path = [include_path]
+            all_paths.extend(include_path)
+
+        task_index = {}
+        for task_dir in all_paths:
+            tasks = self._get_task_and_group(task_dir)
+            task_index = {**tasks, **task_index}
+
+        return task_index
+
+    @property
+    def all_tasks(self):
+        return self._all_tasks
+
+    @property
+    def task_index(self):
+        return self._task_index
+
+    def match_tasks(self, task_list):
+        return utils.pattern_match(task_list, self.all_tasks)
+
+    def _name_is_registered(self, name) -> bool:
+        if name in self.all_tasks:
+            return True
+        return False
+
+    def _name_is_task(self, name) -> bool:
+        if self._name_is_registered(name) and ("task" in self.task_index[name]["type"]):
+            return True
+        return False
+
+    def _name_is_group(self, name) -> bool:
+        if self._name_is_registered(name) and (
+            self.task_index[name]["type"] == "group"
+        ):
+            return True
+        return False
+
+    def _name_is_python_task(self, name):
+        if self._name_is_registered(name) and (
+            self.task_index[name]["type"] == "python_task"
+        ):
+            return True
+        return False
+
+    def _config_is_task(self, config) -> bool:
+        if ("task" in config) and isinstance(config["task"], str):
+            return True
+        return False
+
+    def _config_is_group(self, config) -> bool:
+        if ("task" in config) and isinstance(config["task"], list):
+            return True
+        return False
+
+    def _config_is_python_task(self, config) -> bool:
+        if "class" in config:
+            return True
+        return False
+
+    def _get_yaml_path(self, name):
+        if name not in self.task_index:
+            raise ValueError
+        return self.task_index[name]["yaml_path"]
+
+    def _get_config(self, name):
+        if name not in self.task_index:
+            raise ValueError
+        yaml_path = self._get_yaml_path(name)
+        if yaml_path == -1:
+            return {}
+        else:
+            return utils.load_yaml_config(yaml_path, mode="full")
+
+    def _get_tasklist(self, name):
+        if self._name_is_task(name):
+            raise ValueError
+        return self.task_index[name]["task"]
+
+    def _process_alias(self, config, group=None):
+        # If the group is not the same as the original
+        # group which the group alias was intended for,
+        # Set the group_alias to None instead.
+        if ("group_alias" in config) and ("group" in config) and group is not None:
+            if config["group"] != group:
+                config["group_alias"] = None
+        return config
+
+    def _load_individual_task_or_group(
+        self,
+        name_or_config: Optional[Union[str, dict]] = None,
+        parent_name: Optional[str] = None,
+        update_config: Optional[dict] = None,
+        yaml_path: Optional[str] = None,
+    ) -> Mapping:
+        def load_task(config, task, group=None, yaml_path=None):
+            if "include" in config:
+                if yaml_path is None:
+                    raise ValueError
+                config.update(
+                    utils.load_yaml_config(
+                        yaml_path,
+                        yaml_config={"include": config.pop("include")},
+                        mode="full",
+                    )
+                )
+            if self._config_is_python_task(config):
+                task_object = config["class"]()
+            else:
+                config = self._process_alias(config, group=group)
+                task_object = ConfigurableTask(config=config)
+            if group is not None:
+                task_object = (group, task_object)
+            return {task: task_object}
+
+        if isinstance(name_or_config, str):
+            if update_config is not None:
+                # Process name_or_config as a dict instead
+                name_or_config = {"task": name_or_config, **update_config}
+            elif self._name_is_task(name_or_config):
+                task_config = self._get_config(name_or_config)
+                return load_task(task_config, task=name_or_config, group=parent_name)
+            else:
+                group_name = name_or_config
+                subtask_list = self._get_tasklist(name_or_config)
+                if subtask_list == -1:
+                    group_config = self._get_config(name_or_config)
+                    subtask_list = group_config["task"]
+
+                # This checks if we're at the root.
+                if parent_name is None:
+                    group_config = self._get_config(name_or_config)
+                    if set(group_config.keys()) > {"task", "group"}:
+                        update_config = {
+                            k: v
+                            for k, v in group_config.items()
+                            if k not in ["task", "group"]
+                        }
+                    yaml_path = self._get_yaml_path(group_name)
+
+                    if (update_config is not None) and ("group_alias" in update_config):
+                        group_name = update_config["group_alias"]
+                        update_config.pop("group_alias")
+
+        if isinstance(name_or_config, dict):
+            if update_config is not None:
+                name_or_config = {
+                    **name_or_config,
+                    **update_config,
+                }
+
+            if self._config_is_task(name_or_config):
+                name = name_or_config["task"]
+                # If the name is registered as a group
+                # if self._name_is_task(name) is False:
+                if self._name_is_group(name):
+                    group_name = name
+                    update_config = {
+                        k: v for k, v in name_or_config.items() if k != "task"
+                    }
+                    subtask_list = self._get_tasklist(name)
+                    if subtask_list == -1:
+                        subtask_list = self._get_config(name)["task"]
+                else:
+                    if self._name_is_registered(name):
+                        base_task_config = self._get_config(name)
+
+                        # Check if this is a duplicate.
+                        if parent_name is not None:
+                            name_or_config["group"] = parent_name
+                            num_duplicate = len(
+                                list(
+                                    filter(
+                                        lambda x: x.startswith(name),
+                                        self.task_group_map[parent_name],
+                                    )
+                                )
+                            )
+                            if num_duplicate > 0:
+                                name = f"{name}-{num_duplicate}"
+                            self.task_group_map[parent_name].append(name)
+
+                        task_config = {
+                            **base_task_config,
+                            **name_or_config,
+                        }
+                    else:
+                        task_config = name_or_config
+                    return load_task(
+                        task_config, task=name, group=parent_name, yaml_path=yaml_path
+                    )
+            else:
+                group_name = name_or_config["group"]
+                subtask_list = name_or_config["task"]
+                if set(name_or_config.keys()) > {"task", "group"}:
+                    update_config = {
+                        k: v
+                        for k, v in name_or_config.items()
+                        if k not in ["task", "group"]
+                    }
+
+        all_subtasks = {}
+        if parent_name is not None:
+            all_subtasks = {group_name: (parent_name, None)}
+
+        fn = partial(
+            self._load_individual_task_or_group,
+            parent_name=group_name,
+            update_config=update_config,
+            yaml_path=yaml_path,
+        )
+        all_subtasks = {
+            **all_subtasks,
+            **dict(collections.ChainMap(*map(fn, subtask_list))),
+        }
+        return all_subtasks
+
+    def load_task_or_group(self, task_list: Optional[Union[str, list]] = None) -> dict:
+        """Loads a dictionary of task objects from a list
+
+        :param task_list: Union[str, list] = None
+            Single string or list of string of task names to be loaded
+
+        :return
+            Dictionary of task objects
+        """
+        if isinstance(task_list, str):
+            task_list = [task_list]
+
+        all_loaded_tasks = dict(
+            collections.ChainMap(*map(self._load_individual_task_or_group, task_list))
+        )
+        return all_loaded_tasks
+
+    def load_config(self, config: Dict):
+        return self._load_individual_task_or_group(config)
+
+    def _get_task_and_group(self, task_dir: str):
+        """Creates a dictionary of tasks index with the following metadata,
+        - `type`, that can be either `task`, `python_task`, or `group`.
+            `task` refer to regular task configs, `python_task` are special
+            yaml files that only consists of `task` and `class` parameters.
+            `group` are group configs.
+        - `yaml_path`, path to the yaml file. If the entry is a `group` that
+            was configured through a task config, the yaml_path will be -1
+            and all subtasks will be listed in `task` (see below)
+        - `task`, reserved for entries with `type` as `group`. This will list
+            all subtasks. When a group config is created (as opposed to task
+            config having `group` parameter set), this will be set to -1 to
+            avoid recursive indexing. The whole list of subtasks will be loaded
+            at evaluation.
+
+        :param task_dir: str
+            A directory to check for tasks
+
+        :return
+            Dictionary of task names as key and task metadata
+        """
+        tasks_and_groups = collections.defaultdict()
+        for root, _, file_list in os.walk(task_dir):
+            for f in file_list:
+                if f.endswith(".yaml"):
+                    yaml_path = os.path.join(root, f)
+                    config = utils.load_yaml_config(yaml_path, mode="simple")
+                    if self._config_is_python_task(config):
+                        # This is a python class config
+                        tasks_and_groups[config["task"]] = {
+                            "type": "python_task",
+                            "yaml_path": yaml_path,
+                        }
+                    elif self._config_is_group(config):
+                        # This is a group config
+                        tasks_and_groups[config["group"]] = {
+                            "type": "group",
+                            "task": -1,  # This signals that
+                            # we don't need to know
+                            # the task list for indexing
+                            # as it can be loaded
+                            # when called.
+                            "yaml_path": yaml_path,
+                        }
+
+                        # # Registered the level 1 tasks from a group config
+                        # for config in config["task"]:
+                        #     if isinstance(config, dict) and self._config_is_task(config):
+                        #         task = config["task"]
+                        #         tasks_and_groups[task] = {
+                        #             "type": "task",
+                        #             "yaml_path": yaml_path,
+                        #             }
+
+                    elif self._config_is_task(config):
+                        # This is a task config
+                        task = config["task"]
+                        tasks_and_groups[task] = {
+                            "type": "task",
+                            "yaml_path": yaml_path,
+                        }
+
+                        if "group" in config:
+                            groups = config["group"]
+                            if isinstance(config["group"], str):
+                                groups = [groups]
+
+                            for group in groups:
+                                if group not in tasks_and_groups:
+                                    tasks_and_groups[group] = {
+                                        "type": "group",
+                                        "task": [task],
+                                        "yaml_path": -1,
+                                    }
+                                else:
+                                    tasks_and_groups[group]["task"].append(task)
+                    else:
+                        self.logger.debug(f"File {f} in {root} could not be loaded")
+
+        return tasks_and_groups
+
+
+def include_path(task_dir):
+    logger = utils.eval_logger
+    logger.setLevel(getattr(logging, "INFO"))
+    logger.info(
+        "To still use tasks loaded from args.include_path,"
+        "see an example of the new TaskManager API in "
+        "https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
+    )
+    return 0
+
+
+def initialize_tasks(verbosity="INFO"):
+    logger = utils.eval_logger
+    logger.setLevel(getattr(logging, f"{verbosity}"))
+    logger.info(
+        "lm_eval.tasks.initialize_tasks() is deprecated and no longer necessary. "
+        "It will be removed in v0.4.2 release. "
+        "TaskManager will instead be used."
+    )
+    return 0
+
+
+def get_task_name_from_config(task_config: Dict[str, str]) -> str:
+    if "task" in task_config:
+        return task_config["task"]
+    if "dataset_name" in task_config:
+        return "{dataset_path}_{dataset_name}".format(**task_config)
+    else:
+        return "{dataset_path}".format(**task_config)
+
+
+def get_task_name_from_object(task_object):
+    if hasattr(task_object, "config"):
+        return task_object._config["task"]
+
+    # TODO: scrap this
+    # this gives a mechanism for non-registered tasks to have a custom name anyways when reporting
+    return (
+        task_object.EVAL_HARNESS_NAME
+        if hasattr(task_object, "EVAL_HARNESS_NAME")
+        else type(task_object).__name__
+    )
+
+
+def get_task_dict(
+    task_name_list: List[Union[str, Dict, Task]],
+    task_manager: Optional[TaskManager] = None,
+):
+    """Creates a dictionary of task objects from either a name of task, config, or prepared Task object.
+
+    :param task_name_list: List[Union[str, Dict, Task]]
+        Name of model or LM object, see lm_eval.models.get_model
+    :param task_manager: TaskManager = None
+        A TaskManager object that stores indexed tasks. If not set,
+        task_manager will load one. This should be set by the user
+        if there are additional paths that want to be included
+        via `include_path`
+
+    :return
+        Dictionary of task objects
+    """
+    task_name_from_string_dict = {}
+    task_name_from_config_dict = {}
+    task_name_from_object_dict = {}
+
+    if isinstance(task_name_list, str):
+        task_name_list = [task_name_list]
+
+    string_task_name_list = [task for task in task_name_list if isinstance(task, str)]
+    others_task_name_list = [task for task in task_name_list if ~isinstance(task, str)]
+    if len(string_task_name_list) > 0:
+        if task_manager is None:
+            task_manager = TaskManager()
+
+        task_name_from_string_dict = task_manager.load_task_or_group(
+            string_task_name_list
+        )
+
+    for task_element in others_task_name_list:
+        if isinstance(task_element, dict):
+            task_name_from_config_dict = {
+                **task_name_from_config_dict,
+                **task_manager.load_config(config=task_element),
+            }
+
+        elif isinstance(task_element, Task):
+            task_name_from_object_dict = {
+                **task_name_from_object_dict,
+                get_task_name_from_object(task_element): task_element,
+            }
+
+    if not set(task_name_from_string_dict.keys()).isdisjoint(
+        set(task_name_from_object_dict.keys())
+    ):
+        raise ValueError
+
+    return {
+        **task_name_from_string_dict,
+        **task_name_from_config_dict,
+        **task_name_from_object_dict,
+    }
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/README.md
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/README.md
+# ArabicMMLU
+
+### Paper
+
+ArabicMMLU: Measuring massive multitask language understanding in Arabic
+This dataset has been translated from the original MMLU with the help of GPT-4.
+
+The original data [MMLU](https://arxiv.org/pdf/2009.03300v3.pdf)
+
+The translation has been done with AceGPT researchers [AceGPT](https://arxiv.org/abs/2309.12053)
+
+ArabicMMLU is a comprehensive evaluation benchmark specifically designed to evaluate the knowledge and reasoning abilities of LLMs within the context of Arabic language and culture.
+ArabicMMLU covers a wide range of subjects, comprising 57 topics that span from elementary to advanced professional levels.
+
+Homepage: [AceGPT Homepage](https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic)
+
+### Citation
+
+
+### Groups and Tasks
+
+#### Groups
+
+- `ammlu`: All 57 subjects of the ArabicMMLU dataset, evaluated following the methodology in MMLU's original implementation.
+
+#### Tasks
+
+
+The following tasks evaluate subjects in the ArabicMMLU dataset using loglikelihood-based multiple-choice scoring:
+- `ammlu_{subject_english}`
+
+### Checklist
+
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation?
+    * [x] Yes, original implementation contributed by author of the benchmark
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/_default_template_yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/_default_template_yaml
+group: ammlu
+dataset_path: Hennara/ammlu
+test_split: test
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+doc_to_text: "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nالجواب："
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer)}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/_generate_configs.py
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/ammlu/_generate_configs.py
+"""
+Take in a YAML, and output all other splits with this YAML
+"""
+import argparse
+import os
+
+import yaml
+from tqdm import tqdm
+
+
+SUBJECTS = {
+    "abstract_algebra": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "anatomy": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "astronomy": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "business_ethics": "علوم أخرى",
+    "clinical_knowledge": "علوم أخرى",
+    "college_biology": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "college_chemistry": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "college_computer_science": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "college_mathematics": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "college_medicine": "علوم أخرى",
+    "college_physics": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "computer_security": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "conceptual_physics": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "econometrics": "العلوم الإجتماعية",
+    "electrical_engineering": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "elementary_mathematics": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "formal_logic": "العلوم الانسانية",
+    "global_facts": "علوم أخرى",
+    "high_school_biology": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "high_school_chemistry": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "high_school_computer_science": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "high_school_european_history": "العلوم الانسانية",
+    "high_school_geography": "العلوم الإجتماعية",
+    "high_school_government_and_politics": "العلوم الإجتماعية",
+    "high_school_macroeconomics": "العلوم الإجتماعية",
+    "high_school_mathematics": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "high_school_microeconomics": "العلوم الإجتماعية",
+    "high_school_physics": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "high_school_psychology": "العلوم الإجتماعية",
+    "high_school_statistics": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "high_school_us_history": "العلوم الانسانية",
+    "high_school_world_history": "العلوم الانسانية",
+    "human_aging": "علوم أخرى",
+    "human_sexuality": "العلوم الإجتماعية",
+    "international_law": "العلوم الانسانية",
+    "jurisprudence": "العلوم الانسانية",
+    "logical_fallacies": "العلوم الانسانية",
+    "machine_learning": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "management": "علوم أخرى",
+    "marketing": "علوم أخرى",
+    "medical_genetics": "علوم أخرى",
+    "miscellaneous": "علوم أخرى",
+    "moral_disputes": "العلوم الانسانية",
+    "moral_scenarios": "العلوم الانسانية",
+    "nutrition": "علوم أخرى",
+    "philosophy": "العلوم الانسانية",
+    "prehistory": "العلوم الانسانية",
+    "professional_accounting": "علوم أخرى",
+    "professional_law": "العلوم الانسانية",
+    "professional_medicine": "علوم أخرى",
+    "professional_psychology": "العلوم الإجتماعية",
+    "public_relations": "العلوم الإجتماعية",
+    "security_studies": "العلوم الإجتماعية",
+    "sociology": "العلوم الإجتماعية",
+    "us_foreign_policy": "العلوم الإجتماعية",
+    "virology": "علوم أخرى",
+    "world_religions": "العلوم الانسانية",
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_yaml_path", required=True)
+    parser.add_argument("--save_prefix_path", default="ammlu")
+    parser.add_argument("--cot_prompt_path", default=None)
+    parser.add_argument("--task_prefix", default="")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
+    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
+    with open(args.base_yaml_path, encoding="utf-8") as f:
+        base_yaml = yaml.full_load(f)
+
+    if args.cot_prompt_path is not None:
+        import json
+
+        with open(args.cot_prompt_path, encoding="utf-8") as f:
+            cot_file = json.load(f)
+
+    for subject_eng, category in tqdm(SUBJECTS.items()):
+        if args.cot_prompt_path is not None:
+            description = cot_file[subject_eng]
+        else:
+            description = f"فم بعملية التقييم في مجال {category} \n\n"
+
+        yaml_dict = {
+            "include": base_yaml_name,
+            "task": f"ammlu_{args.task_prefix}_{subject_eng}"
+            if args.task_prefix != ""
+            else f"ammlu_{subject_eng}",
+            "dataset_name": subject_eng,
+            "description": description,
+        }
+
+        file_save_path = args.save_prefix_path + f"_{subject_eng}.yaml"
+        print(f"Saving yaml for subset {subject_eng} to {file_save_path}")
+        with open(file_save_path, "w", encoding="utf-8") as yaml_file:
+            yaml.dump(
+                yaml_dict,
+                yaml_file,
+                width=float("inf"),
+                allow_unicode=True,
+                default_style='"',
+            )