Merge branch 'main' into inverse-scaling-tasks

60c9c170 · haileyschoelkopf · 4b2d565b · b4cd85d4 · 60c9c170 · 60c9c170
Commit 60c9c170 authored May 29, 2024 by haileyschoelkopf
20 changed files
--- a/lm_eval/loggers/__init__.py
+++ b/lm_eval/loggers/__init__.py
+from .evaluation_tracker import EvaluationTracker
+from .wandb_logger import WandbLogger
--- a/lm_eval/loggers/evaluation_tracker.py
+++ b/lm_eval/loggers/evaluation_tracker.py
+import json
+import re
+import time
+from dataclasses import asdict, dataclass
+from datetime import datetime
+from pathlib import Path
+from huggingface_hub import HfApi
+from lm_eval.utils import (
+    eval_logger,
+    handle_non_serializable,
+    hash_string,
+)
+@dataclass(init=False)
+class GeneralConfigTracker:
+    """
+    Tracker for the evaluation parameters.
+    Attributes:
+        model_source (str): Source of the model (e.g. Hugging Face, GGUF, etc.)
+        model_name (str): Name of the model.
+        model_name_sanitized (str): Sanitized model name for directory creation.
+        start_time (float): Start time of the experiment. Logged at class init.
+        end_time (float): Start time of the experiment. Logged when calling [`GeneralConfigTracker.log_end_time`]
+        total_evaluation_time_seconds (str): Inferred total evaluation time in seconds (from the start and end times).
+    """
+    model_source: str = None
+    model_name: str = None
+    model_name_sanitized: str = None
+    start_time: float = None
+    end_time: float = None
+    total_evaluation_time_seconds: str = None
+    def __init__(self) -> None:
+        """Starts the evaluation timer."""
+        self.start_time = time.perf_counter()
+    @staticmethod
+    def _get_model_name(model_args: str) -> str:
+        """Extracts the model name from the model arguments."""
+        def extract_model_name(model_args: str, key: str) -> str:
+            """Extracts the model name from the model arguments using a key."""
+            args_after_key = model_args.split(key)[1]
+            return args_after_key.split(",")[0]
+        # order does matter, e.g. peft and delta are provided together with pretrained
+        prefixes = ["peft=", "delta=", "pretrained=", "model=", "path=", "engine="]
+        for prefix in prefixes:
+            if prefix in model_args:
+                return extract_model_name(model_args, prefix)
+        return ""
+    def log_experiment_args(
+        self,
+        model_source: str,
+        model_args: str,
+    ) -> None:
+        """Logs model parameters and job ID."""
+        self.model_source = model_source
+        self.model_name = GeneralConfigTracker._get_model_name(model_args)
+        self.model_name_sanitized = re.sub(
+            r"[\"<>:/\|\\?\*\[\]]+", "__", self.model_name
+        )
+    def log_end_time(self) -> None:
+        """Logs the end time of the evaluation and calculates the total evaluation time."""
+        self.end_time = time.perf_counter()
+        self.total_evaluation_time_seconds = str(self.end_time - self.start_time)
+class EvaluationTracker:
+    """
+    Keeps track and saves relevant information of the evaluation process.
+    Compiles the data from trackers and writes it to files, which can be published to the Hugging Face hub if requested.
+    """
+    def __init__(
+        self,
+        output_path: str = None,
+        hub_results_org: str = "",
+        hub_repo_name: str = "",
+        push_results_to_hub: bool = False,
+        push_samples_to_hub: bool = False,
+        public_repo: bool = False,
+        token: str = "",
+    ) -> None:
+        """
+        Creates all the necessary loggers for evaluation tracking.
+        Args:
+            output_path (str): Path to save the results. If not provided, the results won't be saved.
+            hub_results_org (str): The Hugging Face organisation to push the results to. If not provided, the results won't be pushed.
+            hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`.
+            push_results_to_hub (bool): Whether to push the results to the Hugging Face hub.
+            push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub.
+            public_repo (bool): Whether to push the results to a public or private repository.
+            token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`.
+        """
+        self.general_config_tracker = GeneralConfigTracker()
+        self.output_path = output_path
+        self.hub_results_org = hub_results_org
+        hub_repo_name = hub_repo_name if hub_repo_name else "lm-eval-results"
+        self.hub_results_repo = f"{hub_results_org}/{hub_repo_name}"
+        self.hub_results_repo_private = f"{hub_results_org}/{hub_repo_name}-private"
+        self.push_results_to_hub = push_results_to_hub
+        self.push_samples_to_hub = push_samples_to_hub
+        self.public_repo = public_repo
+        self.api = HfApi(token=token) if token else None
+    def save_results_aggregated(
+        self,
+        results: dict,
+        samples: dict,
+    ) -> None:
+        """
+        Saves the aggregated results and samples to the output path and pushes them to the Hugging Face hub if requested.
+        Args:
+            results (dict): The aggregated results to save.
+            samples (dict): The samples results to save.
+        """
+        self.general_config_tracker.log_end_time()
+        if self.output_path:
+            try:
+                eval_logger.info("Saving results aggregated")
+                # calculate cumulative hash for each task - only if samples are provided
+                task_hashes = {}
+                if samples:
+                    for task_name, task_samples in samples.items():
+                        sample_hashes = [
+                            s["doc_hash"] + s["prompt_hash"] + s["target_hash"]
+                            for s in task_samples
+                        ]
+                        task_hashes[task_name] = hash_string("".join(sample_hashes))
+                # update initial results dict
+                results.update({"task_hashes": task_hashes})
+                results.update(asdict(self.general_config_tracker))
+                dumped = json.dumps(
+                    results,
+                    indent=2,
+                    default=handle_non_serializable,
+                    ensure_ascii=False,
+                )
+                path = Path(self.output_path if self.output_path else Path.cwd())
+                path = path.joinpath(self.general_config_tracker.model_name_sanitized)
+                path.mkdir(parents=True, exist_ok=True)
+                self.date_id = datetime.now().isoformat().replace(":", "-")
+                file_results_aggregated = path.joinpath(f"results_{self.date_id}.json")
+                file_results_aggregated.open("w", encoding="utf-8").write(dumped)
+                if self.api and self.push_results_to_hub:
+                    self.api.create_repo(
+                        repo_id=self.hub_results_repo
+                        if self.public_repo
+                        else self.hub_results_repo_private,
+                        repo_type="dataset",
+                        private=not self.public_repo,
+                        exist_ok=True,
+                    )
+                    self.api.upload_folder(
+                        repo_id=self.hub_results_repo
+                        if self.public_repo
+                        else self.hub_results_repo_private,
+                        folder_path=str(path),
+                        path_in_repo=self.general_config_tracker.model_name_sanitized,
+                        repo_type="dataset",
+                        commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
+                    )
+            except Exception as e:
+                eval_logger.warning("Could not save results aggregated")
+                eval_logger.info(repr(e))
+        else:
+            eval_logger.info(
+                "Output path not provided, skipping saving results aggregated"
+            )
+    def save_results_samples(
+        self,
+        task_name: str,
+        samples: dict,
+    ) -> None:
+        """
+        Saves the samples results to the output path and pushes them to the Hugging Face hub if requested.
+        Args:
+            task_name (str): The task name to save the samples for.
+            samples (dict): The samples results to save.
+        """
+        if self.output_path:
+            try:
+                eval_logger.info("Saving samples results")
+                samples_dumped = json.dumps(
+                    samples,
+                    indent=2,
+                    default=handle_non_serializable,
+                    ensure_ascii=False,
+                )
+                path = Path(self.output_path if self.output_path else Path.cwd())
+                path = path.joinpath(self.general_config_tracker.model_name_sanitized)
+                path.mkdir(parents=True, exist_ok=True)
+                file_results_samples = path.joinpath(
+                    f"samples_{task_name}_{self.date_id}.json"
+                )
+                file_results_samples.write_text(samples_dumped, encoding="utf-8")
+                if self.api and self.push_samples_to_hub:
+                    self.api.create_repo(
+                        self.hub_results_repo
+                        if self.public_repo
+                        else self.hub_results_repo_private,
+                        repo_type="dataset",
+                        private=not self.public_repo,
+                        exist_ok=True,
+                    )
+                    self.api.upload_folder(
+                        repo_id=self.hub_results_repo
+                        if self.public_repo
+                        else self.hub_results_repo_private,
+                        folder_path=str(path),
+                        path_in_repo=self.general_config_tracker.model_name_sanitized,
+                        repo_type="dataset",
+                        commit_message=f"Adding samples results for {task_name} to {self.general_config_tracker.model_name}",
+                    )
+            except Exception as e:
+                eval_logger.warning("Could not save sample results")
+                eval_logger.info(repr(e))
+        else:
+            eval_logger.info("Output path not provided, skipping saving sample results")
--- a/lm_eval/loggers/utils.py
+++ b/lm_eval/loggers/utils.py
+import logging
+import os
+import re
+import subprocess
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple, Union
+import numpy as np
+from torch.utils.collect_env import get_pretty_env_info
+from transformers import __version__ as trans_version
+logger = logging.getLogger(__name__)
+def remove_none_pattern(input_string: str) -> Tuple[str, bool]:
+    """Remove the ',none' substring from the input_string if it exists at the end.
+    Args:
+        input_string (str): The input string from which to remove the ',none' substring.
+    Returns:
+        Tuple[str, bool]: A tuple containing the modified input_string with the ',none' substring removed
+                          and a boolean indicating whether the modification was made (True) or not (False).
+    """
+    # Define the pattern to match ',none' at the end of the string
+    pattern = re.compile(r",none$")
+    # Use sub() to replace ',none' with an empty string
+    result = re.sub(pattern, "", input_string)
+    # check if the input_string changed
+    removed = result != input_string
+    return result, removed
+def _handle_non_serializable(o: Any) -> Union[int, str, list]:
+    """Handle non-serializable objects by converting them to serializable types.
+    Args:
+        o (Any): The object to be handled.
+    Returns:
+        Union[int, str, list]: The converted object. If the object is of type np.int64 or np.int32,
+            it will be converted to int. If the object is of type set, it will be converted
+            to a list. Otherwise, it will be converted to str.
+    """
+    if isinstance(o, np.int64) or isinstance(o, np.int32):
+        return int(o)
+    elif isinstance(o, set):
+        return list(o)
+    else:
+        return str(o)
+def get_commit_from_path(repo_path: Union[Path, str]) -> Optional[str]:
+    try:
+        git_folder = Path(repo_path, ".git")
+        if git_folder.is_file():
+            git_folder = Path(
+                git_folder.parent,
+                git_folder.read_text(encoding="utf-8").split("\n")[0].split(" ")[-1],
+            )
+        if Path(git_folder, "HEAD").exists():
+            head_name = (
+                Path(git_folder, "HEAD")
+                .read_text(encoding="utf-8")
+                .split("\n")[0]
+                .split(" ")[-1]
+            )
+            head_ref = Path(git_folder, head_name)
+            git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "")
+        else:
+            git_hash = None
+    except Exception as err:
+        logger.debug(
+            f"Failed to retrieve a Git commit hash from path: {str(repo_path)}. Error: {err}"
+        )
+        return None
+    return git_hash
+def get_git_commit_hash():
+    """
+    Gets the git commit hash of your current repo (if it exists).
+    Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
+    """
+    try:
+        git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
+        git_hash = git_hash.decode()
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        # FileNotFoundError occurs when git not installed on system
+        git_hash = get_commit_from_path(os.getcwd())  # git hash of repo if exists
+    return git_hash
+def add_env_info(storage: Dict[str, Any]):
+    try:
+        pretty_env_info = get_pretty_env_info()
+    except Exception as err:
+        pretty_env_info = str(err)
+    transformers_version = trans_version
+    upper_dir_commit = get_commit_from_path(
+        Path(os.getcwd(), "..")
+    )  # git hash of upper repo if exists
+    added_info = {
+        "pretty_env_info": pretty_env_info,
+        "transformers_version": transformers_version,
+        "upper_git_hash": upper_dir_commit,  # in case this repo is submodule
+    }
+    storage.update(added_info)
--- a/lm_eval/logging_utils.py
+++ b/lm_eval/logging_utils.py
 import copy
 import json
 import logging
-import os
+from typing import Any, Dict, List, Literal, Tuple
-import re
-import subprocess
-from pathlib import Path
-from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 import numpy as np
 import pandas as pd
 from packaging.version import Version
-from torch.utils.collect_env import get_pretty_env_info
-from transformers import __version__ as trans_version
+from lm_eval.loggers.utils import _handle_non_serializable, remove_none_pattern
-logger = logging.getLogger(__name__)
-def remove_none_pattern(input_string: str) -> Tuple[str, bool]:
-    """Remove the ',none' substring from the input_string if it exists at the end.
-    Args:
-        input_string (str): The input string from which to remove the ',none' substring.
-    Returns:
-        Tuple[str, bool]: A tuple containing the modified input_string with the ',none' substring removed
-                          and a boolean indicating whether the modification was made (True) or not (False).
-    """
-    # Define the pattern to match ',none' at the end of the string
-    pattern = re.compile(r",none$")
-    # Use sub() to replace ',none' with an empty string
-    result = re.sub(pattern, "", input_string)
-    # check if the input_string changed
+logger = logging.getLogger(__name__)
-    removed = result != input_string
-    return result, removed
-def _handle_non_serializable(o: Any) -> Union[int, str, list]:
-    """Handle non-serializable objects by converting them to serializable types.
-    Args:
-        o (Any): The object to be handled.
-    Returns:
-        Union[int, str, list]: The converted object. If the object is of type np.int64 or np.int32,
-            it will be converted to int. If the object is of type set, it will be converted
-            to a list. Otherwise, it will be converted to str.
-    """
-    if isinstance(o, np.int64) or isinstance(o, np.int32):
-        return int(o)
-    elif isinstance(o, set):
-        return list(o)
-    else:
-        return str(o)
 def get_wandb_printer() -> Literal["Printer"]:
@@ -395,55 +350,3 @@ class WandbLogger:
                self._log_samples_as_artifact(eval_preds, task_name)
            self.run.log({f"{group}_eval_results": grouped_df})
-def get_commit_from_path(repo_path: Path) -> Optional[str]:
-    git_folder = Path(repo_path, ".git")
-    if git_folder.is_file():
-        git_folder = Path(
-            git_folder.parent,
-            git_folder.read_text(encoding="utf-8").split("\n")[0].split(" ")[-1],
-        )
-    if Path(git_folder, "HEAD").exists():
-        head_name = (
-            Path(git_folder, "HEAD")
-            .read_text(encoding="utf-8")
-            .split("\n")[0]
-            .split(" ")[-1]
-        )
-        head_ref = Path(git_folder, head_name)
-        git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "")
-    else:
-        git_hash = None
-    return git_hash
-def get_git_commit_hash():
-    """
-    Gets the git commit hash of your current repo (if it exists).
-    Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
-    """
-    try:
-        git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
-        git_hash = git_hash.decode()
-    except (subprocess.CalledProcessError, FileNotFoundError):
-        # FileNotFoundError occurs when git not installed on system
-        git_hash = get_commit_from_path(os.getcwd())  # git hash of repo if exists
-    return git_hash
-def add_env_info(storage: Dict[str, Any]):
-    try:
-        pretty_env_info = get_pretty_env_info()
-    except Exception as err:
-        pretty_env_info = str(err)
-    transformers_version = trans_version
-    upper_dir_commit = get_commit_from_path(
-        Path(os.getcwd(), "..")
-    )  # git hash of upper repo if exists
-    added_info = {
-        "pretty_env_info": pretty_env_info,
-        "transformers_version": transformers_version,
-        "upper_git_hash": upper_dir_commit,  # in case this repo is submodule
-    }
-    storage.update(added_info)
--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
@@ -4,6 +4,8 @@ from . import (
    gguf,
    huggingface,
    mamba_lm,
+    nemo_lm,
+    neuralmagic,
    neuron_optimum,
    openai_completions,
    optimum_lm,

--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -45,7 +45,7 @@ def anthropic_completion(
    except ModuleNotFoundError:
        raise Exception(
            "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
-please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e .[anthropic]`",
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
        )
    def _exception_callback(e: Exception, sleep_time: float) -> None:
@@ -74,6 +74,70 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
    return completion()
+def anthropic_chat(
+    client,  #: anthropic.Anthropic,
+    model: str,
+    prompt: str,
+    max_tokens: int,
+    temperature: float,
+    stop: List[str],
+    **kwargs: Any,
+) -> str:
+    """Wrapper function around the Anthropic completion API client with exponential back-off
+    in case of RateLimitError.
+    params:
+        client: anthropic.Anthropic
+            Anthropic API client
+        model: str
+            Anthropic model e.g. 'claude-3-opus-20240229', 'claude-3-sonnet-20240229'
+        prompt: str
+            Prompt to feed to the model
+        max_tokens: int
+            Maximum number of tokens to sample from the model
+        temperature: float
+            Sampling temperature
+        stop: List[str]
+            List of stop sequences
+        kwargs: Any
+            Additional model_args to pass to the API client
+    """
+    try:
+        import anthropic
+    except ModuleNotFoundError:
+        raise Exception(
+            "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+        )
+    def _exception_callback(e: Exception, sleep_time: float) -> None:
+        eval_logger.warning(
+            f"RateLimitError occurred: {e.__cause__}\n Retrying in {sleep_time} seconds"
+        )
+    @retry_on_specific_exceptions(
+        on_exceptions=[
+            anthropic.RateLimitError,
+            anthropic.APIConnectionError,
+            anthropic.APIStatusError,
+        ],
+        max_retries=None,  # retry forever, consider changing
+        on_exception_callback=_exception_callback,
+    )
+    def messages():
+        response = client.messages.create(
+            model=model,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            messages=[{"role": "user", "content": f"{prompt}"}],
+            **kwargs,
+        )
+        return response.content[0].text
+    return messages()
 @register_model("anthropic")
 class AnthropicLM(LM):
    REQ_CHUNK_SIZE = 20  # TODO: not used
@@ -104,7 +168,7 @@ class AnthropicLM(LM):
        except ModuleNotFoundError:
            raise Exception(
                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
-please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e .[anthropic]`",
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
            )
        self.model = model
@@ -153,7 +217,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
        except ModuleNotFoundError:
            raise Exception(
                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
-please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e .[anthropic]`",
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
            )
        if not requests:
@@ -204,3 +268,93 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
        raise NotImplementedError("No support for logits.")
+@register_model("anthropic-chat", "anthropic-chat-completions")
+class AnthropicChatLM(AnthropicLM):
+    REQ_CHUNK_SIZE = 20  # TODO: not used
+    def __init__(
+        self,
+        model: str,
+        batch_size: int = 1,
+        max_tokens: int = 256,
+        temperature: float = 0,  # defaults to 1
+        **kwargs,  # top_p, top_k, etc.
+    ) -> None:
+        """Anthropic API wrapper.
+        :param model: str
+            Anthropic model e.g. 'claude-3-opus-20240229', 'claude-3-sonnet-20240229'
+        :param max_tokens: int
+            Maximum number of tokens to sample from the model
+        :param temperature: float
+            Sampling temperature
+        :param kwargs: Any
+            Additional model_args to pass to the API client
+        """
+        super().__init__()
+        try:
+            import anthropic
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+            )
+        self.model = model
+        # defaults to os.environ.get("ANTHROPIC_API_KEY")
+        self.client = anthropic.Anthropic()
+        self.temperature = temperature
+        self.max_token = max_tokens
+        self.tokenizer = self.client.get_tokenizer()
+        self.kwargs = kwargs
+    @property
+    def max_gen_toks(self) -> int:
+        return self.max_tokens
+    def generate_until(self, requests) -> List[str]:
+        try:
+            import anthropic
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+            )
+        if not requests:
+            return []
+        _requests: List[Tuple[str, dict]] = [req.args for req in requests]
+        res = []
+        for request in tqdm(_requests):
+            try:
+                inp = request[0]
+                request_args = request[1]
+                # generation_kwargs
+                until = request_args.get("until")
+                max_tokens = request_args.get("max_gen_toks", self.max_length)
+                temperature = request_args.get("temperature", self.temperature)
+                response = anthropic_chat(
+                    client=self.client,
+                    model=self.model,
+                    prompt=inp,
+                    max_tokens=max_tokens,
+                    temperature=temperature,  # TODO: implement non-greedy sampling for Anthropic
+                    stop=until,  # type: ignore
+                    **self.kwargs,
+                )
+                res.append(response)
+                self.cache_hook.add_partial("generate_until", request, response)
+            except anthropic.APIConnectionError as e:  # type: ignore # noqa: F821
+                eval_logger.critical(f"Server unreachable: {e.__cause__}")
+                break
+            except anthropic.APIStatusError as e:  # type: ignore # noqa: F821
+                eval_logger.critical(f"API error {e.status_code}: {e.message}")
+                break
+        return res
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -13,6 +13,7 @@ from accelerate import (
    InitProcessGroupKwargs,
    find_executable_batch_size,
 )
+from huggingface_hub import HfApi
 from packaging import version
 from peft import PeftModel
 from peft import __version__ as PEFT_VERSION
@@ -43,13 +44,13 @@ def _get_accelerate_args(
    max_memory_per_gpu: Optional[Union[int, str]] = None,
    max_cpu_memory: Optional[Union[int, str]] = None,
    offload_folder: Optional[str] = "./offload",
+    gpus: Optional[int] = None,
 ) -> dict:
    """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
    max_memory = {}
    if max_memory_per_gpu is not None:
        max_memory_per_gpu_map = {
-            device_idx: max_memory_per_gpu
+            device_idx: max_memory_per_gpu for device_idx in range(gpus)
-            for device_idx in range(torch.cuda.device_count())
        }
        max_memory.update(max_memory_per_gpu_map)
    if max_cpu_memory is not None:
@@ -77,7 +78,7 @@ class HFLM(TemplateLM):
    def __init__(
        self,
-        pretrained: Optional[Union[str, transformers.PreTrainedModel]] = "gpt2",
+        pretrained: Union[str, transformers.PreTrainedModel],
        backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
        # override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq)
        revision: Optional[str] = "main",
@@ -99,6 +100,7 @@ class HFLM(TemplateLM):
        trust_remote_code: Optional[bool] = False,
        use_fast_tokenizer: Optional[bool] = True,
        add_bos_token: Optional[bool] = False,
+        prefix_token_id: Optional[int] = None,
        # arguments used for splitting a model across GPUs naively.
        # only used if `parallelize=True`.
        parallelize: Optional[bool] = False,
@@ -106,8 +108,9 @@ class HFLM(TemplateLM):
        max_memory_per_gpu: Optional[Union[int, str]] = None,
        max_cpu_memory: Optional[Union[int, str]] = None,
        offload_folder: Optional[Union[str, os.PathLike]] = "./offload",
-        # PEFT and quantization options
+        # PEFT, delta weights and quantization options
        peft: Optional[str] = None,
+        delta: Optional[str] = None,
        autogptq: Optional[Union[bool, str]] = False,
        **kwargs,
    ) -> None:
@@ -154,7 +157,7 @@ class HFLM(TemplateLM):
                # use user-passed device
                device_list = set(
                    ["cuda", "cpu"]
-                    + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
+                    + [f"cuda:{i}" for i in range(gpus)]
                    + ["mps", "mps:0"]
                )
                if device and device in device_list:
@@ -196,6 +199,15 @@ class HFLM(TemplateLM):
            config=self.config, backend=backend, trust_remote_code=trust_remote_code
        )
+        # load tokenizer so we know tokenizer vocabulary size before loading model and PEFT
+        self._create_tokenizer(
+            pretrained,
+            tokenizer,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            use_fast_tokenizer=use_fast_tokenizer,
+        )
        # if we passed `pretrained` as a string, initialize our model now
        if isinstance(pretrained, str):
            self._create_model(
@@ -204,11 +216,13 @@ class HFLM(TemplateLM):
                dtype=dtype,
                trust_remote_code=trust_remote_code,
                parallelize=parallelize,
+                gpus=gpus,
                device_map_option=device_map_option,
                max_memory_per_gpu=max_memory_per_gpu,
                max_cpu_memory=max_cpu_memory,
                offload_folder=offload_folder,
                peft=peft,
+                delta=delta,
                autogptq=autogptq,
                **kwargs,
            )
@@ -231,14 +245,6 @@ class HFLM(TemplateLM):
                        "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
                    )
-        self._create_tokenizer(
-            pretrained,
-            tokenizer,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-            use_fast_tokenizer=use_fast_tokenizer,
-        )
        self.truncation = truncation
        self.logits_cache = logits_cache
        self.vocab_size = self.tokenizer.vocab_size
@@ -275,7 +281,10 @@ class HFLM(TemplateLM):
            )
        self._max_length = max_length
+        self.pretrained = pretrained
+        self.delta = delta
+        self.peft = peft
+        self.revision = revision
        self.batch_schedule = 1
        self.batch_sizes = {}
        self.max_batch_size = max_batch_size
@@ -322,9 +331,7 @@ class HFLM(TemplateLM):
                        self._model = accelerator.prepare_model(
                            self.model, evaluation_mode=True
                        )
-                    self._device = torch.device(
+                    self._device = torch.device(f"{accelerator.device}")
-                        f"cuda:{accelerator.local_process_index}"
-                    )
                    self.accelerator = accelerator
                    if self.accelerator.is_local_main_process:
@@ -340,6 +347,12 @@ class HFLM(TemplateLM):
            self._rank = 0
            self._world_size = 1
+        self.custom_prefix_token_id = prefix_token_id
+        if prefix_token_id is not None:
+            eval_logger.info(
+                f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}"
+            )
    @property
    def config(self):
        # return the associated transformers.AutoConfig for the given pretrained model.
@@ -358,6 +371,15 @@ class HFLM(TemplateLM):
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
        return self.tokenizer.eos_token_id
+    @property
+    def prefix_token_id(self):
+        # it is used as prefix for loglikelihood
+        if self.custom_prefix_token_id is not None:
+            return self.custom_prefix_token_id
+        if self.tokenizer.bos_token_id is not None:
+            return self.tokenizer.bos_token_id
+        return self.tokenizer.eos_token_id
    @property
    def max_length(self):
        if self._max_length:  # if max length manually set, return it
@@ -466,12 +488,14 @@ class HFLM(TemplateLM):
        # only used if `parallelize=True`.
        # (accelerate naive PP (device_map) options)
        parallelize: Optional[bool] = False,
+        gpus: Optional[int] = None,
        device_map_option: Optional[str] = "auto",
        max_memory_per_gpu: Optional[Union[int, str]] = None,
        max_cpu_memory: Optional[Union[int, str]] = None,
        offload_folder: Optional[str] = "./offload",
-        # PEFT and quantization options
+        # PEFT, delta weights and quantization options
        peft: Optional[str] = None,
+        delta: Optional[str] = None,
        autogptq: Optional[Union[bool, str]] = False,
        **kwargs,
    ) -> None:
@@ -496,6 +520,7 @@ class HFLM(TemplateLM):
                    max_memory_per_gpu,
                    max_cpu_memory,
                    offload_folder,
+                    gpus,
                )
            )
        elif "device_map" not in model_kwargs:
@@ -504,9 +529,7 @@ class HFLM(TemplateLM):
            # for quantized models now seems to be device_map="auto"
            # which breaks data-parallel mode.
            if hasattr(self, "accelerator"):
-                model_kwargs.update(
+                model_kwargs.update({"device_map": {"": f"{self.accelerator.device}"}})
-                    {"device_map": {"": f"cuda:{self.accelerator.local_process_index}"}}
-                )
            else:
                model_kwargs.update({"device_map": {"": str(self.device)}})
@@ -547,12 +570,47 @@ class HFLM(TemplateLM):
                **model_kwargs,
            )
+        if peft and delta:
+            raise ValueError(
+                "Cannot use both 'peft' and 'delta' options at the same time."
+            )
        if peft:
            if model_kwargs.get("load_in_4bit", None):
-                assert PEFT_VERSION >= "0.4.0", "load_in_4bit requires peft >= 0.4.0"
+                if version.parse(PEFT_VERSION) < version.parse("0.4.0"):
+                    raise AssertionError("load_in_4bit requires peft >= 0.4.0")
+            if self._model.config.vocab_size != len(self.tokenizer):
+                # resize model for LoRAs with added tokens
+                self._model.resize_token_embeddings(len(self.tokenizer))
+                eval_logger.info(
+                    f"Model config indicates vocab_size='{self._model.config.vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..."
+                )
            self._model = PeftModel.from_pretrained(
                self._model, peft, revision=revision
            )
+        elif delta:
+            if autogptq:
+                eval_logger.warning(
+                    "Delta weights might trigger unexpected behavior when used with AutoGPTQ."
+                )
+            _model_delta = self.AUTO_MODEL_CLASS.from_pretrained(
+                delta,
+                revision=revision,
+                torch_dtype=get_dtype(dtype),
+                trust_remote_code=trust_remote_code,
+                **model_kwargs,
+            )
+            for name, param in self._model.state_dict().items():
+                try:
+                    param.data += _model_delta.state_dict()[name]
+                except KeyError:
+                    raise KeyError(f"Delta model is missing weights for layer: {name}")
+                except Exception as e:
+                    raise RuntimeError(
+                        f"Failed to add delta weights to layer {name}. Error: {e}"
+                    )
+            del _model_delta
        return None
@@ -615,6 +673,8 @@ class HFLM(TemplateLM):
            max_cont_enc = len(continuation_enc[-(self.max_length + 1) :])
        else:
            max_length = self.max_length
+            max_context_enc = max_length
+            max_cont_enc = max_length
        # if OOM, then halves batch_size and tries again
        @find_executable_batch_size(starting_batch_size=self.max_batch_size)
@@ -664,14 +724,21 @@ class HFLM(TemplateLM):
        self, string: str, left_truncate_len=None, add_special_tokens=None
    ) -> List[int]:
        """ """
+        # default for None - empty dict, use predefined tokenizer param
+        # used for all models except for CausalLM or predefined value
+        special_tokens_kwargs = {}
+        # by default for CausalLM - false or self.add_bos_token is set
        if add_special_tokens is None:
            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
-                add_special_tokens = False or self.add_bos_token
+                special_tokens_kwargs = {
-            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                    "add_special_tokens": False or self.add_bos_token
-                # TODO: investigate best practices for enc-dec models + special tokens
+                }
-                add_special_tokens = True
+        # otherwise the method explicitly defines the value
+        else:
+            special_tokens_kwargs = {"add_special_tokens": add_special_tokens}
-        encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens)
+        encoding = self.tokenizer.encode(string, **special_tokens_kwargs)
        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
        if left_truncate_len:
@@ -690,17 +757,16 @@ class HFLM(TemplateLM):
        old_padding_side = self.tokenizer.padding_side
        self.tokenizer.padding_side = padding_side
+        add_special_tokens = {}
        if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
-            add_special_tokens = False or self.add_bos_token
+            add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
-        elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
-            add_special_tokens = True
        encoding = self.tokenizer(
            strings,
            truncation=truncation,
            padding="longest",
            return_tensors="pt",
-            add_special_tokens=add_special_tokens,
+            **add_special_tokens,
        )
        if left_truncate_len:
            encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
@@ -711,11 +777,8 @@ class HFLM(TemplateLM):
        return encoding["input_ids"], encoding["attention_mask"]
-    def tok_decode(self, tokens):
+    def tok_decode(self, tokens, skip_special_tokens=True):
-        if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+        return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
-            return self.tokenizer.decode(tokens)
-        elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
-            return self.tokenizer.decode(tokens, skip_special_tokens=True)
    def _model_call(self, inps, attn_mask=None, labels=None):
        """
@@ -811,7 +874,7 @@ class HFLM(TemplateLM):
                    utils.make_disjoint_window,
                    utils.get_rolling_token_windows(
                        token_list=self.tok_encode(string),
-                        prefix_token=self.eot_token_id,
+                        prefix_token=self.prefix_token_id,
                        max_seq_len=self.max_length,
                        context_len=1,
                    ),
@@ -1148,7 +1211,7 @@ class HFLM(TemplateLM):
                if "until" in kwargs.keys():
                    until = kwargs.pop("until")
                    if isinstance(until, str):
-                        until = [kwargs]
+                        until = [until]
                    elif not isinstance(until, list):
                        raise ValueError(
                            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
@@ -1158,7 +1221,7 @@ class HFLM(TemplateLM):
                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
                )
            # add EOS token to stop sequences
-            eos = self.tok_decode(self.eot_token_id)
+            eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
            if not until:
                until = [eos]
            else:
@@ -1221,3 +1284,44 @@ class HFLM(TemplateLM):
        pbar.close()
        return res
+    def get_model_info(self) -> dict:
+        """
+        Method to get Hugging Face model information for experiment reproducibility.
+        """
+        def get_model_num_params(model) -> int:
+            if hasattr(model, "num_parameters"):
+                return model.num_parameters()
+            if hasattr(model, "parameters"):
+                return sum(p.numel() for p in model.parameters())
+            else:
+                return -1
+        def get_model_dtype(model) -> str:
+            if hasattr(model, "dtype"):
+                return model.dtype
+            else:
+                return ""
+        def get_model_sha(pretrained: str, revision: str) -> str:
+            try:
+                model_info = HfApi().model_info(repo_id=pretrained, revision=revision)
+                return model_info.sha
+            except Exception as e:
+                eval_logger.warn(
+                    f"Failed to get model SHA for {pretrained} at revision {revision}. Error: {e}"
+                )
+                return ""
+        model_info = {
+            "model_num_parameters": get_model_num_params(self._model),
+            "model_dtype": get_model_dtype(self._model),
+            "model_revision": self.revision,
+            "model_sha": get_model_sha(self.pretrained, self.revision),
+        }
+        if self.peft:
+            model_info["peft_sha"] = get_model_sha(self.peft, self.revision)
+        if self.delta:
+            model_info["delta_sha"] = get_model_sha(self.delta, self.revision)
+        return model_info
--- a/lm_eval/models/nemo_lm.py
+++ b/lm_eval/models/nemo_lm.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import pathlib
+from copy import deepcopy
+from typing import List, Literal
+import filelock
+import numpy as np
+import torch
+from tqdm import tqdm
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import Collator
+from lm_eval.utils import (
+    eval_logger,
+    get_rolling_token_windows,
+    make_disjoint_window,
+    simple_parse_args_string,
+)
+def _patch_pretrained_cfg(
+    pretrained_cfg, trainer, tensor_model_parallel_size, pipeline_model_parallel_size
+):
+    try:
+        import omegaconf
+    except ModuleNotFoundError:
+        raise Exception(
+            "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
+            "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
+            "or installing nemo following https://github.com/NVIDIA/NeMo.",
+        )
+    omegaconf.OmegaConf.set_struct(pretrained_cfg, True)
+    with omegaconf.open_dict(pretrained_cfg):
+        attributes_to_update = {
+            "sequence_parallel": False,
+            "activations_checkpoint_granularity": None,
+            "activations_checkpoint_method": None,
+            "precision": trainer.precision,
+            "global_batch_size": None,
+            "tensor_model_parallel_size": tensor_model_parallel_size,
+            "pipeline_model_parallel_size": pipeline_model_parallel_size,
+            "apply_rope_fusion": False,
+        }
+        for name, value in attributes_to_update.items():
+            if hasattr(pretrained_cfg, name):
+                pretrained_cfg[name] = value
+    return pretrained_cfg
+def _get_target_from_class(target_class) -> str:
+    return f"{target_class.__module__}.{target_class.__name__}"
+def load_model(
+    model_path: str,
+    trainer,
+    tensor_model_parallel_size: int,
+    pipeline_model_parallel_size: int,
+) -> torch.nn.Module:
+    try:
+        from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import (
+            MegatronGPTModel,
+        )
+        from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+    except ModuleNotFoundError:
+        raise Exception(
+            "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
+            "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
+            "or installing nemo following https://github.com/NVIDIA/NeMo.",
+        )
+    model_path = pathlib.Path(model_path)
+    save_restore_connector = NLPSaveRestoreConnector()
+    if model_path.is_dir():
+        save_restore_connector.model_extracted_dir = model_path.as_posix()
+    pretrained_cfg = save_restore_connector.restore_from(
+        None, model_path.as_posix(), return_config=True, trainer=trainer
+    )
+    if not hasattr(pretrained_cfg, "target"):
+        pretrained_cfg["target"] = _get_target_from_class(MegatronGPTModel)
+    pretrained_cfg = _patch_pretrained_cfg(
+        pretrained_cfg,
+        trainer,
+        tensor_model_parallel_size=tensor_model_parallel_size,
+        pipeline_model_parallel_size=pipeline_model_parallel_size,
+    )
+    model_to_load_path = model_path
+    override_config = pretrained_cfg
+    module_name, class_name = override_config.target.rsplit(".", 1)
+    model_class = getattr(importlib.import_module(module_name), class_name)
+    # monkeypatch _build_tokenizer method to be process-safe
+    tokenizer_lock = filelock.FileLock(f"/tmp/{model_path.name}.tokenizer.lock")
+    def _synced_build_tokenizer(self):
+        with tokenizer_lock:
+            self._original_build_tokenizer()
+    model_class._original_build_tokenizer = model_class._build_tokenizer
+    model_class._build_tokenizer = _synced_build_tokenizer
+    model = model_class.restore_from(
+        restore_path=model_to_load_path.as_posix(),
+        trainer=trainer,
+        override_config_path=override_config,
+        save_restore_connector=save_restore_connector,
+        map_location=f"cuda:{trainer.local_rank}",
+    )
+    model.freeze()
+    model.training = False
+    try:
+        # Have to turn off activations_checkpoint_method for inference
+        model.model.language_model.encoder.activations_checkpoint_method = None
+    except AttributeError:
+        pass
+    return model
+def setup_distributed_environment(trainer):
+    try:
+        from nemo.utils.app_state import AppState
+    except ModuleNotFoundError:
+        raise Exception(
+            "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
+            "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
+            "or installing nemo following https://github.com/NVIDIA/NeMo.",
+        )
+    def dummy():
+        return
+    if trainer.strategy.launcher is not None:
+        trainer.strategy.launcher.launch(dummy, trainer=trainer)
+    trainer.strategy.setup_environment()
+    app_state = AppState()
+    return app_state
+@register_model("nemo_lm")
+class NeMoLM(LM):
+    def __init__(
+        self,
+        path: str,
+        max_length: int = 4096,
+        batch_size: int = 1,
+        max_gen_toks: int = 256,
+        devices: int = 1,
+        num_nodes: int = 1,
+        tensor_model_parallel_size: int = 1,
+        pipeline_model_parallel_size: int = 1,
+        precision: Literal[
+            "16-mixed",
+            "bf16-mixed",
+            "32-true",
+            "64-true",
+            64,
+            32,
+            16,
+            "64",
+            "32",
+            "16",
+            "bf16",
+        ] = "bf16",
+        **kwargs,
+    ):
+        try:
+            from nemo.collections.nlp.modules.common.text_generation_utils import (
+                generate,
+            )
+            from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+            from pytorch_lightning.trainer.trainer import Trainer
+            self.generate = generate
+        except ModuleNotFoundError:
+            raise Exception(
+                "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
+                "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
+                "or installing nemo following https://github.com/NVIDIA/NeMo.",
+            )
+        super().__init__()
+        if (
+            tensor_model_parallel_size == 1
+            and pipeline_model_parallel_size == 1
+            and devices > 1
+        ):
+            eval_logger.info(
+                f"The number of data replicas for evaluation is {devices}."
+            )
+            eval_logger.info(f"The total number of devices is {devices}.")
+            eval_logger.info(
+                "No tensor parallelism or pipeline parallelism is applied."
+            )
+        elif tensor_model_parallel_size * pipeline_model_parallel_size == devices:
+            eval_logger.info(
+                f"Setting tensor parallelism to {tensor_model_parallel_size} and pipeline parallelism to {pipeline_model_parallel_size}."
+            )
+            eval_logger.info(f"The total number of devices is {devices}.")
+            eval_logger.info("No data parallelism is applied.")
+        else:
+            raise ValueError(
+                "Please set the product of tensor_model_parallel_size and pipeline_model_parallel_size"
+                "equal to the specified number of devices."
+            )
+        if num_nodes > 1:
+            raise ValueError(
+                "A number of nodes greater than 1 is not supported yet. Please set num_nodes as 1."
+            )
+        trainer = Trainer(
+            strategy=NLPDDPStrategy(),
+            devices=devices,
+            accelerator="gpu",
+            num_nodes=num_nodes,
+            precision=precision,
+            logger=False,
+            enable_checkpointing=False,
+            use_distributed_sampler=False,
+        )
+        # Modify the following flags only for data replication
+        if (
+            tensor_model_parallel_size == 1
+            and pipeline_model_parallel_size == 1
+            and devices > 1
+        ):
+            self._device = torch.device(f"cuda:{trainer.global_rank}")
+            self._rank = trainer.global_rank
+            self._world_size = trainer.world_size
+        self.model = load_model(
+            path,
+            trainer,
+            tensor_model_parallel_size=tensor_model_parallel_size,
+            pipeline_model_parallel_size=pipeline_model_parallel_size,
+        ).cuda()
+        self.tokenizer = self.model.tokenizer
+        self.app_state = setup_distributed_environment(trainer)
+        self._max_length = max_length
+        self._batch_size = int(batch_size)
+        self._max_gen_toks = max_gen_toks
+    @classmethod
+    def create_from_arg_string(cls, arg_string, additional_config=None):
+        args = simple_parse_args_string(arg_string)
+        if additional_config:
+            args["batch_size"] = additional_config.get("batch_size", 1)
+        return cls(**args)
+    @property
+    def eot_token_id(self):
+        try:
+            return self.tokenizer.eos_id
+        except AttributeError:
+            return None
+    @property
+    def max_length(self):
+        return self._max_length
+    @property
+    def max_gen_toks(self):
+        return self._max_gen_toks
+    @property
+    def batch_size(self):
+        return self._batch_size
+    @property
+    def device(self):
+        return self._device
+    @property
+    def rank(self):
+        return self._rank
+    @property
+    def world_size(self):
+        return self._world_size
+    @property
+    def accelerator(self):
+        return self._Accelerator(self.world_size)
+    class _Accelerator:
+        def __init__(self, world_size):
+            self.world_size = world_size
+        def wait_for_everyone(self):
+            torch.distributed.barrier()
+        def gather(self, local_tensor):
+            gathered_tensors = [
+                torch.zeros(1, dtype=local_tensor.dtype).cuda()
+                for _ in range(self.world_size)
+            ]
+            torch.distributed.all_gather(gathered_tensors, local_tensor)
+            return torch.cat(gathered_tensors)
+    def tok_encode(self, string: str):
+        return self.tokenizer.text_to_ids(string)
+    def tok_decode(self, tokens):
+        return self.tokenizer.ids_to_text(tokens)
+    def _encode_pair(self, context, continuation):
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+        whole_enc = self.tok_encode(context + continuation)
+        context_enc = self.tok_encode(context)
+        context_enc_len = len(context_enc)
+        continuation_enc = whole_enc[context_enc_len:]
+        return context_enc, continuation_enc
+    def loglikelihood(self, requests):
+        new_reqs = []
+        for context, continuation in [req.args for req in requests]:
+            if context == "":
+                # end of text as context
+                context_enc, continuation_enc = (
+                    [self.eot_token_id],
+                    self.tok_encode(continuation),
+                )
+            else:
+                context_enc, continuation_enc = self._encode_pair(context, continuation)
+            new_reqs.append(((context, continuation), context_enc, continuation_enc))
+        return self._loglikelihood_tokens(new_reqs)
+    def loglikelihood_rolling(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[float]:
+        loglikelihoods = []
+        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
+            rolling_token_windows = list(
+                map(
+                    make_disjoint_window,
+                    get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.eot_token_id,
+                        max_seq_len=self.max_length - 1,
+                        context_len=1,
+                    ),
+                )
+            )
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+            string_nll = self._loglikelihood_tokens(
+                rolling_token_windows,
+            )
+            # discard is_greedy
+            string_nll = [x[0] for x in string_nll]
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+        return loglikelihoods
+    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
+        res = []
+        def _collate(x):
+            toks = x[1] + x[2]
+            return -len(toks), tuple(toks)
+        re_ord = Collator(requests, sort_fn=_collate)
+        chunks = re_ord.get_batched(n=self.batch_size, batch_fn=None)
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running loglikelihood requests",
+        )
+        for chunk in chunks:
+            inps = []
+            ctxlens = []
+            contlens = []
+            for _, context_enc, continuation_enc in chunk:
+                # Leave one token for generation. Tokens_to_generate = 0 breaks NeMo.
+                inp = (context_enc + continuation_enc)[-(self.max_length - 1) :]
+                ctxlen = len(context_enc) - max(
+                    0, len(context_enc) + len(continuation_enc) - (self.max_length - 1)
+                )
+                ctxlens.append(ctxlen)
+                contlens.append(len(continuation_enc))
+                inps.append(self.tok_decode(inp))
+            output = self.generate(
+                self.model,
+                inputs=inps,
+                tokens_to_generate=1,
+                min_tokens_to_generate=1,
+                compute_logprob=True,
+                all_probs=True,
+            )
+            batch_token_ids = np.asarray(output["token_ids"])[:, :-1]
+            batch_logprobs = output["logprob"][:, :-1]
+            batch_full_logprob = output["full_logprob"][:, :-1, :]
+            # Compute greedy tokens for entire batch rather than calling it with proper ctxlen for each sample.
+            # Additional tokens for each sample will be trimmed later.
+            min_ctxlen = min(ctxlens)
+            # Use min_ctxlen-1 instead of min_ctxlen since full_logprobs are not returns for the first token.
+            batch_greedy_tokens = (
+                torch.argmax(batch_full_logprob[:, min_ctxlen - 1 :, :], -1)
+                .cpu()
+                .numpy()
+            )
+            for token_ids, greedy_tokens, logprobs, ctxlen, contlen, (
+                cache_key,
+                _,
+                _,
+            ) in zip(
+                batch_token_ids,
+                batch_greedy_tokens,
+                batch_logprobs,
+                ctxlens,
+                contlens,
+                chunk,
+            ):
+                # Trim at contlen since shorter contexts in a batch will have more than one token generated.
+                # Use ctxlen-1 instead of ctxlen same as for full_logprob in batch_greedy_tokens calculation
+                logprobs = (logprobs[ctxlen - 1 :])[:contlen]
+                logprob = sum(logprobs).tolist()
+                continuation_tokens = (token_ids[ctxlen:])[:contlen]
+                len_diff = ctxlen - min_ctxlen
+                is_greedy = continuation_tokens == (greedy_tokens[len_diff:])[:contlen]
+                if not isinstance(is_greedy, bool):
+                    is_greedy = is_greedy.all()
+                answer = (logprob, is_greedy)
+                if cache_key is not None:
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+                res.append(answer)
+                pbar.update(1)
+        pbar.close()
+        return re_ord.get_original(res)
+    def generate_until(self, requests):
+        if not requests:
+            return []
+        res = []
+        def get_until(req_args):
+            until = req_args.get("until", [])
+            until = deepcopy(until)  # prevent from modifying req_args for cache_key
+            if self.tokenizer.ids_to_tokens([self.eot_token_id])[0] not in until:
+                until.append(self.tokenizer.ids_to_tokens([self.eot_token_id])[0])
+            return until
+        def _collate(x):
+            toks = self.tok_encode(x[0])
+            return len(toks), x[0]
+        re_ords = Collator(
+            [reg.args for reg in requests], sort_fn=_collate, group_by="gen_kwargs"
+        )
+        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
+        for chunk in chunks:
+            contexts, all_gen_kwargs = zip(*chunk)
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            req_args = all_gen_kwargs[0]
+            # unpack our keyword arguments.
+            until = get_until(req_args)
+            max_gen_toks = req_args.get("max_gen_toks", self.max_gen_toks)
+            remaining_length = self.max_length - max_gen_toks
+            contexts = []
+            for context, _ in chunk:
+                encoded_context = self.tok_encode(context)
+                encoded_context = encoded_context[-remaining_length:]
+                contexts.append(self.tok_decode(encoded_context))
+            output = self.generate(
+                self.model,
+                inputs=contexts,
+                tokens_to_generate=max_gen_toks,
+                end_strings=until,
+                greedy=True,
+            )
+            answers = output["sentences"]
+            continuations = []
+            for context, answer in zip(contexts, answers):
+                continuations.append(answer[len(context) :])
+            for term in until:
+                continuations = [answer.split(term)[0] for answer in continuations]
+            for request, answer in zip(chunk, continuations):
+                self.cache_hook.add_partial("greedy_until", request, answer)
+                res.append(answer)
+        return re_ords.get_original(res)
--- a/lm_eval/models/neuralmagic.py
+++ b/lm_eval/models/neuralmagic.py
+import copy
+from typing import List, Optional, Tuple, Union
+import numpy
+import transformers
+from tqdm import tqdm
+import lm_eval.models.utils
+from lm_eval import utils
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from lm_eval.models.huggingface import HFLM
+eval_logger = utils.eval_logger
+@register_model("sparseml")
+class SparseMLLM(HFLM):
+    """
+    SparseML is an open-source model optimization toolkit that enables you to create
+    inference-optimized sparse models using pruning, quantization, and distillation
+    algorithms. Models optimized with SparseML can then be exported to the ONNX format and
+    deployed with DeepSparse for GPU-class performance on CPU hardware.
+    This class is a wrapper around the HuggingFace LM class to enable SparseML
+    integration with the lm-evaluation-harness.
+    """
+    def _create_model(
+        self,
+        pretrained: str,
+        revision: Optional[str] = "main",
+        dtype: Optional[str] = "auto",
+        trust_remote_code: Optional[bool] = False,
+        **kwargs,
+    ) -> None:
+        try:
+            from sparseml.transformers import SparseAutoModelForCausalLM
+        except ModuleNotFoundError:
+            raise Exception(
+                "Package `sparseml` is not installed. "
+                "Please install it via `pip install sparseml[transformers]`"
+            )
+        model_kwargs = kwargs if kwargs else {}
+        if "device_map" not in model_kwargs:
+            # set a device_map to initialize model on the right GPU.
+            # this is needed because it seems that the default behavior
+            # for quantized models now seems to be device_map="auto"
+            # which breaks data-parallel mode.
+            if hasattr(self, "accelerator"):
+                model_kwargs.update(
+                    {"device_map": {"": f"cuda:{self.accelerator.local_process_index}"}}
+                )
+            else:
+                model_kwargs.update({"device_map": {"": str(self.device)}})
+        relevant_kwarg_names = [
+            "offload_folder",
+            "device_map",
+        ]
+        relevant_kwargs = {
+            k: v for k, v in model_kwargs.items() if k in relevant_kwarg_names
+        }
+        # Log the difference between model_kwargs and relevant_kwargs so we can see
+        # what is being ignored
+        ignored_kwargs = {}
+        for k, v in model_kwargs.items():
+            if k not in relevant_kwargs.keys():
+                ignored_kwargs[k] = v
+        eval_logger.warning(
+            f"The sparseml integration is ignoring the following kwargs that are specified: {ignored_kwargs}"
+        )
+        model = SparseAutoModelForCausalLM.from_pretrained(
+            pretrained,
+            revision=revision,
+            torch_dtype=lm_eval.models.utils.get_dtype(dtype),
+            trust_remote_code=trust_remote_code,
+            **relevant_kwargs,
+        )
+        self._model = model
+    def _get_config(self, pretrained: str, **kwargs) -> None:
+        try:
+            from sparseml.transformers import SparseAutoConfig
+        except ModuleNotFoundError:
+            raise Exception(
+                "Package `sparseml` is not installed. "
+                "Please install it via `pip install sparseml[transformers]`"
+            )
+        self._config = SparseAutoConfig.from_pretrained(
+            pretrained_model_name_or_path=pretrained, **kwargs
+        )
+    def _create_tokenizer(
+        self,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        tokenizer: Optional[
+            Union[
+                str,
+                transformers.PreTrainedTokenizer,
+                transformers.PreTrainedTokenizerFast,
+            ]
+        ],
+        **kwargs,
+    ) -> None:
+        try:
+            from sparseml.transformers import SparseAutoTokenizer
+        except ModuleNotFoundError:
+            raise Exception(
+                "Package `sparseml` is not installed. "
+                "Please install it via `pip install sparseml[transformers]`"
+            )
+        if tokenizer:
+            if isinstance(tokenizer, str):
+                self.tokenizer = SparseAutoTokenizer.from_pretrained(
+                    tokenizer,
+                    **kwargs,
+                )
+            else:
+                assert isinstance(
+                    tokenizer, transformers.PreTrainedTokenizer
+                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
+                self.tokenizer = tokenizer
+        else:
+            # Get tokenizer based on 'pretrained'
+            if isinstance(pretrained, str):
+                model_name = pretrained
+            else:
+                # get the HF hub name via accessor on model
+                model_name = self.model.name_or_path
+            self.tokenizer = SparseAutoTokenizer.from_pretrained(
+                model_name,
+                **kwargs,
+            )
+        return None
+@register_model("deepsparse")
+class DeepSparseLM(LM):
+    """
+    Wrapper around DeepSparse, a sparsity-aware deep learning
+    inference runtime for CPUs, to make it compatible with the
+    lm-evaluation-harness.
+    """
+    _DEFAULT_MAX_LENGTH = 2048
+    def __init__(
+        self,
+        pretrained: str,
+        tokenizer: Optional[
+            Union[
+                str,
+                transformers.PreTrainedTokenizer,
+                transformers.PreTrainedTokenizerFast,
+            ]
+        ] = None,
+        batch_size: Optional[Union[int, str]] = 1,
+        max_gen_toks: Optional[int] = 256,
+        max_length: Optional[int] = None,
+    ):
+        super().__init__()
+        try:
+            import deepsparse
+        except ModuleNotFoundError:
+            raise Exception(
+                "Package `deepsparse` is not installed. "
+                "Please install it via `pip install deepsparse[transformers]`"
+            )
+        if isinstance(batch_size, str) and not batch_size.isdigit():
+            eval_logger.warning(
+                f"batch_size={batch_size} is not valid for deepsparse because it is not an integer. "
+                "Ignoring and using the default of 1."
+            )
+            batch_size = 1
+        self.batch_size = int(batch_size)
+        self._max_length = max_length if max_length else self._DEFAULT_MAX_LENGTH
+        self._max_gen_toks = max_gen_toks
+        self.batch_sizes = {}
+        # Initialize new model and tokenizer instances
+        self.model = deepsparse.TextGeneration(
+            model_path=pretrained,
+            sequence_length=self._max_length,
+            batch_size=batch_size,
+        )
+        self.tokenizer = tokenizer if tokenizer else self.model.tokenizer
+        self.config = self.model.config
+    def tok_encode(self, string: str) -> List[int]:
+        return self.tokenizer.encode(string)
+    def tok_decode(self, tokens: List[int]) -> str:
+        return self.tokenizer.decode(tokens)
+    @property
+    def eot_token_id(self):
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+        return self.tokenizer.eos_token_id
+    @property
+    def prefix_token_id(self):
+        # it is used as prefix for loglikelihood
+        if self.tokenizer.bos_token_id is not None:
+            return self.tokenizer.bos_token_id
+        return self.tokenizer.eos_token_id
+    @property
+    def max_length(self) -> int:
+        return self._max_length
+    @property
+    def max_gen_toks(self) -> int:
+        return self._max_gen_toks
+    def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
+        """
+        Copied directly from
+        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
+        """
+        new_reqs = []
+        for context, continuation in [req.args for req in requests]:
+            if context == "":
+                raise NotImplementedError(
+                    "Implementing empty context is not supported yet"
+                )
+            context_enc, continuation_enc = self._encode_pair(context, continuation)
+            new_reqs.append(((context, continuation), context_enc, continuation_enc))
+        return self._loglikelihood_tokens(new_reqs)
+    def _loglikelihood_tokens(
+        self,
+        requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
+        disable_tqdm: bool = False,
+    ) -> List[Tuple[float, bool]]:
+        """
+        The function to compute the loglikelihood of the continuation
+        tokens given the context tokens.
+        This function is an adapted version of the original function from
+        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
+        """
+        res = []
+        def _collate(x):
+            """Defines the key for the sorted method"""
+            toks = x[1] + x[2]
+            return -len(toks), tuple(toks)
+        re_ord = utils.Reorderer(requests, _collate)
+        for chunk in tqdm(
+            list(lm_eval.models.utils.chunks(re_ord.get_reordered(), self.batch_size)),
+            disable=disable_tqdm,
+        ):
+            batch_inp = []
+            batch_cache_key = []
+            batch_continuation_enc = []
+            # len(chunk) is the batch_size
+            for cache_key, context_enc, continuation_enc in chunk:
+                # how this all works (illustrated on a causal decoder-only setup):
+                #          CTX      CONT
+                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+                # model  \               \
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice # noqa: E501
+                inp = (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
+                batch_inp.append(self.tokenizer.decode(inp))
+                batch_cache_key.append(cache_key)
+                batch_continuation_enc.append(continuation_enc)
+            response = self.model(
+                prompt=batch_inp,
+                max_new_tokens=0,
+                output_scores=True,
+                include_prompt_logits=True,
+            )
+            for resp, continuation_enc, cache_key in zip(
+                response.generations, batch_continuation_enc, batch_cache_key
+            ):
+                # (seq_len, vocab_size)
+                multi_scores = resp.score
+                from deepsparse.utils.data import numpy_log_softmax
+                # (seq_len, vocab_size) but with softmax applied
+                multi_logits = numpy_log_softmax(multi_scores, axis=1)
+                # toss out the context half of the sequence
+                # (cont_len, vocab_size)
+                continuation_multi_logits = multi_logits[-len(continuation_enc) :]
+                # pick out the logits for the continuation tokens
+                # (cont_len,)
+                continuation_logits = continuation_multi_logits[
+                    numpy.arange(len(continuation_enc)), continuation_enc
+                ]
+                # check if the tokens generated greedly are the same
+                # as the expected continuation
+                greedy_tokens = continuation_multi_logits.argmax(axis=1)
+                max_equal = greedy_tokens.tolist() == continuation_enc
+                # Answer: (log prob, is-exact-match)
+                answer = (float(continuation_logits.sum()), bool(max_equal))
+                res.append(answer)
+                if cache_key is not None:
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+        return re_ord.get_original(res)
+    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+        raise NotImplementedError(
+            "The method not required by any of our current task integrations so far"
+        )
+    def generate_until(self, requests: List[Instance]) -> List[str]:
+        """
+        The function to generate a certain number of new tokens
+        given a context.
+        This function is an adapted version of the original function from
+        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py
+        """
+        if not requests:
+            return []
+        res = []
+        requests = [req.args for req in requests]
+        def _collate(x):
+            toks = self.tok_encode(x[0])
+            return len(toks), x[0]
+        re_ord = utils.Reorderer(requests, _collate)
+        def sameuntil_chunks(xs, size):
+            ret = []
+            lastuntil = xs[0][1]
+            for x in xs:
+                if len(ret) >= size or x[1] != lastuntil:
+                    yield ret, lastuntil
+                    ret = []
+                    lastuntil = x[1]
+                ret.append(x)
+            if ret:
+                yield ret, lastuntil
+        pbar = tqdm(total=len(requests))
+        for chunk, request_args in tqdm(
+            list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size))
+        ):
+            inps = []
+            # make a deepcopy since we are changing arguments
+            request_args = copy.deepcopy(request_args)
+            self._max_gen_toks = request_args.pop("max_gen_toks", self.max_gen_toks)
+            for context, _ in chunk:
+                # add context (prompts) to the list
+                inps.append(context)
+            until = request_args.pop("until", ["<|endoftext|>"])
+            request_args.pop("do_sample", None)
+            request_args["temperature"] = request_args.get("temperature", 0)
+            # run inference (generate max_gen_toks tokens)
+            out = self.model(
+                sequences=inps,
+                max_new_tokens=self.max_gen_toks - 1,
+                stop=until,
+                **request_args,
+            )
+            for resp, (context, args_) in zip(out.generations, chunk):
+                text = resp.text
+                until_ = until
+                # split the text at the first occurrence of any of the until tokens
+                for term in until_:
+                    if len(term) > 0:
+                        text = text.split(term)[0]
+                res.append(text)
+                self.cache_hook.add_partial(
+                    "generate_until", (context, {"until": until_}), text
+                )
+                pbar.update(1)
+        pbar.close()
+        return re_ord.get_original(res)
+    def _encode_pair(
+        self, context: str, continuation: str
+    ) -> Tuple[List[int], List[int]]:
+        """
+        Copied directly from
+        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
+        """
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+        whole_enc = self.tok_encode(context + continuation)
+        context_enc = self.tok_encode(context)
+        context_enc_len = len(context_enc)
+        continuation_enc = whole_enc[context_enc_len:]
+        return context_enc, continuation_enc
--- a/lm_eval/models/neuron_optimum.py
+++ b/lm_eval/models/neuron_optimum.py
@@ -305,6 +305,11 @@ class NEURON_HF(TemplateLM):
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
        return self.tokenizer.eos_token_id
+    @property
+    def prefix_token_id(self):
+        # it is used as prefix for loglikelihood
+        return self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
    @property
    def max_length(self):
        if self._max_length:  # if max length manually set, return it
@@ -460,7 +465,7 @@ class NEURON_HF(TemplateLM):
                    utils.make_disjoint_window,
                    utils.get_rolling_token_windows(
                        token_list=self.tok_encode(string),
-                        prefix_token=self.eot_token_id,
+                        prefix_token=self.prefix_token_id,
                        max_seq_len=self.max_length,
                        context_len=1,
                    ),
@@ -659,7 +664,7 @@ class NEURON_HF(TemplateLM):
                    if "until" in kwargs.keys():
                        until = kwargs.pop("until")
                        if isinstance(until, str):
-                            until = [kwargs]
+                            until = [until]
                        elif not isinstance(until, list):
                            raise ValueError(
                                f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -14,13 +14,11 @@ from lm_eval.models.utils import retry_on_specific_exceptions
 from lm_eval.utils import eval_logger
-def get_result(response, ctxlen: int) -> Tuple[float, bool]:
+def get_result(response) -> Tuple[float, bool]:
    """Process results from OpenAI API response.
    :param response: dict
        OpenAI API Response
-    :param ctxlen: int
-        Length of context (so we can slice them away and only keep the predictions)
    :return:
        continuation_logprobs: np.array
            Log probabilities of continuation tokens
@@ -29,9 +27,9 @@ def get_result(response, ctxlen: int) -> Tuple[float, bool]:
    """
    is_greedy = True
    logprobs = response.logprobs.token_logprobs
-    continuation_logprobs = sum(logprobs[ctxlen:])
+    continuation_logprobs = sum(logprobs)
-    for i in range(ctxlen, len(response.logprobs.token_logprobs)):
+    for i in range(len(response.logprobs.token_logprobs)):
        token = response.logprobs.token_logprobs[i]
        top_tokens = response.logprobs.top_logprobs[i]
        top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
@@ -111,7 +109,7 @@ class OpenaiCompletionsLM(TemplateLM):
        self.base_url = base_url
        self.tokenizer_backend = tokenizer_backend
        self.truncate = truncate
-        self._batch_size = batch_size
+        self._batch_size = int(batch_size)
        self._max_gen_toks = max_gen_toks
        self._max_length = max_length
@@ -212,7 +210,6 @@ class OpenaiCompletionsLM(TemplateLM):
                client=self.client,
                model=self.model,
                prompt=inps,
-                echo=True,
                max_tokens=0,
                temperature=0.0,
                logprobs=10,
@@ -222,7 +219,7 @@ class OpenaiCompletionsLM(TemplateLM):
            for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip(
                response.choices, ctxlens, chunk
            ):
-                answer = get_result(resp, ctxlen)
+                answer = get_result(resp)
                res.append(answer)
@@ -281,7 +278,7 @@ class OpenaiCompletionsLM(TemplateLM):
                **{
                    k: v
                    for k, v in request_args.items()
-                    if k not in ["do_sample", "max_gen_toks"]
+                    if k not in {"do_sample", "max_gen_toks", "until"}
                },
            )
            for resp, (context, args_) in zip(response.choices, chunk):
@@ -433,7 +430,7 @@ class OpenaiChatCompletionsLM(LM):
                    if "until" in kwargs.keys():
                        until = kwargs.pop("until")
                        if isinstance(until, str):
-                            until = [kwargs]
+                            until = [until]
                        elif not isinstance(until, list):
                            raise ValueError(
                                f"Expected repr(kwargs['until']) to be of type Union[str, list] but got {until}"

--- a/lm_eval/models/optimum_lm.py
+++ b/lm_eval/models/optimum_lm.py
+import json
 from importlib.util import find_spec
 from pathlib import Path
+from lm_eval import utils
 from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM
+eval_logger = utils.eval_logger
 @register_model("openvino")
 class OptimumLM(HFLM):
    """
    Optimum Intel provides a simple interface to optimize Transformer models and convert them to \
    OpenVINO™ Intermediate Representation (IR) format to accelerate end-to-end pipelines on \
    Intel® architectures using OpenVINO™ runtime.
+    To use an OpenVINO config, use `--model_args ov_config` to point to a json file with an OpenVINO config:
+    `lm_eval --model openvino --model_args pretrained=gpt2,ov_config=config.json --task lambada_openai`
+    Example json file contents: {"INFERENCE_PRECISION_HINT": "f32", "CACHE_DIR": "model_cache"}
    """
    def __init__(
@@ -48,16 +57,25 @@ class OptimumLM(HFLM):
            from optimum.intel.openvino import OVModelForCausalLM
        model_kwargs = kwargs if kwargs else {}
+        if "ov_config" in model_kwargs:
+            if not Path(model_kwargs["ov_config"]).exists():
+                raise ValueError(
+                    "ov_config should point to a .json file containing an OpenVINO config"
+                )
+            with open(model_kwargs["ov_config"]) as f:
+                model_kwargs["ov_config"] = json.load(f)
+                eval_logger.info(
+                    f"Using custom OpenVINO config: {model_kwargs['ov_config']}"
+                )
+        else:
+            model_kwargs["ov_config"] = {}
+        model_kwargs["ov_config"].setdefault("CACHE_DIR", "")
        model_file = Path(pretrained) / "openvino_model.xml"
        if model_file.exists():
            export = False
        else:
            export = True
-        kwargs["ov_config"] = {
-            "PERFORMANCE_HINT": "LATENCY",
-            "NUM_STREAMS": "1",
-            "CACHE_DIR": "",
-        }
        self._model = OVModelForCausalLM.from_pretrained(
            pretrained,

--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -21,10 +21,12 @@ from lm_eval.utils import (
 try:
    import ray
    from vllm import LLM, SamplingParams
+    from vllm.lora.request import LoRARequest
    from vllm.transformers_utils.tokenizer import get_tokenizer
 except ModuleNotFoundError:
    pass
 eval_logger = eval_logger
@@ -34,7 +36,7 @@ class VLLM(TemplateLM):
    def __init__(
        self,
-        pretrained="gpt2",
+        pretrained: str,
        dtype: Literal["float16", "bfloat16", "float32", "auto"] = "auto",
        revision: Optional[str] = None,
        trust_remote_code: Optional[bool] = False,
@@ -42,6 +44,7 @@ class VLLM(TemplateLM):
        tokenizer_mode: Literal["auto", "slow"] = "auto",
        tokenizer_revision: Optional[str] = None,
        add_bos_token: Optional[bool] = False,
+        prefix_token_id: Optional[int] = None,
        tensor_parallel_size: int = 1,
        quantization: Optional[str] = None,
        max_gen_toks: int = 256,
@@ -54,6 +57,7 @@ class VLLM(TemplateLM):
        gpu_memory_utilization: float = 0.9,
        device: str = "cuda",
        data_parallel_size: int = 1,
+        lora_local_path: str = None,
        **kwargs,
    ):
        super().__init__()
@@ -96,9 +100,6 @@ class VLLM(TemplateLM):
        if self.data_parallel_size <= 1:
            self.model = LLM(**self.model_args)
        else:
-            assert parse_version(version("vllm")) < parse_version(
-                "0.3.3"
-            ), "data_parallel is only compatible with vllm < v0.3.3."
            eval_logger.warning(
                "You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached."
            )
@@ -118,14 +119,36 @@ class VLLM(TemplateLM):
            tokenizer_revision=tokenizer_revision,
        )
        self.add_bos_token = add_bos_token
+        self.custom_prefix_token_id = prefix_token_id
+        if prefix_token_id is not None:
+            eval_logger.info(
+                f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}"
+            )
        self._max_gen_toks = max_gen_toks
+        if lora_local_path is not None:
+            assert parse_version(version("vllm")) > parse_version(
+                "0.3.0"
+            ), "lora adapters only compatible with vllm > v0.3.0."
+            self.lora_request = LoRARequest("finetuned", 1, lora_local_path)
+        else:
+            self.lora_request = None
    @property
    def eot_token_id(self):
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
        return self.tokenizer.eos_token_id
+    @property
+    def prefix_token_id(self):
+        # it is used as prefix for loglikelihood
+        if self.custom_prefix_token_id is not None:
+            return self.custom_prefix_token_id
+        if self.tokenizer.bos_token_id is not None:
+            return self.tokenizer.bos_token_id
+        return self.tokenizer.eos_token_id
    @property
    def max_length(self):
        if self._max_length:  # if max length manually set, return it
@@ -208,6 +231,14 @@ class VLLM(TemplateLM):
            # flatten results
            return undistribute(results)
+        if self.lora_request is not None:
+            outputs = self.model.generate(
+                prompt_token_ids=requests,
+                sampling_params=sampling_params,
+                use_tqdm=True if self.batch_size == "auto" else False,
+                lora_request=self.lora_request,
+            )
+        else:
            outputs = self.model.generate(
                prompt_token_ids=requests,
                sampling_params=sampling_params,

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
-# v1.0 Tasks
-This list keeps track of which tasks' implementations have been ported to YAML / v2.0 of the Eval Harness.
-Boxes should be checked iff tasks are implemented in the refactor and tested for regression. Tasks should be struck through if checked *against original introducing paper* implementation or popularizing implementation. (WIP) Denotes that there exists a PR or person working on this task already.
- [x] Glue
- [x] SuperGlue
- [x] CoQA
- [x] DROP
- [x] ~~Lambada~~
- [x] Lambada (Cloze variants)
- [x] ~~Lambada (Multilingual)~~
- [x] Wikitext
- [x] PiQA
- [x] PROST
- [x] MCTACO
- [x] Pubmed QA
- [x] SciQ
- [x] QASPER
- [x] QA4MRE
- [x] TriviaQA
- [x] AI2 ARC
- [x] LogiQA
- [x] HellaSwag
- [x] SWAG
- [x] OpenBookQA
- [ ] SQuADv2 (Lintang)
- [x] RACE
- [x] HeadQA
- [x] MathQA
- [x] WebQs
- [x] WSC273
- [x] Winogrande
- [x] ANLI
- [x] Hendrycks Ethics (missing some tasks/metrics, see PR 660: <https://github.com/EleutherAI/lm-evaluation-harness/pull/660> for more info)
- [x] TruthfulQA (mc1)
- [x] TruthfulQA (mc2)
- [x] TruthfulQA (gen)
- [x] MuTual
- [ ] Hendrycks Math (Hailey)
- [x] Asdiv
- [ ] GSM8k
- [x] Arithmetic
- [ ] MMMLU (Hailey)
- [x] Translation (WMT) suite
- [x] Unscramble
- [x] ~~Pile (perplexity)~~
- [x] BLiMP
- [x] ToxiGen
- [x] StoryCloze
- [ ] NaturalQs (Hailey)
- [x] CrowS-Pairs
- [x] XCopa
- [ ] BIG-Bench (Hailey)
- [x] XStoryCloze
- [x] XWinograd
- [x] PAWS-X
- [x] XNLI
- [x] MGSM
- [ ] SCROLLS
- [x] Babi
- [x] Belebele
-# Novel Tasks
-Tasks added in the revamped harness that were not previously available. Again, a strikethrough denotes checking performed *against the original task's implementation or published results introducing the task*.
-# Task Wishlist
- [ ] TheoremQA
- [ ] Theorem Proving evaluations
- [ ] Chain of Thought
- [ ] Self-consistency ; Least-to-Most prompting, etc.
- [ ] Summarization Tasks
- [ ] Anthropic Model-Written Evals
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -136,13 +136,14 @@ class TaskManager:
            if "include" in config:
                if yaml_path is None:
                    raise ValueError
-                config.update(
+                config = {
-                    utils.load_yaml_config(
+                    **utils.load_yaml_config(
                        yaml_path,
                        yaml_config={"include": config.pop("include")},
                        mode="full",
-                    )
+                    ),
-                )
+                    **config,
+                }
            if self._config_is_python_task(config):
                task_object = config["class"]()
            else:
@@ -356,28 +357,6 @@ class TaskManager:
        return tasks_and_groups
-def include_path(task_dir):
-    logger = utils.eval_logger
-    logger.setLevel(getattr(logging, "INFO"))
-    logger.info(
-        "To still use tasks loaded from args.include_path,"
-        "see an example of the new TaskManager API in "
-        "https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
-    )
-    return 0
-def initialize_tasks(verbosity="INFO"):
-    logger = utils.eval_logger
-    logger.setLevel(getattr(logging, f"{verbosity}"))
-    logger.info(
-        "lm_eval.tasks.initialize_tasks() is deprecated and no longer necessary. "
-        "It will be removed in v0.4.2 release. "
-        "TaskManager will instead be used."
-    )
-    return 0
 def get_task_name_from_config(task_config: Dict[str, str]) -> str:
    if "task" in task_config:
        return task_config["task"]
@@ -401,7 +380,7 @@ def get_task_name_from_object(task_object):
 def get_task_dict(
-    task_name_list: List[Union[str, Dict, Task]],
+    task_name_list: Union[str, List[Union[str, Dict, Task]]],
    task_manager: Optional[TaskManager] = None,
 ):
    """Creates a dictionary of task objects from either a name of task, config, or prepared Task object.
@@ -423,9 +402,20 @@ def get_task_dict(
    if isinstance(task_name_list, str):
        task_name_list = [task_name_list]
+    elif isinstance(task_name_list, list):
+        if not all([isinstance(task, (str, dict, Task)) for task in task_name_list]):
+            raise TypeError(
+                "Expected all list items to be of types 'str', 'dict', or 'Task', but at least one entry did not match."
+            )
+    else:
+        raise TypeError(
+            f"Expected a 'str' or 'list' but received {type(task_name_list)}."
+        )
    string_task_name_list = [task for task in task_name_list if isinstance(task, str)]
-    others_task_name_list = [task for task in task_name_list if ~isinstance(task, str)]
+    others_task_name_list = [
+        task for task in task_name_list if not isinstance(task, str)
+    ]
    if len(string_task_name_list) > 0:
        if task_manager is None:
            task_manager = TaskManager()

--- a/lm_eval/tasks/aclue/README.md
+++ b/lm_eval/tasks/aclue/README.md
+# ACLUE
+### Paper
+Can Large Language Model Comprehend Ancient Chinese? A Preliminary Test on ACLUE
+https://arxiv.org/abs/2310.09550
+The Ancient Chinese Language Understanding Evaluation (ACLUE) is an evaluation benchmark focused on ancient Chinese language comprehension. It aims to assess the performance of large-scale language models on understanding ancient Chinese. The benchmark comprises 15 tasks spanning various domains, including lexical, syntactic, semantic, inference, and knowledge. ACLUE's tasks are derived from a combination of manually curated questions from publicly available resources, and automatically
+generated questions from classical Chinese language corpora. The range of questions span from the Xia dynasty (2070 BCE) to the Ming dynasty (1368 CE). ACLUE adopts a multiple-choice question format for all tasks.
+Homepage: https://github.com/isen-zhang/ACLUE
+### Citation
+```bibtex
+@inproceedings{zhang-li-2023-large,
+    title = "Can Large Langauge Model Comprehend {A}ncient {C}hinese? A Preliminary Test on {ACLUE}",
+    author = "Zhang, Yixuan  and Li, Haonan",
+    booktitle = "Proceedings of the Ancient Language Processing Workshop",
+    month = sep,
+    year = "2023",
+    address = "Varna, Bulgaria",
+    publisher = "INCOMA Ltd., Shoumen, Bulgaria",
+    url = "https://aclanthology.org/2023.alp-1.9",
+    pages = "80--87"
+}
+```
+### Groups and Tasks
+#### Groups
+- `aclue`: All 15 subjects of the ACLUE dataset, evaluated following the methodology in CMMLU's original implementation.
+#### Tasks
+The following tasks evaluate subjects in the ACLUE dataset using loglikelihood-based multiple-choice scoring:
+- `aclue_{subject_english}`
+### Checklist
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation?
+    * [x] Yes, original implementation contributed by author of the benchmark
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/aclue/_default_template_yaml
+++ b/lm_eval/tasks/aclue/_default_template_yaml
+group: aclue
+dataset_path: tyouisen/aclue
+test_split: test
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+doc_to_text: "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案："
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer)}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/aclue/_generate_configs.py
+++ b/lm_eval/tasks/aclue/_generate_configs.py
+"""
+Take in a YAML, and output all other splits with this YAML
+"""
+import argparse
+import os
+import yaml
+from tqdm import tqdm
+from lm_eval.utils import eval_logger
+SUBJECTS = {
+    "古文单字多义": "polysemy_resolution",
+    "诗词情感分类": "poetry_sentiment_analysis",
+    "古汉语命名体识别": "named_entity_recognition",
+    "古汉语知识": "basic_ancient_chinese",
+    "古诗词上下句预测": "poetry_context_prediction",
+    "古文断句": "sentence_segmentation",
+    "对联": "couplet_prediction",
+    "古诗词曲鉴赏": "poetry_appreciate",
+    "国学常识": "ancient_chinese_culture",
+    "古音学": "ancient_phonetics",
+    "通假字": "homographic_character_resolution",
+    "古代文学知识": "ancient_literature",
+    "医古文": "ancient_medical",
+    "古诗词质量评估": "poetry_quality_assessment",
+    "古文阅读理解": "reading_comprehension",
+}
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_yaml_path", required=True)
+    parser.add_argument("--save_prefix_path", default="aclue")
+    parser.add_argument("--cot_prompt_path", default=None)
+    parser.add_argument("--task_prefix", default="")
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
+    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
+    with open(args.base_yaml_path, encoding="utf-8") as f:
+        base_yaml = yaml.full_load(f)
+    if args.cot_prompt_path is not None:
+        import json
+        with open(args.cot_prompt_path, encoding="utf-8") as f:
+            cot_file = json.load(f)
+    for subject_zh, subject_eng in tqdm(SUBJECTS.items()):
+        if args.cot_prompt_path is not None:
+            description = cot_file[subject_eng]
+        else:
+            description = (
+                f"以下是关于{subject_zh}的单项选择题，请直接给出正确答案的选项。\n\n"
+            )
+        yaml_dict = {
+            "include": base_yaml_name,
+            "task": f"aclue_{args.task_prefix}_{subject_eng}"
+            if args.task_prefix != ""
+            else f"aclue_{subject_eng}",
+            "dataset_name": subject_eng,
+            "description": description,
+        }
+        file_save_path = args.save_prefix_path + f"_{subject_eng}.yaml"
+        eval_logger.info(f"Saving yaml for subset {subject_eng} to {file_save_path}")
+        with open(file_save_path, "w", encoding="utf-8") as yaml_file:
+            yaml.dump(
+                yaml_dict,
+                yaml_file,
+                width=float("inf"),
+                allow_unicode=True,
+                default_style='"',
+            )
--- a/lm_eval/tasks/aclue/aclue_ancient_chinese_culture.yaml
+++ b/lm_eval/tasks/aclue/aclue_ancient_chinese_culture.yaml
+"dataset_name": "ancient_chinese_culture"
+"description": "以下是关于国学常识的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "aclue_ancient_chinese_culture"
--- a/lm_eval/tasks/aclue/aclue_ancient_literature.yaml
+++ b/lm_eval/tasks/aclue/aclue_ancient_literature.yaml
+"dataset_name": "ancient_literature"
+"description": "以下是关于古代文学知识的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "aclue_ancient_literature"