update version

a6b358ca · Rayyyyy · ed53d51c · ed53d51c · a6b358ca · a6b358ca
Commit a6b358ca authored May 24, 2024 by Rayyyyy
10 changed files
--- a/lm_eval/metrics.py
+++ b/lm_eval/metrics.py
-import math
-from collections.abc import Iterable
-
-import numpy as np
-import sacrebleu
-import sklearn.metrics
-import random
-
-
-def mean(arr):
-    return sum(arr) / len(arr)
-
-
-def pop_stddev(arr):
-    mu = mean(arr)
-    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr))
-
-
-def sample_stddev(arr):
-    mu = mean(arr)
-    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1))
-
-
-def mean_stderr(arr):
-    return sample_stddev(arr) / math.sqrt(len(arr))
-
-
-def median(arr):
-    return arr[len(arr) // 2]
-
-
-def matthews_corrcoef(items):
-    unzipped_list = list(zip(*items))
-    golds = unzipped_list[0]
-    preds = unzipped_list[1]
-    return sklearn.metrics.matthews_corrcoef(golds, preds)
-
-
-def f1_score(items):
-    unzipped_list = list(zip(*items))
-    golds = unzipped_list[0]
-    preds = unzipped_list[1]
-    fscore = sklearn.metrics.f1_score(golds, preds)
-
-    return np.max(fscore)
-
-
-def acc_all(items):
-    # Only count as correct if all answers are labeled correctly for each question
-    question_scoring_dict = {}
-    preds = list(zip(*items))[0]
-    docs = list(zip(*items))[1]
-
-    for doc, pred in zip(docs, preds):
-        paragraph_id = doc["idx"]["paragraph"]
-        question_id = doc["idx"]["question"]
-        if (paragraph_id, question_id) not in question_scoring_dict:
-            question_scoring_dict[(paragraph_id, question_id)] = []
-
-        gold_label = doc["label"] == 1
-
-        question_scoring_dict[(paragraph_id, question_id)].append(gold_label == pred)
-    acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
-    return acc
-
-
-def acc_all_stderr(items):
-    # Only count as correct if all answers are labeled correctly for each question
-    question_scoring_dict = {}
-    preds = list(zip(*items))[0]
-    docs = list(zip(*items))[1]
-
-    for doc, pred in zip(docs, preds):
-        question_id = doc["idx"]["question"]
-        if question_id not in question_scoring_dict:
-            question_scoring_dict[question_id] = []
-
-        gold_label = doc["label"] == 1
-        question_scoring_dict[question_id].append(gold_label == pred)
-
-    acc = mean_stderr([int(all(x)) for x in question_scoring_dict.values()])
-    return acc
-
-
-def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
-    """Compute max metric between prediction and each ground truth."""
-    scores_for_ground_truths = []
-    for ground_truth in ground_truths:
-        score = metric_fn(prediction, ground_truth)
-        scores_for_ground_truths.append(score)
-    return max(scores_for_ground_truths)
-
-
-def perplexity(items):
-    return math.exp(-mean(items))
-
-
-def weighted_mean(items):
-    a, b = zip(*items)
-    return sum(a) / sum(b)
-
-
-def weighted_perplexity(items):
-    return math.exp(-weighted_mean(items))
-
-
-def bits_per_byte(items):
-    return -weighted_mean(items) / math.log(2)
-
-
-def bleu(items):
-    """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
-    for evaluating a generated sentence to a reference sentence. It counts matching
-    n-grams in the candidate translation to n-grams in the reference text, where
-    1-gram or unigram would be each token and a bigram comparison would be each
-    word pair. The comparison is made regardless of word order
-    Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
-    Paper: https://www.aclweb.org/anthology/P02-1040/
-
-    Higher is better
-    """
-    refs = list(zip(*items))[0]
-    preds = list(zip(*items))[1]
-    refs, preds = _sacreformat(refs, preds)
-    return sacrebleu.corpus_bleu(preds, refs).score
-
-
-def chrf(items):
-    """chrF++ is a tool for automatic evaluation of machine translation output
-    based on character n-gram precision and recall enhanced with word n-grams.
-    Source: https://github.com/m-popovic/chrF
-    Paper: https://www.aclweb.org/anthology/W15-3049.pdf
-
-    Higher is better  # TODO I think
-    """
-    refs = list(zip(*items))[0]
-    preds = list(zip(*items))[1]
-    refs, preds = _sacreformat(refs, preds)
-    return sacrebleu.corpus_chrf(preds, refs).score
-
-
-def ter(items):
-    """Translation Error Rate is an error metric for machine translation that
-    measures the number of edits required to change a system output into one
-    of the references
-    Source: http://www.cs.umd.edu/~snover/tercom/
-    Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
-
-    Lower is better
-    """
-    refs = list(zip(*items))[0]
-    preds = list(zip(*items))[1]
-    refs, preds = _sacreformat(refs, preds)
-    return sacrebleu.corpus_ter(preds, refs).score
-
-
-def is_non_str_iterable(obj):
-    return isinstance(obj, Iterable) and not isinstance(obj, str)
-
-
-def _sacreformat(refs, preds):
-    """Format refs and preds for sacrebleu corpus calculation. It is very particular"""
-    # Sacrebleu expects (List[str], List[List[str])
-    #   e.g. sacrebleu.corpus_bleu([pred_t], [[ref1_stream], [ref2_stream], ...])
-
-    # Note [ref1_stream] is the first reference for each pred.
-    # So lists are size N and (M, N) for N preds and M possible refs for each pred
-    # This is a different order of dimensions that I would expect
-
-    # We expect refs to be List[str] or List[List[str]], the outer list corresponding to preds
-    # Must become List[List[str]] with the inner list corresponding to preds
-    if not is_non_str_iterable(refs):
-        refs = list(refs)
-    if not is_non_str_iterable(refs[0]):
-        refs = [[ref] for ref in refs]
-    refs = list(zip(*refs))
-    # Note the number of refs in each ref list much match the number of preds
-
-    # We expect preds to be List[str] or List[List[str]]. Must become List[str]
-    if not is_non_str_iterable(preds):
-        preds = list(preds)
-    if is_non_str_iterable(preds[0]):
-        assert len(preds[0]) == 1, f"Pred must be a str, was {preds[0]}"
-        preds = [pred[0] for pred in preds]
-
-    return refs, preds
-
-
-# stderr stuff
-
-
-class _bootstrap_internal:
-    def __init__(self, f, n):
-        self.f = f
-        self.n = n
-
-    def __call__(self, v):
-        i, xs = v
-        rnd = random.Random()
-        rnd.seed(i)
-        res = []
-        for _ in range(self.n):
-            res.append(self.f(rnd.choices(xs, k=len(xs))))
-        return res
-
-
-def bootstrap_stderr(f, xs, iters):
-    import multiprocessing as mp
-
-    pool = mp.Pool(mp.cpu_count())
-    # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
-    # equivalent to stderr calculated without Bessel's correction in the stddev.
-    # Unfortunately, I haven't been able to figure out what the right correction is
-    # to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but
-    # that would be ad-hoc and I can't prove that that would actually be an unbiased estimator)
-    # Thankfully, shouldn't matter because our samples are pretty big usually anyways
-    res = []
-    chunk_size = min(1000, iters)
-    from tqdm import tqdm
-
-    print("bootstrapping for stddev:", f.__name__)
-    for bootstrap in tqdm(
-        pool.imap(
-            _bootstrap_internal(f, chunk_size),
-            [(i, xs) for i in range(iters // chunk_size)],
-        ),
-        total=iters // chunk_size,
-    ):
-        # sample w replacement
-        res.extend(bootstrap)
-
-    pool.close()
-    return sample_stddev(res)
-
-
-def stderr_for_metric(metric, bootstrap_iters):
-    bootstrappable = [
-        median,
-        matthews_corrcoef,
-        f1_score,
-        perplexity,
-        bleu,
-        chrf,
-        ter,
-    ]
-
-    if metric in bootstrappable:
-        return lambda x: bootstrap_stderr(metric, x, iters=bootstrap_iters)
-
-    stderr = {mean: mean_stderr, acc_all: acc_all_stderr}
-
-    return stderr.get(metric, None)
-
-
-def yesno(x):
-    if x:
-        return "yes"
-    else:
-        return "no"
--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
-from . import gpt2
-from . import gpt3
-from . import anthropic_llms
-from . import huggingface
-from . import textsynth
-from . import dummy
-from . import gguf
+from . import (
+    anthropic_llms,
+    dummy,
+    gguf,
+    huggingface,
+    mamba_lm,
+    nemo_lm,
+    neuralmagic,
+    neuron_optimum,
+    openai_completions,
+    optimum_lm,
+    textsynth,
+    vllm_causallms,
+)

-MODEL_REGISTRY = {
-    "hf": gpt2.HFLM,
-    "hf-causal": gpt2.HFLM,
-    "hf-causal-experimental": huggingface.AutoCausalLM,
-    "hf-seq2seq": huggingface.AutoSeq2SeqLM,
-    "gpt2": gpt2.GPT2LM,
-    "gpt3": gpt3.GPT3LM,
-    "anthropic": anthropic_llms.AnthropicLM,
-    "textsynth": textsynth.TextSynthLM,
-    "dummy": dummy.DummyLM,
-    "gguf": gguf.GGUFLM
-}

+# TODO: implement __all__

-def get_model(model_name):
-    return MODEL_REGISTRY[model_name]
+
+try:
+    # enable hf hub transfer if available
+    import hf_transfer  # type: ignore # noqa
+    import huggingface_hub.constants  # type: ignore
+
+    huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
+except ImportError:
+    pass
--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
-import os
-from lm_eval.base import BaseLM
+from typing import Any, List, Tuple
+
 from tqdm import tqdm
-import time

+from lm_eval import utils
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import retry_on_specific_exceptions
+
+
+eval_logger = utils.eval_logger

-def anthropic_completion(client, model, prompt, max_tokens_to_sample, temperature, stop):
-    """Query Anthropic API for completion.

-    Retry with back-off until they respond
+def anthropic_completion(
+    client,  #: anthropic.Anthropic,
+    model: str,
+    prompt: str,
+    max_tokens_to_sample: int,
+    temperature: float,
+    stop: List[str],
+    **kwargs: Any,
+) -> str:
+    """Wrapper function around the Anthropic completion API client with exponential back-off
+    in case of RateLimitError.
+
+    params:
+        client: anthropic.Anthropic
+            Anthropic API client
+        model: str
+            Anthropic model e.g. 'claude-instant-v1', 'claude-2'
+        prompt: str
+            Prompt to feed to the model
+        max_tokens_to_sample: int
+            Maximum number of tokens to sample from the model
+        temperature: float
+            Sampling temperature
+        stop: List[str]
+            List of stop sequences
+        kwargs: Any
+            Additional model_args to pass to the API client
    """
-    import anthropic

-    backoff_time = 3
-    while True:
-        try:
-            response = client.completion(
-                prompt=f"{anthropic.HUMAN_PROMPT} {prompt}{anthropic.AI_PROMPT}",
-                model=model,
-                # NOTE: Claude really likes to do CoT, and overly aggressive stop sequences
-                #       (e.g. gsm8k's ":") may truncate a lot of the input.
-                stop_sequences=[anthropic.HUMAN_PROMPT] + stop,
-                max_tokens_to_sample=max_tokens_to_sample,
-                temperature=temperature,
-            )
-            print(response)
-            return response["completion"]
-        except RuntimeError:
-            # TODO: I don't actually know what error Anthropic raises when it times out
-            #       So err update this error when we find out.
-            import traceback
+    try:
+        import anthropic
+    except ModuleNotFoundError:
+        raise Exception(
+            "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+        )

-            traceback.print_exc()
-            time.sleep(backoff_time)
-            backoff_time *= 1.5
+    def _exception_callback(e: Exception, sleep_time: float) -> None:
+        eval_logger.warning(
+            f"RateLimitError occurred: {e.__cause__}\n Retrying in {sleep_time} seconds"
+        )

+    @retry_on_specific_exceptions(
+        on_exceptions=[anthropic.RateLimitError],
+        max_retries=None,  # retry forever, consider changing
+        on_exception_callback=_exception_callback,
+    )
+    def completion():
+        response = client.completions.create(
+            prompt=f"{anthropic.HUMAN_PROMPT} {prompt}{anthropic.AI_PROMPT}",
+            model=model,
+            # NOTE: Claude really likes to do CoT, and overly aggressive stop sequences
+            #       (e.g. gsm8k's ":") may truncate a lot of the input.
+            stop_sequences=[anthropic.HUMAN_PROMPT] + stop,
+            max_tokens_to_sample=max_tokens_to_sample,
+            temperature=temperature,
+            **kwargs,
+        )
+        return response.completion

-class AnthropicLM(BaseLM):
-    REQ_CHUNK_SIZE = 20
+    return completion()
+
+
+def anthropic_chat(
+    client,  #: anthropic.Anthropic,
+    model: str,
+    prompt: str,
+    max_tokens: int,
+    temperature: float,
+    stop: List[str],
+    **kwargs: Any,
+) -> str:
+    """Wrapper function around the Anthropic completion API client with exponential back-off
+    in case of RateLimitError.
+
+    params:
+        client: anthropic.Anthropic
+            Anthropic API client
+        model: str
+            Anthropic model e.g. 'claude-3-opus-20240229', 'claude-3-sonnet-20240229'
+        prompt: str
+            Prompt to feed to the model
+        max_tokens: int
+            Maximum number of tokens to sample from the model
+        temperature: float
+            Sampling temperature
+        stop: List[str]
+            List of stop sequences
+        kwargs: Any
+            Additional model_args to pass to the API client
+    """
+
+    try:
+        import anthropic
+    except ModuleNotFoundError:
+        raise Exception(
+            "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+        )
+
+    def _exception_callback(e: Exception, sleep_time: float) -> None:
+        eval_logger.warning(
+            f"RateLimitError occurred: {e.__cause__}\n Retrying in {sleep_time} seconds"
+        )
+
+    @retry_on_specific_exceptions(
+        on_exceptions=[
+            anthropic.RateLimitError,
+            anthropic.APIConnectionError,
+            anthropic.APIStatusError,
+        ],
+        max_retries=None,  # retry forever, consider changing
+        on_exception_callback=_exception_callback,
+    )
+    def messages():
+        response = client.messages.create(
+            model=model,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            messages=[{"role": "user", "content": f"{prompt}"}],
+            **kwargs,
+        )
+        return response.content[0].text
+
+    return messages()

-    def __init__(self, model):
-        """
+
+@register_model("anthropic")
+class AnthropicLM(LM):
+    REQ_CHUNK_SIZE = 20  # TODO: not used
+
+    def __init__(
+        self,
+        batch_size: int = 1,
+        model: str = "claude-2.0",
+        max_tokens_to_sample: int = 256,
+        temperature: float = 0,  # defaults to 1
+        **kwargs,  # top_p, top_k, etc.
+    ) -> None:
+        """Anthropic API wrapper.

        :param model: str
-            Anthropic model e.g. claude-instant-v1
+            Anthropic model e.g. 'claude-instant-v1', 'claude-2'
+        :param max_tokens_to_sample: int
+            Maximum number of tokens to sample from the model
+        :param temperature: float
+            Sampling temperature
+        :param kwargs: Any
+            Additional model_args to pass to the API client
        """
        super().__init__()
-        import anthropic
+
+        try:
+            import anthropic
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+            )
+
        self.model = model
-        self.client = anthropic.Client(os.environ['ANTHROPIC_API_KEY'])
+        # defaults to os.environ.get("ANTHROPIC_API_KEY")
+        self.client = anthropic.Anthropic()
+        self.temperature = temperature
+        self.max_tokens_to_sample = max_tokens_to_sample
+        self.tokenizer = self.client.get_tokenizer()
+        self.kwargs = kwargs

    @property
    def eot_token_id(self):
+        # Not sure but anthropic.HUMAN_PROMPT ?
        raise NotImplementedError("No idea about anthropic tokenization.")

    @property
-    def max_length(self):
+    def max_length(self) -> int:
        return 2048

    @property
-    def max_gen_toks(self):
-        return 256
+    def max_gen_toks(self) -> int:
+        return self.max_tokens_to_sample

    @property
    def batch_size(self):
        # Isn't used because we override _loglikelihood_tokens
-        raise NotImplementedError()
+        raise NotImplementedError("No support for logits.")

    @property
    def device(self):
        # Isn't used because we override _loglikelihood_tokens
-        raise NotImplementedError()
+        raise NotImplementedError("No support for logits.")

-    def tok_encode(self, string: str):
-        raise NotImplementedError("No idea about anthropic tokenization.")
+    def tok_encode(self, string: str) -> List[int]:
+        return self.tokenizer.encode(string).ids

-    def tok_decode(self, tokens):
-        raise NotImplementedError("No idea about anthropic tokenization.")
+    def tok_decode(self, tokens: List[int]) -> str:
+        return self.tokenizer.decode(tokens)

-    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
+    def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
        raise NotImplementedError("No support for logits.")

-    def greedy_until(self, requests):
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
+        try:
+            import anthropic
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+            )
+
        if not requests:
            return []

+        _requests: List[Tuple[str, dict]] = [req.args for req in requests]
+
        res = []
-        for request in tqdm(requests):
-            inp = request[0]
-            request_args = request[1]
-            until = request_args["until"]
-            response = anthropic_completion(
-                client=self.client,
-                model=self.model,
-                prompt=inp,
-                max_tokens_to_sample=self.max_gen_toks,
-                temperature=0.0,
-                stop=until,
-            )
-            res.append(response)
+        for request in tqdm(_requests, disable=disable_tqdm):
+            try:
+                inp = request[0]
+                request_args = request[1]
+                # generation_kwargs
+                until = request_args.get("until")
+                max_gen_toks = request_args.get("max_gen_toks", self.max_length)
+                temperature = request_args.get("temperature", self.temperature)
+                response = anthropic_completion(
+                    client=self.client,
+                    model=self.model,
+                    prompt=inp,
+                    max_tokens_to_sample=max_gen_toks,
+                    temperature=temperature,  # TODO: implement non-greedy sampling for Anthropic
+                    stop=until,  # type: ignore
+                    **self.kwargs,
+                )
+                res.append(response)
+
+                self.cache_hook.add_partial("generate_until", request, response)
+            except anthropic.APIConnectionError as e:  # type: ignore # noqa: F821
+                eval_logger.critical(f"Server unreachable: {e.__cause__}")
+                break
+            except anthropic.APIStatusError as e:  # type: ignore # noqa: F821
+                eval_logger.critical(f"API error {e.status_code}: {e.message}")
+                break
+
        return res

    def _model_call(self, inps):
@@ -105,5 +260,101 @@ class AnthropicLM(BaseLM):
        raise NotImplementedError()

    def _model_generate(self, context, max_length, eos_token_id):
-        # Isn't used because we override greedy_until
+        # Isn't used because we override generate_until
        raise NotImplementedError()
+
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError("No support for logits.")
+
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError("No support for logits.")
+
+
+@register_model("anthropic-chat", "anthropic-chat-completions")
+class AnthropicChatLM(AnthropicLM):
+    REQ_CHUNK_SIZE = 20  # TODO: not used
+
+    def __init__(
+        self,
+        model: str,
+        batch_size: int = 1,
+        max_tokens: int = 256,
+        temperature: float = 0,  # defaults to 1
+        **kwargs,  # top_p, top_k, etc.
+    ) -> None:
+        """Anthropic API wrapper.
+
+        :param model: str
+            Anthropic model e.g. 'claude-3-opus-20240229', 'claude-3-sonnet-20240229'
+        :param max_tokens: int
+            Maximum number of tokens to sample from the model
+        :param temperature: float
+            Sampling temperature
+        :param kwargs: Any
+            Additional model_args to pass to the API client
+        """
+        super().__init__()
+
+        try:
+            import anthropic
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+            )
+
+        self.model = model
+        # defaults to os.environ.get("ANTHROPIC_API_KEY")
+        self.client = anthropic.Anthropic()
+        self.temperature = temperature
+        self.max_token = max_tokens
+        self.tokenizer = self.client.get_tokenizer()
+        self.kwargs = kwargs
+
+    @property
+    def max_gen_toks(self) -> int:
+        return self.max_tokens
+
+    def generate_until(self, requests) -> List[str]:
+        try:
+            import anthropic
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+            )
+
+        if not requests:
+            return []
+
+        _requests: List[Tuple[str, dict]] = [req.args for req in requests]
+
+        res = []
+        for request in tqdm(_requests):
+            try:
+                inp = request[0]
+                request_args = request[1]
+                # generation_kwargs
+                until = request_args.get("until")
+                max_tokens = request_args.get("max_gen_toks", self.max_length)
+                temperature = request_args.get("temperature", self.temperature)
+                response = anthropic_chat(
+                    client=self.client,
+                    model=self.model,
+                    prompt=inp,
+                    max_tokens=max_tokens,
+                    temperature=temperature,  # TODO: implement non-greedy sampling for Anthropic
+                    stop=until,  # type: ignore
+                    **self.kwargs,
+                )
+                res.append(response)
+
+                self.cache_hook.add_partial("generate_until", request, response)
+            except anthropic.APIConnectionError as e:  # type: ignore # noqa: F821
+                eval_logger.critical(f"Server unreachable: {e.__cause__}")
+                break
+            except anthropic.APIStatusError as e:  # type: ignore # noqa: F821
+                eval_logger.critical(f"API error {e.status_code}: {e.message}")
+                break
+
+        return res
--- a/lm_eval/models/dummy.py
+++ b/lm_eval/models/dummy.py
 import random
-from lm_eval.base import LM

+from tqdm import tqdm

+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+
+
+@register_model("dummy")
 class DummyLM(LM):
-    def __init__(self):
-        pass
+    def __init__(self) -> None:
+        super().__init__()

    @classmethod
    def create_from_arg_string(cls, arg_string, additional_config=None):
        return cls()

-    def loglikelihood(self, requests):
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
        res = []

-        for _ in requests:
+        for _ in tqdm(requests, disable=disable_tqdm):
            res.append((-random.random(), False))

        return res

-    def greedy_until(self, requests):
+    def generate_until(self, requests, disable_tqdm: bool = False):
        res = []

-        for ctx, _ in requests:
+        for ctx, _ in tqdm(requests, disable=disable_tqdm):
            res.append("lol")
            assert ctx.strip() != ""

        return res

-    def loglikelihood_rolling(self, requests):
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
        res = []

-        for _ in requests:
+        for _ in tqdm(requests, disable=disable_tqdm):
            res.append(-random.random())

        return res
--- a/lm_eval/models/gguf.py
+++ b/lm_eval/models/gguf.py
-import requests
 import logging
 import time
-from tqdm import tqdm
+
+import requests
 from requests.exceptions import RequestException
-import transformers
-from lm_eval.utils import Reorderer
-from lm_eval.base import BaseLM
+from tqdm import tqdm
+
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+

 logger = logging.getLogger(__name__)


 def get_result(logprobs, context_length):
    is_greedy = True
-    offsets = logprobs['text_offset']
-    tokens = logprobs['tokens']
-    tokens_logprobs = logprobs['token_logprobs']
+    offsets = logprobs["text_offset"]
+    tokens = logprobs["tokens"]
+    tokens_logprobs = logprobs["token_logprobs"]

    idx = 0
    while offsets[idx] < context_length:
@@ -31,26 +33,35 @@ def get_result(logprobs, context_length):
    return continuation_logprobs, is_greedy


-class GGUFLM(BaseLM):
-    def __init__(self, base_url, max_length=2048):
+@register_model("gguf", "ggml")
+class GGUFLM(LM):
+    def __init__(self, base_url=None, max_length=2048, **kwargs):
        super().__init__()
        self.base_url = base_url
+        assert self.base_url, "must pass `base_url` to use GGUF LM!"
        self.logprobs = 10
        self.temperature = 0.0
        self.max_length = max_length

-    def gguf_completion(self, context, continuation=None, stop=None, retries=3, delay=5, **kwargs):
+    def gguf_completion(
+        self, context, continuation=None, stop=None, retries=3, delay=5, **kwargs
+    ):
        for _ in range(retries):
            try:
                prompt = context
-                request = {'prompt': prompt, 'logprobs': self.logprobs,
-                           'temperature': self.temperature}
+                request = {
+                    "prompt": prompt,
+                    "logprobs": self.logprobs,
+                    "temperature": self.temperature,
+                }
                if continuation:
                    prompt += continuation
-                    request.update({'prompt': prompt, 'max_tokens': 1, 'echo': True})
+                    request.update({"prompt": prompt, "max_tokens": 1, "echo": True})
                if stop is not None:
-                    request['stop'] = stop
-                response = requests.post(f"{self.base_url}/v1/completions", json=request)
+                    request["stop"] = stop
+                response = requests.post(
+                    f"{self.base_url}/v1/completions", json=request
+                )
                response.raise_for_status()
                return response.json()
            except RequestException as e:
@@ -59,34 +70,44 @@ class GGUFLM(BaseLM):
        else:
            raise Exception(f"Failed to get a valid response after {retries} retries.")

-    def loglikelihood(self, requests):
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
        if not requests:
            return []
        res = []
-        for context, continuation in tqdm(requests):
+        for context, continuation in tqdm(
+            [req.args for req in requests], disable=disable_tqdm
+        ):
            response = self.gguf_completion(context=context, continuation=continuation)
            if response and "choices" in response and response["choices"]:
                choice = response["choices"][0]
                logprobs = choice.get("logprobs")
-                if logprobs and "token_logprobs" in logprobs and logprobs["token_logprobs"]:
+                if (
+                    logprobs
+                    and "token_logprobs" in logprobs
+                    and logprobs["token_logprobs"]
+                ):
                    logprob, is_greedy = get_result(logprobs, len(context))
                    res.append((logprob, is_greedy))
                else:
-                    logger.warning("Invalid logprobs data. Expected 'logprobs' to contain 'token_logprobs' list.")
+                    logger.warning(
+                        "Invalid logprobs data. Expected 'logprobs' to contain 'token_logprobs' list."
+                    )
            else:
-                logger.error(f"Invalid response for loglikelihood. Response: {response}")
+                logger.error(
+                    f"Invalid response for loglikelihood. Response: {response}"
+                )
                assert False
        return res

-    def greedy_until(self, requests):
+    def generate_until(self, requests, disable_tqdm: bool = False):
        if not requests:
            return []

        res = []
-        for request in tqdm(requests):
+        for request in tqdm([req.args for req in requests], disable=disable_tqdm):
            inp = request[0]
            request_args = request[1]
-            until = request_args["until"]
+            until = request_args.get("until", ["</s>"])
            response = self.gguf_completion(context=inp, stop=until)
            if response and "choices" in response and response["choices"]:
                choice = response["choices"][0]
@@ -94,49 +115,16 @@ class GGUFLM(BaseLM):
                    generated_text = choice["text"].strip()
                    res.append(generated_text)
                else:
-                    logger.error(f"Invalid response for greedy_until. Response: {response}")
+                    logger.error(
+                        f"Invalid response for greedy_until. Response: {response}"
+                    )
                    res.append(None)  # Add default value in case of error
            else:
                logger.error(f"Invalid response for greedy_until. Response: {response}")
                res.append(None)  # Add default value in case of error
        return res

-    def loglikelihood_rolling(self, requests):
-        raise NotImplementedError("loglikelihood_rolling not yet supported for GGUF models")
-
-    def _model_call(self, inps):
-        # Placeholder implementation
-        raise NotImplementedError()
-
-    def _model_generate(self, context, max_length, eos_token_id):
-        # Placeholder implementation
-        raise NotImplementedError()
-
-    def tok_encode(self, string: str):
-        raise NotImplementedError()
-
-    def tok_decode(self, tokens):
-        raise NotImplementedError()
-
-    @property
-    def batch_size(self):
-        # Placeholder implementation
-        raise NotImplementedError()
-
-    @property
-    def device(self):
-        # Placeholder implementation
-        raise NotImplementedError()
-
-    @property
-    def eot_token_id(self):
-        # Placeholder implementation
-        raise NotImplementedError()
-
-    def max_length(self):
-        return self.max_length
-
-    @property
-    def max_gen_toks(self):
-        # Placeholder implementation
-        raise NotImplementedError()
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError(
+            "loglikelihood_rolling not yet supported for GGUF models"
+        )
--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
-import torch
-import transformers
-from typing import Optional, Union
-from lm_eval.base import BaseLM
-
-
-def _get_dtype(
-    dtype: Union[str, torch.dtype]
-) -> torch.dtype:
-    """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
-    if isinstance(dtype, str) and dtype != "auto":
-        # Convert `str` args torch dtype: `float16` -> `torch.float16`
-        _torch_dtype = getattr(torch, dtype)
-    else:
-        _torch_dtype = dtype
-    return _torch_dtype
-
-
-class HFLM(BaseLM):
-
-    _DEFAULT_MAX_LENGTH = 2048
-
-    def __init__(
-        self,
-        device="cuda",
-        pretrained="gpt2",
-        revision="main",
-        low_cpu_mem_usage=None,
-        subfolder=None,
-        tokenizer=None,
-        batch_size=1,
-	max_length=None,
-        load_in_8bit: Optional[bool] = False,
-        trust_remote_code: Optional[bool] = False,
-        dtype: Optional[Union[str, torch.dtype]]="auto",
-    ):
-        super().__init__()
-
-        assert isinstance(device, str)
-        assert isinstance(pretrained, str)
-        assert isinstance(batch_size, (int, str))
-
-        device_list = set(
-            ["cuda", "cpu"] + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
-        )
-        if device and device in device_list:
-            self._device = torch.device(device)
-            print(f"Using device '{device}'")
-        else:
-            print("Device not specified")
-            print(f"Cuda Available? {torch.cuda.is_available()}")
-            self._device = (
-                torch.device("cuda")
-                if torch.cuda.is_available()
-                else torch.device("cpu")
-            )
-
-        # TODO: update this to be less of a hack once subfolder is fixed in HF
-        revision = revision + ("/" + subfolder if subfolder is not None else "")
-
-        self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
-            pretrained,
-            load_in_8bit=load_in_8bit,
-            low_cpu_mem_usage=low_cpu_mem_usage,
-            revision=revision,
-            torch_dtype=_get_dtype(dtype),
-            trust_remote_code=trust_remote_code,
-        ).eval()
-        if not load_in_8bit:
-            try:
-                self.gpt2.to(self.device)
-            except:
-                print("Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore.")
-        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-            pretrained if tokenizer is None else tokenizer,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-        )
-
-        self.vocab_size = self.tokenizer.vocab_size
-
-        # setup for automatic batch size detection
-        if batch_size == "auto":
-            self.batch_size_per_gpu = batch_size
-        else:
-            self.batch_size_per_gpu = int(batch_size)
-
-        self._max_length = max_length
-
-    @property
-    def eot_token_id(self):
-        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
-        return self.tokenizer.eos_token_id
-
-    @property
-    def max_length(self):
-        if self._max_length: # if max length manually set, return it
-            return self._max_length
-        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
-        for attr in seqlen_config_attrs:
-            if hasattr(self.gpt2.config, attr):
-                return getattr(self.gpt2.config, attr)
-        if hasattr(self.tokenizer, "model_max_length"):
-            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
-                return self._DEFAULT_MAX_LENGTH
-            return self.tokenizer.model_max_length
-        return self._DEFAULT_MAX_LENGTH
-
-
-    @property
-    def max_gen_toks(self):
-        return 256
-
-    @property
-    def batch_size(self):
-        # TODO: fix multi-gpu
-        return self.batch_size_per_gpu  # * gpus
-
-    @property
-    def device(self):
-        # TODO: fix multi-gpu
-        return self._device
-
-    def tok_encode(self, string: str):
-        return self.tokenizer.encode(string, add_special_tokens=False)
-
-    def tok_decode(self, tokens):
-        return self.tokenizer.decode(tokens)
-
-    def _model_call(self, inps):
-        """
-        inps: a torch tensor of shape [batch, sequence]
-        the size of sequence may vary from call to call
-
-        returns: a torch tensor of shape [batch, sequence, vocab] with the
-        logits returned from the model
-        """
-        with torch.no_grad():
-            return self.gpt2(inps)[0]
-
-    def _model_generate(self, context, max_length, eos_token_id):
-        generation_kwargs = {"do_sample": False, "max_length": max_length}
-        if eos_token_id is not None:
-            generation_kwargs['eos_token_id'] = eos_token_id
-            generation_kwargs['pad_token_id'] = eos_token_id # setting eos_token_id as pad token
-        return self.gpt2.generate(context, **generation_kwargs)
-
-
-# for backwards compatibility
-GPT2LM = HFLM
--- a/lm_eval/models/gpt3.py
+++ b/lm_eval/models/gpt3.py
-import os
-import numpy as np
-import transformers
-from lm_eval.base import BaseLM
-from lm_eval import utils
-from tqdm import tqdm
-import time
-
-
-def get_result(response, ctxlen):
-    """Process results from OpenAI API response.
-
-    :param response: dict
-        OpenAI API Response
-    :param ctxlen: int
-        Length of context (so we can slice them away and only keep the predictions)
-    :return:
-        continuation_logprobs: np.array
-            Log probabilities of continuation tokens
-        is_greedy: bool
-            whether argmax matches given continuation exactly
-    """
-    is_greedy = True
-    logprobs = response["logprobs"]["token_logprobs"]
-    continuation_logprobs = sum(logprobs[ctxlen:])
-
-    for i in range(ctxlen, len(response["logprobs"]["tokens"])):
-        token = response["logprobs"]["tokens"][i]
-        top_tokens = response["logprobs"]["top_logprobs"][i]
-        top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
-        if top_token != token:
-            is_greedy = False
-            break
-
-    return continuation_logprobs, is_greedy
-
-
-def oa_completion(**kwargs):
-    """Query OpenAI API for completion.
-
-    Retry with back-off until they respond
-    """
-    import openai
-
-    backoff_time = 3
-    while True:
-        try:
-            return openai.Completion.create(**kwargs)
-        except openai.error.OpenAIError:
-            import traceback
-
-            traceback.print_exc()
-            time.sleep(backoff_time)
-            backoff_time *= 1.5
-
-
-class GPT3LM(BaseLM):
-    REQ_CHUNK_SIZE = 20
-
-    def __init__(self, engine, truncate=False):
-        """
-
-        :param engine: str
-            OpenAI API engine (e.g. davinci)
-        :param truncate: bool
-            Truncate input if too long (if False and input is too long, throw error)
-        """
-        super().__init__()
-
-        import openai
-
-        self.engine = engine
-        self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2")
-
-        self.vocab_size = self.tokenizer.vocab_size
-
-        # to make the annoying "Using pad_token, but it is not set yet." error go away
-        self.tokenizer.pad_token = "<|endoftext|>"
-        assert self.tokenizer.encode("hello\n\nhello") == [31373, 198, 198, 31373]
-        self.truncate = truncate
-        self.end_of_text_token_id = self.tokenizer.convert_tokens_to_ids(
-            ["<|endoftext|>"]
-        )[0]
-
-        # Read from environment variable OPENAI_API_SECRET_KEY
-        openai.api_key = os.environ["OPENAI_API_SECRET_KEY"]
-
-    @property
-    def eot_token_id(self):
-        return self.tokenizer.eos_token_id
-
-    @property
-    def max_length(self):
-        # Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
-        return 2048
-
-    @property
-    def max_gen_toks(self):
-        return 256
-
-    @property
-    def batch_size(self):
-        # Isn't used because we override _loglikelihood_tokens
-        raise NotImplementedError()
-
-    @property
-    def device(self):
-        # Isn't used because we override _loglikelihood_tokens
-        raise NotImplementedError()
-
-    def tok_encode(self, string: str):
-        return self.tokenizer.encode(string, add_special_tokens=False)
-
-    def tok_decode(self, tokens):
-        return self.tokenizer.decode(tokens)
-
-    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
-        res = []
-
-        def _collate(x):
-            # this doesn't efficiently handle last-token differences yet, but those are kinda annoying because
-            # it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations
-            # we care about and so we need some kind of backup for when it isn't
-            toks = x[1] + x[2]
-            return -len(toks), tuple(toks)
-
-        re_ord = utils.Reorderer(requests, _collate)
-
-        for chunk in tqdm(
-            list(utils.chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE)),
-            disable=disable_tqdm,
-        ):
-            inps = []
-            ctxlens = []
-            for cache_key, context_enc, continuation_enc in chunk:
-                # max_length+1 because the API takes up to 2049 tokens, including the first context token
-                inp = (context_enc + continuation_enc)[-(self.max_length + 1) :]
-                # TODO: the logic is much simpler if we just look at the length of continuation tokens
-                ctxlen = len(context_enc) - max(
-                    0, len(context_enc) + len(continuation_enc) - (self.max_length + 1)
-                )
-
-                inps.append(inp)
-                ctxlens.append(ctxlen)
-
-            response = oa_completion(
-                engine=self.engine,
-                prompt=inps,
-                echo=True,
-                max_tokens=0,
-                temperature=0.0,
-                logprobs=10,
-            )
-
-            for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip(
-                response.choices, ctxlens, chunk
-            ):
-                answer = get_result(resp, ctxlen)
-
-                res.append(answer)
-
-                # partial caching
-                if cache_key is not None:
-                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
-
-        return re_ord.get_original(res)
-
-    def greedy_until(self, requests):
-        if not requests:
-            return []
-        res = []
-
-        def _collate(x):
-            toks = self.tok_encode(x[0])
-            return len(toks), x[0]
-
-        re_ord = utils.Reorderer(requests, _collate)
-
-        def sameuntil_chunks(xs, size):
-            ret = []
-            lastuntil = xs[0][1]
-            for x in xs:
-                if len(ret) >= size or x[1] != lastuntil:
-                    yield ret, lastuntil
-                    ret = []
-                    lastuntil = x[1]
-                ret.append(x)
-
-            if ret:
-                yield ret, lastuntil
-
-        # todo: more intelligent batching for heterogeneous `until`
-        for chunk, until in tqdm(
-            list(sameuntil_chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE))
-        ):
-            inps = []
-            for context, _ in chunk:
-                context_enc = self.tok_encode(context)
-                inp = context_enc[-(self.max_length - self.max_gen_toks) :]
-                inps.append(inp)
-
-            response = oa_completion(
-                engine=self.engine,
-                prompt=inps,
-                max_tokens=self.max_gen_toks,
-                temperature=0.0,
-                logprobs=10,
-                stop=until,
-            )
-
-            for resp, (context, until_) in zip(response.choices, chunk):
-                s = resp["text"]
-
-                for term in until_:
-                    s = s.split(term)[0]
-
-                # partial caching
-                self.cache_hook.add_partial("greedy_until", (context, until_), s)
-
-                res.append(s)
-
-        return re_ord.get_original(res)
-
-    def _model_call(self, inps):
-        # Isn't used because we override _loglikelihood_tokens
-        raise NotImplementedError()
-
-    def _model_generate(self, context, max_length, eos_token_id):
-        # Isn't used because we override greedy_until
-        raise NotImplementedError()
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
-import math
+import copy
+import os
+from datetime import timedelta
+from pathlib import Path
+from typing import List, Literal, Optional, Tuple, Union
+
 import torch
 import torch.nn.functional as F
 import transformers
-import peft
+from accelerate import (
+    Accelerator,
+    DistributedType,
+    InitProcessGroupKwargs,
+    find_executable_batch_size,
+)
+from huggingface_hub import HfApi
+from packaging import version
+from peft import PeftModel
 from peft import __version__ as PEFT_VERSION
-from pathlib import Path
-from typing import List, Mapping, NewType, Optional, Tuple, Union
 from tqdm import tqdm
-
-from transformers import BatchEncoding
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
+    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+)

 from lm_eval import utils
-from lm_eval.base import BaseLM
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import TemplateLM
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import (
+    Collator,
+    clear_torch_cache,
+    get_dtype,
+    pad_and_concat,
+    stop_sequences_criteria,
+)

-TokenSequence = Union[List[int], torch.LongTensor, torch.Tensor, BatchEncoding]

-_DeviceMapping = NewType("DeviceMapping", Mapping[str, Union[int, str, torch.device]])
+eval_logger = utils.eval_logger


 def _get_accelerate_args(
@@ -43,352 +64,331 @@ def _get_accelerate_args(
    return args


-def _get_dtype(
-    dtype: Union[str, torch.dtype], config: Optional[transformers.AutoConfig] = None
-) -> torch.dtype:
-    """Converts `dtype` from `str` to torch.dtype when possible."""
-    if dtype is None and config is not None:
-        _torch_dtype = config.torch_dtype
-    elif isinstance(dtype, str) and dtype != "auto":
-        # Convert `str` args torch dtype: `float16` -> `torch.float16`
-        _torch_dtype = getattr(torch, dtype)
-    else:
-        _torch_dtype = dtype
-    return _torch_dtype
-
+@register_model("hf-auto", "hf", "huggingface")
+class HFLM(TemplateLM):
+    """
+    An abstracted Huggingface model class. Enables usage with both models of
+    `transformers.AutoModelForCausalLM` and `transformers.AutoModelForSeq2SeqLM` classes.

-class HuggingFaceAutoLM(BaseLM):
-    AUTO_CONFIG_CLASS: transformers.AutoConfig = transformers.AutoConfig
-    AUTO_TOKENIZER_CLASS: transformers.AutoTokenizer = transformers.AutoTokenizer
-    AUTO_MODEL_CLASS: transformers.AutoModel = None
-    AUTO_PEFT_CLASS: peft.PeftModel = None
+    Supports data-parallel multi-GPU with HF Accelerate.
+    """

-    # Default max sequence length setting for when no `max_length` is provided
-    # or no max length config setting is found in the model or tokenizer.
-    _DEFAULT_MAX_LENGTH: int = 2048
+    AUTO_MODEL_CLASS = None
+    _DEFAULT_MAX_LENGTH = 2048

    def __init__(
        self,
-        pretrained: str,
-        quantized: Optional[Union[bool, str]] = False,
-        tokenizer: Optional[str] = None,
-        subfolder: Optional[str] = None,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
+        # override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq)
        revision: Optional[str] = "main",
-        batch_size: Optional[Union[int, str]] = 1,
-        max_batch_size: Optional[int] = 512,
-        max_gen_toks: Optional[int] = 256,
+        subfolder: Optional[str] = None,
+        tokenizer: Optional[
+            Union[
+                str,
+                transformers.PreTrainedTokenizer,
+                transformers.PreTrainedTokenizerFast,
+            ]
+        ] = None,
+        truncation: Optional[bool] = False,
+        logits_cache: bool = True,
        max_length: Optional[int] = None,
-        add_special_tokens: Optional[bool] = None,
-        use_accelerate: Optional[bool] = False,
+        device: Optional[str] = "cuda",
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
+        batch_size: Optional[Union[int, str]] = 1,
+        max_batch_size: Optional[int] = 64,
+        trust_remote_code: Optional[bool] = False,
+        use_fast_tokenizer: Optional[bool] = True,
+        add_bos_token: Optional[bool] = False,
+        prefix_token_id: Optional[int] = None,
+        # arguments used for splitting a model across GPUs naively.
+        # only used if `parallelize=True`.
+        parallelize: Optional[bool] = False,
        device_map_option: Optional[str] = "auto",
        max_memory_per_gpu: Optional[Union[int, str]] = None,
        max_cpu_memory: Optional[Union[int, str]] = None,
-        offload_folder: Optional[str] = "./offload",
-        dtype: Optional[Union[str, torch.dtype]] = None,
-        device: Optional[Union[int, str]] = "cuda",
-        peft: str = None,
-        load_in_8bit: Optional[bool] = False,
-        load_in_4bit: Optional[bool] = False,
-        trust_remote_code: Optional[bool] = False,
-        gptq_use_triton: Optional[bool] = False,
-        bnb_4bit_quant_type: Optional[str] = None,
-        bnb_4bit_compute_dtype: Optional[Union[str, torch.dtype]] = None,
-    ):
-        """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation.
-        Args:
-            pretrained (str):
-                The HuggingFace Hub model ID name or the path to a pre-trained
-                model to load. This is effectively the `pretrained_model_name_or_path`
-                argument of `from_pretrained` in the HuggingFace `transformers` API.
-            quantized (str or bool, optional, defaults to False):
-                File name of a GPTQ quantized model to load. Set to `True` to use the
-                default name of the quantized model.
-            add_special_tokens (bool, optional, defaults to True):
-                Whether to add special tokens to the input sequences. If `None`, the
-                default value will be set to `True` for seq2seq models (e.g. T5) and
-                `False` for causal models.
-                WARNING: Evaluating causal models with `add_special_tokens=True` is
-                currently __not__ supported.
-            > Large model loading `accelerate` arguments
-            use_accelerate (bool, optional, defaults to False):
-                If True, uses the `accelerate` library to load a large model across
-                multiple devices.
-            device_map_option (str, optional, defaults to "auto"):
-                The device map option to use when loading the model with
-                `accelerate`.
-                Options:
-                    "auto", "balanced", "balanced_low_0", "sequential"
-                See the `accelerate` docs for more details on these options:
-                https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.device_map
-            max_memory_per_gpu (Union[int, str], optional, defaults to None):
-                The maximum memory available for each GPU in bytes as `int` or in
-                the format f"{significand}{unit_symbol}" where {unit_symbol} is
-                any of ["GB", "MB", "GIB", "MIB"]. Refer to the `max_memory` arg in
-                the "Parameters for big model inference" section of the following
-                docs:
-                https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.max_memory
-            max_cpu_memory (Union[int, str], optional, defaults to None):
-                The maximum available CPU RAM in bytes as `int` or in the format
-                f"{significand}{unit_symbol}" where {unit_symbol} is any of
-                ["GB", "MB", "GIB", "MIB"]. Refer to the `max_memory` arg in the
-                "Parameters for big model inference" section of the following docs:
-                https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.max_memory
-            offload_folder (str, optional, defaults to "./offload"):
-                The folder to offload weights into if `device_map` contains any
-                "disk" value.
-            dtype (Union[str, torch.dtype], optional, defaults to None):):
-                Converts the model weights to `dtype`, if specified. Strings get
-                converted to `torch.dtype` objects (e.g. `float16` -> `torch.float16`).
-                Use `dtype="auto"` to derive the type from the model’s weights.
-            peft (str, optional, defaults to None):
-                Path of the adapter weights to load from Huggingface. This will usually
-                include a directory that includes the files `adapter_config.json` and
-                `adapter_model.bin`. Compatible with [PEFT](https://github.com/huggingface/peft)
-            load_in_8bit (bool, optional, defaults to False):
-                If True, will convert the loaded model into mixed-8bit quantized model. See:
-                https://huggingface.co/docs/transformers/main/en/main_classes/quantization#load-a-large-model-in-8bit
-            load_in_4bit (bool, optional, defaults to False):
-                If True, will convert the loaded model into mixed-4bit quantized model. See:
-                https://huggingface.co/docs/transformers/main/en/main_classes/quantization#load-a-large-model-in-4bit
-            trust_remote_code (bool, optional, defaults to False):
-                If True, will trust the remote code when loading the model.
-            gptq_use_triton (bool, optional, defaults to False):
-                Use Triton for GPTQ inference.
-            bnb_4bit_quant_type (str, optional, defaults to None): 
-                The quantization type to use for BnB 4bit quantization. See:
-                https://github.com/huggingface/transformers/blob/main/src/transformers/utils/quantization_config.py#L77
-            bnb_4bit_compute_dtype (Union[str, torch.dtype], optional, defaults to None):
-                The compute dtype to use for BnB 4bit quantization. See:
-                https://github.com/huggingface/transformers/blob/main/src/transformers/utils/quantization_config.py#L74
-
-        """
+        offload_folder: Optional[Union[str, os.PathLike]] = "./offload",
+        # PEFT, delta weights and quantization options
+        peft: Optional[str] = None,
+        delta: Optional[str] = None,
+        autogptq: Optional[Union[bool, str]] = False,
+        **kwargs,
+    ) -> None:
        super().__init__()

-        assert isinstance(pretrained, str)
-        assert isinstance(device, str)
-        assert isinstance(batch_size, (int, str))
-        if (
-            add_special_tokens is not None
-            and self.AUTO_MODEL_CLASS is transformers.AutoModelForCausalLM
-        ):
-            # TODO: Support evaluating causal models with special tokens. Currently,
-            # this is not possible because the `_loglikelihood_tokens()` method for
-            # causal LMs makes a no-special-tokens assumption given that contexts
-            # and labels/continuations are tokenized separately without special
-            # tokens, concatenated, and then processed as inputs.
-            assert (
-                not add_special_tokens
-            ), "Evaluating causal models with `add_special_tokens=True` is currently not supported."
+        # optionally: take in an already-initialized transformers.PreTrainedModel
+        if not isinstance(pretrained, str):
+            eval_logger.warning(
+                "`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way."
+            )
+            assert not parallelize, "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
+            self._model = pretrained
+            self._device = self._model.device
+            self._config = self._model.config
+            gpus = 0
+
+            if tokenizer:
+                assert isinstance(
+                    tokenizer, transformers.PreTrainedTokenizer
+                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
+                self.tokenizer = tokenizer
+            else:
+                # Get tokenizer
+                model_name = self._model.name_or_path
+                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                    model_name,
+                    revision=revision,
+                    trust_remote_code=trust_remote_code,
+                    use_fast=use_fast_tokenizer,
+                )

-        # setup for automatic batch size detection
-        if str(batch_size).startswith("auto"):
-            batch_size = batch_size.split(":")
-            self._batch_size = batch_size[0]
-            self.batch_schedule = float(batch_size[1]) if len(batch_size) > 1 else 1
        else:
-            self._batch_size = int(batch_size)
-        self.max_batch_size = max_batch_size
+            assert isinstance(device, str)
+            assert isinstance(pretrained, str)
+            assert isinstance(batch_size, (int, str))
+
+            gpus = torch.cuda.device_count()
+            accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
+            accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
+            if accelerator.num_processes > 1:
+                self.accelerator = accelerator
+
+            if not (parallelize or accelerator.num_processes > 1):
+                # use user-passed device
+                device_list = set(
+                    ["cuda", "cpu"]
+                    + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
+                    + ["mps", "mps:0"]
+                )
+                if device and device in device_list:
+                    self._device = torch.device(device)
+                    eval_logger.info(f"Using device '{device}'")
+                    if device in ("mps", "mps:0") and version.parse(
+                        torch.__version__
+                    ) < version.parse("2.1"):
+                        raise RuntimeError(
+                            f"mps requires torch >= 2.1. You have {torch.__version__}"
+                        )
+                else:
+                    eval_logger.info("Device not specified")
+                    eval_logger.info(f"Cuda Available? {torch.cuda.is_available()}")
+                    self._device = (
+                        torch.device("cuda")
+                        if torch.cuda.is_available()
+                        else torch.device("cpu")
+                    )
+            else:
+                if device != "cuda":
+                    eval_logger.info(
+                        f"Using `accelerate launch` or `parallelize=True`, device '{device}' will be overridden when placing model."
+                    )
+                # TODO: include in warning that `load_in_8bit` etc. affect this too
+                self._device = torch.device(device)

-        self._max_gen_toks = max_gen_toks
-        self._max_length = max_length
-        self._config = self.AUTO_CONFIG_CLASS.from_pretrained(
-            pretrained,
-            trust_remote_code=trust_remote_code,
-            revision=revision + ("/" + subfolder if subfolder is not None else ""),
+            # TODO: update this to be less of a hack once subfolder is fixed in HF
+            revision = revision + ("/" + subfolder if subfolder is not None else "")
+
+            self._get_config(
+                pretrained,
+                revision=revision,
+                trust_remote_code=trust_remote_code,
+            )
+
+        # determine which of 'causal' and 'seq2seq' backends to use
+        self._get_backend(
+            config=self.config, backend=backend, trust_remote_code=trust_remote_code
        )

-        self._add_special_tokens = add_special_tokens
-        self.tokenizer = self._create_auto_tokenizer(
-            pretrained=pretrained,
+        # load tokenizer so we know tokenizer vocabulary size before loading model and PEFT
+        self._create_tokenizer(
+            pretrained,
+            tokenizer,
            revision=revision,
-            subfolder=subfolder,
-            tokenizer=tokenizer,
-        )
-        self.tokenizer.model_max_length = self.max_length
-
-        model_kwargs = {}
-        if use_accelerate:
-            model_kwargs = _get_accelerate_args(
-                device_map_option,
-                max_memory_per_gpu,
-                max_cpu_memory,
-                offload_folder,
-            )
-        self.model = self._create_auto_model(
-            pretrained=pretrained,
-            quantized=quantized,
            trust_remote_code=trust_remote_code,
-            revision=revision,
-            subfolder=subfolder,
-            torch_dtype=_get_dtype(dtype, self._config),
-            gptq_use_triton=gptq_use_triton,
-            load_in_8bit=load_in_8bit,
-            load_in_4bit=load_in_4bit,
-            bnb_4bit_quant_type=bnb_4bit_quant_type,
-            bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
-            **model_kwargs,
+            use_fast_tokenizer=use_fast_tokenizer,
        )
-        # note: peft_path can be different than pretrained model path
-        if peft is not None:
-            self.model = self._create_auto_model_peft(
-                model=self.model,
-                peft=peft,
-                revision=revision,
-                subfolder=subfolder,
-                load_in_4bit=load_in_4bit,
-            )
-        self.model.eval()
-        torch.set_grad_enabled(False)
-
-        self._device = device
-        if use_accelerate and "lm_head" in self.model.hf_device_map:
-            # `accelerate` can place `lm_head` weights on a different device than
-            # the user specified one so we force `self._device` to be the same as
-            # `lm_head`'s.
-            self._device = self.model.hf_device_map["lm_head"]
-        if not use_accelerate and not (load_in_4bit or load_in_8bit):
-            try:
-                self.model.to(self._device)
-            except:
-                print("Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore.")

-    def _create_auto_model(
-        self,
-        *,
-        pretrained: str,
-        quantized: Optional[Union[bool, str]] = False,
-        revision: str,
-        subfolder: str,
-        device_map: Optional[Union[str, _DeviceMapping]] = None,
-        max_memory: Optional[dict] = None,
-        offload_folder: Optional[str] = None,
-        load_in_8bit: Optional[bool] = False,
-        load_in_4bit: Optional[bool] = False,
-        trust_remote_code: Optional[bool] = False,
-        torch_dtype: Optional[Union[str, torch.dtype]] = None,
-        gptq_use_triton: Optional[bool] = False,
-        bnb_4bit_quant_type: Optional[str] = None,
-        bnb_4bit_compute_dtype: Optional[Union[str, torch.dtype]] = None,
-    ) -> transformers.AutoModel:
-        """Returns a pre-trained pytorch model from a pre-trained model configuration."""
-        if not quantized:
-            if load_in_4bit:
-                assert transformers.__version__ >= "4.30.0", "load_in_4bit requires transformers >= 4.30.0"
-            model_kwargs = {}
-            if transformers.__version__ >= "4.30.0":
-                model_kwargs["load_in_4bit"] = load_in_4bit
-                if load_in_4bit:
-                    model_kwargs["bnb_4bit_quant_type"] = bnb_4bit_quant_type
-                    model_kwargs["bnb_4bit_compute_dtype"] = getattr(torch, bnb_4bit_compute_dtype)
-            model = self.AUTO_MODEL_CLASS.from_pretrained(
-                pretrained,
-                revision=revision + ("/" + subfolder if subfolder is not None else ""),
-                device_map=device_map,
-                max_memory=max_memory,
-                offload_folder=offload_folder,
-                load_in_8bit=load_in_8bit,
+        # if we passed `pretrained` as a string, initialize our model now
+        if isinstance(pretrained, str):
+            self._create_model(
+                pretrained=pretrained,
+                revision=revision,
+                dtype=dtype,
                trust_remote_code=trust_remote_code,
-                torch_dtype=torch_dtype,
-                **model_kwargs,
+                parallelize=parallelize,
+                device_map_option=device_map_option,
+                max_memory_per_gpu=max_memory_per_gpu,
+                max_cpu_memory=max_cpu_memory,
+                offload_folder=offload_folder,
+                peft=peft,
+                delta=delta,
+                autogptq=autogptq,
+                **kwargs,
            )
+
+        # access self._model through self.model property outside this method
+        if isinstance(self.model, torch.nn.Module):
+            self.model.eval()
+            self.model.tie_weights()
+
+        if isinstance(pretrained, str) and (gpus >= 1 or str(self.device) == "mps"):
+            # TODO: can remove this whole snippet except in the mps case, perhaps?
+            if not (parallelize or autogptq or hasattr(self, "accelerator")):
+                # place model onto device requested manually,
+                # if not using HF Accelerate or device_map
+                # or any other option that preloads model onto device
+                try:
+                    self.model.to(self.device)
+                except ValueError:
+                    eval_logger.debug(
+                        "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
+                    )
+
+        self.truncation = truncation
+        self.logits_cache = logits_cache
+        self.vocab_size = self.tokenizer.vocab_size
+        # select (or create) a pad token to use
+        if self.tokenizer.pad_token:
+            pass
+        elif self.tokenizer.unk_token:
+            self.tokenizer.pad_token_id = self.tokenizer.unk_token_id
+        elif self.tokenizer.eos_token:
+            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
        else:
-            from auto_gptq import AutoGPTQForCausalLM
-            model = AutoGPTQForCausalLM.from_quantized(
-                pretrained,
-                model_basename=None if quantized == True else Path(quantized).stem,
-                device_map=device_map,
-                max_memory=max_memory,
-                trust_remote_code=trust_remote_code,
-                use_safetensors=True if quantized == True else quantized.endswith('.safetensors'),
-                use_triton=gptq_use_triton,
-                warmup_triton=gptq_use_triton,
+            if getattr(self.config, "model_type", None) == "qwen":
+                # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
+                self.tokenizer.pad_token = "<|endoftext|>"
+            elif (
+                self.tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
+                or self.tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
+            ):
+                # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
+                # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
+                # ---
+                # Note that the world tokenizer class name, might change in the future for the final huggingface merge
+                # https://github.com/huggingface/transformers/pull/26963
+                assert self.tokenizer.pad_token_id == 0
+            else:
+                self.tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
+
+        # TODO: override this for Gemma
+        self.add_bos_token = add_bos_token
+        if getattr(self.config, "model_type", None) == "gemma":
+            self.add_bos_token = True
+            eval_logger.info(
+                f"Model type is '{self.config.model_type}', a BOS token will be used as Gemma underperforms without it."
            )
-        return model

-    def _create_auto_model_peft(
-        self,
-        *,
-        model: transformers.PreTrainedModel,
-        peft: str,
-        revision: str,
-        subfolder: str,
-        load_in_4bit: Optional[bool] = False,
-    ):
-        if load_in_4bit:
-            assert PEFT_VERSION >= "0.4.0", "load_in_4bit requires peft >= 0.4.0"
-        model = self.AUTO_PEFT_CLASS.from_pretrained(
-            model,
-            peft,
-            revision=revision + ("/" + subfolder if subfolder is not None else ""),
-        )
-        return model
-
-    def _create_auto_tokenizer(
-        self,
-        *,
-        pretrained: str,
-        revision: str,
-        subfolder: str,
-        tokenizer: Optional[str] = None,
-    ) -> transformers.PreTrainedTokenizer:
-        """Returns a pre-trained tokenizer from a pre-trained tokenizer configuration."""
-        tokenizer = self.AUTO_TOKENIZER_CLASS.from_pretrained(
-            pretrained if tokenizer is None else tokenizer,
-            revision=revision + ("/" + subfolder if subfolder is not None else ""),
-        )
-        tokenizer.pad_token = tokenizer.eos_token
-        return tokenizer
+        self._max_length = max_length
+        self.pretrained = pretrained
+        self.delta = delta
+        self.peft = peft
+        self.revision = revision
+        self.batch_schedule = 1
+        self.batch_sizes = {}
+        self.max_batch_size = max_batch_size

-    @property
-    def add_special_tokens(self) -> bool:
-        """Whether to include special tokens in encoded text. This should be
-        determined by whether or not the model was trained with special tokens.
-        TODO: Remove these conditionals once HuggingFace supports a way to
-        check whether or not an arbitrary model was trained with special tokens.
-        """
-        if self._add_special_tokens is not None:
-            return self._add_special_tokens
-        elif self.AUTO_MODEL_CLASS is transformers.AutoModelForCausalLM:
-            return False
-        elif self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM:
-            return True
+        if str(batch_size).startswith("auto"):
+            batch_size = batch_size.split(":")
+            self.batch_size_per_gpu = batch_size[0]
+            self.batch_schedule = float(batch_size[1]) if len(batch_size) > 1 else 1
        else:
-            raise ValueError(
-                "Could not determine `add_special_tokens` value from the model "
-                "class. Set to `True` or `False` depending on whether the model "
-                "was pre-trained with special tokens."
+            self.batch_size_per_gpu = int(batch_size)
+
+        if isinstance(pretrained, str):
+            # multigpu data-parallel support when launched with accelerate
+            if gpus > 1:
+                if parallelize:
+                    if accelerator.num_processes > 1:
+                        raise RuntimeError(
+                            "Attempted to use both a HF Accelerate `device_map` and to launch via `accelerate launch`. If this is the case, please either remove `parallelize=True` from --model_args or launch outside of the Accelerate launcher."
+                        )
+                    else:
+                        pass
+                elif accelerator.num_processes == 1:
+                    # if we aren't launching via accelerate, ditch
+                    self._rank = 0
+                    self._world_size = 1
+                else:
+                    if gpus > accelerator.num_processes:
+                        eval_logger.warning(
+                            "WARNING: The number of total system GPUs does not match the number of spawned processes. "
+                            "If you would like to use data parallelism, please launch the script "
+                            "with 'accelerate launch *script*'. "
+                            f"Current run will proceed with {accelerator.num_processes} devices."
+                        )
+                    assert (
+                        accelerator.distributed_type
+                        in [
+                            DistributedType.FSDP,
+                            DistributedType.MULTI_GPU,
+                        ]
+                    ), "Unsupported distributed type provided. Only DDP and FSDP are supported."
+                    if accelerator.distributed_type == DistributedType.FSDP:
+                        self._model = accelerator.prepare(self.model)
+                    else:
+                        self._model = accelerator.prepare_model(
+                            self.model, evaluation_mode=True
+                        )
+                    self._device = torch.device(
+                        f"cuda:{accelerator.local_process_index}"
+                    )
+                    self.accelerator = accelerator
+
+                    if self.accelerator.is_local_main_process:
+                        eval_logger.info(f"Using {gpus} devices with data parallelism")
+
+                    self._rank = self.accelerator.local_process_index
+                    self._world_size = self.accelerator.num_processes
+        else:
+            # if a PreTrainedModel was passed into HFLM, we forgo distributed setup.
+            eval_logger.warning(
+                "Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration"
            )
+            self._rank = 0
+            self._world_size = 1
+
+        self.custom_prefix_token_id = prefix_token_id
+        if prefix_token_id is not None:
+            eval_logger.info(
+                f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}"
+            )
+
+    @property
+    def config(self):
+        # return the associated transformers.AutoConfig for the given pretrained model.
+        return self._config

    @property
-    def eot_token(self) -> str:
-        return self.tokenizer.eos_token
+    def model(self):
+        # returns the model, unwrapping it if using Accelerate
+        if hasattr(self, "accelerator"):
+            return self.accelerator.unwrap_model(self._model)
+        else:
+            return self._model

    @property
-    def eot_token_id(self) -> int:
+    def eot_token_id(self):
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
        return self.tokenizer.eos_token_id

    @property
-    def max_gen_toks(self) -> int:
-        return self._max_gen_toks
+    def prefix_token_id(self):
+        # it is used as prefix for loglikelihood
+        if self.custom_prefix_token_id is not None:
+            return self.custom_prefix_token_id
+        if self.tokenizer.bos_token_id is not None:
+            return self.tokenizer.bos_token_id
+        return self.tokenizer.eos_token_id

    @property
-    def max_length(self) -> int:
-        """Return the maximum sequence length of the model.
-        NOTE: Different model configurations have different max sequence length
-        attribute names.
-            - n_positions: (CTRLConfig, T5Config)
-            - max_position_embeddings: (BartConfig, RoFormerConfig)
-            - n_ctx: (GPT2Config)
-        NOTE: For relative position encoded models you should specify the max
-        sequence length of the model in the constructor via `max_length`.
-        """
-        if self._max_length is not None:
+    def max_length(self):
+        if self._max_length:  # if max length manually set, return it
            return self._max_length
-        # Try to get the sequence length from the model config.
        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
        for attr in seqlen_config_attrs:
-            if hasattr(self._config, attr):
-                return getattr(self._config, attr)
+            if hasattr(self.model.config, attr):
+                return getattr(self.model.config, attr)
        if hasattr(self.tokenizer, "model_max_length"):
            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
                return self._DEFAULT_MAX_LENGTH
@@ -396,360 +396,931 @@ class HuggingFaceAutoLM(BaseLM):
        return self._DEFAULT_MAX_LENGTH

    @property
-    def batch_size(self) -> int:
-        # TODO: Add adaptive batch size.
-        return self._batch_size  # * gpus
+    def max_gen_toks(self) -> int:
+        return 256

    @property
-    def device(self) -> Union[int, str, torch.device]:
+    def batch_size(self):
+        return self.batch_size_per_gpu
+
+    @property
+    def device(self):
        return self._device

-    def tok_encode(self, string: str) -> TokenSequence:
-        # TODO: Merge `tok_encode_batch` here.
-        return self.tokenizer.encode(string, add_special_tokens=self.add_special_tokens)
+    @property
+    def rank(self):
+        return self._rank

-    def tok_encode_batch(self, strings: List[str]) -> TokenSequence:
-        return self.tokenizer(
-            strings,
-            padding=True,
-            add_special_tokens=self.add_special_tokens,
-            return_tensors="pt",
+    @property
+    def world_size(self):
+        return self._world_size
+
+    def _get_backend(
+        self,
+        config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
+        backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
+        trust_remote_code: Optional[bool] = False,
+    ) -> None:
+        """
+        Helper method during initialization.
+        Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder))
+        model type to be used.
+        """
+        assert backend in ["default", "causal", "seq2seq"]
+
+        if backend != "default":
+            # if we've settled on non-default backend, use that manually
+            if backend == "causal":
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+            elif backend == "seq2seq":
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
+            eval_logger.info(
+                f"Overrode HF model backend type, and using type '{backend}'"
+            )
+        else:
+            # determine and use the default HF backend for this model, based on its config + metadata.
+            if (
+                getattr(config, "model_type")
+                in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+            ):
+                # first check if model type is listed under seq2seq models, since some
+                # models like MBart are listed in both seq2seq and causal mistakenly in HF transformers.
+                # these special cases should be treated as seq2seq models.
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
+            elif (
+                getattr(self.config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+            ):
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+            else:
+                if not trust_remote_code:
+                    eval_logger.warning(
+                        "HF model type is neither marked as CausalLM or Seq2SeqLM. \
+                    This is expected if your model requires `trust_remote_code=True` but may be an error otherwise."
+                    )
+                # if model type is neither in HF transformers causal or seq2seq model registries
+                # then we default to AutoModelForCausalLM
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+
+        assert self.AUTO_MODEL_CLASS in [
+            transformers.AutoModelForCausalLM,
+            transformers.AutoModelForSeq2SeqLM,
+        ]
+        return None
+
+    def _get_config(
+        self,
+        pretrained: str,
+        revision: str = "main",
+        trust_remote_code: bool = False,
+    ) -> None:
+        self._config = transformers.AutoConfig.from_pretrained(
+            pretrained,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
        )

-    def tok_decode(self, tokens: torch.LongTensor) -> List[str]:
-        return self.tokenizer.batch_decode(tokens, skip_special_tokens=True)
+    def _create_model(
+        self,
+        pretrained: str,
+        revision: Optional[str] = "main",
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
+        trust_remote_code: Optional[bool] = False,
+        # arguments used for splitting a model across GPUs naively.
+        # only used if `parallelize=True`.
+        # (accelerate naive PP (device_map) options)
+        parallelize: Optional[bool] = False,
+        device_map_option: Optional[str] = "auto",
+        max_memory_per_gpu: Optional[Union[int, str]] = None,
+        max_cpu_memory: Optional[Union[int, str]] = None,
+        offload_folder: Optional[str] = "./offload",
+        # PEFT, delta weights and quantization options
+        peft: Optional[str] = None,
+        delta: Optional[str] = None,
+        autogptq: Optional[Union[bool, str]] = False,
+        **kwargs,
+    ) -> None:
+        """
+        Initializes an HF or HF-compatible PreTrainedModel from scratch
+        inside HFLM, using the kwargs passed into self.__init__().

-    def greedy_until(
-        self, requests: List[Tuple[str, Union[List[str], str]]]
-    ) -> List[str]:
-        def _collate(x):
-            tokens = self.tok_encode(x[0])
-            return len(tokens), x[0]
+        Also handles functionality such as AutoGPTQ usage and PEFT wrapping.

-        results = []
-        reorder = utils.Reorderer(requests, _collate)
+        For future similar extensions to AutoGPTQ that are not core to HF's ecosystem,
+        (such as PyTorch models that are nearly, but not quite, fully mirroring
+        HF's public interface relied on in this HFLM class)
+        please consider subclassing HFLM and overriding this and other methods as needed.
+        """

-        adaptive_batch_size = None
-        if self.batch_size == "auto":
-            # using rolling window with maximum context
-            print("Passed argument batch_size = auto. Detecting largest batch size")
-            batch_size = self._detect_batch_size()
-            print(f"Determined Largest batch size: {batch_size}")
-            adaptive_batch_size = batch_size
+        model_kwargs = kwargs if kwargs else {}

-        for chunk in utils.chunks(
-            tqdm(reorder.get_reordered(), disable=False),
-            self.batch_size if self.batch_size != "auto" else adaptive_batch_size,
-        ):
-            context = [c[0] for c in chunk]
-            request_args = chunk[0][1]
-            stop = request_args.get("until", None)
-            stop_sequences = stop if isinstance(stop, list) else [stop]
-            max_generation_length = request_args.get("max_length", None)
+        if parallelize:
+            model_kwargs.update(
+                _get_accelerate_args(
+                    device_map_option,  # TODO: phase out device_map_option?
+                    max_memory_per_gpu,
+                    max_cpu_memory,
+                    offload_folder,
+                )
+            )
+        elif "device_map" not in model_kwargs:
+            # set a device_map to initialize model on the right GPU.
+            # this is needed because it seems that the default behavior
+            # for quantized models now seems to be device_map="auto"
+            # which breaks data-parallel mode.
+            if hasattr(self, "accelerator"):
+                model_kwargs.update(
+                    {"device_map": {"": f"cuda:{self.accelerator.local_process_index}"}}
+                )
+            else:
+                model_kwargs.update({"device_map": {"": str(self.device)}})

-            assert (
-                isinstance(max_generation_length, int) or max_generation_length is None
+        if not autogptq:
+            if model_kwargs.get("load_in_4bit", None):
+                assert (
+                    transformers.__version__ >= "4.30.0"
+                ), "load_in_4bit requires transformers >= 4.30.0"
+            if transformers.__version__ >= "4.30.0":
+                if model_kwargs.get("load_in_4bit", None):
+                    if model_kwargs.get("bnb_4bit_compute_dtype", None):
+                        model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(
+                            model_kwargs["bnb_4bit_compute_dtype"]
+                        )
+            self._model = self.AUTO_MODEL_CLASS.from_pretrained(
+                pretrained,
+                revision=revision,
+                torch_dtype=get_dtype(dtype),
+                trust_remote_code=trust_remote_code,
+                **model_kwargs,
            )
-            assert isinstance(stop_sequences, list) or stop_sequences is None
+        else:
+            try:
+                from auto_gptq import AutoGPTQForCausalLM
+            except ModuleNotFoundError:
+                raise Exception(
+                    "Tried to load auto_gptq, but auto-gptq is not installed ",
+                    "please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]",
+                )

-            # TODO: Find a better way to handle stop sequences for 0-shot.
-            if stop_sequences is None:
-                until = [self.eot_token]
-            else:
-                until = stop_sequences + [self.eot_token]
+            self._model = AutoGPTQForCausalLM.from_quantized(
+                pretrained,
+                trust_remote_code=trust_remote_code,
+                model_basename=None if autogptq is True else Path(autogptq).stem,
+                use_safetensors=True
+                if autogptq is True
+                else autogptq.endswith(".safetensors"),
+                **model_kwargs,
+            )

-            if max_generation_length is None:
-                max_tokens = self.max_gen_toks
-            else:
-                max_tokens = max_generation_length
+        if peft and delta:
+            raise ValueError(
+                "Cannot use both 'peft' and 'delta' options at the same time."
+            )
+
+        if peft:
+            if model_kwargs.get("load_in_4bit", None):
+                if version.parse(PEFT_VERSION) < version.parse("0.4.0"):
+                    raise AssertionError("load_in_4bit requires peft >= 0.4.0")
+            if self._model.config.vocab_size != len(self.tokenizer):
+                # resize model for LoRAs with added tokens
+                self._model.resize_token_embeddings(len(self.tokenizer))
+                eval_logger.info(f"Model config indicates vocab_size='{self._model.config.vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer...") 
+            self._model = PeftModel.from_pretrained(
+                self._model, peft, revision=revision
+            )
+        elif delta:
+            if autogptq:
+                eval_logger.warning(
+                    "Delta weights might trigger unexpected behavior when used with AutoGPTQ."
+                )
+            _model_delta = self.AUTO_MODEL_CLASS.from_pretrained(
+                delta,
+                revision=revision,
+                torch_dtype=get_dtype(dtype),
+                trust_remote_code=trust_remote_code,
+                **model_kwargs,
+            )
+            for name, param in self._model.state_dict().items():
+                try:
+                    param.data += _model_delta.state_dict()[name]
+                except KeyError:
+                    raise KeyError(f"Delta model is missing weights for layer: {name}")
+                except Exception as e:
+                    raise RuntimeError(
+                        f"Failed to add delta weights to layer {name}. Error: {e}"
+                    )

-            token_context = self.tok_encode_batch(context)
+            del _model_delta

-            responses = self._model_generate(
-                inputs=token_context,
-                max_tokens=max_tokens,
-                stop=until,
+        return None
+
+    def _create_tokenizer(
+        self,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        tokenizer: Optional[
+            Union[
+                str,
+                transformers.PreTrainedTokenizer,
+                transformers.PreTrainedTokenizerFast,
+            ]
+        ],
+        revision: Optional[str] = "main",
+        trust_remote_code: Optional[bool] = False,
+        use_fast_tokenizer: Optional[bool] = True,
+    ) -> None:
+        """
+        Helper method during initialization.
+
+        Create a tokenizer object corresponding to the correct
+        tokenizer for value of `pretrained`, or use the pre-initialized tokenizer passed.
+        """
+
+        if tokenizer:
+            if isinstance(tokenizer, str):
+                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                    tokenizer,
+                    revision=revision,
+                    trust_remote_code=trust_remote_code,
+                    use_fast=use_fast_tokenizer,
+                )
+            else:
+                assert isinstance(
+                    tokenizer, transformers.PreTrainedTokenizer
+                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
+                self.tokenizer = tokenizer
+        else:
+            # Get tokenizer based on 'pretrained'
+            if isinstance(pretrained, str):
+                model_name = pretrained
+            else:
+                # get the HF hub name via accessor on model
+                model_name = self.model.name_or_path
+            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                model_name,
+                revision=revision,
+                trust_remote_code=trust_remote_code,
+                use_fast=use_fast_tokenizer,
            )
-            responses = self.tok_decode(responses.tolist())
+        return None

-            for response in responses:
-                # Ensure the generated responses do not contain the stop sequences.
-                for term in until:
-                    response = response.split(term)[0]
-                # partial caching
-                self.cache_hook.add_partial("greedy_until", (context, until), response)
-                results.append(response)
-        return reorder.get_original(results)
+    def _detect_batch_size(self, requests=None, pos: int = 0):
+        if requests:
+            _, context_enc, continuation_enc = requests[pos]
+            max_length = len(
+                (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
+            )
+            max_context_enc = len(context_enc[-(self.max_length + 1) :])
+            max_cont_enc = len(continuation_enc[-(self.max_length + 1) :])
+        else:
+            max_length = self.max_length
+            max_context_enc = max_length
+            max_cont_enc = max_length
+
+        # if OOM, then halves batch_size and tries again
+        @find_executable_batch_size(starting_batch_size=self.max_batch_size)
+        def forward_batch(batch_size):
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                length = max(max_context_enc, max_cont_enc)
+                batched_conts = torch.ones(
+                    (batch_size, length), device=self.device
+                ).long()
+                test_batch = torch.ones((batch_size, length), device=self.device).long()
+                call_kwargs = {
+                    "attn_mask": test_batch,
+                    "labels": batched_conts,
+                }
+            else:
+                call_kwargs = {}
+                test_batch = torch.ones(
+                    (batch_size, max_length), device=self.device
+                ).long()
+            for _ in range(5):
+                out = F.log_softmax(self._model_call(test_batch, **call_kwargs), dim=-1)  # noqa: F841
+
+            return batch_size
+
+        try:
+            batch_size = forward_batch()
+        except RuntimeError as e:
+            if "No executable batch size found" in str(e):
+                batch_size = 1
+            else:
+                raise
+
+        if self.world_size > 1:
+            # if multi-GPU, always take minimum over all selected batch sizes
+            max_rnk_bs = torch.tensor([batch_size], device=self.device)
+            gathered = (
+                self.accelerator.gather(max_rnk_bs).cpu().detach().numpy().tolist()
+            )
+            batch_size = min(gathered)
+            clear_torch_cache()
+            return batch_size
+
+        clear_torch_cache()
+        return batch_size
+
+    def tok_encode(
+        self, string: str, left_truncate_len=None, add_special_tokens=None
+    ) -> List[int]:
+        """ """
+        # default for None - empty dict, use predefined tokenizer param
+        # used for all models except for CausalLM or predefined value
+        special_tokens_kwargs = {}
+
+        # by default for CausalLM - false or self.add_bos_token is set
+        if add_special_tokens is None:
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                special_tokens_kwargs = {
+                    "add_special_tokens": False or self.add_bos_token
+                }
+        # otherwise the method explicitly defines the value
+        else:
+            special_tokens_kwargs = {"add_special_tokens": add_special_tokens}

+        encoding = self.tokenizer.encode(string, **special_tokens_kwargs)

-class AutoCausalLM(HuggingFaceAutoLM):
-    """Causal language modeling.
-    You can find a set of supported models in the HF documentation:
-    https://huggingface.co/docs/transformers/main/model_doc/auto#transformers.AutoModelForCausalLM
-    """
+        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+        if left_truncate_len:
+            encoding = encoding[-left_truncate_len:]

-    AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
-    AUTO_PEFT_CLASS = peft.PeftModel
+        return encoding

-    def _create_auto_tokenizer(
+    def tok_batch_encode(
        self,
-        *,
-        pretrained: str,
-        revision: str,
-        subfolder: str,
-        tokenizer: Optional[str] = None,
-    ) -> transformers.PreTrainedTokenizer:
-        tokenizer = super()._create_auto_tokenizer(
-            pretrained=pretrained,
-            revision=revision,
-            subfolder=subfolder,
-            tokenizer=tokenizer,
+        strings: List[str],
+        padding_side: str = "left",
+        left_truncate_len: int = None,
+        truncation: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
+        old_padding_side = self.tokenizer.padding_side
+        self.tokenizer.padding_side = padding_side
+
+        add_special_tokens = {}
+        if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+            add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
+
+        encoding = self.tokenizer(
+            strings,
+            truncation=truncation,
+            padding="longest",
+            return_tensors="pt",
+            **add_special_tokens,
        )
-        tokenizer.padding_side = "left"
-        return tokenizer
+        if left_truncate_len:
+            encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
+            encoding["attention_mask"] = encoding["attention_mask"][
+                :, -left_truncate_len:
+            ]
+        self.tokenizer.padding_side = old_padding_side

-    def _model_call(
-        self, inputs: TokenSequence, labels: Optional[TokenSequence] = None
-    ) -> TokenSequence:
-        return self.model(inputs)["logits"]
+        return encoding["input_ids"], encoding["attention_mask"]

-    def _model_generate(
-        self,
-        inputs: transformers.BatchEncoding,
-        max_tokens: int,
-        stop: Optional[List[str]] = None,
-    ) -> TokenSequence:
-        # Ensure that the context does not encroach into the `space`
-        # for the generation.
-        input_ids = inputs["input_ids"][:, self.max_gen_toks - self.max_length :]
-        attention_mask = inputs["attention_mask"][
-            :, self.max_gen_toks - self.max_length :
-        ]
-        input_ids = input_ids.to(self.device)
-        attention_mask = attention_mask.to(self.device)
+    def tok_decode(self, tokens, skip_special_tokens=True):
+        return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)

+    def _model_call(self, inps, attn_mask=None, labels=None):
+        """
+        :param inps: torch.Tensor
+            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)] or of shape
+            [batch, sequence_ctx]. the size of sequence may vary from call to call
+        :param attn_mask: torch.Tensor, optional
+            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)]. Only passed
+            (and must be passed) if self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM
+        :param labels: torch.Tensor, optional
+            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)]. Only passed
+            (and must be passed) if self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM
+        :return
+            A torch tensor of shape [batch, sequence, vocab] with the
+        logits returned from the model's decoder
+        """
+        with torch.no_grad():
+            if attn_mask is not None or labels is not None:
+                assert attn_mask is not None and labels is not None
+                assert self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM
+                return self.model(
+                    input_ids=inps, attention_mask=attn_mask, labels=labels
+                ).logits
+            else:
+                assert self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+                return self.model(inps).logits
+
+    def _model_generate(self, context, max_length, stop, **generation_kwargs):
+        # temperature = 0.0 if not set
+        # if do_sample is false and temp==0.0:
+        # remove temperature, as do_sample=False takes care of this
+        # and we don't want a warning from HF
+        generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
+        do_sample = generation_kwargs.get("do_sample", None)
+
+        # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
+        if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
+            generation_kwargs["do_sample"] = do_sample = False
+
+        if do_sample is False and generation_kwargs.get("temperature") == 0.0:
+            generation_kwargs.pop("temperature")
+        # build stopping criteria
        stopping_criteria = stop_sequences_criteria(
-            self.tokenizer, stop, input_ids.shape[1], input_ids.shape[0]
+            self.tokenizer, stop, context.shape[1], context.shape[0]
        )
-
-        generations = self.model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            # GPT style models require the `generate` `max_length` arg to include the
-            # context length, so we instead set `max_new_tokens` which is the number
-            # of new tokens to generate, excluding the current number of tokens.
-            max_new_tokens=max_tokens,
+        return self.model.generate(
+            input_ids=context,
+            max_length=max_length,
            stopping_criteria=stopping_criteria,
-            do_sample=False,
-        )
-        return utils.select_continuation_from_batch_left_padding(
-            generations, max_context_size=inputs["input_ids"].size(1)
+            pad_token_id=self.tokenizer.pad_token_id,
+            use_cache=True,
+            **generation_kwargs,
        )

+    def _select_cont_toks(
+        self, logits: torch.Tensor, contlen: int = None, inplen: int = None
+    ) -> torch.Tensor:
+        if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+            assert (
+                contlen and inplen
+            ), "Must pass input len and cont. len to select scored logits for causal LM"
+            # discard right-padding.
+            # also discard the input/context tokens. we'll only score continuations.
+            logits = logits[inplen - contlen : inplen]
+        elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+            assert (
+                contlen and not inplen
+            ), "Selecting scored logits for Seq2SeqLM requires only cont. len"
+            # only discard right-padding.
+            # the logits input to this fn only contain decoder-side tokens.
+            logits = logits[:contlen]

-class AutoSeq2SeqLM(HuggingFaceAutoLM):
-    """Seq2Seq language modeling.
-    You can find a set of supported models in the following documentation:
-    https://huggingface.co/docs/transformers/main/model_doc/auto#transformers.AutoModelForSeq2SeqLM
-    """
-
-    AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
-    AUTO_PEFT_CLASS = peft.PeftModel
+        return logits

-    def loglikelihood(
-        self, requests: List[Tuple[str, str]]
-    ) -> List[Tuple[float, bool]]:
-        new_requests = []
-        for chunk in utils.chunks(requests, self.batch_size):
-            context, continuation = zip(*chunk)
+    def loglikelihood_rolling(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[float]:
+        loglikelihoods = []

-            # Fill empty contexts with the EOT token.
-            context = [
-                f"{self.eot_token}" if len(text) == 0 else text for text in context
-            ]
-            context_enc = self.tok_encode_batch(context)
-            for key in context_enc:
-                context_enc[key] = context_enc[key][:, -self.max_length :]
-
-            # Remove leading whitespace introduced by the default
-            # `text_target_separator` since the context and continuation
-            # will not be concatenated as a single (decoder) input.
-            continuation = [text.lstrip() for text in continuation]
-            continuation_enc = self.tok_encode_batch(list(continuation))
-            for key in continuation_enc:
-                continuation_enc[key] = continuation_enc[key][:, -self.max_length :]
-
-            new_requests.append(
-                ((context, continuation), context_enc, continuation_enc)
-            )
-        return self._loglikelihood_tokens(new_requests)
+        adaptive_batch_size = None
+        if self.batch_size == "auto":
+            # using rolling window with maximum context
+            print("Passed argument batch_size = auto. Detecting largest batch size")
+            batch_size = self._detect_batch_size()
+            print(f"Determined Largest batch size: {batch_size}")
+            adaptive_batch_size = batch_size

-    def loglikelihood_rolling(self, requests: List[Tuple[str, str]]) -> List[float]:
-        loglikelihoods = []
-        for (string,) in tqdm(requests):
+        for (string,) in tqdm(
+            [req.args for req in requests], disable=(disable_tqdm or (self.rank != 0))
+        ):
            rolling_token_windows = list(
                map(
                    utils.make_disjoint_window,
                    utils.get_rolling_token_windows(
                        token_list=self.tok_encode(string),
-                        prefix_token=self.eot_token_id,
+                        prefix_token=self.prefix_token_id,
                        max_seq_len=self.max_length,
                        context_len=1,
                    ),
                )
            )
-            contexts, conts = utils.split_and_pad_windows(
-                rolling_token_windows,
-                pad_token_id=self.eot_token_id,
-                max_seq_len=self.max_length,
-            )
-            # Manually create BatchEncoding tensors with attention masks as
-            # expected by `self._model_call` in `self._loglikelihood_tokens`.
-            contexts_enc = torch.Tensor(contexts).long()
-            contexts_enc = transformers.tokenization_utils_base.BatchEncoding(
-                {
-                    "input_ids": contexts_enc,
-                    "attention_mask": (contexts_enc != self.eot_token_id).long(),
-                }
-            )
-            conts_enc = torch.Tensor(conts).long()
-            conts_enc = transformers.tokenization_utils_base.BatchEncoding(
-                {
-                    "input_ids": conts_enc,
-                    "attention_mask": (conts_enc != self.eot_token_id).long(),
-                }
-            )
-            # TODO: Extract out this call so it only gets called once and also
-            # somehow figure out partial caching for.
-            rolling_token_windows_request = [
-                ((contexts, conts), contexts_enc, conts_enc)
-            ]
+
+            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+
+            pad_amnt = 0
+            if self.world_size > 1:
+                # We pad out the external document-level iterator so the inner iterator doesn't hang
+                mytensor = torch.tensor(len(rolling_token_windows), device=self.device)
+                gathered = (
+                    self.accelerator.gather(mytensor).cpu().detach().numpy().tolist()
+                )
+
+                pad_amnt = max(gathered) - gathered[self.rank]
+                if pad_amnt > 0:
+                    rolling_token_windows += pad_amnt * [rolling_token_windows[0]]
+
            string_nll = self._loglikelihood_tokens(
-                rolling_token_windows_request, disable_tqdm=True
+                requests=rolling_token_windows,
+                disable_tqdm=True,
+                override_bs=adaptive_batch_size,
            )
-            string_nll = [x[0] for x in string_nll]  # discard is_greedy
+
+            if (self.world_size > 1) and (pad_amnt > 0):
+                string_nll = [x[0] for x in string_nll[:-pad_amnt]]
+            else:
+                # discard is_greedy
+                string_nll = [x[0] for x in string_nll]
+
            string_nll = sum(string_nll)
            loglikelihoods.append(string_nll)
+
        return loglikelihoods

+    def _batch_scheduler(self, pos, n_reordered_requests):
+        sched = pos // int(len(n_reordered_requests) / self.batch_schedule)
+        if sched in self.batch_sizes:
+            return self.batch_sizes[sched]
+        if (len(self.batch_sizes) > 1) and (
+            self.batch_sizes[sched - 1] == self.max_batch_size
+        ):
+            # if previous batch size is already maximal, skip recomputation
+            self.batch_sizes[sched] = self.max_batch_size
+            return self.batch_sizes[sched]
+        print(
+            f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size"
+        )
+        self.batch_sizes[sched] = self._detect_batch_size(n_reordered_requests, pos)
+        print(f"Determined largest batch size: {self.batch_sizes[sched]}")
+        return self.batch_sizes[sched]
+
    def _loglikelihood_tokens(
        self,
-        requests: List[Tuple[Tuple[str, str], TokenSequence, TokenSequence]],
-        disable_tqdm: Optional[bool] = False,
+        requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
+        disable_tqdm: bool = False,
+        override_bs: int = None,
    ) -> List[Tuple[float, bool]]:
-        results = []
-        for chunk in tqdm(
-            requests, total=math.ceil(len(requests)), disable=disable_tqdm
-        ):
-            cache_keys, inputs_tokens, targets_tokens = chunk
-            inputs_tokens = inputs_tokens.to(self.device)
-            targets_tokens = targets_tokens.to(self.device)
-            outputs = self._model_call(inputs=inputs_tokens, labels=targets_tokens)
-            log_softmaxes = F.log_softmax(outputs.logits, dim=-1)
-
-            output_iterator = zip(
-                zip(cache_keys[0], cache_keys[1]),
-                log_softmaxes,
-                targets_tokens["input_ids"],
-                targets_tokens["attention_mask"],
-            )
-            for cache_key, log_softmax, target_tokens, target_mask in output_iterator:
-                length = target_mask.sum()
-                log_softmax = log_softmax[:length]
-                target_tokens = target_tokens[:length]
-                greedy_tokens = log_softmax.argmax(dim=-1)
-                max_equal = (greedy_tokens == target_tokens).all()
-                target_logits = torch.gather(
-                    log_softmax, 1, target_tokens.unsqueeze(-1)
-                ).squeeze(-1)
-                answer = (float(target_logits.sum()), bool(max_equal))
-                results.append(answer)
-                if cache_key is not None:
-                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
-        return results
-
-    def _model_call(
-        self, inputs: TokenSequence, labels: Optional[TokenSequence] = None
-    ) -> TokenSequence:
-        return self.model(**inputs, labels=labels["input_ids"])
-
-    def _model_generate(
-        self,
-        inputs: transformers.BatchEncoding,
-        max_tokens: int,
-        stop: Optional[List[str]] = None,
-    ) -> TokenSequence:
-        input_ids = inputs["input_ids"][:, -self.max_length :].to(self.device)
-        attention_mask = inputs["attention_mask"][:, -self.max_length :].to(self.device)
-
-        # Generate one token to calculate the number of start tokens prepended to decoder_input_ids
-        # (leaving this here in case the below assumption is violated in the future)
-        # one_tok_gen = self.model.generate(
-        #    input_ids=torch.zeros((1, 1), dtype=torch.int),
-        #    min_length=2,
-        #    max_new_tokens=1,
-        # ).squeeze()
-        # initial_decoder_input_length = len(one_tok_gen) - 1
-
-        # Assume that there will always only be one token in the decoder inputs, assumption holds for existing HF models
-        stopping_criteria = stop_sequences_criteria(
-            self.tokenizer, stop, 1, input_ids.shape[0]
+        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
+        res = []
+
+        def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key for the sorted method"""
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+
+            toks = req[1] + req[2]
+            return -len(toks), tuple(toks)
+
+        def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key to group and lookup one-token continuations"""
+            # Use with group_by="contexts" (optional)"
+            # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
+            # speeds up some multiple-choice tasks proportionally to the number of choices.
+            # groups requests by context+continuation[:-1] and infer on one request/group.
+            return req[-2] + req[-1][:-1]
+
+        re_ord = Collator(
+            requests,
+            sort_fn=_collate,
+            group_by="contexts"
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+            and self.logits_cache
+            else None,
+            group_fn=_lookup_one_token_cont,
        )

-        generations = self.model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            max_new_tokens=max_tokens,
-            stopping_criteria=stopping_criteria,
-            do_sample=False,
+        # automatic (variable) batch size detection for vectorization
+        # pull longest context sample from request
+        n_reordered_requests = len(re_ord)
+        batch_size = (
+            self.batch_size
+            if self.batch_size != "auto"
+            else override_bs
+            if override_bs is not None
+            else 0
+        )
+        batch_fn = (
+            self._batch_scheduler
+            if self.batch_size == "auto"
+            and n_reordered_requests > 0
+            and not override_bs
+            else None
        )
-        return generations

+        chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn)
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running loglikelihood requests",
+        )
+        for chunk in chunks:
+            inps = []
+            cont_toks_list = []
+            inplens = []
+
+            conts = []
+            encoder_attns = []
+
+            padding_len_inp = None
+            padding_len_cont = None
+            # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
+            # tensors, then we pack them together into a batch, call the model, and then pick it all apart
+            # again because vectorizing is annoying
+
+            for _, context_enc, continuation_enc in chunk:
+                # sanity check
+                assert len(context_enc) > 0
+                assert len(continuation_enc) > 0
+                assert len(continuation_enc) <= self.max_length
+
+                # how this all works (illustrated on a causal decoder-only setup):
+                #          CTX      CONT
+                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+                # model  \               \
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+
+                # when too long to fit in context, truncate from the left
+                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                    inp = torch.tensor(
+                        (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
+                        dtype=torch.long,
+                        device=self.device,
+                    )
+                    (inplen,) = inp.shape
+                elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                    inp = torch.tensor(
+                        (context_enc)[-self.max_length :],
+                        dtype=torch.long,
+                        device=self.device,
+                    )
+                    (inplen,) = inp.shape
+
+                    # build encoder attn masks
+                    encoder_attns.append(torch.ones_like(inp))
+
+                    cont = torch.tensor(
+                        (continuation_enc)[-self.max_length :],
+                        # TODO: left-shift these?
+                        # TODO: our code assumes we never end up truncating conts for either model type
+                        dtype=torch.long,
+                        device=self.device,
+                    )
+                    (contlen,) = cont.shape
+
+                    conts.append(cont)
+
+                    padding_len_cont = (
+                        max(padding_len_cont, contlen)
+                        if padding_len_cont is not None
+                        else contlen
+                    )
+
+                padding_len_inp = (
+                    max(padding_len_inp, inplen)
+                    if padding_len_inp is not None
+                    else inplen
+                )

-class MultiTokenEOSCriteria(transformers.StoppingCriteria):
-    """Criteria to stop on the specified multi-token sequence."""
+                inps.append(inp)  # [1, inp_length]
+                cont_toks_list.append(continuation_enc)
+                inplens.append(inplen)
+
+            # create encoder attn mask and batched conts, if seq2seq
+            call_kwargs = {}
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                batched_inps = pad_and_concat(
+                    padding_len_inp, inps, padding_side="right"
+                )  # [batch, padding_len_inp]
+            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                # TODO: left-pad encoder inps and mask?
+                batched_inps = pad_and_concat(
+                    padding_len_inp, inps
+                )  # [batch, padding_len_inp]
+                batched_conts = pad_and_concat(
+                    padding_len_cont, conts
+                )  # [batch, padding_len_cont]
+                batched_encoder_mask = pad_and_concat(
+                    padding_len_inp, encoder_attns
+                )  # [batch, padding_len_inp]
+                call_kwargs = {
+                    "attn_mask": batched_encoder_mask,
+                    "labels": batched_conts,
+                }

-    def __init__(
-        self,
-        sequence: str,
-        tokenizer: transformers.PreTrainedTokenizer,
-        initial_decoder_input_length: int,
-        batch_size: int,
-    ):
-        self.initial_decoder_input_length = initial_decoder_input_length
-        self.done_tracker = [False] * batch_size
-        self.sequence = sequence
-        self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
-        self.sequence_id_len = len(self.sequence_ids)
-        self.tokenizer = tokenizer
-
-    def __call__(self, input_ids, scores, **kwargs) -> bool:
-        # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
-        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :][
-            :, -self.sequence_id_len :
-        ]
+            multi_logits = F.log_softmax(
+                self._model_call(batched_inps, **call_kwargs), dim=-1
+            )  # [batch, padding_length (inp or cont), vocab]
+
+            for (request_str, ctx_tokens, _), logits, inplen, cont_toks in zip(
+                chunk, multi_logits, inplens, cont_toks_list
+            ):
+                # Slice to original seq length
+                contlen = len(cont_toks)
+                # take only logits in the continuation
+                # (discard context toks if decoder-only ; discard right-padding)
+                # also discards + checks for "virtual tokens" in the causal LM's input window
+                # from prompt/prefix tuning tokens, if applicable
+                ctx_len = (
+                    inplen + (logits.shape[0] - padding_len_inp)
+                    if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+                    else None
+                )
+                logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
+                logits = logits.unsqueeze(0)  # [1, seq, vocab]
+
+                # Check if per-token argmax is exactly equal to continuation
+                greedy_tokens = logits.argmax(dim=-1)
+
+                # check for one-token continuation cache hits.
+                # noop in case group_by != "contexts" or no cache hit and returns the
+                # original args. Otherwise, expands the logits batch dimension and yields each
+                # batch along with matching continuation tokens and prompt strings.
+                # logits -> [1, seq, vocab]
+                for request_str, cont_toks, logits in re_ord.get_cache(
+                    req_str=request_str,
+                    cxt_toks=ctx_tokens,
+                    cont_toks=cont_toks,
+                    logits=logits,
+                ):
+                    cont_toks = torch.tensor(
+                        cont_toks, dtype=torch.long, device=self.device
+                    ).unsqueeze(0)  # [1, seq]
+                    max_equal = (greedy_tokens == cont_toks).all()
+
+                    # Obtain log-probs at the corresponding continuation token indices
+                    # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
+                    logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
+                        -1
+                    )  # [1, seq]
+
+                    # Answer: (log prob, is-exact-match)
+                    answer = (float(logits.sum()), bool(max_equal))
+
+                    res.append(answer)
+
+                    self.cache_hook.add_partial("loglikelihood", request_str, answer)
+                    pbar.update(1)
+
+        pbar.close()
+
+        return re_ord.get_original(res)
+
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
+        res = []
+
+        def _collate(req: Tuple[str, dict]):
+            """Defines the key for the sorted method"""
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = self.tok_encode(req[0])
+            return -len(toks), req[0]
+
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running generate_until requests",
+        )
+        adaptive_batch_size = None
+        if self.batch_size == "auto":
+            # using rolling window with maximum context
+            print("Passed argument batch_size = auto. Detecting largest batch size")
+            batch_size = self._detect_batch_size()
+            print(f"Determined Largest batch size: {batch_size}")
+            adaptive_batch_size = batch_size
+        # for each different set of kwargs, we execute all requests, by batch.
+        batch_size = (
+            self.batch_size
+            if self.batch_size != "auto"
+            else adaptive_batch_size
+            if adaptive_batch_size is not None
+            else 0
+        )
+        batch_fn = (
+            self._batch_scheduler
+            if self.batch_size == "auto" and not adaptive_batch_size
+            else None
+        )

-        lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
-
-        for i, done in enumerate(self.done_tracker):
-            if not done:
-                self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
-        return False not in self.done_tracker
-
-
-def stop_sequences_criteria(
-    tokenizer: transformers.PreTrainedTokenizer,
-    stop_sequences: List[str],
-    initial_decoder_input_length: int,
-    batch_size: int,
-) -> transformers.StoppingCriteriaList:
-    return transformers.StoppingCriteriaList(
-        [
-            *[
-                MultiTokenEOSCriteria(
-                    sequence, tokenizer, initial_decoder_input_length, batch_size
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        # group_fn=lambda x: x[1] -> x=(context, gen_kwargs)
+        re_ords = Collator(
+            [reg.args for reg in requests],
+            sort_fn=_collate,
+            group_by="gen_kwargs",
+            group_fn=lambda x: x[1],
+        )
+        chunks = re_ords.get_batched(n=batch_size, batch_fn=batch_fn)
+        for chunk in chunks:
+            contexts, all_gen_kwargs = zip(*chunk)
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            gen_kwargs = all_gen_kwargs[0]
+            # unpack our keyword arguments.
+            until = None
+            if isinstance(gen_kwargs, dict):
+                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                if "until" in kwargs.keys():
+                    until = kwargs.pop("until")
+                    if isinstance(until, str):
+                        until = [until]
+                    elif not isinstance(until, list):
+                        raise ValueError(
+                            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
+                        )
+            else:
+                raise ValueError(
+                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
                )
-                for sequence in stop_sequences
-            ],
-        ]
-    )
+            # add EOS token to stop sequences
+            eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
+            if not until:
+                until = [eos]
+            else:
+                until.append(eos)
+            if "max_gen_toks" in kwargs.keys():
+                max_gen_toks = kwargs.pop("max_gen_toks")
+            else:
+                max_gen_toks = self.max_gen_toks
+
+            # set the max length in tokens of inputs ("context_enc")
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                # max len for inputs = max length, minus room to generate the max new tokens
+                max_ctx_len = self.max_length - max_gen_toks
+            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                # max len for inputs = encoder's whole max_length
+                max_ctx_len = self.max_length
+
+            # encode, pad, and truncate contexts for this batch
+            context_enc, attn_masks = self.tok_batch_encode(
+                contexts,
+                left_truncate_len=max_ctx_len,
+                truncation=self.truncation,
+            )
+            context_enc = context_enc.to(self.device)
+            attn_masks = attn_masks.to(self.device)
+
+            if "max_length" not in kwargs:
+                kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
+
+            # perform batched generation
+            cont = self._model_generate(
+                context=context_enc,
+                attention_mask=attn_masks,
+                stop=until,
+                **kwargs,
+            )
+
+            cont_toks_list = cont.tolist()
+            for cont_toks, context in zip(cont_toks_list, contexts):
+                # discard context + left-padding toks if using causal decoder-only LM
+                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                    cont_toks = cont_toks[context_enc.shape[1] :]
+
+                s = self.tok_decode(cont_toks)
+
+                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
+                for term in until:
+                    if len(term) > 0:
+                        # ignore '' separator,
+                        # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
+                        s = s.split(term)[0]
+
+                res.append(s)
+
+                self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
+                pbar.update(1)
+        # reorder this group of results back to original unsorted form
+        res = re_ords.get_original(res)
+
+        pbar.close()
+
+        return res
+
+    def get_model_info(self) -> dict:
+        """
+        Method to get Hugging Face model information for experiment reproducibility.
+        """
+
+        def get_model_num_params(model) -> int:
+            if hasattr(model, "num_parameters"):
+                return model.num_parameters()
+            if hasattr(model, "parameters"):
+                return sum(p.numel() for p in model.parameters())
+            else:
+                return -1
+
+        def get_model_dtype(model) -> str:
+            if hasattr(model, "dtype"):
+                return model.dtype
+            else:
+                return ""
+
+        def get_model_sha(pretrained: str, revision: str) -> str:
+            try:
+                model_info = HfApi().model_info(repo_id=pretrained, revision=revision)
+                return model_info.sha
+            except Exception as e:
+                eval_logger.warn(
+                    f"Failed to get model SHA for {pretrained} at revision {revision}. Error: {e}"
+                )
+                return ""
+
+        model_info = {
+            "model_num_parameters": get_model_num_params(self._model),
+            "model_dtype": get_model_dtype(self._model),
+            "model_revision": self.revision,
+            "model_sha": get_model_sha(self.pretrained, self.revision),
+        }
+        if self.peft:
+            model_info["peft_sha"] = get_model_sha(self.peft, self.revision)
+        if self.delta:
+            model_info["delta_sha"] = get_model_sha(self.delta, self.revision)
+        return model_info
--- a/lm_eval/models/mamba_lm.py
+++ b/lm_eval/models/mamba_lm.py
+from typing import Optional, Union
+
+import torch
+
+import lm_eval.models.utils
+from lm_eval.api.registry import register_model
+from lm_eval.models.huggingface import HFLM
+
+
+@register_model("mamba_ssm")
+class MambaLMWrapper(HFLM):
+    def __init__(
+        self,
+        pretrained="state-spaces/mamba-130m",
+        **kwargs,
+    ) -> None:
+        """
+        Mamba (via the `mamba_ssm` package) supports the following args:
+        ```
+        d_model: int,
+        n_layer: int,
+        vocab_size: int,
+        initializer_cfg=None,
+        pad_vocab_size_multiple: int = 1,
+        ssm_cfg=None,
+        norm_epsilon: float = 1e-5,
+        rms_norm: bool = False,
+        initializer_cfg=None,
+        fused_add_norm=False,
+        residual_in_fp32=False,
+        ```
+
+        See https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py#L175 for more info.
+        The above can all be passed via `--model_args` or to this __init__() directly
+        but we recommend placing many of these within the config.json file uploaded alongside your
+        Mamba model to the HF Hub instead.
+        All other HuggingFace from_pretrained() kwargs
+        such as those related to
+        `parallelize=True`, PEFT, autoGPTQ,
+        or any sub-configurations of these advanced args,
+        are unsupported by the `mamba_ssm` package.
+
+        The HFLM arguments
+
+        `backend`, `tokenizer`, `truncation`, `max_length`,
+        `device`, `dtype`, `batch_size`, `max_batch_size`, `trust_remote_code`, `use_fast_tokenizer`
+
+        Are all supported by Mamba where they do not conflict
+        with Mamba-specific restrictions such as causal LMs only.
+        """
+
+        if "backend" in kwargs:
+            # mamba currently only supports causal models
+            assert kwargs["backend"] == "causal"
+
+        super().__init__(
+            pretrained=pretrained,
+            # set appropriate defaults for tokenizer, max length, etc
+            backend=kwargs.pop("backend", "causal"),
+            tokenizer=kwargs.pop("tokenizer", "EleutherAI/gpt-neox-20b"),
+            max_length=kwargs.pop("max_length", 2048),
+            **kwargs,
+        )
+
+    def _get_config(
+        self,
+        pretrained: str,
+        **kwargs,
+    ) -> None:
+        try:
+            from mamba_ssm.utils.hf import load_config_hf  # noqa: F811
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
+please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
+            )
+
+        self._config = load_config_hf(pretrained)
+
+    def _create_model(
+        self,
+        pretrained: str,
+        dtype: Optional[Union[str, torch.dtype]] = "float16",
+        # no `parallelize=True` options
+        # no PEFT and quantization options
+        # Mamba does not support arbitrary HF from_pretrained() args
+        **kwargs,
+    ) -> None:
+        try:
+            from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel  # noqa: F811
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
+please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
+            )
+
+        self._model = MambaLMHeadModel.from_pretrained(
+            pretrained,
+            device=self._device,
+            dtype=torch.float16
+            if dtype == "auto"
+            else lm_eval.models.utils.get_dtype(dtype),
+        )
+
+    def _model_generate(self, context, max_length, stop, **generation_kwargs):
+        for key in ("do_sample", "attention_mask"):
+            if key in generation_kwargs:
+                generation_kwargs.pop(key)
+
+        # mamba's custom GenerationMixin currently does not support
+        # passing stopping criteria.
+        # for the time being, we simply generate to max length,
+        # then truncate (equivalent result)
+        # -- this should be revisited to speed up generation
+        # stopping_criteria = stop_sequences_criteria(
+        #     self.tokenizer, stop, 1, context.shape[0]
+        # )
+
+        return self.model.generate(
+            input_ids=context,
+            max_length=max_length,
+            # stopping_criteria=stopping_criteria,
+            # pad_token_id=self.tokenizer.pad_token_id,
+            # use_cache=True,
+            **generation_kwargs,
+        )
--- a/lm_eval/models/nemo_lm.py
+++ b/lm_eval/models/nemo_lm.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import pathlib
+from copy import deepcopy
+from typing import List, Literal
+
+import filelock
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import Collator
+from lm_eval.utils import (
+    eval_logger,
+    get_rolling_token_windows,
+    make_disjoint_window,
+    simple_parse_args_string,
+)
+
+
+def _patch_pretrained_cfg(
+    pretrained_cfg, trainer, tensor_model_parallel_size, pipeline_model_parallel_size
+):
+    try:
+        import omegaconf
+    except ModuleNotFoundError:
+        raise Exception(
+            "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
+            "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
+            "or installing nemo following https://github.com/NVIDIA/NeMo.",
+        )
+
+    omegaconf.OmegaConf.set_struct(pretrained_cfg, True)
+    with omegaconf.open_dict(pretrained_cfg):
+        attributes_to_update = {
+            "sequence_parallel": False,
+            "activations_checkpoint_granularity": None,
+            "activations_checkpoint_method": None,
+            "precision": trainer.precision,
+            "global_batch_size": None,
+            "tensor_model_parallel_size": tensor_model_parallel_size,
+            "pipeline_model_parallel_size": pipeline_model_parallel_size,
+            "apply_rope_fusion": False,
+        }
+        for name, value in attributes_to_update.items():
+            if hasattr(pretrained_cfg, name):
+                pretrained_cfg[name] = value
+    return pretrained_cfg
+
+
+def _get_target_from_class(target_class) -> str:
+    return f"{target_class.__module__}.{target_class.__name__}"
+
+
+def load_model(
+    model_path: str,
+    trainer,
+    tensor_model_parallel_size: int,
+    pipeline_model_parallel_size: int,
+) -> torch.nn.Module:
+    try:
+        from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import (
+            MegatronGPTModel,
+        )
+        from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+    except ModuleNotFoundError:
+        raise Exception(
+            "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
+            "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
+            "or installing nemo following https://github.com/NVIDIA/NeMo.",
+        )
+    model_path = pathlib.Path(model_path)
+
+    save_restore_connector = NLPSaveRestoreConnector()
+    if model_path.is_dir():
+        save_restore_connector.model_extracted_dir = model_path.as_posix()
+    pretrained_cfg = save_restore_connector.restore_from(
+        None, model_path.as_posix(), return_config=True, trainer=trainer
+    )
+    if not hasattr(pretrained_cfg, "target"):
+        pretrained_cfg["target"] = _get_target_from_class(MegatronGPTModel)
+
+    pretrained_cfg = _patch_pretrained_cfg(
+        pretrained_cfg,
+        trainer,
+        tensor_model_parallel_size=tensor_model_parallel_size,
+        pipeline_model_parallel_size=pipeline_model_parallel_size,
+    )
+
+    model_to_load_path = model_path
+    override_config = pretrained_cfg
+
+    module_name, class_name = override_config.target.rsplit(".", 1)
+    model_class = getattr(importlib.import_module(module_name), class_name)
+
+    # monkeypatch _build_tokenizer method to be process-safe
+    tokenizer_lock = filelock.FileLock(f"/tmp/{model_path.name}.tokenizer.lock")
+
+    def _synced_build_tokenizer(self):
+        with tokenizer_lock:
+            self._original_build_tokenizer()
+
+    model_class._original_build_tokenizer = model_class._build_tokenizer
+    model_class._build_tokenizer = _synced_build_tokenizer
+
+    model = model_class.restore_from(
+        restore_path=model_to_load_path.as_posix(),
+        trainer=trainer,
+        override_config_path=override_config,
+        save_restore_connector=save_restore_connector,
+        map_location=f"cuda:{trainer.local_rank}",
+    )
+
+    model.freeze()
+    model.training = False
+    try:
+        # Have to turn off activations_checkpoint_method for inference
+        model.model.language_model.encoder.activations_checkpoint_method = None
+    except AttributeError:
+        pass
+    return model
+
+
+def setup_distributed_environment(trainer):
+    try:
+        from nemo.utils.app_state import AppState
+    except ModuleNotFoundError:
+        raise Exception(
+            "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
+            "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
+            "or installing nemo following https://github.com/NVIDIA/NeMo.",
+        )
+
+    def dummy():
+        return
+
+    if trainer.strategy.launcher is not None:
+        trainer.strategy.launcher.launch(dummy, trainer=trainer)
+    trainer.strategy.setup_environment()
+
+    app_state = AppState()
+
+    return app_state
+
+
+@register_model("nemo_lm")
+class NeMoLM(LM):
+    def __init__(
+        self,
+        path: str,
+        max_length: int = 4096,
+        batch_size: int = 1,
+        max_gen_toks: int = 256,
+        devices: int = 1,
+        num_nodes: int = 1,
+        tensor_model_parallel_size: int = 1,
+        pipeline_model_parallel_size: int = 1,
+        precision: Literal[
+            "16-mixed",
+            "bf16-mixed",
+            "32-true",
+            "64-true",
+            64,
+            32,
+            16,
+            "64",
+            "32",
+            "16",
+            "bf16",
+        ] = "bf16",
+        **kwargs,
+    ):
+        try:
+            from nemo.collections.nlp.modules.common.text_generation_utils import (
+                generate,
+            )
+            from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+            from pytorch_lightning.trainer.trainer import Trainer
+
+            self.generate = generate
+        except ModuleNotFoundError:
+            raise Exception(
+                "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
+                "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
+                "or installing nemo following https://github.com/NVIDIA/NeMo.",
+            )
+
+        super().__init__()
+
+        if (
+            tensor_model_parallel_size == 1
+            and pipeline_model_parallel_size == 1
+            and devices > 1
+        ):
+            eval_logger.info(
+                f"The number of data replicas for evaluation is {devices}."
+            )
+            eval_logger.info(f"The total number of devices is {devices}.")
+            eval_logger.info(
+                "No tensor parallelism or pipeline parallelism is applied."
+            )
+
+        elif tensor_model_parallel_size * pipeline_model_parallel_size == devices:
+            eval_logger.info(
+                f"Setting tensor parallelism to {tensor_model_parallel_size} and pipeline parallelism to {pipeline_model_parallel_size}."
+            )
+            eval_logger.info(f"The total number of devices is {devices}.")
+            eval_logger.info("No data parallelism is applied.")
+
+        else:
+            raise ValueError(
+                "Please set the product of tensor_model_parallel_size and pipeline_model_parallel_size"
+                "equal to the specified number of devices."
+            )
+
+        if num_nodes > 1:
+            raise ValueError(
+                "A number of nodes greater than 1 is not supported yet. Please set num_nodes as 1."
+            )
+
+        trainer = Trainer(
+            strategy=NLPDDPStrategy(),
+            devices=devices,
+            accelerator="gpu",
+            num_nodes=num_nodes,
+            precision=precision,
+            logger=False,
+            enable_checkpointing=False,
+            use_distributed_sampler=False,
+        )
+        # Modify the following flags only for data replication
+        if (
+            tensor_model_parallel_size == 1
+            and pipeline_model_parallel_size == 1
+            and devices > 1
+        ):
+            self._device = torch.device(f"cuda:{trainer.global_rank}")
+            self._rank = trainer.global_rank
+            self._world_size = trainer.world_size
+        self.model = load_model(
+            path,
+            trainer,
+            tensor_model_parallel_size=tensor_model_parallel_size,
+            pipeline_model_parallel_size=pipeline_model_parallel_size,
+        ).cuda()
+        self.tokenizer = self.model.tokenizer
+        self.app_state = setup_distributed_environment(trainer)
+
+        self._max_length = max_length
+        self._batch_size = int(batch_size)
+        self._max_gen_toks = max_gen_toks
+
+    @classmethod
+    def create_from_arg_string(cls, arg_string, additional_config=None):
+        args = simple_parse_args_string(arg_string)
+        if additional_config:
+            args["batch_size"] = additional_config.get("batch_size", 1)
+
+        return cls(**args)
+
+    @property
+    def eot_token_id(self):
+        try:
+            return self.tokenizer.eos_id
+        except AttributeError:
+            return None
+
+    @property
+    def max_length(self):
+        return self._max_length
+
+    @property
+    def max_gen_toks(self):
+        return self._max_gen_toks
+
+    @property
+    def batch_size(self):
+        return self._batch_size
+
+    @property
+    def device(self):
+        return self._device
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @property
+    def world_size(self):
+        return self._world_size
+
+    @property
+    def accelerator(self):
+        return self._Accelerator(self.world_size)
+
+    class _Accelerator:
+        def __init__(self, world_size):
+            self.world_size = world_size
+
+        def wait_for_everyone(self):
+            torch.distributed.barrier()
+
+        def gather(self, local_tensor):
+            gathered_tensors = [
+                torch.zeros(1, dtype=local_tensor.dtype).cuda()
+                for _ in range(self.world_size)
+            ]
+            torch.distributed.all_gather(gathered_tensors, local_tensor)
+            return torch.cat(gathered_tensors)
+
+    def tok_encode(self, string: str):
+        return self.tokenizer.text_to_ids(string)
+
+    def tok_decode(self, tokens):
+        return self.tokenizer.ids_to_text(tokens)
+
+    def _encode_pair(self, context, continuation):
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+        whole_enc = self.tok_encode(context + continuation)
+        context_enc = self.tok_encode(context)
+        context_enc_len = len(context_enc)
+        continuation_enc = whole_enc[context_enc_len:]
+        return context_enc, continuation_enc
+
+    def loglikelihood(self, requests):
+        new_reqs = []
+        for context, continuation in [req.args for req in requests]:
+            if context == "":
+                # end of text as context
+                context_enc, continuation_enc = (
+                    [self.eot_token_id],
+                    self.tok_encode(continuation),
+                )
+            else:
+                context_enc, continuation_enc = self._encode_pair(context, continuation)
+
+            new_reqs.append(((context, continuation), context_enc, continuation_enc))
+
+        return self._loglikelihood_tokens(new_reqs)
+
+    def loglikelihood_rolling(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[float]:
+        loglikelihoods = []
+
+        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
+            rolling_token_windows = list(
+                map(
+                    make_disjoint_window,
+                    get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.eot_token_id,
+                        max_seq_len=self.max_length - 1,
+                        context_len=1,
+                    ),
+                )
+            )
+
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+
+            string_nll = self._loglikelihood_tokens(
+                rolling_token_windows,
+            )
+
+            # discard is_greedy
+            string_nll = [x[0] for x in string_nll]
+
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+        return loglikelihoods
+
+    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
+        res = []
+
+        def _collate(x):
+            toks = x[1] + x[2]
+            return -len(toks), tuple(toks)
+
+        re_ord = Collator(requests, sort_fn=_collate)
+        chunks = re_ord.get_batched(n=self.batch_size, batch_fn=None)
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running loglikelihood requests",
+        )
+        for chunk in chunks:
+            inps = []
+            ctxlens = []
+            contlens = []
+
+            for _, context_enc, continuation_enc in chunk:
+                # Leave one token for generation. Tokens_to_generate = 0 breaks NeMo.
+                inp = (context_enc + continuation_enc)[-(self.max_length - 1) :]
+
+                ctxlen = len(context_enc) - max(
+                    0, len(context_enc) + len(continuation_enc) - (self.max_length - 1)
+                )
+                ctxlens.append(ctxlen)
+                contlens.append(len(continuation_enc))
+
+                inps.append(self.tok_decode(inp))
+
+            output = self.generate(
+                self.model,
+                inputs=inps,
+                tokens_to_generate=1,
+                min_tokens_to_generate=1,
+                compute_logprob=True,
+                all_probs=True,
+            )
+
+            batch_token_ids = np.asarray(output["token_ids"])[:, :-1]
+            batch_logprobs = output["logprob"][:, :-1]
+            batch_full_logprob = output["full_logprob"][:, :-1, :]
+
+            # Compute greedy tokens for entire batch rather than calling it with proper ctxlen for each sample.
+            # Additional tokens for each sample will be trimmed later.
+            min_ctxlen = min(ctxlens)
+
+            # Use min_ctxlen-1 instead of min_ctxlen since full_logprobs are not returns for the first token.
+            batch_greedy_tokens = (
+                torch.argmax(batch_full_logprob[:, min_ctxlen - 1 :, :], -1)
+                .cpu()
+                .numpy()
+            )
+
+            for token_ids, greedy_tokens, logprobs, ctxlen, contlen, (
+                cache_key,
+                _,
+                _,
+            ) in zip(
+                batch_token_ids,
+                batch_greedy_tokens,
+                batch_logprobs,
+                ctxlens,
+                contlens,
+                chunk,
+            ):
+                # Trim at contlen since shorter contexts in a batch will have more than one token generated.
+                # Use ctxlen-1 instead of ctxlen same as for full_logprob in batch_greedy_tokens calculation
+                logprobs = (logprobs[ctxlen - 1 :])[:contlen]
+                logprob = sum(logprobs).tolist()
+
+                continuation_tokens = (token_ids[ctxlen:])[:contlen]
+                len_diff = ctxlen - min_ctxlen
+                is_greedy = continuation_tokens == (greedy_tokens[len_diff:])[:contlen]
+                if not isinstance(is_greedy, bool):
+                    is_greedy = is_greedy.all()
+                answer = (logprob, is_greedy)
+
+                if cache_key is not None:
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+
+                res.append(answer)
+                pbar.update(1)
+
+        pbar.close()
+
+        return re_ord.get_original(res)
+
+    def generate_until(self, requests):
+        if not requests:
+            return []
+        res = []
+
+        def get_until(req_args):
+            until = req_args.get("until", [])
+            until = deepcopy(until)  # prevent from modifying req_args for cache_key
+            if self.tokenizer.ids_to_tokens([self.eot_token_id])[0] not in until:
+                until.append(self.tokenizer.ids_to_tokens([self.eot_token_id])[0])
+            return until
+
+        def _collate(x):
+            toks = self.tok_encode(x[0])
+            return len(toks), x[0]
+
+        re_ords = Collator(
+            [reg.args for reg in requests], sort_fn=_collate, group_by="gen_kwargs"
+        )
+        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
+        for chunk in chunks:
+            contexts, all_gen_kwargs = zip(*chunk)
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            req_args = all_gen_kwargs[0]
+            # unpack our keyword arguments.
+            until = get_until(req_args)
+            max_gen_toks = req_args.get("max_gen_toks", self.max_gen_toks)
+
+            remaining_length = self.max_length - max_gen_toks
+            contexts = []
+            for context, _ in chunk:
+                encoded_context = self.tok_encode(context)
+                encoded_context = encoded_context[-remaining_length:]
+                contexts.append(self.tok_decode(encoded_context))
+
+            output = self.generate(
+                self.model,
+                inputs=contexts,
+                tokens_to_generate=max_gen_toks,
+                end_strings=until,
+                greedy=True,
+            )
+
+            answers = output["sentences"]
+
+            continuations = []
+            for context, answer in zip(contexts, answers):
+                continuations.append(answer[len(context) :])
+
+            for term in until:
+                continuations = [answer.split(term)[0] for answer in continuations]
+
+            for request, answer in zip(chunk, continuations):
+                self.cache_hook.add_partial("greedy_until", request, answer)
+                res.append(answer)
+
+        return re_ords.get_original(res)