resolved git conflict

470fb31c · lintangsutawika · dfb41835 · 4d10ad56 · 470fb31c · 470fb31c
Commit 470fb31c authored Jan 02, 2024 by lintangsutawika
20 changed files
--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
-from lm_eval.api.model import LM
-from lm_eval.api.registry import register_model
+from typing import Any, List, Tuple
+
 from tqdm import tqdm
-import time
+
 from lm_eval import utils
-from typing import List, Any, Tuple
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from lm_eval.utils import retry_on_specific_exceptions
+

 eval_logger = utils.eval_logger

@@ -45,26 +48,30 @@ def anthropic_completion(
 please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e .[anthropic]`",
        )

-    backoff_time: float = 3
-    while True:
-        try:
-            response = client.completions.create(
-                prompt=f"{anthropic.HUMAN_PROMPT} {prompt}{anthropic.AI_PROMPT}",
-                model=model,
-                # NOTE: Claude really likes to do CoT, and overly aggressive stop sequences
-                #       (e.g. gsm8k's ":") may truncate a lot of the input.
-                stop_sequences=[anthropic.HUMAN_PROMPT] + stop,
-                max_tokens_to_sample=max_tokens_to_sample,
-                temperature=temperature,
-                **kwargs,
-            )
-            return response.completion
-        except anthropic.RateLimitError as e:
-            eval_logger.warning(
-                f"RateLimitError occurred: {e.__cause__}\n Retrying in {backoff_time} seconds"
-            )
-            time.sleep(backoff_time)
-            backoff_time *= 1.5
+    def _exception_callback(e: Exception, sleep_time: float) -> None:
+        eval_logger.warning(
+            f"RateLimitError occurred: {e.__cause__}\n Retrying in {sleep_time} seconds"
+        )
+
+    @retry_on_specific_exceptions(
+        on_exceptions=[anthropic.RateLimitError],
+        max_retries=None,  # retry forever, consider changing
+        on_exception_callback=_exception_callback,
+    )
+    def completion():
+        response = client.completions.create(
+            prompt=f"{anthropic.HUMAN_PROMPT} {prompt}{anthropic.AI_PROMPT}",
+            model=model,
+            # NOTE: Claude really likes to do CoT, and overly aggressive stop sequences
+            #       (e.g. gsm8k's ":") may truncate a lot of the input.
+            stop_sequences=[anthropic.HUMAN_PROMPT] + stop,
+            max_tokens_to_sample=max_tokens_to_sample,
+            temperature=temperature,
+            **kwargs,
+        )
+        return response.completion
+
+    return completion()


 @register_model("anthropic")
@@ -141,6 +148,14 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
        raise NotImplementedError("No support for logits.")

    def generate_until(self, requests) -> List[str]:
+        try:
+            import anthropic
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e .[anthropic]`",
+            )
+
        if not requests:
            return []


--- a/lm_eval/models/dummy.py
+++ b/lm_eval/models/dummy.py
 import random
+
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model


--- a/lm_eval/models/gguf.py
+++ b/lm_eval/models/gguf.py
-import requests
 import logging
 import time
-from tqdm import tqdm
+
+import requests
 from requests.exceptions import RequestException
+from tqdm import tqdm
+
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model

+
 logger = logging.getLogger(__name__)



--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
+import copy
 import os
-from packaging import version
+from pathlib import Path
+from typing import List, Literal, Optional, Tuple, Union
+
 import torch
+import torch.nn.functional as F
 import transformers
+from accelerate import Accelerator, DistributedType, find_executable_batch_size
+from packaging import version
+from peft import PeftModel
+from peft import __version__ as PEFT_VERSION
+from tqdm import tqdm
 from transformers.models.auto.modeling_auto import (
    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
 )
-from peft import __version__ as PEFT_VERSION, PeftModel
-
-import copy
-from collections import defaultdict
-from tqdm import tqdm
-from pathlib import Path
-
-import torch.nn.functional as F

 from lm_eval import utils
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
+from lm_eval.utils import Collator, stop_sequences_criteria

-from lm_eval.utils import MultiTokenEOSCriteria, stop_sequences_criteria
-
-from accelerate import Accelerator, find_executable_batch_size, DistributedType
-from typing import List, Optional, Union, Tuple, Literal

 eval_logger = utils.eval_logger

@@ -107,9 +105,7 @@ class HFLM(LM):
            eval_logger.warning(
                "`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way."
            )
-            assert (
-                not parallelize
-            ), "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
+            assert not parallelize, "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
            self._model = pretrained
            self._device = self._model.device

@@ -145,9 +141,7 @@ class HFLM(LM):
                    + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
                    + ["mps", "mps:0"]
                )
-                if device:
-                    if device not in device_list:
-                        device = int(device)
+                if device and device in device_list:
                    self._device = torch.device(device)
                    eval_logger.info(f"Using device '{device}'")
                    if device in ("mps", "mps:0") and version.parse(
@@ -170,7 +164,7 @@ class HFLM(LM):
                        f"Using `accelerate launch` or `parallelize=True`, device '{device}' will be overridden when placing model."
                    )
                # TODO: include in warning that `load_in_8bit` etc. affect this too
-                self._device = device
+                self._device = torch.device(device)

            # TODO: update this to be less of a hack once subfolder is fixed in HF
            revision = revision + ("/" + subfolder if subfolder is not None else "")
@@ -207,7 +201,7 @@ class HFLM(LM):
        self.model.eval()
        self.model.tie_weights()

-        if (gpus >= 1 or self.device.type == "mps") and isinstance(pretrained, str):
+        if isinstance(pretrained, str) and (gpus >= 1 or str(self.device) == "mps"):
            if not (parallelize or autogptq or ("device_map" in kwargs)):
                # place model onto device requested manually,
                # if not using HF Accelerate or device_map
@@ -238,7 +232,7 @@ class HFLM(LM):
        elif self.tokenizer.eos_token:
            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
        else:
-            if "Qwen" in pretrained:
+            if self.config.model_type == "qwen":
                # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
                self.tokenizer.pad_token = "<|endoftext|>"
            else:
@@ -279,10 +273,13 @@ class HFLM(LM):
                            "with 'accelerate launch *script*'. "
                            f"Current run will proceed with {accelerator.num_processes} devices."
                        )
-                    assert accelerator.distributed_type in [
-                        DistributedType.FSDP,
-                        DistributedType.MULTI_GPU,
-                    ], "Unsupported distributed type provided. Only DDP and FSDP are supported."
+                    assert (
+                        accelerator.distributed_type
+                        in [
+                            DistributedType.FSDP,
+                            DistributedType.MULTI_GPU,
+                        ]
+                    ), "Unsupported distributed type provided. Only DDP and FSDP are supported."
                    if accelerator.distributed_type == DistributedType.FSDP:
                        self._model = accelerator.prepare(self.model)
                    else:
@@ -417,7 +414,6 @@ class HFLM(LM):
        revision: str = "main",
        trust_remote_code: bool = False,
    ) -> None:
-
        self._config = transformers.AutoConfig.from_pretrained(
            pretrained,
            revision=revision,
@@ -635,7 +631,7 @@ class HFLM(LM):
        padding_side: str = "left",
        left_truncate_len: int = None,
        truncation: bool = False,
-    ) -> Tuple[List[int], List[int]]:
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
        # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
        old_padding_side = self.tokenizer.padding_side
        self.tokenizer.padding_side = padding_side
@@ -751,8 +747,9 @@ class HFLM(LM):
        for context, continuation in [req.args for req in requests]:
            if context == "":
                # end of text as context
-                context_enc, continuation_enc = [self.eot_token_id], self.tok_encode(
-                    continuation
+                context_enc, continuation_enc = (
+                    [self.eot_token_id],
+                    self.tok_encode(continuation),
                )
            else:
                context_enc, continuation_enc = self._encode_pair(context, continuation)
@@ -844,6 +841,7 @@ class HFLM(LM):
        res = []

        def _collate(x):
+            """Defines the key for the sorted method"""
            # the negative sign on len(toks) sorts descending - this has a few advantages:
            # - time estimates will always be over not underestimates, which is more useful for planning
            # - to know the size of a batch when going through the list, you know the first one is always the batch
@@ -854,26 +852,27 @@ class HFLM(LM):
            toks = x[1] + x[2]
            return -len(toks), tuple(toks)

-        re_ord = utils.Reorderer(requests, _collate)
+        re_ord = Collator(requests, sort_fn=_collate)

-        n_reordered_requests = len(re_ord.get_reordered())
        # automatic (variable) batch size detection for vectorization
        # pull longest context sample from request
-
-        chunks = utils.chunks(
-            re_ord.get_reordered(),
-            n=self.batch_size
+        n_reordered_requests = len(re_ord)
+        batch_size = (
+            self.batch_size
            if self.batch_size != "auto"
            else override_bs
            if override_bs is not None
-            else 0,
-            fn=self._batch_scheduler
+            else 0
+        )
+        batch_fn = (
+            self._batch_scheduler
            if self.batch_size == "auto"
            and n_reordered_requests > 0
            and not override_bs
-            else None,
+            else None
        )

+        chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn)
        pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
        for chunk in chunks:
            inps = []
@@ -995,9 +994,7 @@ class HFLM(LM):
                greedy_tokens = logits.argmax(dim=-1)
                cont_toks = torch.tensor(
                    cont_toks, dtype=torch.long, device=self.device
-                ).unsqueeze(
-                    0
-                )  # [1, seq]
+                ).unsqueeze(0)  # [1, seq]
                max_equal = (greedy_tokens == cont_toks).all()

                # Obtain log-probs at the corresponding continuation token indices
@@ -1019,10 +1016,10 @@ class HFLM(LM):
        return re_ord.get_original(res)

    def generate_until(self, requests: List[Instance]) -> List[str]:
-        res = defaultdict(list)
-        re_ords = {}
+        res = []

        def _collate(x):
+            """Defines the key for the sorted method"""
            # the negative sign on len(toks) sorts descending - this has a few advantages:
            # - time estimates will always be over not underestimates, which is more useful for planning
            # - to know the size of a batch when going through the list, you know the first one is always the batch
@@ -1032,14 +1029,6 @@ class HFLM(LM):
            toks = self.tok_encode(x[0])
            return -len(toks), x[0]

-        # we group requests by their generation_kwargs,
-        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
-        # in the same batch.
-        grouper = utils.Grouper(requests, lambda x: str(x.args[1]))
-        for key, reqs in grouper.get_grouped().items():
-            # within each set of reqs for given kwargs, we reorder by token length, descending.
-            re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
-
        pbar = tqdm(total=len(requests), disable=(self.rank != 0))
        if self.batch_size == "auto":
            # using rolling window with maximum context
@@ -1048,98 +1037,102 @@ class HFLM(LM):
            print(f"Determined Largest batch size: {batch_size}")
            adaptive_batch_size = batch_size
        # for each different set of kwargs, we execute all requests, by batch.
-        for key, re_ord in re_ords.items():
-            chunks = utils.chunks(
-                re_ord.get_reordered(),
-                n=self.batch_size
-                if self.batch_size != "auto"
-                else adaptive_batch_size
-                if adaptive_batch_size is not None
-                else 0,
-                fn=self._batch_scheduler
-                if self.batch_size == "auto" and not adaptive_batch_size
-                else None,
-            )
-            for chunk in chunks:
-                contexts, all_gen_kwargs = zip(*chunk)
-                # we assume all gen kwargs in the batch are the same
-                # this is safe to assume because the `grouper` object ensures it.
-                gen_kwargs = all_gen_kwargs[0]
-                # unpack our keyword arguments.
-                until = None
-                if isinstance(gen_kwargs, dict):
-                    kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
-                    if "until" in kwargs.keys():
-                        until = kwargs.pop("until")
-                        if isinstance(until, str):
-                            until = [kwargs]
-                        elif not isinstance(until, list):
-                            raise ValueError(
-                                f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
-                            )
-                else:
-                    raise ValueError(
-                        f"Expected `kwargs` to be of type `dict` but got {kwargs}"
-                    )
-                if not until:
-                    until = [self.tok_decode(self.eot_token_id)]
-                if "max_gen_toks" in kwargs.keys():
-                    max_gen_toks = kwargs.pop("max_gen_toks")
-                else:
-                    max_gen_toks = self.max_gen_toks
+        batch_size = (
+            self.batch_size
+            if self.batch_size != "auto"
+            else adaptive_batch_size
+            if adaptive_batch_size is not None
+            else 0
+        )
+        batch_fn = (
+            self._batch_scheduler
+            if self.batch_size == "auto" and not adaptive_batch_size
+            else None
+        )

-                # set the max length in tokens of inputs ("context_enc")
-                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
-                    # max len for inputs = max length, minus room to generate the max new tokens
-                    max_ctx_len = self.max_length - max_gen_toks
-                elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
-                    # max len for inputs = encoder's whole max_length
-                    max_ctx_len = self.max_length
-
-                # encode, pad, and truncate contexts for this batch
-                context_enc, attn_masks = self.tok_batch_encode(
-                    contexts,
-                    left_truncate_len=max_ctx_len,
-                    truncation=self.truncation,
-                )
-                context_enc = context_enc.to(self.device)
-                attn_masks = attn_masks.to(self.device)
-
-                if "max_length" not in kwargs:
-                    kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
-
-                # perform batched generation
-                cont = self._model_generate(
-                    context=context_enc,
-                    attention_mask=attn_masks,
-                    stop=until,
-                    **kwargs,
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        re_ords = Collator([reg.args for reg in requests], _collate, grouping=True)
+        chunks = re_ords.get_batched(n=batch_size, batch_fn=batch_fn)
+        for chunk in chunks:
+            contexts, all_gen_kwargs = zip(*chunk)
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            gen_kwargs = all_gen_kwargs[0]
+            # unpack our keyword arguments.
+            until = None
+            if isinstance(gen_kwargs, dict):
+                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                if "until" in kwargs.keys():
+                    until = kwargs.pop("until")
+                    if isinstance(until, str):
+                        until = [kwargs]
+                    elif not isinstance(until, list):
+                        raise ValueError(
+                            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
+                        )
+            else:
+                raise ValueError(
+                    f"Expected `kwargs` to be of type `dict` but got {kwargs}"
                )
+            if not until:
+                until = [self.tok_decode(self.eot_token_id)]
+            if "max_gen_toks" in kwargs.keys():
+                max_gen_toks = kwargs.pop("max_gen_toks")
+            else:
+                max_gen_toks = self.max_gen_toks

-                cont_toks_list = cont.tolist()
-                for cont_toks, context in zip(cont_toks_list, contexts):
-                    # discard context + left-padding toks if using causal decoder-only LM
-                    if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
-                        cont_toks = cont_toks[context_enc.shape[1] :]
+            # set the max length in tokens of inputs ("context_enc")
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                # max len for inputs = max length, minus room to generate the max new tokens
+                max_ctx_len = self.max_length - max_gen_toks
+            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                # max len for inputs = encoder's whole max_length
+                max_ctx_len = self.max_length
+
+            # encode, pad, and truncate contexts for this batch
+            context_enc, attn_masks = self.tok_batch_encode(
+                contexts,
+                left_truncate_len=max_ctx_len,
+                truncation=self.truncation,
+            )
+            context_enc = context_enc.to(self.device)
+            attn_masks = attn_masks.to(self.device)

-                    s = self.tok_decode(cont_toks)
+            if "max_length" not in kwargs:
+                kwargs["max_length"] = context_enc.shape[1] + max_gen_toks

-                    # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
-                    for term in until:
-                        if len(term) > 0:
-                            # ignore '' separator,
-                            # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
-                            s = s.split(term)[0]
+            # perform batched generation
+            cont = self._model_generate(
+                context=context_enc,
+                attention_mask=attn_masks,
+                stop=until,
+                **kwargs,
+            )

-                    res[key].append(s)
+            cont_toks_list = cont.tolist()
+            for cont_toks, context in zip(cont_toks_list, contexts):
+                # discard context + left-padding toks if using causal decoder-only LM
+                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                    cont_toks = cont_toks[context_enc.shape[1] :]

-                    self.cache_hook.add_partial(
-                        "generate_until", (context, gen_kwargs), s
-                    )
-                    pbar.update(1)
-            # reorder this group of results back to original unsorted form
-            res[key] = re_ord.get_original(res[key])
+                s = self.tok_decode(cont_toks)
+
+                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
+                for term in until:
+                    if len(term) > 0:
+                        # ignore '' separator,
+                        # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
+                        s = s.split(term)[0]
+
+                res.append(s)
+
+                self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
+                pbar.update(1)
+        # reorder this group of results back to original unsorted form
+        res = re_ords.get_original(res)

        pbar.close()

-        return grouper.get_original(res)
+        return res
--- a/lm_eval/models/mamba_lm.py
+++ b/lm_eval/models/mamba_lm.py
+from typing import Optional, Union
+
+import torch
+
+from lm_eval import utils
+from lm_eval.api.registry import register_model
+from lm_eval.models.huggingface import HFLM
+
+
+@register_model("mamba_ssm")
+class MambaLMWrapper(HFLM):
+    def __init__(
+        self,
+        pretrained="state-spaces/mamba-130m",
+        **kwargs,
+    ) -> None:
+        """
+        Mamba (via the `mamba_ssm` package) supports the following args:
+        ```
+        d_model: int,
+        n_layer: int,
+        vocab_size: int,
+        initializer_cfg=None,
+        pad_vocab_size_multiple: int = 1,
+        ssm_cfg=None,
+        norm_epsilon: float = 1e-5,
+        rms_norm: bool = False,
+        initializer_cfg=None,
+        fused_add_norm=False,
+        residual_in_fp32=False,
+        ```
+
+        See https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py#L175 for more info.
+        The above can all be passed via `--model_args` or to this __init__() directly
+        but we recommend placing many of these within the config.json file uploaded alongside your
+        Mamba model to the HF Hub instead.
+        All other HuggingFace from_pretrained() kwargs
+        such as those related to
+        `parallelize=True`, PEFT, autoGPTQ,
+        or any sub-configurations of these advanced args,
+        are unsupported by the `mamba_ssm` package.
+
+        The HFLM arguments
+
+        `backend`, `revision`, `subfolder`, `tokenizer`, `truncation`, `max_length`,
+        `device`, `dtype`, `batch_size`, `max_batch_size`, `trust_remote_code`, `use_fast_tokenizer`
+
+        Are all supported by Mamba where they do not conflict
+        with Mamba-specific restrictions such as causal LMs only.
+        """
+
+        if "backend" in kwargs:
+            # mamba currently only supports causal models
+            assert kwargs["backend"] == "causal"
+
+        super().__init__(
+            pretrained=pretrained,
+            # set appropriate defaults for tokenizer, max length, etc
+            backend=kwargs.get("backend", "causal"),
+            tokenizer=kwargs.get("tokenizer", "EleutherAI/gpt-neox-20b"),
+            max_length=kwargs.get("max_length", 2048),
+            **kwargs,
+        )
+
+    def _get_config(
+        self,
+        pretrained: str,
+        **kwargs,
+    ) -> None:
+        try:
+            from mamba_ssm.utils.hf import load_config_hf  # noqa: F811
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
+please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
+            )
+
+        self._config = load_config_hf(pretrained)
+
+    def _create_model(
+        self,
+        pretrained: str,
+        dtype: Optional[Union[str, torch.dtype]] = "float16",
+        # no `parallelize=True` options
+        # no PEFT and quantization options
+        # Mamba does not support arbitrary HF from_pretrained() args
+        **kwargs,
+    ) -> None:
+        try:
+            from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel  # noqa: F811
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
+please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
+            )
+
+        self._model = MambaLMHeadModel.from_pretrained(
+            pretrained,
+            device=self._device,
+            dtype=torch.float16 if dtype == "auto" else utils.get_dtype(dtype),
+            **kwargs,
+        )
+
+    def _model_generate(self, context, max_length, stop, **generation_kwargs):
+        for key in ("do_sample", "attention_mask"):
+            if key in generation_kwargs:
+                generation_kwargs.pop(key)
+
+        # mamba's custom GenerationMixin currently does not support
+        # passing stopping criteria.
+        # for the time being, we simply generate to max length,
+        # then truncate (equivalent result)
+        # -- this should be revisited to speed up generation
+        # stopping_criteria = stop_sequences_criteria(
+        #     self.tokenizer, stop, 1, context.shape[0]
+        # )
+
+        return self.model.generate(
+            input_ids=context,
+            max_length=max_length,
+            # stopping_criteria=stopping_criteria,
+            # pad_token_id=self.tokenizer.pad_token_id,
+            # use_cache=True,
+            **generation_kwargs,
+        )
--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
-import os
-import time
-from typing import List, Tuple, Optional
-
 import copy
+import os
 from collections import defaultdict
+from importlib.util import find_spec
+from typing import List, Optional, Tuple
+
 from tqdm import tqdm

 from lm_eval import utils
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
+from lm_eval.utils import retry_on_specific_exceptions


 def get_result(response, ctxlen: int) -> Tuple[float, bool]:
@@ -44,24 +45,28 @@ def oa_completion(**kwargs):

    Retry with back-off until they respond
    """
-    try:
-        import openai, tiktoken  # noqa: E401
-    except ModuleNotFoundError:
+    if not find_spec("openai") or not find_spec("tiktoken"):
        raise Exception(
-            "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
-please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
+            "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. "
+            "Please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`"
        )
+    else:
+        import openai

-    backoff_time = 3
-    while True:
-        try:
-            return openai.completions.create(**kwargs)
-        except openai.OpenAIError:
-            import traceback
+    def _exception_callback(e: Exception, sleep_time: float) -> None:
+        import traceback
+
+        traceback.print_exc()

-            traceback.print_exc()
-            time.sleep(backoff_time)
-            backoff_time *= 1.5
+    @retry_on_specific_exceptions(
+        on_exceptions=[openai.OpenAIError],
+        max_retries=None,  # retry forever, consider changing
+        on_exception_callback=_exception_callback,
+    )
+    def completion():
+        return openai.completions.create(**kwargs)
+
+    return completion()


 @register_model("openai-completions")
@@ -88,7 +93,8 @@ class OpenaiCompletionsLM(LM):
        super().__init__()
        self.seed = seed
        try:
-            import openai, tiktoken  # noqa: E401
+            import openai  # noqa: E401
+            import tiktoken
        except ModuleNotFoundError:
            raise Exception(
                "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
@@ -102,7 +108,7 @@ class OpenaiCompletionsLM(LM):
        self._max_gen_toks = max_gen_toks
        self._max_length = max_length

-        # Read from environment variable OPENAI_API_SECRET_KEY
+        # Read from environment variable OPENAI_API_KEY
        openai.api_key = os.environ["OPENAI_API_KEY"]

    @property
@@ -154,8 +160,9 @@ class OpenaiCompletionsLM(LM):
        for context, continuation in [req.args for req in requests]:
            if context == "":
                # end of text as context
-                context_enc, continuation_enc = [self.eot_token_id], self.tok_encode(
-                    continuation
+                context_enc, continuation_enc = (
+                    [self.eot_token_id],
+                    self.tok_encode(continuation),
                )
            else:
                context_enc, continuation_enc = self._encode_pair(context, continuation)
@@ -247,6 +254,7 @@ class OpenaiCompletionsLM(LM):
            list(sameuntil_chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE))
        ):
            inps = []
+            self._max_gen_toks = request_args.pop("max_gen_toks", self.max_gen_toks)
            for context, _ in chunk:
                context_enc = self.tok_encode(context)
                inp = context_enc[-(self.max_length - self.max_gen_toks) :]
@@ -326,68 +334,68 @@ def oa_chat_completion(client, **kwargs):

    Retry with back-off until they respond
    """
-    try:
-        import openai, tiktoken  # noqa: E401
-    except ModuleNotFoundError:
+    if not find_spec("openai") or not find_spec("tiktoken"):
        raise Exception(
-            "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
-please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
+            "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. "
+            "Please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`"
        )
+    else:
+        import openai

-    async def _get_completions(**kwargs):
-        chat_completions = await client.chat.completions.create(**kwargs)
-        return chat_completions
+    def _exception_callback(e: Exception, sleep_time: float) -> None:
+        import traceback

-    backoff_time = 3
-    while True:
-        try:
-            return client.chat.completions.create(**kwargs)
-        except openai.OpenAIError:
-            import traceback
+        traceback.print_exc()
+
+    @retry_on_specific_exceptions(
+        on_exceptions=[openai.OpenAIError],
+        max_retries=None,  # retry forever, consider changing
+        on_exception_callback=_exception_callback,
+    )
+    def completion():
+        return client.chat.completions.create(**kwargs)

-            traceback.print_exc()
-            time.sleep(backoff_time)
-            backoff_time *= 1.5
+    return completion()


-@register_model("openai-chat-completions")
+@register_model("openai-chat-completions", "local-chat-completions")
 class OpenaiChatCompletionsLM(LM):
    def __init__(
-        self, model: str = "gpt-3.5-turbo", truncate: bool = False, batch_size: int = 1
+        self,
+        model: str = "gpt-3.5-turbo",  # GPT model or Local model using HuggingFace model paths
+        base_url: str = None,
+        truncate: bool = False,
+        **kwargs,
    ) -> None:
        """

        :param model: str
+            Implements an OpenAI-style chat completion API for
+            accessing both OpenAI OR locally-hosted models using
+            HuggingFace Tokenizer
            OpenAI API model (e.g. gpt-3.5-turbo)
+            using the **gen_kwargs passed on init
        :param truncate: bool
            Truncate input if too long (if False and input is too long, throw error)
        """
        super().__init__()
        try:
-            import openai, tiktoken  # noqa: E401
+            import openai  # noqa: E401
        except ModuleNotFoundError:
            raise Exception(
                "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
    please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
            )
        self.model = model
-        self.frequency_penalty = 0
-        self.logit_bias = None
-        self.n = 1
-        self.presence_penalty = 0
-        self.temperature = 1
-        self.top_p = 1
-        self.tokenizer = tiktoken.encoding_for_model(self.model)
-        self.vocab_size = self.tokenizer.n_vocab
+        self.base_url = base_url
        self.truncate = truncate
-        self.end_of_text_token_id = self.tokenizer.eot_token

        # Read from environment variable OPENAI_API_KEY
-        self.client = openai.OpenAI()  # openai.AsyncOpenAI()
-
-    @property
-    def eot_token_id(self):
-        return self.end_of_text_token_id
+        # Set to EMPTY for local
+        if self.base_url:
+            self.client = openai.OpenAI(base_url=self.base_url)
+        else:
+            self.client = openai.OpenAI()  # openai.AsyncOpenAI()

    @property
    def max_length(self) -> int:
@@ -408,53 +416,19 @@ class OpenaiChatCompletionsLM(LM):
        # Isn't used because we override _loglikelihood_tokens
        raise NotImplementedError()

-    def tok_encode(self, string: str) -> List[int]:
-        return self.tokenizer.encode(string)
-
-    def tok_decode(self, tokens: List[int]) -> str:
-        return self.tokenizer.decode(tokens)
-
-    def _encode_pair(
-        self, context: str, continuation: str
-    ) -> Tuple[List[int], List[int]]:
-        n_spaces = len(context) - len(context.rstrip())
-        if n_spaces > 0:
-            continuation = context[-n_spaces:] + continuation
-            context = context[:-n_spaces]
-        whole_enc = self.tok_encode(context + continuation)
-        context_enc = self.tok_encode(context)
-        context_enc_len = len(context_enc)
-        continuation_enc = whole_enc[context_enc_len:]
-        return context_enc, continuation_enc
-
    def generate_until(self, requests) -> List[str]:
        res = defaultdict(list)
        re_ords = {}

-        def _collate(x):
-            toks = self.tok_encode(x[0])
-            return -len(toks), x[0]
-
        # we group requests by their generation_kwargs,
        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
        # in the same batch.
        grouper = utils.Grouper(requests, lambda x: str(x.args[1]))
        for key, reqs in grouper.get_grouped().items():
            # within each set of reqs for given kwargs, we reorder by token length, descending.
-            re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
-
-        def sameuntil_chunks(xs, size):
-            ret = []
-            lastuntil = xs[0][1]
-            for x in xs:
-                if len(ret) >= size or x[1] != lastuntil:
-                    yield ret, lastuntil
-                    ret = []
-                    lastuntil = x[1]
-                ret.append(x)
-
-            if ret:
-                yield ret, lastuntil
+            re_ords[key] = utils.Reorderer(
+                [req.args for req in reqs], lambda x: (-len(x[0]), x[0])
+            )

        pbar = tqdm(total=len(requests), disable=(self.rank != 0))
        for key, re_ord in re_ords.items():
@@ -468,37 +442,26 @@ class OpenaiChatCompletionsLM(LM):

                gen_kwargs = all_gen_kwargs[0]
                until = None
-                if isinstance(gen_kwargs, dict):
-                    kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                if isinstance(kwargs := copy.deepcopy(gen_kwargs), dict):
+                    if "do_sample" in kwargs.keys():
+                        kwargs.pop("do_sample")
                    if "until" in kwargs.keys():
                        until = kwargs.pop("until")
                        if isinstance(until, str):
                            until = [kwargs]
                        elif not isinstance(until, list):
                            raise ValueError(
-                                f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
+                                f"Expected repr(kwargs['until']) to be of type Union[str, list] but got {until}"
                            )
+                        kwargs["stop"] = until
+                    kwargs["max_tokens"] = kwargs.pop("max_gen_toks", self.max_gen_toks)
                else:
                    raise ValueError(
-                        f"Expected `kwargs` to be of type `dict` but got {kwargs}"
+                        f"Expected repr(kwargs) to be of type repr(dict) but got {kwargs}"
                    )

-                if "max_gen_toks" in kwargs.keys():
-                    max_gen_toks = kwargs.pop("max_gen_toks")
-                else:
-                    max_gen_toks = self.max_gen_toks
-
                response = oa_chat_completion(
-                    client=self.client,
-                    messages=inps,
-                    model=self.model,
-                    frequency_penalty=self.frequency_penalty,
-                    # logit_bias=self.logit_bias,
-                    max_tokens=max_gen_toks,
-                    n=self.n,
-                    presence_penalty=self.presence_penalty,
-                    temperature=self.temperature,
-                    top_p=self.top_p,
+                    client=self.client, messages=inps, model=self.model, **kwargs
                )

                for resp, (context, args_) in zip(response.choices, chunk):

--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
@@ -13,11 +13,13 @@ Homepage: https://textsynth.com/index.html
 """
 import logging
 import os
+
 import requests as _requests
-import time
 from tqdm import tqdm
+
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
+from lm_eval.utils import retry_on_specific_exceptions


 logger = logging.getLogger(__name__)
@@ -27,21 +29,26 @@ def textsynth_completion(**kwargs):
    """Query TextSynth API for completion.
    Retry with back-off until they respond.
    """
-    backoff_time = 3
-    while True:
-        try:
-            return _requests.post(**kwargs)
-        except _requests.exceptions.RequestException:
-            import traceback

-            traceback.print_exc()
-            time.sleep(backoff_time)
-            backoff_time *= 1.5
+    def _exception_callback(e: Exception, sleep_time: float) -> None:
+        import traceback
+
+        traceback.print_exc()
+
+    @retry_on_specific_exceptions(
+        on_exceptions=[_requests.exceptions.RequestException],
+        max_retries=None,  # retry forever, consider changing
+        on_exception_callback=_exception_callback,
+    )
+    def completion():
+        return _requests.post(**kwargs)
+
+    return completion()


 @register_model("textsynth")
 class TextSynthLM(LM):
-    def __init__(self, engine, truncate: bool = False) -> None:
+    def __init__(self, engine, truncate: bool = False, **kwargs) -> None:
        """
        :param engine: str
            TextSynth API engine (e.g. `gptj_6B`)
@@ -149,7 +156,7 @@ class TextSynthLM(LM):
                self.cache_hook.add_partial("generate_until", (inp, request_args), s)
            else:
                logger.error(
-                    f"The following response does not contain generated `text`. "
+                    "The following response does not contain generated `text`. "
                    "Got:\n{resp}"
                )
                assert False

--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
+import copy
 from collections import defaultdict
-from typing import List, Tuple, Optional, Literal, Union, Any
-from transformers import AutoTokenizer
+from importlib.util import find_spec
+from typing import List, Literal, Optional, Tuple, Union
+
+from tqdm import tqdm
+
+from lm_eval import utils
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import LM
-import copy
-from tqdm import tqdm
 from lm_eval.api.registry import register_model
-from lm_eval import utils
+

 try:
-    from vllm import LLM, SamplingParams
    from ray.util.multiprocessing import Pool
+    from vllm import LLM, SamplingParams
    from vllm.transformers_utils.tokenizer import get_tokenizer
 except ModuleNotFoundError:
    pass
@@ -54,12 +57,10 @@ class VLLM(LM):
    ):
        super().__init__()

-        try:
-            import vllm
-        except ModuleNotFoundError:
+        if not find_spec("vllm"):
            raise Exception(
-                "attempted to use 'vllm' LM type, but package `vllm` is not installed. \
-please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`",
+                "attempted to use 'vllm' LM type, but package `vllm` is not installed. "
+                "Please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`"
            )

        assert "cuda" in device or device is None, "vLLM only supports CUDA"
@@ -95,7 +96,8 @@ please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`"
            trust_remote_code=trust_remote_code,
            tokenizer_revision=tokenizer_revision,
        )
-        self.batch_size = batch_size
+
+        self.batch_size = "auto" if batch_size.startswith("auto:") else batch_size
        self._max_gen_toks = max_gen_toks

    @property
@@ -193,8 +195,9 @@ please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`"
        for context, continuation in [req.args for req in requests]:
            if context == "":
                # end of text as context
-                context_enc, continuation_enc = [self.eot_token_id], self.tok_encode(
-                    continuation
+                context_enc, continuation_enc = (
+                    [self.eot_token_id],
+                    self.tok_encode(continuation),
                )
            else:
                context_enc, continuation_enc = self._encode_pair(context, continuation)

--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
@@ -69,7 +69,6 @@ def get_prompt(prompt_id: str, dataset_name: str = None, subset_name: str = None
 def load_prompt_list(
    use_prompt: str, dataset_name=None, subset_name=None, yaml_path=None, **kwargs
 ):
-
    category_name, prompt_name = use_prompt.split(":")

    if category_name == "promptsource":
@@ -113,7 +112,6 @@ class PromptString:
        self.prompt_string = prompt_string

    def apply(self, doc):
-
        doc_to_text = self.prompt_string["doc_to_text"]
        doc_to_target = self.prompt_string["doc_to_target"]


--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -131,6 +131,9 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None:
    """
    Calling this function
    """
+
+    # Track whether any tasks failed during loading
+    import_fail = False
    for root, subdirs, file_list in os.walk(task_dir):
        # if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
        for f in file_list:
@@ -155,20 +158,27 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None:

                # Log this silently and show it only when
                # the user defines the appropriate verbosity.
-                except ModuleNotFoundError as e:
+                except (ImportError, ModuleNotFoundError) as e:
+                    import_fail = True
                    eval_logger.debug(
                        f"{yaml_path}: {e}. Config will not be added to registry."
                    )
                except Exception as error:
                    import traceback

-                    eval_logger.debug(
-                        "Failed to load config in\n"
+                    eval_logger.warning(
+                        "Unexpected error loading config in\n"
                        f"                                 {yaml_path}\n"
                        "                                 Config will not be added to registry\n"
                        f"                                 Error: {error}\n"
                        f"                                 Traceback: {traceback.format_exc()}"
                    )
+
+    if import_fail:
+        eval_logger.warning(
+          "Some tasks could not be loaded due to missing dependencies."
+          " Run with `--verbosity DEBUG` for full details."
+          )
    return 0


@@ -180,7 +190,6 @@ def include_path(task_dir):


 def initialize_tasks(verbosity="INFO"):
-
    eval_logger.setLevel(getattr(logging, f"{verbosity}"))

    task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"

--- a/lm_eval/tasks/anli/anli_r1.yaml
+++ b/lm_eval/tasks/anli/anli_r1.yaml
@@ -23,4 +23,4 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
--- a/lm_eval/tasks/arc/arc_easy.yaml
+++ b/lm_eval/tasks/arc/arc_easy.yaml
@@ -20,4 +20,4 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
--- a/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
@@ -13,4 +13,4 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
--- a/lm_eval/tasks/asdiv/default.yaml
+++ b/lm_eval/tasks/asdiv/default.yaml
@@ -11,4 +11,4 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
--- a/lm_eval/tasks/babi/babi.yaml
+++ b/lm_eval/tasks/babi/babi.yaml
@@ -17,4 +17,4 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 metadata:
-  - version: 0.0
+  version: 0.0
--- a/lm_eval/tasks/bbh/_generate_configs.py
+++ b/lm_eval/tasks/bbh/_generate_configs.py
@@ -24,7 +24,6 @@ def parse_args():


 if __name__ == "__main__":
-
    args = parse_args()

    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
@@ -37,7 +36,6 @@ if __name__ == "__main__":

    dataset_path = "lukaemon/bbh"
    for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
-
        resp = requests.get(
            f"https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/main/cot-prompts/{task}.txt"
        ).content.decode("utf-8")

--- a/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
@@ -27,4 +27,4 @@ filter_list:
      - function: "take_first"
 num_fewshot: 0
 metadata:
-  - version: 1.0
+  version: 1.0
--- a/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
@@ -24,4 +24,4 @@ filter_list:
      - function: "take_first"
 num_fewshot: 0
 metadata:
-  - version: 0
+  version: 0
--- a/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
@@ -18,4 +18,4 @@ generation_kwargs:
  temperature: 0.0
 num_fewshot: 0
 metadata:
-  - version: 0
+  version: 0
--- a/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
@@ -18,4 +18,4 @@ generation_kwargs:
  temperature: 0.0
 num_fewshot: 0
 metadata:
-  - version: 0
+  version: 0