types

1f97a945 · Baber · 0087929e · 1f97a945 · 1f97a945 · 1f97a945
Commit 1f97a945 authored Jul 23, 2025 by Baber
4 changed files
--- a/lm_eval/models/api_models.py
+++ b/lm_eval/models/api_models.py
+from __future__ import annotations
 import abc
 import asyncio
 import copy
@@ -8,16 +10,9 @@ from functools import cached_property
 from typing import (
    TYPE_CHECKING,
    Any,
-    Awaitable,
    Callable,
-    Dict,
-    Iterable,
-    List,
    Literal,
    NamedTuple,
-    Optional,
-    Tuple,
-    Union,
 )
@@ -36,18 +31,21 @@ from importlib.util import find_spec
 from io import BytesIO
 from lm_eval import utils
-from lm_eval.api.instance import Instance
 from lm_eval.api.model import TemplateLM
 from lm_eval.models.utils import Collator, chunks, configure_pad_token
 if TYPE_CHECKING:
+    from collections.abc import Awaitable, Iterable
    from PIL import Image
+    from lm_eval.api.instance import Instance
 eval_logger = logging.getLogger(__name__)
-LogLikelihoodInputs = Tuple[Tuple[str, str], List[int], List[int]]
+LogLikelihoodInputs = tuple[tuple[str, str], list[int], list[int]]
 # utility class to keep track of json encoded chats
@@ -58,9 +56,7 @@ class JsonChatStr(NamedTuple):
        return self.prompt.encode(encoding)
-def create_image_prompt(
+def create_image_prompt(imgs: list[Image.Image], chat: dict, fmt: str = "PNG") -> dict:
-    imgs: list["Image.Image"], chat: dict, fmt: str = "PNG"
-) -> dict:
    """
    Parameters
@@ -109,33 +105,32 @@ class TemplateAPI(TemplateLM):
        model: str = None,
        pretrained: str = None,  # `model` takes precedence over `pretrained` when passed.
        base_url: str = None,
-        tokenizer: Optional[str] = None,
+        tokenizer: str | None = None,
        # Loglikelihood tasks require a tokenizer to calculate context lengths,
        # however the requests can be sent as a string if the API doesn't support token inputs.
        # use tokenized_requests=False
-        tokenizer_backend: Optional[
+        tokenizer_backend: Literal["tiktoken", "huggingface", "None", "none"]
-            Literal["tiktoken", "huggingface", "None", "none"]
+        | None = "huggingface",
-        ] = "huggingface",
        truncate: bool = False,
        # number of concurrent requests. More useful if not batching
        num_concurrent: int = 1,
        max_retries: int = 3,
        max_gen_toks: int = 256,
-        batch_size: Union[str, int] = 1,
+        batch_size: str | int = 1,
        seed: int = 1234,
-        max_length: Optional[int] = 2048,
+        max_length: int | None = 2048,
        add_bos_token: bool = False,
        custom_prefix_token_id: int = None,
        # send the requests as tokens or strings
        tokenized_requests: bool = True,
        trust_remote_code: bool = False,
-        revision: Optional[str] = "main",
+        revision: str | None = "main",
        use_fast_tokenizer: bool = True,
        verify_certificate: bool = True,
        eos_string: str = None,
        # timeout in seconds
        timeout: int = 300,
-        header: Optional[Dict[str, str]] = None,
+        header: dict[str, str] | None = None,
        max_images: int = 1,
        **kwargs,
    ) -> None:
@@ -232,12 +227,12 @@ class TemplateAPI(TemplateLM):
    @abc.abstractmethod
    def _create_payload(
        self,
-        messages: Union[List[List[int]], List[dict], List[str], str],
+        messages: list[list[int]] | list[dict] | list[str] | str,
        *,
        generate: bool = True,
-        gen_kwargs: Optional[dict] = None,
+        gen_kwargs: dict | None = None,
        seed: int = 1234,
-        eos: str = None,
+        eos: str | None = None,
        **kwargs,
    ) -> dict:
        """This method is responsible for creating the json payload that will be sent to the API."""
@@ -245,9 +240,9 @@ class TemplateAPI(TemplateLM):
    def create_message(
        self,
-        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
+        messages: list[list[int]] | list[str] | list[JsonChatStr],
        generate=False,
-    ) -> Union[List[List[int]], List[dict], List[str], str]:
+    ) -> list[list[int]] | list[dict] | list[str] | str:
        """Helper method to transform the prompt into the expected API input format. messages consist of batched requests"""
        if isinstance(messages[0], JsonChatStr):
            # for chat completions we need to decode the json string to list[dict,...]
@@ -276,17 +271,17 @@ class TemplateAPI(TemplateLM):
    @staticmethod
    @abc.abstractmethod
    def parse_logprobs(
-        outputs: Union[Any, List[Any]],
+        outputs: Any | list[Any],
-        tokens: List[List[int]] = None,
+        tokens: list[list[int]] | None = None,
-        ctxlen: List[int] = None,
+        ctxlen: list[int] | None = None,
        **kwargs,
-    ) -> List[Tuple[float, bool]]:
+    ) -> list[tuple[float, bool]]:
        """Method used to parse the logprobs from the (batched) API response. This method should return a list of tuples"""
        raise NotImplementedError
    @staticmethod
    @abc.abstractmethod
-    def parse_generations(outputs: Union[Any, List[Any]], **kwargs) -> List[str]:
+    def parse_generations(outputs: Any | list[Any], **kwargs) -> list[str]:
        """Method used to parse the generations from the (batched) API response. This method should return a list of str"""
        raise NotImplementedError
@@ -303,14 +298,15 @@ class TemplateAPI(TemplateLM):
    @property
    def tokenizer_name(self) -> str:
        """Must be defined for LM subclasses which implement Chat Templating.
        Should return the name of the tokenizer or chat template used.
        Used only to properly fingerprint caches when requests are being cached with `--cache_requests`, otherwise not used.
        """
        return ""
    def apply_chat_template(
-        self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
+        self, chat_history: list[dict[str, str]], add_generation_prompt: bool = True
-    ) -> Union[str, JsonChatStr]:
+    ) -> str | JsonChatStr:
        """Applies a chat template to a list of chat history between user and model."""
        if self.tokenizer_backend == "huggingface" and self.tokenized_requests:
            return self.tokenizer.apply_chat_template(
@@ -319,7 +315,6 @@ class TemplateAPI(TemplateLM):
                add_generation_prompt=add_generation_prompt,
                continue_final_message=not add_generation_prompt,
            )
-        else:
        # bit of a hack. We'll load back before sending to the API
        return JsonChatStr(
            json.dumps(
@@ -329,23 +324,23 @@ class TemplateAPI(TemplateLM):
        )
    @cached_property
-    def eot_token_id(self) -> Optional[int]:
+    def eot_token_id(self) -> int | None:
        if self.tokenizer is None:
            return None
        else:
            if self.tokenizer_backend == "huggingface":
                return self.tokenizer.eos_token_id
-            elif self.tokenizer_backend == "tiktoken":
+            if self.tokenizer_backend == "tiktoken":
                return self.tokenizer.eot_token
    @cached_property
-    def eos_string(self) -> Optional[str]:
+    def eos_string(self) -> str | None:
        if self._eos_string:
            return self._eos_string
-        elif self.tokenizer is not None:
+        if self.tokenizer is not None:
            if self.tokenizer_backend == "huggingface":
                return self.tokenizer.eos_token
-            elif self.tokenizer_backend == "tiktoken":
+            if self.tokenizer_backend == "tiktoken":
                return self.tokenizer.decode([self.tokenizer.eot_token])
        else:
            eval_logger.warning(
@@ -354,7 +349,7 @@ class TemplateAPI(TemplateLM):
            return None
    @cached_property
-    def prefix_token_id(self) -> Optional[int]:
+    def prefix_token_id(self) -> int | None:
        if self.tokenizer is None:
            return None
        else:
@@ -364,24 +359,24 @@ class TemplateAPI(TemplateLM):
                if self.tokenizer.bos_token_id is not None:
                    return self.tokenizer.bos_token_id
                return self.tokenizer.eos_token_id
-            else:
            return self.tokenizer.eot_token
    def tok_encode(
        self,
        string: str,
-        left_truncate_len: int = None,
+        left_truncate_len: int | None = None,
        add_special_tokens: bool = False,
        truncation: bool = False,
        **kwargs,
-    ) -> Union[List[List[int]], List[int], List[str]]:
+    ) -> list[list[int]] | list[int] | list[str]:
        if self.tokenizer_backend is None:
            return [string]
-        elif self.tokenizer_backend == "huggingface":
+        if self.tokenizer_backend == "huggingface":
            # by default for CausalLM - false or self.add_bos_token is set
            if not add_special_tokens:
                add_special_tokens = False or self.add_bos_token
-            encoding: Union[List[List[int]], List[int]] = self.tokenizer(
+            encoding: list[list[int]] | list[int] = self.tokenizer(
                string,
                add_special_tokens=add_special_tokens,
                truncation=truncation,
@@ -404,20 +399,20 @@ class TemplateAPI(TemplateLM):
                encoding = self.tokenizer.encode_batch(string)
            return encoding
-    def decode_batch(self, tokens: List[List[int]]) -> List[str]:
+    def decode_batch(self, tokens: list[list[int]]) -> list[str] | None:
        if self.tokenizer_backend == "huggingface":
            return self.tokenizer.batch_decode(tokens)
-        elif self.tokenizer_backend == "tiktoken":
+        if self.tokenizer_backend == "tiktoken":
            return self.tokenizer.decode_batch(tokens)
    def model_call(
        self,
-        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
+        messages: list[list[int]] | list[str] | list[JsonChatStr],
        *,
        generate: bool = True,
-        gen_kwargs: Optional[Dict] = None,
+        gen_kwargs: dict | None = None,
        **kwargs,
-    ) -> Optional[dict]:
+    ) -> dict | None:
        # !!! Copy: shared dict for each request, need new object !!!
        gen_kwargs = copy.deepcopy(gen_kwargs)
        try:
@@ -441,7 +436,7 @@ class TemplateAPI(TemplateLM):
            response.raise_for_status()
            return response.json()
        except RetryError:
-            eval_logger.error(
+            eval_logger.exception(
                "API request failed after multiple retries. Please check the API status."
            )
            return None
@@ -450,14 +445,14 @@ class TemplateAPI(TemplateLM):
        self,
        session: ClientSession,
        sem: asyncio.Semaphore,
-        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
+        messages: list[list[int]] | list[str] | list[JsonChatStr],
        *,
        generate: bool = True,
-        cache_keys: list = None,
+        cache_keys: list | None = None,
-        ctxlens: Optional[List[int]] = None,
+        ctxlens: list[int] | None = None,
-        gen_kwargs: Optional[Dict] = None,
+        gen_kwargs: dict | None = None,
        **kwargs,
-    ) -> Union[List[str], List[Tuple[float, bool]], None]:
+    ) -> list[str] | list[tuple[float, bool]] | None:
        # !!! Copy: shared dict for each request, need new object !!!
        gen_kwargs = copy.deepcopy(gen_kwargs)
        payload = self._create_payload(
@@ -508,8 +503,8 @@ class TemplateAPI(TemplateLM):
                sem.release()
    def batch_loglikelihood_requests(
-        self, chunks: Iterable[List[LogLikelihoodInputs]]
+        self, chunks: Iterable[list[LogLikelihoodInputs]]
-    ) -> Tuple[List[List[int]], List[int], List[Tuple[str, str]]]:
+    ) -> tuple[list[list[int]], list[int], list[tuple[str, str]]]:
        inputs = []
        ctxlens = []
        cache_keys = []
@@ -536,9 +531,9 @@ class TemplateAPI(TemplateLM):
        cache_keys: list,
        *,
        generate: bool = True,
-        ctxlens: List[int] = None,
+        ctxlens: list[int] | None = None,
        **kwargs,
-    ) -> Union[List[List[str]], List[List[Tuple[float, bool]]]]:
+    ) -> list[list[str]] | list[list[tuple[float, bool]]]:
        ctxlens = ctxlens if ctxlens else [None] * len(requests)
        conn = TCPConnector(limit=self._concurrent, ssl=self.verify_certificate)
        sem = asyncio.Semaphore(self._concurrent)
@@ -575,14 +570,14 @@ class TemplateAPI(TemplateLM):
            return await tqdm_asyncio.gather(*tasks, desc="Requesting API")
-    def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
+    def _loglikelihood_tokens(self, requests, **kwargs) -> list[tuple[float, bool]]:
        assert self.tokenizer is not None, (
            "Tokenizer is required for loglikelihood tasks to compute context lengths."
        )
        res = []
        def _collate(req: LogLikelihoodInputs):
-            """Defines the key for the sorted method"""
+            """Defines the key for the sorted method."""
            # the negative sign on len(toks) sorts descending - this has a few advantages:
            # - time estimates will always be over not underestimates, which is more useful for planning
            # - to know the size of a batch when going through the list, you know the first one is always the batch
@@ -639,8 +634,8 @@ class TemplateAPI(TemplateLM):
        return re_ord.get_original(res)
    def generate_until(
-        self, requests: List[Instance], disable_tqdm: bool = False
+        self, requests: list[Instance], disable_tqdm: bool = False
-    ) -> List[str]:
+    ) -> list[str]:
        res = []
        def _collate_gen(_requests):
@@ -773,8 +768,8 @@ class TemplateAPI(TemplateLM):
        return re_ord.get_original(res)
    def loglikelihood_rolling(
-        self, requests: List[Instance], disable_tqdm: bool = False
+        self, requests: list[Instance], disable_tqdm: bool = False
-    ) -> List[float]:
+    ) -> list[float]:
        loglikelihoods = []
        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
+from __future__ import annotations
 import copy
 import logging
 import os
 from datetime import timedelta
 from pathlib import Path
-from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Literal
 import jinja2
 import torch
@@ -40,7 +42,7 @@ from lm_eval.models.utils import (
 if TYPE_CHECKING:
-    from transformers.quantizers import AutoQuantizationConfig
+    from transformers.quantizers.auto import AutoQuantizationConfig
 eval_logger = logging.getLogger(__name__)
@@ -59,46 +61,43 @@ class HFLM(TemplateLM):
    def __init__(
        self,
-        pretrained: Union[str, transformers.PreTrainedModel],
+        pretrained: str | transformers.PreTrainedModel,
        backend: Literal["default", "causal", "seq2seq"] = "default",
        # override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq)
-        revision: Optional[str] = "main",
+        revision: str | None = "main",
        subfolder: str = "",
-        tokenizer: Optional[
+        tokenizer: str
-            Union[
+        | transformers.PreTrainedTokenizer
-                str,
+        | transformers.PreTrainedTokenizerFast
-                transformers.PreTrainedTokenizer,
+        | None = None,
-                transformers.PreTrainedTokenizerFast,
+        truncation: bool | None = False,
-            ]
-        ] = None,
-        truncation: Optional[bool] = False,
        logits_cache: bool = True,
-        max_length: Optional[int] = None,
+        max_length: int | None = None,
-        device: Optional[str] = "cuda",
+        device: str | None = "cuda",
-        dtype: Optional[Union[str, torch.dtype]] = "auto",
+        dtype: str | torch.dtype | None = "auto",
-        softmax_dtype: Optional[Union[str, torch.dtype]] = None,
+        softmax_dtype: str | torch.dtype | None = None,
-        mixed_precision_dtype: Optional[Union[str, torch.dtype]] = None,
+        mixed_precision_dtype: str | torch.dtype | None = None,
-        batch_size: Optional[Union[int, str]] = 1,
+        batch_size: int | str | None = 1,
-        max_batch_size: Optional[int] = 64,
+        max_batch_size: int | None = 64,
-        trust_remote_code: Optional[bool] = False,
+        trust_remote_code: bool | None = False,
-        use_fast_tokenizer: Optional[bool] = True,
+        use_fast_tokenizer: bool | None = True,
-        add_bos_token: Optional[bool] = False,
+        add_bos_token: bool | None = False,
-        prefix_token_id: Optional[int] = None,
+        prefix_token_id: int | None = None,
        # arguments used for splitting a model across GPUs naively.
        # only used if `parallelize=True`.
-        parallelize: Optional[bool] = False,
+        parallelize: bool | None = False,
-        max_memory_per_gpu: Optional[Union[int, str]] = None,
+        max_memory_per_gpu: int | str | None = None,
-        max_cpu_memory: Optional[Union[int, str]] = None,
+        max_cpu_memory: int | str | None = None,
-        offload_folder: Optional[Union[str, os.PathLike]] = "./offload",
+        offload_folder: str | os.PathLike | None = "./offload",
        # PEFT, delta weights and quantization options
-        peft: Optional[str] = None,
+        peft: str | None = None,
-        delta: Optional[str] = None,
+        delta: str | None = None,
-        autogptq: Optional[Union[bool, str]] = False,
+        autogptq: bool | str | None = False,
-        gptqmodel: Optional[bool] = False,
+        gptqmodel: bool | None = False,
-        gguf_file: Optional[str] = None,
+        gguf_file: str | None = None,
        # end token for thinking, either the string or int token id.
        # splits to get response after this token (if provided).
-        think_end_token: Union[str, int, None] = None,
+        think_end_token: str | int | None = None,
        **kwargs,
    ) -> None:
        super().__init__()
@@ -271,9 +270,10 @@ class HFLM(TemplateLM):
            self.batch_size_per_gpu = int(batch_size)
        if isinstance(pretrained, str):
-            if gpus >= 1 or str(self.device) == "mps":
+            if (gpus >= 1 or str(self.device) == "mps") and not (
+                parallelize or autogptq or hasattr(self, "accelerator")
+            ):
                # TODO: can remove this whole snippet except in the mps case, perhaps?
-                if not (parallelize or autogptq or hasattr(self, "accelerator")):
                # place model onto device requested manually,
                # if not using HF Accelerate or device_map
                # or any other option that preloads model onto device
@@ -327,12 +327,12 @@ class HFLM(TemplateLM):
    def _get_accelerate_args(
        self,
-        parallelize: Optional[bool] = None,
+        parallelize: bool | None = None,
-        device_map: Optional[str] = "auto",
+        device_map: str | None = "auto",
-        max_memory_per_gpu: Optional[Union[int, str]] = None,
+        max_memory_per_gpu: int | str | None = None,
-        max_cpu_memory: Optional[Union[int, str]] = None,
+        max_cpu_memory: int | str | None = None,
-        offload_folder: Optional[str] = "./offload",
+        offload_folder: str | None = "./offload",
-        gpus: Optional[int] = None,
+        gpus: int | None = None,
    ) -> dict:
        """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
        num_local_processes = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
@@ -480,9 +480,9 @@ class HFLM(TemplateLM):
    def _get_backend(
        self,
-        config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
+        config: transformers.PretrainedConfig | transformers.AutoConfig,
        backend: Literal["default", "causal", "seq2seq"] = "default",
-        trust_remote_code: Optional[bool] = False,
+        trust_remote_code: bool | None = False,
    ) -> None:
        """
        Helper method during initialization.
@@ -497,27 +497,20 @@ class HFLM(TemplateLM):
        if backend != "default":
            # if we've settled on non-default backend, use that manually
-            if backend == "causal":
+            if backend in ["causal", "seq2seq"]:
-                self.backend = backend
-            elif backend == "seq2seq":
                self.backend = backend
            eval_logger.info(
                f"Overrode HF model backend type, and using type '{self.backend}'"
            )
        else:
            # determine and use the default HF backend for this model, based on its config + metadata.
-            if (
+            if self.config.model_type in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES:
-                getattr(config, "model_type")
-                in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
-            ):
                # first check if model type is listed under seq2seq models, since some
                # models like MBart are listed in both seq2seq and causal mistakenly in HF transformers.
                # these special cases should be treated as seq2seq models.
                self.backend = "seq2seq"
                eval_logger.debug(f"Using model type '{self.backend}'")
-            elif (
+            elif self.config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
-                getattr(self.config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
-            ):
                self.backend = "causal"
                eval_logger.debug(f"Using model type '{self.backend}'")
            else:
@@ -545,7 +538,7 @@ class HFLM(TemplateLM):
        pretrained: str,
        revision: str = "main",
        trust_remote_code: bool = False,
-        gguf_file: Optional[str] = None,
+        gguf_file: str | None = None,
        subfolder: str = "",
    ) -> None:
        """Return the model config for HuggingFace models"""
@@ -560,24 +553,24 @@ class HFLM(TemplateLM):
    def _create_model(
        self,
        pretrained: str,
-        revision: Optional[str] = "main",
+        revision: str | None = "main",
-        dtype: Optional[Union[str, torch.dtype]] = "auto",
+        dtype: str | torch.dtype | None = "auto",
-        trust_remote_code: Optional[bool] = False,
+        trust_remote_code: bool | None = False,
        # arguments used for splitting a model across GPUs naively.
        # only used if `parallelize=True`.
        # (accelerate naive PP (device_map) options)
-        parallelize: Optional[bool] = False,
+        parallelize: bool | None = False,
-        gpus: Optional[int] = None,
+        gpus: int | None = None,
-        max_memory_per_gpu: Optional[Union[int, str]] = None,
+        max_memory_per_gpu: int | str | None = None,
-        max_cpu_memory: Optional[Union[int, str]] = None,
+        max_cpu_memory: int | str | None = None,
-        offload_folder: Optional[str] = "./offload",
+        offload_folder: str | None = "./offload",
        # PEFT, delta weights and quantization options
-        peft: Optional[str] = None,
+        peft: str | None = None,
-        delta: Optional[str] = None,
+        delta: str | None = None,
-        autogptq: Optional[Union[bool, str]] = False,
+        autogptq: bool | str | None = False,
-        gptqmodel: Optional[bool] = False,
+        gptqmodel: bool | None = False,
-        gguf_file: Optional[str] = None,
+        gguf_file: str | None = None,
-        quantization_config: Optional["AutoQuantizationConfig"] = None,
+        quantization_config: AutoQuantizationConfig | None = None,
        subfolder: str = "",
        **kwargs,
    ) -> None:
@@ -598,7 +591,7 @@ class HFLM(TemplateLM):
        model_kwargs.update(
            self._get_accelerate_args(
                parallelize=parallelize,
-                device_map=kwargs.get("device_map", None),
+                device_map=kwargs.get("device_map"),
                max_memory_per_gpu=max_memory_per_gpu,
                max_cpu_memory=max_cpu_memory,
                offload_folder=offload_folder,
@@ -611,12 +604,11 @@ class HFLM(TemplateLM):
                assert transformers.__version__ >= "4.30.0", (
                    "load_in_4bit requires transformers >= 4.30.0"
                )
-            if transformers.__version__ >= "4.30.0":
+            if transformers.__version__ >= "4.30.0" and (
-                if model_kwargs.get("load_in_4bit", None):
+                model_kwargs.get("load_in_4bit")
-                    if model_kwargs.get("bnb_4bit_compute_dtype", None):
+                and (compute_dtype := model_kwargs.get("bnb_4bit_compute_dtype"))
-                        model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(
+            ):
-                            model_kwargs["bnb_4bit_compute_dtype"]
+                model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(compute_dtype)
-                        )
            self._model = self.AUTO_MODEL_CLASS.from_pretrained(
                pretrained,
@@ -641,7 +633,7 @@ class HFLM(TemplateLM):
                    raise type(exception)(
                        "Tried to load auto_gptq, but auto-gptq is not installed ",
                        "please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]",
-                    )
+                    ) from exception
                self._model = AutoGPTQForCausalLM.from_quantized(
                    pretrained,
@@ -660,7 +652,7 @@ class HFLM(TemplateLM):
                    raise type(exception)(
                        "Tried to load gptqmodel, but gptqmodel is not installed ",
                        "please install gptqmodel via `pip install gptqmodel --no-build-isolation` or `pip install lm-eval[gptqmodel] --no-build-isolation`",
-                    )
+                    ) from exception
                self._model = GPTQModel.from_quantized(
                    pretrained, trust_remote_code=trust_remote_code, **model_kwargs
@@ -672,11 +664,11 @@ class HFLM(TemplateLM):
            )
        if peft:
-            from peft import PeftModel
+            from peft import PeftModel, __version__ as PEFT_VERSION
-            from peft import __version__ as PEFT_VERSION
-            if model_kwargs.get("load_in_4bit", None):
+            if model_kwargs.get("load_in_4bit") and version.parse(
-                if version.parse(PEFT_VERSION) < version.parse("0.4.0"):
+                PEFT_VERSION
+            ) < version.parse("0.4.0"):
                raise AssertionError("load_in_4bit requires peft >= 0.4.0")
            if self._model.config.vocab_size != len(self.tokenizer):
                # resize model for LoRAs with added tokens
@@ -703,11 +695,13 @@ class HFLM(TemplateLM):
                try:
                    param.data += _model_delta.state_dict()[name]
                except KeyError:
-                    raise KeyError(f"Delta model is missing weights for layer: {name}")
+                    raise KeyError(
+                        f"Delta model is missing weights for layer: {name}"
+                    ) from None
                except Exception as e:
                    raise RuntimeError(
                        f"Failed to add delta weights to layer {name}. Error: {e}"
-                    )
+                    ) from e
            del _model_delta
@@ -715,20 +709,17 @@ class HFLM(TemplateLM):
    def _create_tokenizer(
        self,
-        pretrained: Union[str, transformers.PreTrainedModel],
+        pretrained: str | transformers.PreTrainedModel,
-        tokenizer: Optional[
+        tokenizer: str
-            Union[
+        | transformers.PreTrainedTokenizer
-                str,
+        | transformers.PreTrainedTokenizerFast
-                transformers.PreTrainedTokenizer,
+        | None,
-                transformers.PreTrainedTokenizerFast,
+        revision: str | None = "main",
-            ]
+        trust_remote_code: bool | None = False,
-        ],
+        use_fast_tokenizer: bool | None = True,
-        revision: Optional[str] = "main",
+        gguf_file: str | None = None,
-        trust_remote_code: Optional[bool] = False,
+        add_bos_token: bool | None = False,
-        use_fast_tokenizer: Optional[bool] = True,
+        subfolder: str | None = "",
-        gguf_file: Optional[str] = None,
-        add_bos_token: Optional[bool] = False,
-        subfolder: Optional[str] = "",
    ) -> None:
        """
        Helper method during initialization.
@@ -760,8 +751,12 @@ class HFLM(TemplateLM):
                )
            else:
                assert isinstance(
-                    tokenizer, transformers.PreTrainedTokenizer
+                    tokenizer,
-                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
+                    (
+                        transformers.PreTrainedTokenizer,
+                        transformers.PreTrainedTokenizerFast,
+                    ),
+                )
                self.tokenizer = tokenizer
        else:
            # Get tokenizer based on 'pretrained'
@@ -838,7 +833,7 @@ class HFLM(TemplateLM):
    def tok_encode(
        self, string: str, left_truncate_len=None, add_special_tokens=None
-    ) -> List[int]:
+    ) -> list[int]:
        """ """
        # default for None - empty dict, use predefined tokenizer param
        # used for all models except for CausalLM or predefined value
@@ -864,11 +859,11 @@ class HFLM(TemplateLM):
    def tok_batch_encode(
        self,
-        strings: List[str],
+        strings: list[str],
        padding_side: str = "left",
        left_truncate_len: int = None,
        truncation: bool = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
        old_padding_side = self.tokenizer.padding_side
        self.tokenizer.padding_side = padding_side
@@ -917,15 +912,17 @@ class HFLM(TemplateLM):
            A torch tensor of shape [batch, sequence, vocab] with the
        logits returned from the model's decoder
        """
-        with torch.no_grad():
+        with (
-            with torch.autocast(
+            torch.no_grad(),
+            torch.autocast(
                device_type=self.device.type,
                dtype=self.mixed_precision_dtype,
                enabled=self.mixed_precision_dtype is not None,
+            ),
        ):
            if attn_mask is not None or labels is not None:
                assert attn_mask is not None and labels is not None
-                    assert self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM
+                assert transformers.AutoModelForSeq2SeqLM == self.AUTO_MODEL_CLASS
                return self.model(
                    input_ids=inps, attention_mask=attn_mask, labels=labels
                ).logits
@@ -942,7 +939,7 @@ class HFLM(TemplateLM):
        # remove temperature, as do_sample=False takes care of this
        # and we don't want a warning from HF
        generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
-        do_sample = generation_kwargs.get("do_sample", None)
+        do_sample = generation_kwargs.get("do_sample")
        # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
        if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
@@ -989,8 +986,8 @@ class HFLM(TemplateLM):
        return logits
    def loglikelihood_rolling(
-        self, requests: List[Instance], disable_tqdm: bool = False
+        self, requests: list[Instance], disable_tqdm: bool = False
-    ) -> List[float]:
+    ) -> list[float]:
        adaptive_batch_size = None
        if self.batch_size == "auto":
            # using rolling window with maximum context
@@ -1009,7 +1006,7 @@ class HFLM(TemplateLM):
                disable=(disable_tqdm or (self.rank != 0)),
            )
        ):
-            rolling_token_windows: List[Tuple[List[int], List[int]]] = list(
+            rolling_token_windows: list[tuple[list[int], list[int]]] = list(
                map(
                    utils.make_disjoint_window,
                    utils.get_rolling_token_windows(
@@ -1093,14 +1090,14 @@ class HFLM(TemplateLM):
    def _loglikelihood_tokens(
        self,
-        requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
+        requests: list[tuple[tuple[str, str], list[int], list[int]]],
        disable_tqdm: bool = False,
        override_bs: int = None,
-    ) -> List[Tuple[float, bool]]:
+    ) -> list[tuple[float, bool]]:
        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
        res = []
-        def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
+        def _collate(req: tuple[tuple[str, str], list[int], list[int]]):
            """Defines the key for the sorted method"""
            # the negative sign on len(toks) sorts descending - this has a few advantages:
            # - time estimates will always be over not underestimates, which is more useful for planning
@@ -1112,7 +1109,7 @@ class HFLM(TemplateLM):
            toks = req[1] + req[2]
            return -len(toks), tuple(toks)
-        def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
+        def _lookup_one_token_cont(req: tuple[tuple[str, str], list[int], list[int]]):
            """Defines the key to group and lookup one-token continuations"""
            # Use with group_by="contexts" (optional)"
            # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
@@ -1286,7 +1283,7 @@ class HFLM(TemplateLM):
                # original args. Otherwise, expands the logits batch dimension and yields each
                # batch along with matching continuation tokens and prompt strings.
                # logits -> [1, seq, vocab]
-                for request_str, cont_toks, logits in re_ord.get_cache(
+                for request_str, cont_toks, logits in re_ord.get_cache(  # noqa
                    req_str=request_str,
                    cxt_toks=ctx_tokens,
                    cont_toks=cont_toks,
@@ -1327,11 +1324,11 @@ class HFLM(TemplateLM):
        return re_ord.get_original(res)
    def generate_until(
-        self, requests: List[Instance], disable_tqdm: bool = False
+        self, requests: list[Instance], disable_tqdm: bool = False
-    ) -> List[str]:
+    ) -> list[str]:
        res = []
-        def _collate(req: Tuple[str, dict]):
+        def _collate(req: tuple[str, dict]):
            """Defines the key for the sorted method"""
            # the negative sign on len(toks) sorts descending - this has a few advantages:
            # - time estimates will always be over not underestimates, which is more useful for planning
@@ -1394,7 +1391,7 @@ class HFLM(TemplateLM):
                raise ValueError(
                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
                )
-            if "max_gen_toks" in kwargs.keys():
+            if "max_gen_toks" in kwargs:
                max_gen_toks = kwargs.pop("max_gen_toks")
            else:
                max_gen_toks = self.max_gen_toks
@@ -1472,7 +1469,7 @@ class HFLM(TemplateLM):
        return res
    def apply_chat_template(
-        self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
+        self, chat_history: list[dict[str, str]], add_generation_prompt: bool = True
    ) -> str:
        """
        Method to apply a chat template to a list of chat history between user and model.

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
+from __future__ import annotations
 import logging
 import os
 from functools import cached_property
 from operator import itemgetter
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any
 from lm_eval.api.registry import register_model
 from lm_eval.models.api_models import TemplateAPI
@@ -26,9 +28,9 @@ class LocalCompletionsAPI(TemplateAPI):
    def _create_payload(
        self,
-        messages: Union[List[List[int]], List[dict], List[str], str],
+        messages: list[list[int]] | list[dict] | list[str] | str,
        generate=False,
-        gen_kwargs: Optional[dict] = None,
+        gen_kwargs: dict | None = None,
        seed: int = 1234,
        eos=None,
        **kwargs,
@@ -50,7 +52,6 @@ class LocalCompletionsAPI(TemplateAPI):
                "seed": seed,
                **gen_kwargs,
            }
-        else:
        return {
            "model": self.model,
            "prompt": messages,
@@ -63,11 +64,11 @@ class LocalCompletionsAPI(TemplateAPI):
    @staticmethod
    def parse_logprobs(
-        outputs: Union[Dict, List[Dict]],
+        outputs: dict | list[dict],
-        tokens: List[List[int]] = None,
+        tokens: list[list[int]] = None,
-        ctxlens: List[int] = None,
+        ctxlens: list[int] = None,
        **kwargs,
-    ) -> List[Tuple[float, bool]]:
+    ) -> list[tuple[float, bool]]:
        res = []
        if not isinstance(outputs, list):
            outputs = [outputs]
@@ -88,7 +89,7 @@ class LocalCompletionsAPI(TemplateAPI):
        return res
    @staticmethod
-    def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]:
+    def parse_generations(outputs: dict | list[dict], **kwargs) -> list[str]:
        res = []
        if not isinstance(outputs, list):
            outputs = [outputs]
@@ -130,9 +131,9 @@ class LocalChatCompletion(LocalCompletionsAPI):
    def _create_payload(
        self,
-        messages: List[Dict],
+        messages: list[dict],
        generate=False,
-        gen_kwargs: dict = None,
+        gen_kwargs: dict | None = None,
        seed=1234,
        eos=None,
        **kwargs,
@@ -160,7 +161,7 @@ class LocalChatCompletion(LocalCompletionsAPI):
        }
    @staticmethod
-    def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]:
+    def parse_generations(outputs: dict | list[dict], **kwargs) -> list[str]:
        res = []
        if not isinstance(outputs, list):
            outputs = [outputs]
@@ -173,11 +174,11 @@ class LocalChatCompletion(LocalCompletionsAPI):
    def tok_encode(
        self,
-        string: Union[str, Any],
+        string: str | Any,
        left_truncate_len=None,
        add_special_tokens=None,
        **kwargs,
-    ) -> Union[List[str], List[int], Any]:
+    ) -> list[str] | list[int] | Any:
        return string
    def loglikelihood(self, requests, **kwargs):
@@ -219,7 +220,7 @@ class OpenAICompletionsAPI(LocalCompletionsAPI):
        )
        return super().loglikelihood(requests, **kwargs)
-    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
+    def chat_template(self, chat_template: bool | str = False) -> str | None:
        return ""
@@ -261,7 +262,7 @@ class OpenAIChatCompletion(LocalChatCompletion):
    def _create_payload(
        self,
-        messages: List[Dict],
+        messages: list[dict],
        generate=False,
        gen_kwargs: dict = None,
        seed=1234,

--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
+from __future__ import annotations
 import copy
 import gc
 import inspect
@@ -8,7 +10,7 @@ from importlib.util import find_spec
 from multiprocessing import Process, Queue
 from queue import Empty
 from time import sleep
-from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Literal
 import jinja2
 from more_itertools import distribute
@@ -51,10 +53,10 @@ eval_logger = logging.getLogger(__name__)
 def _vllm_mp_worker(
    model_args: dict,
-    sampling_params: "SamplingParams",
+    sampling_params: SamplingParams,
    requests: list[list[int]],
-    lora_request: "LoRARequest",
+    lora_request: LoRARequest,
-    result_queue: "Queue",
+    result_queue: Queue,
    dp_size: int,
    local_dp_rank: int,
    dp_master_port: int,
@@ -114,30 +116,30 @@ class VLLM(TemplateLM):
        self,
        pretrained: str,
        dtype: Literal["float16", "bfloat16", "float32", "auto"] = "auto",
-        revision: Optional[str] = None,
+        revision: str | None = None,
-        trust_remote_code: Optional[bool] = False,
+        trust_remote_code: bool | None = False,
-        tokenizer: Optional[str] = None,
+        tokenizer: str | None = None,
        tokenizer_mode: Literal["auto", "slow"] = "auto",
-        tokenizer_revision: Optional[str] = None,
+        tokenizer_revision: str | None = None,
-        add_bos_token: Optional[bool] = False,
+        add_bos_token: bool | None = False,
-        prefix_token_id: Optional[int] = None,
+        prefix_token_id: int | None = None,
        tensor_parallel_size: int = 1,
-        quantization: Optional[str] = None,
+        quantization: str | None = None,
        max_gen_toks: int = 256,
        swap_space: int = 4,
-        batch_size: Union[str, int] = 1,
+        batch_size: str | int = 1,
-        max_batch_size=None,
+        max_batch_size: int | None = None,
-        max_length: int = None,
+        max_length: int | None = None,
-        max_model_len: int = None,
+        max_model_len: int | None = None,
        seed: int = 1234,
        gpu_memory_utilization: float = 0.9,
        device: str = "cuda",
        data_parallel_size: int = 1,
-        lora_local_path: str = None,
+        lora_local_path: str | None = None,
        # VLLM: enable thinking tags in the prompt.
        enable_thinking: bool = True,
        # End marker for thinking tags - splits to get response after this token (if provided).
-        think_end_token: Optional[str] = None,
+        think_end_token: str | None = None,
        max_lora_rank: int = 16,
        **kwargs,
    ):
@@ -173,7 +175,7 @@ class VLLM(TemplateLM):
            "quantization": quantization,
            "seed": int(seed),
            "device": str(device),
-            "enable_lora": True if lora_local_path else False,
+            "enable_lora": bool(lora_local_path),
            "max_lora_rank": int(max_lora_rank),
        }
        self.model_args.update(kwargs)
@@ -304,7 +306,7 @@ class VLLM(TemplateLM):
        return self._max_gen_toks
    def apply_chat_template(
-        self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
+        self, chat_history: list[dict[str, str]], add_generation_prompt: bool = True
    ) -> str:
        """
        Method to apply a chat template to a list of chat history between user and model.
@@ -339,14 +341,14 @@ class VLLM(TemplateLM):
    def tok_encode(
        self,
-        string: Union[str, List[str]],
+        string: str | list[str],
        left_truncate_len: int = None,
        add_special_tokens: bool = False,
        truncation: bool = False,
-    ) -> Union[List[int], List[List[int]]]:
+    ) -> list[int] | list[list[int]]:
        if not add_special_tokens:
            add_special_tokens = False or self.add_bos_token
-        encoding: Union[List[List[int]], List[int]] = self.tokenizer(
+        encoding: list[list[int]] | list[int] = self.tokenizer(
            string,
            add_special_tokens=add_special_tokens,
            truncation=truncation,
@@ -364,10 +366,10 @@ class VLLM(TemplateLM):
    def _model_generate(
        self,
-        requests: List[List[int]] = None,
+        requests: list[list[int]] = None,
        generate: bool = False,
        max_tokens: int = None,
-        stop: Optional[List[str]] = None,
+        stop: list[str] | None = None,
        **kwargs,
    ):
        if generate:
@@ -385,7 +387,7 @@ class VLLM(TemplateLM):
            def run_inference_one_model(
                model_args: dict,
                sampling_params: SamplingParams,
-                requests: List[List[int]],
+                requests: list[list[int]],
                lora_request: LoRARequest,
            ):
                llm = LLM(**model_args)
@@ -454,7 +456,7 @@ class VLLM(TemplateLM):
                        if dead_procs:
                            raise RuntimeError(
                                f"Worker processes {dead_procs} died unexpectedly"
-                            )
+                            ) from None
                        continue
                results = [rank_res[i] for i in range(len(procs))]
@@ -481,14 +483,14 @@ class VLLM(TemplateLM):
            outputs = self.model.generate(
                prompt_token_ids=requests,
                sampling_params=sampling_params,
-                use_tqdm=True if self.batch_size == "auto" else False,
+                use_tqdm=self.batch_size == "auto",
                lora_request=self.lora_request,
            )
            return outputs
    def loglikelihood_rolling(
-        self, requests: List[Instance], disable_tqdm: bool = False
+        self, requests: list[Instance], disable_tqdm: bool = False
-    ) -> List[float]:
+    ) -> list[float]:
        adaptive_batch_size = None
        if self.batch_size == "auto":
            adaptive_batch_size = len(requests)
@@ -503,7 +505,7 @@ class VLLM(TemplateLM):
                disable=(disable_tqdm or (self.rank != 0)),
            )
        ):
-            rolling_token_windows: List[Tuple[List[int], List[int]]] = list(
+            rolling_token_windows: list[tuple[list[int], list[int]]] = list(
                map(
                    make_disjoint_window,
                    get_rolling_token_windows(
@@ -556,13 +558,13 @@ class VLLM(TemplateLM):
        return loglikelihoods
    def generate_until(
-        self, requests: List[Instance], disable_tqdm: bool = False
+        self, requests: list[Instance], disable_tqdm: bool = False
-    ) -> List[str]:
+    ) -> list[str]:
        res = []
        # batch tokenize contexts
        context, all_gen_kwargs = zip(*(req.args for req in requests))
-        context_encoding: List[List[int]] = self.tok_encode(
+        context_encoding: list[list[int]] = self.tok_encode(
            context, add_special_tokens=self.add_bos_token
        )
        requests = [
@@ -608,7 +610,7 @@ class VLLM(TemplateLM):
                raise ValueError(
                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
                )
-            if "max_gen_toks" in kwargs.keys():
+            if "max_gen_toks" in kwargs:
                max_gen_toks = kwargs.pop("max_gen_toks")
            else:
                max_gen_toks = self.max_gen_toks
@@ -634,7 +636,7 @@ class VLLM(TemplateLM):
            )
            # cache generations
-            for output, context in zip(cont, context):
+            for output, context_ in zip(cont, context):
                generated_text: str = output.outputs[0].text
                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
                generated_text = postprocess_generated_text(
@@ -642,7 +644,7 @@ class VLLM(TemplateLM):
                )
                res.append(generated_text)
                self.cache_hook.add_partial(
-                    "generate_until", (context, gen_kwargs), generated_text
+                    "generate_until", (context_, gen_kwargs), generated_text
                )
                pbar.update(1)
@@ -652,9 +654,9 @@ class VLLM(TemplateLM):
    def _loglikelihood_tokens(
        self,
-        requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
+        requests: list[tuple[tuple[str, str], list[int], list[int]]],
        disable_tqdm: bool = False,
-    ) -> List[Tuple[float, bool]]:
+    ) -> list[tuple[float, bool]]:
        res = []
        def _collate(x):
@@ -675,7 +677,7 @@ class VLLM(TemplateLM):
        for chunk in chunks:
            inputs = []
            ctxlens = []
-            for cache_key, context_enc, continuation_enc in chunk:
+            for _cache_key, context_enc, continuation_enc in chunk:
                if (
                    full_length := len(context_enc + continuation_enc)
                ) > self.max_length:
@@ -713,7 +715,7 @@ class VLLM(TemplateLM):
        return re_ord.get_original(res)
    @staticmethod
-    def _parse_logprobs(tokens: List, outputs, ctxlen: int) -> Tuple[float, bool]:
+    def _parse_logprobs(tokens: list, outputs, ctxlen: int) -> tuple[float, bool]:
        """Process logprobs and tokens.
        :param tokens: list