Pin datasets < 4.0.0 (#3172)

* Fix: pin datasets < 4.0 * fix * update type hints in HF * fix hellaswag path

Pin datasets < 4.0.0 (#3172)
* Fix: pin datasets < 4.0 * fix * update type hints in HF * fix hellaswag path
904bba12 · Baber Abbasi · GitHub · 2eea3f50 · 904bba12 · 904bba12
Unverified Commit 904bba12 authored Jul 23, 2025 by Baber Abbasi Committed by GitHub Jul 23, 2025
5 changed files
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -433,8 +433,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        # because it's already been determined based on the prior env var before launching our
        # script--`datasets` gets imported by lm_eval internally before these lines can update the env.
        import datasets
+        from packaging.version import parse as vparse

-        datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
+        if vparse(datasets.__version__) < vparse("4.0.0"):
+            datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True

        if isinstance(args.model_args, dict):
            args.model_args["trust_remote_code"] = True

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -981,6 +981,10 @@ class ConfigurableTask(Task):
    def download(
        self, dataset_kwargs: Optional[Dict[str, Any]] = None, **kwargs
    ) -> None:
+        from packaging.version import parse as vparse
+
+        if dataset_kwargs and vparse(datasets.__version__) >= vparse("4.0.0"):
+            dataset_kwargs.pop("trust_remote_code", None)
        if isinstance(self.config.custom_dataset, Callable):
            eval_logger.warning(
                f"{self.config.task}: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager."

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -3,9 +3,10 @@ from __future__ import annotations
 import copy
 import logging
 import os
+from collections.abc import Iterator, Sequence
 from datetime import timedelta
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Literal

 import jinja2
 import torch
@@ -19,6 +20,7 @@ from accelerate import (
 from accelerate.utils import get_max_memory
 from huggingface_hub import HfApi
 from packaging import version
+from packaging.version import parse as vparse
 from tqdm import tqdm
 from transformers.models.auto.modeling_auto import (
    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
@@ -26,7 +28,6 @@ from transformers.models.auto.modeling_auto import (
 )

 from lm_eval import utils
-from lm_eval.api.instance import Instance
 from lm_eval.api.model import TemplateLM
 from lm_eval.api.registry import register_model
 from lm_eval.models.utils import (
@@ -42,15 +43,17 @@ from lm_eval.models.utils import (


 if TYPE_CHECKING:
-    from transformers.quantizers import AutoQuantizationConfig
+    from transformers.quantizers.auto import AutoQuantizationConfig
+
+    from lm_eval.api.instance import Instance

 eval_logger = logging.getLogger(__name__)
+TOKENIZER_INFINITY = 1000000000000000019884624838656


 @register_model("hf-auto", "hf", "huggingface")
 class HFLM(TemplateLM):
-    """
-    An abstracted Huggingface model class. Enables usage with both models of
+    """An abstracted Huggingface model class. Enables usage with both models of
    `transformers.AutoModelForCausalLM` and `transformers.AutoModelForSeq2SeqLM` classes.

    Supports data-parallel multi-GPU with HF Accelerate.
@@ -61,48 +64,45 @@ class HFLM(TemplateLM):

    def __init__(
        self,
-        pretrained: Union[str, transformers.PreTrainedModel],
+        pretrained: str | transformers.PreTrainedModel,
        backend: Literal["default", "causal", "seq2seq"] = "default",
        # override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq)
-        revision: Optional[str] = "main",
+        revision: str | None = "main",
        subfolder: str = "",
-        tokenizer: Optional[
-            Union[
-                str,
-                transformers.PreTrainedTokenizer,
-                transformers.PreTrainedTokenizerFast,
-            ]
-        ] = None,
-        truncation: Optional[bool] = False,
+        tokenizer: str
+        | transformers.PreTrainedTokenizer
+        | transformers.PreTrainedTokenizerFast
+        | None = None,
+        truncation: bool | None = False,
        logits_cache: bool = True,
-        max_length: Optional[int] = None,
-        device: Optional[str] = "cuda",
-        dtype: Optional[Union[str, torch.dtype]] = "auto",
-        softmax_dtype: Optional[Union[str, torch.dtype]] = None,
-        mixed_precision_dtype: Optional[Union[str, torch.dtype]] = None,
-        batch_size: Optional[Union[int, str]] = 1,
-        max_batch_size: Optional[int] = 64,
-        trust_remote_code: Optional[bool] = False,
-        use_fast_tokenizer: Optional[bool] = True,
-        add_bos_token: Optional[bool] = False,
-        prefix_token_id: Optional[int] = None,
+        max_length: int | None = None,
+        device: str | None = "cuda",
+        dtype: str | torch.dtype | None = "auto",
+        softmax_dtype: str | torch.dtype | None = None,
+        mixed_precision_dtype: str | torch.dtype | None = None,
+        batch_size: int | str | None = 1,
+        max_batch_size: int | None = 64,
+        trust_remote_code: bool | None = False,
+        use_fast_tokenizer: bool | None = True,
+        add_bos_token: bool | None = False,
+        prefix_token_id: int | None = None,
        # arguments used for splitting a model across GPUs naively.
        # only used if `parallelize=True`.
-        parallelize: Optional[bool] = False,
-        max_memory_per_gpu: Optional[Union[int, str]] = None,
-        max_cpu_memory: Optional[Union[int, str]] = None,
-        offload_folder: Optional[Union[str, os.PathLike]] = "./offload",
+        parallelize: bool | None = False,
+        max_memory_per_gpu: int | str | None = None,
+        max_cpu_memory: int | str | None = None,
+        offload_folder: str | os.PathLike | None = "./offload",
        # PEFT, delta weights and quantization options
-        peft: Optional[str] = None,
-        delta: Optional[str] = None,
-        autogptq: Optional[Union[bool, str]] = False,
-        gptqmodel: Optional[bool] = False,
-        gguf_file: Optional[str] = None,
+        peft: str | None = None,
+        delta: str | None = None,
+        autogptq: bool | str | None = False,
+        gptqmodel: bool | None = False,
+        gguf_file: str | None = None,
        # end token for thinking, either the string or int token id.
        # splits to get response after this token (if provided).
-        think_end_token: Union[str, int, None] = None,
+        think_end_token: str | int | None = None,
        enable_thinking: bool | None = None,
-        chat_template_args: Optional[dict[str, Any]] = None,
+        chat_template_args: dict[str, Any] | None = None,
        **kwargs,
    ) -> None:
        super().__init__()
@@ -280,18 +280,19 @@ class HFLM(TemplateLM):
            self.batch_size_per_gpu = int(batch_size)

        if isinstance(pretrained, str):
-            if gpus >= 1 or str(self.device) == "mps":
+            if (gpus >= 1 or str(self.device) == "mps") and not (
+                parallelize or autogptq or hasattr(self, "accelerator")
+            ):
                # TODO: can remove this whole snippet except in the mps case, perhaps?
-                if not (parallelize or autogptq or hasattr(self, "accelerator")):
-                    # place model onto device requested manually,
-                    # if not using HF Accelerate or device_map
-                    # or any other option that preloads model onto device
-                    try:
-                        self.model.to(self.device)
-                    except ValueError:
-                        eval_logger.debug(
-                            "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
-                        )
+                # place model onto device requested manually,
+                # if not using HF Accelerate or device_map
+                # or any other option that preloads model onto device
+                try:
+                    self.model.to(self.device)
+                except ValueError:
+                    eval_logger.debug(
+                        "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
+                    )
            # multigpu data-parallel support when launched with accelerate
            if gpus > 1:
                if accelerator.num_processes > 1:
@@ -336,12 +337,12 @@ class HFLM(TemplateLM):

    def _get_accelerate_args(
        self,
-        parallelize: Optional[bool] = None,
-        device_map: Optional[str] = "auto",
-        max_memory_per_gpu: Optional[Union[int, str]] = None,
-        max_cpu_memory: Optional[Union[int, str]] = None,
-        offload_folder: Optional[str] = "./offload",
-        gpus: Optional[int] = None,
+        parallelize: bool | None = None,
+        device_map: str | None = "auto",
+        max_memory_per_gpu: int | str | None = None,
+        max_cpu_memory: int | str | None = None,
+        offload_folder: str | None = "./offload",
+        gpus: int | None = None,
    ) -> dict:
        """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
        num_local_processes = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
@@ -379,13 +380,8 @@ class HFLM(TemplateLM):
                }
            else:  # Estimating the possible memory requirements
                max_memory_all_gpus = get_max_memory()
-                if "cpu" in max_memory_all_gpus:
-                    del max_memory_all_gpus["cpu"]
-                if not hasattr(self, "accelerator"):
-                    max_memory_per_gpu_map = {
-                        k: v for k, v in max_memory_all_gpus.items()
-                    }
-                else:
+                max_memory_all_gpus.pop("cpu", None)
+                if hasattr(self, "accelerator"):
                    # use only 1 / num_processes of the GPUs if we are running under accelerate launch
                    max_memory_per_gpu_map = {
                        k: v
@@ -393,6 +389,9 @@ class HFLM(TemplateLM):
                        if k % num_local_processes
                        == (self.accelerator.process_index % num_local_processes)
                    }
+                else:
+                    max_memory_per_gpu_map = max_memory_all_gpus
+
            args["max_memory"] = max_memory_per_gpu_map
            args["device_map"] = "auto" if device_map is None else device_map
            eval_logger.info(
@@ -436,12 +435,12 @@ class HFLM(TemplateLM):
            return self._model

    @property
-    def eot_token_id(self):
+    def eot_token_id(self) -> int:
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
        return self.tokenizer.eos_token_id

    @property
-    def prefix_token_id(self):
+    def prefix_token_id(self) -> int:
        # it is used as prefix for loglikelihood
        if self.custom_prefix_token_id is not None:
            return self.custom_prefix_token_id
@@ -450,7 +449,7 @@ class HFLM(TemplateLM):
        return self.tokenizer.eos_token_id

    @property
-    def max_length(self):
+    def max_length(self) -> int:
        if self._max_length:  # if max length manually set, return it
            return self._max_length
        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
@@ -458,7 +457,7 @@ class HFLM(TemplateLM):
            if hasattr(self.model.config, attr):
                return getattr(self.model.config, attr)
        if hasattr(self.tokenizer, "model_max_length"):
-            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
+            if self.tokenizer.model_max_length == TOKENIZER_INFINITY:
                return self._DEFAULT_MAX_LENGTH
            return self.tokenizer.model_max_length
        return self._DEFAULT_MAX_LENGTH
@@ -489,12 +488,12 @@ class HFLM(TemplateLM):

    def _get_backend(
        self,
-        config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
+        config: transformers.PretrainedConfig | transformers.AutoConfig,
        backend: Literal["default", "causal", "seq2seq"] = "default",
-        trust_remote_code: Optional[bool] = False,
+        trust_remote_code: bool | None = False,
    ) -> None:
-        """
-        Helper method during initialization.
+        """Helper method during initialization.
+
        Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder)) model type to be used.
        sets `self.AUTO_MODEL_CLASS` appropriately if not already set.

@@ -506,9 +505,7 @@ class HFLM(TemplateLM):

        if backend != "default":
            # if we've settled on non-default backend, use that manually
-            if backend == "causal":
-                self.backend = backend
-            elif backend == "seq2seq":
+            if backend in ["causal", "seq2seq"]:
                self.backend = backend
            eval_logger.info(
                f"Overrode HF model backend type, and using type '{self.backend}'"
@@ -516,7 +513,7 @@ class HFLM(TemplateLM):
        else:
            # determine and use the default HF backend for this model, based on its config + metadata.
            if (
-                getattr(config, "model_type")
+                getattr(config, "model_type", None)
                in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
            ):
                # first check if model type is listed under seq2seq models, since some
@@ -525,7 +522,7 @@ class HFLM(TemplateLM):
                self.backend = "seq2seq"
                eval_logger.debug(f"Using model type '{self.backend}'")
            elif (
-                getattr(self.config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+                getattr(config, "model_type", None) in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
            ):
                self.backend = "causal"
                eval_logger.debug(f"Using model type '{self.backend}'")
@@ -554,10 +551,10 @@ class HFLM(TemplateLM):
        pretrained: str,
        revision: str = "main",
        trust_remote_code: bool = False,
-        gguf_file: Optional[str] = None,
+        gguf_file: str | None = None,
        subfolder: str = "",
    ) -> None:
-        """Return the model config for HuggingFace models"""
+        """Return the model config for HuggingFace models."""
        self._config = transformers.AutoConfig.from_pretrained(
            pretrained,
            revision=revision,
@@ -569,29 +566,28 @@ class HFLM(TemplateLM):
    def _create_model(
        self,
        pretrained: str,
-        revision: Optional[str] = "main",
-        dtype: Optional[Union[str, torch.dtype]] = "auto",
-        trust_remote_code: Optional[bool] = False,
+        revision: str | None = "main",
+        dtype: str | torch.dtype | None = "auto",
+        trust_remote_code: bool | None = False,
        # arguments used for splitting a model across GPUs naively.
        # only used if `parallelize=True`.
        # (accelerate naive PP (device_map) options)
-        parallelize: Optional[bool] = False,
-        gpus: Optional[int] = None,
-        max_memory_per_gpu: Optional[Union[int, str]] = None,
-        max_cpu_memory: Optional[Union[int, str]] = None,
-        offload_folder: Optional[str] = "./offload",
+        parallelize: bool | None = False,
+        gpus: int | None = None,
+        max_memory_per_gpu: int | str | None = None,
+        max_cpu_memory: int | str | None = None,
+        offload_folder: str | None = "./offload",
        # PEFT, delta weights and quantization options
-        peft: Optional[str] = None,
-        delta: Optional[str] = None,
-        autogptq: Optional[Union[bool, str]] = False,
-        gptqmodel: Optional[bool] = False,
-        gguf_file: Optional[str] = None,
-        quantization_config: Optional["AutoQuantizationConfig"] = None,
+        peft: str | None = None,
+        delta: str | None = None,
+        autogptq: bool | str | None = False,
+        gptqmodel: bool | None = False,
+        gguf_file: str | None = None,
+        quantization_config: AutoQuantizationConfig | None = None,
        subfolder: str = "",
        **kwargs,
    ) -> None:
-        """
-        Initializes an HF or HF-compatible PreTrainedModel from scratch
+        """Initializes an HF or HF-compatible PreTrainedModel from scratch
        inside HFLM, using the kwargs passed into self.__init__().

        Also handles functionality such as AutoGPTQ usage and PEFT wrapping.
@@ -602,12 +598,12 @@ class HFLM(TemplateLM):
        please consider subclassing HFLM and overriding this and other methods as needed.
        """

-        model_kwargs = kwargs if kwargs else {}
+        model_kwargs = kwargs or {}

        model_kwargs.update(
            self._get_accelerate_args(
                parallelize=parallelize,
-                device_map=kwargs.get("device_map", None),
+                device_map=kwargs.get("device_map"),
                max_memory_per_gpu=max_memory_per_gpu,
                max_cpu_memory=max_cpu_memory,
                offload_folder=offload_folder,
@@ -616,16 +612,12 @@ class HFLM(TemplateLM):
        )

        if not autogptq and not gptqmodel:
-            if model_kwargs.get("load_in_4bit", None):
-                assert transformers.__version__ >= "4.30.0", (
+            if model_kwargs.get("load_in_4bit"):
+                assert vparse(transformers.__version__) >= vparse("4.30.0"), (
                    "load_in_4bit requires transformers >= 4.30.0"
                )
-            if transformers.__version__ >= "4.30.0":
-                if model_kwargs.get("load_in_4bit", None):
-                    if model_kwargs.get("bnb_4bit_compute_dtype", None):
-                        model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(
-                            model_kwargs["bnb_4bit_compute_dtype"]
-                        )
+                if compute_dtype := model_kwargs.get("bnb_4bit_compute_dtype"):
+                    model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(compute_dtype)

            self._model = self.AUTO_MODEL_CLASS.from_pretrained(
                pretrained,
@@ -650,7 +642,7 @@ class HFLM(TemplateLM):
                    raise type(exception)(
                        "Tried to load auto_gptq, but auto-gptq is not installed ",
                        "please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]",
-                    )
+                    ) from exception

                self._model = AutoGPTQForCausalLM.from_quantized(
                    pretrained,
@@ -669,7 +661,7 @@ class HFLM(TemplateLM):
                    raise type(exception)(
                        "Tried to load gptqmodel, but gptqmodel is not installed ",
                        "please install gptqmodel via `pip install gptqmodel --no-build-isolation` or `pip install lm-eval[gptqmodel] --no-build-isolation`",
-                    )
+                    ) from exception

                self._model = GPTQModel.from_quantized(
                    pretrained, trust_remote_code=trust_remote_code, **model_kwargs
@@ -684,9 +676,10 @@ class HFLM(TemplateLM):
            from peft import PeftModel
            from peft import __version__ as PEFT_VERSION

-            if model_kwargs.get("load_in_4bit", None):
-                if version.parse(PEFT_VERSION) < version.parse("0.4.0"):
-                    raise AssertionError("load_in_4bit requires peft >= 0.4.0")
+            if model_kwargs.get("load_in_4bit") and vparse(PEFT_VERSION) < vparse(
+                "0.4.0"
+            ):
+                raise AssertionError("load_in_4bit requires peft >= 0.4.0")
            if self._model.config.vocab_size != len(self.tokenizer):
                # resize model for LoRAs with added tokens
                eval_logger.info(
@@ -711,36 +704,32 @@ class HFLM(TemplateLM):
            for name, param in self._model.state_dict().items():
                try:
                    param.data += _model_delta.state_dict()[name]
-                except KeyError:
-                    raise KeyError(f"Delta model is missing weights for layer: {name}")
+                except KeyError as e:
+                    raise KeyError(
+                        f"Delta model is missing weights for layer: {name}"
+                    ) from e
                except Exception as e:
                    raise RuntimeError(
                        f"Failed to add delta weights to layer {name}. Error: {e}"
-                    )
+                    ) from e

            del _model_delta

-        return None
-
    def _create_tokenizer(
        self,
-        pretrained: Union[str, transformers.PreTrainedModel],
-        tokenizer: Optional[
-            Union[
-                str,
-                transformers.PreTrainedTokenizer,
-                transformers.PreTrainedTokenizerFast,
-            ]
-        ],
-        revision: Optional[str] = "main",
-        trust_remote_code: Optional[bool] = False,
-        use_fast_tokenizer: Optional[bool] = True,
-        gguf_file: Optional[str] = None,
-        add_bos_token: Optional[bool] = False,
-        subfolder: Optional[str] = "",
+        pretrained: str | transformers.PreTrainedModel,
+        tokenizer: str
+        | transformers.PreTrainedTokenizer
+        | transformers.PreTrainedTokenizerFast
+        | None,
+        revision: str | None = "main",
+        trust_remote_code: bool | None = False,
+        use_fast_tokenizer: bool | None = True,
+        gguf_file: str | None = None,
+        add_bos_token: bool | None = False,
+        subfolder: str | None = "",
    ) -> None:
-        """
-        Helper method during initialization.
+        """Helper method during initialization.

        Create a tokenizer object corresponding to the correct
        tokenizer for value of `pretrained`, or use the pre-initialized tokenizer passed.
@@ -769,8 +758,12 @@ class HFLM(TemplateLM):
                )
            else:
                assert isinstance(
-                    tokenizer, transformers.PreTrainedTokenizer
-                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
+                    tokenizer,
+                    (
+                        transformers.PreTrainedTokenizer,
+                        transformers.PreTrainedTokenizerFast,
+                    ),
+                )
                self.tokenizer = tokenizer
        else:
            # Get tokenizer based on 'pretrained'
@@ -782,9 +775,8 @@ class HFLM(TemplateLM):
            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
                model_name, **kwargs
            )
-        return None

-    def _detect_batch_size(self, requests=None, pos: int = 0):
+    def _detect_batch_size(self, requests: Sequence | None = None, pos: int = 0):
        if requests:
            _, context_enc, continuation_enc = requests[pos]
            max_length = len(
@@ -799,7 +791,7 @@ class HFLM(TemplateLM):

        # if OOM, then halves batch_size and tries again
        @find_executable_batch_size(starting_batch_size=self.max_batch_size)
-        def forward_batch(batch_size):
+        def forward_batch(batch_size: int):
            if self.backend == "seq2seq":
                length = max(max_context_enc, max_cont_enc)
                batched_conts = torch.ones(
@@ -846,8 +838,11 @@ class HFLM(TemplateLM):
        return batch_size

    def tok_encode(
-        self, string: str, left_truncate_len=None, add_special_tokens=None
-    ) -> List[int]:
+        self,
+        string: str,
+        left_truncate_len: int | None = None,
+        add_special_tokens: bool | None = None,
+    ) -> list[int]:
        """ """
        # default for None - empty dict, use predefined tokenizer param
        # used for all models except for CausalLM or predefined value
@@ -873,11 +868,11 @@ class HFLM(TemplateLM):

    def tok_batch_encode(
        self,
-        strings: List[str],
+        strings: list[str],
        padding_side: str = "left",
-        left_truncate_len: int = None,
+        left_truncate_len: int | None = None,
        truncation: bool = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
        old_padding_side = self.tokenizer.padding_side
        self.tokenizer.padding_side = padding_side
@@ -896,7 +891,7 @@ class HFLM(TemplateLM):
        if left_truncate_len:
            original_lengths = encoding["input_ids"].size(1)
            if original_lengths > left_truncate_len:
-                eval_logger.warn(
+                eval_logger.warning(
                    f"Left truncation applied. Original sequence length was {original_lengths}, "
                    f"truncating to last {left_truncate_len} tokens. Some content will be lost.",
                )
@@ -908,11 +903,17 @@ class HFLM(TemplateLM):

        return encoding["input_ids"], encoding["attention_mask"]

-    def tok_decode(self, tokens, skip_special_tokens=True):
+    def tok_decode(self, tokens: Iterator[list[str]], skip_special_tokens: bool = True):
        return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)

-    def _model_call(self, inps, attn_mask=None, labels=None):
+    def _model_call(
+        self,
+        inps: torch.Tensor,
+        attn_mask: torch.Tensor | None = None,
+        labels: torch.Tensor | None = None,
+    ) -> torch.Tensor:
        """
+
        :param inps: torch.Tensor
            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)] or of shape
            [batch, sequence_ctx]. the size of sequence may vary from call to call
@@ -926,32 +927,40 @@ class HFLM(TemplateLM):
            A torch tensor of shape [batch, sequence, vocab] with the
        logits returned from the model's decoder
        """
-        with torch.no_grad():
-            with torch.autocast(
+        with (
+            torch.no_grad(),
+            torch.autocast(
                device_type=self.device.type,
                dtype=self.mixed_precision_dtype,
                enabled=self.mixed_precision_dtype is not None,
-            ):
-                if attn_mask is not None or labels is not None:
-                    assert attn_mask is not None and labels is not None
-                    assert self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM
-                    return self.model(
-                        input_ids=inps, attention_mask=attn_mask, labels=labels
-                    ).logits
-                else:
-                    assert self.AUTO_MODEL_CLASS in (
-                        transformers.AutoModelForCausalLM,
-                        transformers.AutoModelForVision2Seq,
-                    )
-                    return self.model(inps).logits
+            ),
+        ):
+            if attn_mask is not None or labels is not None:
+                assert attn_mask is not None and labels is not None
+                assert transformers.AutoModelForSeq2SeqLM == self.AUTO_MODEL_CLASS
+                return self.model(
+                    input_ids=inps, attention_mask=attn_mask, labels=labels
+                ).logits
+
+            assert self.AUTO_MODEL_CLASS in (
+                transformers.AutoModelForCausalLM,
+                transformers.AutoModelForVision2Seq,
+            )
+            return self.model(inps).logits

-    def _model_generate(self, context, max_length, stop, **generation_kwargs):
+    def _model_generate(
+        self,
+        context,
+        max_length: int,
+        stop: list[str],
+        **generation_kwargs: dict[str, Any],
+    ) -> torch.Tensor:
        # temperature = 0.0 if not set
        # if do_sample is false and temp==0.0:
        # remove temperature, as do_sample=False takes care of this
        # and we don't want a warning from HF
        generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
-        do_sample = generation_kwargs.get("do_sample", None)
+        do_sample = generation_kwargs.get("do_sample")

        # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
        if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
@@ -978,7 +987,10 @@ class HFLM(TemplateLM):
            )

    def _select_cont_toks(
-        self, logits: torch.Tensor, contlen: int = None, inplen: int = None
+        self,
+        logits: torch.Tensor,
+        contlen: int | None = None,
+        inplen: int | None = None,
    ) -> torch.Tensor:
        if self.backend == "causal":
            assert contlen and inplen, (
@@ -998,8 +1010,8 @@ class HFLM(TemplateLM):
        return logits

    def loglikelihood_rolling(
-        self, requests: List[Instance], disable_tqdm: bool = False
-    ) -> List[float]:
+        self, requests: list[Instance], disable_tqdm: bool = False
+    ) -> list[float]:
        adaptive_batch_size = None
        if self.batch_size == "auto":
            # using rolling window with maximum context
@@ -1018,7 +1030,7 @@ class HFLM(TemplateLM):
                disable=(disable_tqdm or (self.rank != 0)),
            )
        ):
-            rolling_token_windows: List[Tuple[List[int], List[int]]] = list(
+            rolling_token_windows: list[tuple[list[int], list[int]]] = list(
                map(
                    utils.make_disjoint_window,
                    utils.get_rolling_token_windows(
@@ -1102,15 +1114,15 @@ class HFLM(TemplateLM):

    def _loglikelihood_tokens(
        self,
-        requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
+        requests: list[tuple[tuple[str, str], list[int], list[int]]],
        disable_tqdm: bool = False,
-        override_bs: int = None,
-    ) -> List[Tuple[float, bool]]:
+        override_bs: int | None = None,
+    ) -> list[tuple[float, bool]]:
        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
        res = []

-        def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
-            """Defines the key for the sorted method"""
+        def _collate(req: tuple[tuple[str, str], list[int], list[int]]):
+            """Defines the key for the sorted method."""
            # the negative sign on len(toks) sorts descending - this has a few advantages:
            # - time estimates will always be over not underestimates, which is more useful for planning
            # - to know the size of a batch when going through the list, you know the first one is always the batch
@@ -1121,8 +1133,8 @@ class HFLM(TemplateLM):
            toks = req[1] + req[2]
            return -len(toks), tuple(toks)

-        def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
-            """Defines the key to group and lookup one-token continuations"""
+        def _lookup_one_token_cont(req: tuple[tuple[str, str], list[int], list[int]]):
+            """Defines the key to group and lookup one-token continuations."""
            # Use with group_by="contexts" (optional)"
            # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
            # speeds up some multiple-choice tasks proportionally to the number of choices.
@@ -1295,7 +1307,7 @@ class HFLM(TemplateLM):
                # original args. Otherwise, expands the logits batch dimension and yields each
                # batch along with matching continuation tokens and prompt strings.
                # logits -> [1, seq, vocab]
-                for request_str, cont_toks, logits in re_ord.get_cache(
+                for request_str, cont_toks, logits in re_ord.get_cache(  # noqa
                    req_str=request_str,
                    cxt_toks=ctx_tokens,
                    cont_toks=cont_toks,
@@ -1336,11 +1348,11 @@ class HFLM(TemplateLM):
        return re_ord.get_original(res)

    def generate_until(
-        self, requests: List[Instance], disable_tqdm: bool = False
-    ) -> List[str]:
+        self, requests: list[Instance], disable_tqdm: bool = False
+    ) -> list[str]:
        res = []

-        def _collate(req: Tuple[str, dict]):
+        def _collate(req: tuple[str, dict]):
            """Defines the key for the sorted method"""
            # the negative sign on len(toks) sorts descending - this has a few advantages:
            # - time estimates will always be over not underestimates, which is more useful for planning
@@ -1400,10 +1412,10 @@ class HFLM(TemplateLM):
                # add EOS token to stop sequences
                until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
            else:
-                raise ValueError(
+                raise TypeError(
                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
                )
-            if "max_gen_toks" in kwargs.keys():
+            if "max_gen_toks" in kwargs:
                max_gen_toks = kwargs.pop("max_gen_toks")
            else:
                max_gen_toks = self.max_gen_toks
@@ -1481,11 +1493,9 @@ class HFLM(TemplateLM):
        return res

    def apply_chat_template(
-        self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
+        self, chat_history: list[dict[str, str]], add_generation_prompt: bool = True
    ) -> str:
-        """
-        Method to apply a chat template to a list of chat history between user and model.
-        """
+        """Method to apply a chat template to a list of chat history between user and model."""
        try:
            chat_templated = self.tokenizer.apply_chat_template(
                chat_history,
@@ -1510,9 +1520,7 @@ class HFLM(TemplateLM):
        return chat_templated

    def get_model_info(self) -> dict:
-        """
-        Method to get Hugging Face model information for experiment reproducibility.
-        """
+        """Method to get Hugging Face model information for experiment reproducibility."""

        def get_model_num_params(model) -> int:
            if hasattr(model, "num_parameters"):

--- a/lm_eval/tasks/hellaswag/hellaswag.yaml
+++ b/lm_eval/tasks/hellaswag/hellaswag.yaml
 tag:
  - multiple_choice
 task: hellaswag
-dataset_path: hellaswag
+dataset_path: Rowan/hellaswag
 dataset_name: null
 output_type: multiple_choice
 training_split: train

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,7 @@ license = { "text" = "MIT" }
 dependencies = [
    "accelerate>=0.26.0",
    "evaluate",
-    "datasets>=2.16.0",
+    "datasets>=2.16.0,<4.0",
    "evaluate>=0.4.0",
    "jsonlines",
    "numexpr",