Pin datasets < 4.0.0 (#3172)

* Fix: pin datasets < 4.0 * fix * update type hints in HF * fix hellaswag path

Pin datasets < 4.0.0 (#3172)
* Fix: pin datasets < 4.0 * fix * update type hints in HF * fix hellaswag path
904bba12 · Baber Abbasi · GitHub · 2eea3f50 · 904bba12 · 904bba12
Unverified Commit 904bba12 authored Jul 23, 2025 by Baber Abbasi Committed by GitHub Jul 23, 2025
5 changed files
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -433,8 +433,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        # because it's already been determined based on the prior env var before launching our
        # script--`datasets` gets imported by lm_eval internally before these lines can update the env.
        import datasets
+        from packaging.version import parse as vparse

-        datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
+        if vparse(datasets.__version__) < vparse("4.0.0"):
+            datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True

        if isinstance(args.model_args, dict):
            args.model_args["trust_remote_code"] = True

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -981,6 +981,10 @@ class ConfigurableTask(Task):
    def download(
        self, dataset_kwargs: Optional[Dict[str, Any]] = None, **kwargs
    ) -> None:
+        from packaging.version import parse as vparse
+
+        if dataset_kwargs and vparse(datasets.__version__) >= vparse("4.0.0"):
+            dataset_kwargs.pop("trust_remote_code", None)
        if isinstance(self.config.custom_dataset, Callable):
            eval_logger.warning(
                f"{self.config.task}: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager."

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
--- a/lm_eval/tasks/hellaswag/hellaswag.yaml
+++ b/lm_eval/tasks/hellaswag/hellaswag.yaml
 tag:
  - multiple_choice
 task: hellaswag
-dataset_path: hellaswag
+dataset_path: Rowan/hellaswag
 dataset_name: null
 output_type: multiple_choice
 training_split: train

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,7 @@ license = { "text" = "MIT" }
 dependencies = [
    "accelerate>=0.26.0",
    "evaluate",
-    "datasets>=2.16.0",
+    "datasets>=2.16.0,<4.0",
    "evaluate>=0.4.0",
    "jsonlines",
    "numexpr",