remove all; reformat table (#3107)

28001d29 · Baber Abbasi · GitHub · 71d0289d · 28001d29 · 28001d29
Unverified Commit 28001d29 authored Jul 05, 2025 by Baber Abbasi Committed by GitHub Jul 05, 2025
6 changed files
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
@@ -50,12 +50,12 @@ jobs:
        with:
          python-version: 3.9
          cache: 'pip'
-          cache-dependency-path: setup.py
+          cache-dependency-path: pyproject.toml
      - name: Install dependencies
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
        run: |
            python -m pip install --upgrade pip
-            pip install -e '.[dev,ifeval]' --extra-index-url https://download.pytorch.org/whl/cpu
+            pip install -e '.[dev,ifeval,unitxt]' --extra-index-url https://download.pytorch.org/whl/cpu
    #   Install optional git dependencies
    #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
    #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi

--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -53,7 +53,7 @@ jobs:

      # Cache HuggingFace cache directory for CPU tests
      - name: Cache HuggingFace cache (CPU tests)
-        uses: actions/cache@v3
+        uses: actions/cache@v4
        id: cache-hf-cpu
        with:
          path: ~/.cache/huggingface
@@ -64,7 +64,7 @@ jobs:
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
-          pip install -e '.[dev]' --extra-index-url https://download.pytorch.org/whl/cpu
+          pip install -e '.[dev,unitxt]' --extra-index-url https://download.pytorch.org/whl/cpu
          pip install hf_xet

      - name: Test with pytest

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -29,7 +29,7 @@ repos:
      - id: mixed-line-ending
        args: [--fix=lf]
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.10
+    rev: v0.12.2
    hooks:
      # Run the linter.
      - id: ruff
@@ -47,7 +47,7 @@ repos:
          )$
        args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
  - repo: https://github.com/jackdewinter/pymarkdown
-    rev: v0.9.29
+    rev: v0.9.30
    hooks:
      - id: pymarkdown
        exclude: ^(lm_eval/tasks/.*|docs/footguns\.md)$

--- a/README.md
+++ b/README.md
@@ -599,37 +599,24 @@ The best way to get support is to open an issue on this repo or join the [Eleuth

 Extras dependencies can be installed via `pip install -e ".[NAME]"`

-| Name                 | Use                                                   |
-| -------------------- | ----------------------------------------------------- |
-| api                  | For using api models (Anthropic, OpenAI API)          |
-| audiolm_qwen         | For running Qwen2 audio models                        |
-| deepsparse           | For running NM's DeepSparse models                    |
-| dev                  | For linting PRs and contributions                     |
-| gptq                 | For loading models with AutoGPTQ                      |
-| gptqmodel            | For loading models with GPTQModel                     |
-| hf_transfer          | For speeding up HF Hub file downloads                 |
-| ibm_watsonx_ai       | For using IBM watsonx.ai model apis                   |
-| ifeval               | For running the IFEval task                           |
-| ipex                 | For running on optimum-intel ipex backend             |
-| japanese_leaderboard | For running Japanese LLM Leaderboard tasks            |
-| longbench            | For running LongBench tasks                           |
-| mamba                | For loading Mamba SSM models                          |
-| math                 | For running math task answer checking                 |
-| multilingual         | For multilingual tokenizers                           |
-| neuronx              | For running on AWS inf2 instances                     |
-| optimum              | For running Intel OpenVINO models                     |
-| promptsource         | For using PromptSource prompts                        |
-| ruler                | For running RULER tasks                               |
-| sae_lens             | For using SAELens to steer models                     |
-| sentencepiece        | For using the sentencepiece tokenizer                 |
-| sparseml             | For using NM's SparseML models                        |
-| sparsify             | For using Sparsify to steer models                    |
-| testing              | For running library test suite                        |
-| vllm                 | For loading models with vLLM                          |
-| wandb                | For integration with `Weights and Biases` platform    |
-| zeno                 | For visualizing results with Zeno                     |
-| -------------------- | ----------------------------------------------------- |
-| all                  | Loads all extras (not recommended)                    |
+| NAME                 | Description                    | NAME           | Description                           |
+|----------------------|--------------------------------|----------------|---------------------------------------|
+| tasks                | All task-specific dependencies | api            | API models (Anthropic, OpenAI, local) |
+| acpbench             | ACP Bench tasks                | audiolm_qwen   | Qwen2 audio models                    |
+| ifeval               | IFEval task                    | deepsparse     | DeepSparse models (CPU)               |
+| japanese_leaderboard | Japanese LLM tasks             | gptq           | AutoGPTQ models                       |
+| longbench            | LongBench tasks                | gptqmodel      | GPTQModel models                      |
+| math                 | Math answer checking           | hf_transfer    | Speed up HF downloads                 |
+| multilingual         | Multilingual tokenizers        | ibm_watsonx_ai | IBM watsonx.ai models                 |
+| ruler                | RULER tasks                    | ipex           | Intel IPEX backend                    |
+|                      |                                |                |                                       |
+| dev                  | Linting & contributions        | mamba          | Mamba SSM models                      |
+| promptsource         | PromptSource prompts           | neuronx        | AWS inf2 instances                    |
+| sentencepiece        | Sentencepiece tokenizer        | optimum        | Intel OpenVINO models                 |
+| testing              | Run test suite                 | sae_lens       | SAELens model steering                |
+| unitxt               | Run unitxt tasks               | sparseml       | SparseML models (CPU)                 |
+| wandb                | Weights & Biases               | sparsify       | Sparsify model steering               |
+| zeno                 | Result visualization           | vllm           | vLLM models                           |

 ## Cite as


--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -3,15 +3,19 @@ import hashlib
 import json
 import logging
 import os
-from typing import Dict, List, Optional, Tuple, Type, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Iterable, Optional, Type, TypeVar, Union

-import transformers
-from sqlitedict import SqliteDict
 from tqdm import tqdm

 from lm_eval import utils


+if TYPE_CHECKING:
+    from sqlitedict import SqliteDict
+
+    from lm_eval.api.instance import Instance
+
+
 eval_logger = logging.getLogger(__name__)

 T = TypeVar("T", bound="LM")
@@ -27,10 +31,10 @@ class LM(abc.ABC):
        # set rank and world size to a single process, by default.
        self._rank = 0
        self._world_size = 1
-        self.cache_hook = CacheHook(None)
+        self.cache_hook: "CacheHook" = CacheHook(None)

    @abc.abstractmethod
-    def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
+    def loglikelihood(self, requests) -> list[tuple[float, bool]]:
        """Compute log-likelihood of generating a continuation from a context.
        Downstream tasks should attempt to use loglikelihood instead of other
        LM calls whenever possible.
@@ -55,7 +59,7 @@ class LM(abc.ABC):
        pass

    @abc.abstractmethod
-    def loglikelihood_rolling(self, requests) -> List[float]:
+    def loglikelihood_rolling(self, requests) -> list[float]:
        """Compute full log-likelihood of a string, with no truncation, for perplexity computation
        - We will use the full max context length of the model.
        - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
@@ -97,7 +101,7 @@ class LM(abc.ABC):

    # TODO: Add an optional max length
    @abc.abstractmethod
-    def generate_until(self, requests) -> List[str]:
+    def generate_until(self, requests) -> list[str]:
        """Generate greedily until a stopping sequence

        :param requests: list[Instance]
@@ -114,7 +118,7 @@ class LM(abc.ABC):
        pass

    def apply_chat_template(
-        self, chat_history: List[Dict[str, str]], add_generation_prompt=True
+        self, chat_history: list[dict[str, str]], add_generation_prompt=True
    ) -> str:
        """
        Defines how to transform few-shot examples provided as chat history into a format that can be used as input to the LM.
@@ -165,8 +169,7 @@ class LM(abc.ABC):
        - Instance of the LM class.
        """

-        additional_config = {} if additional_config is None else additional_config
-        additional_config = {
+        additional_config = additional_config or {} | {
            k: v for k, v in additional_config.items() if v is not None
        }

@@ -204,25 +207,25 @@ class LM(abc.ABC):

        return ""

-    def set_cache_hook(self, cache_hook) -> None:
+    def set_cache_hook(self, cache_hook: "CacheHook") -> None:
        self.cache_hook = cache_hook


 ### SQLite-based caching of LM responses
-def hash_args(attr, args):
+def hash_args(attr: str, args: Iterable[Any]) -> str:
    dat = json.dumps([attr] + list(args))
    return hashlib.sha256(dat.encode("utf-8")).hexdigest()


 class CacheHook:
-    def __init__(self, cachinglm) -> None:
+    def __init__(self, cachinglm: Optional["CachingLM"]) -> None:
        if cachinglm is None:
-            self.dbdict = None
+            self.dbdict: Optional["SqliteDict"] = None
            return

        self.dbdict = cachinglm.dbdict

-    def add_partial(self, attr, req, res) -> None:
+    def add_partial(self, attr: str, req: Iterable[Any], res: Any) -> None:
        if self.dbdict is None:
            return
        hsh = hash_args(attr, req)
@@ -230,7 +233,7 @@ class CacheHook:


 class CachingLM:
-    def __init__(self, lm, cache_db) -> None:
+    def __init__(self, lm: LM, cache_db: str) -> None:
        """LM wrapper that returns cached results if they exist, and uses the underlying LM if not.

        :param lm: LM
@@ -238,8 +241,10 @@ class CachingLM:
        :param cache_db: str
            Path to cache db
        """
-        self.lm = lm
-        self.cache_db = cache_db
+        from sqlitedict import SqliteDict
+
+        self.lm: LM = lm
+        self.cache_db: str = cache_db
        if os.path.dirname(cache_db):
            os.makedirs(os.path.dirname(cache_db), exist_ok=True)
        self.dbdict = SqliteDict(cache_db, autocommit=True)
@@ -247,13 +252,13 @@ class CachingLM:
        # add hook to lm
        lm.set_cache_hook(self.get_cache_hook())

-    def __getattr__(self, attr: str):
+    def __getattr__(self, attr: str) -> Any:
        lm_attr = getattr(self.lm, attr)
        if attr not in ["loglikelihood", "loglikelihood_rolling", "generate_until"]:
            eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM")
            return lm_attr

-        def fn(requests):
+        def _fn(requests: list["Instance"]) -> list["Instance"]:
            res = []
            remaining_reqs = []
            warned = False
@@ -306,9 +311,9 @@ class CachingLM:

            return res

-        return fn
+        return _fn

-    def get_cache_hook(self):
+    def get_cache_hook(self) -> "CacheHook":
        return CacheHook(self)


@@ -331,19 +336,23 @@ class TemplateLM(LM):
        return self.eot_token_id

    @abc.abstractmethod
-    def tok_encode(self, string: str, **kwargs) -> List[int]:
+    def tok_encode(self, string: str, **kwargs) -> list[int]:
        """
        Tokenize a string using the model's tokenizer and return a list of token IDs.
        """
        pass

    @abc.abstractmethod
-    def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
+    def _loglikelihood_tokens(
+        self, requests: list["Instance"], **kwargs
+    ) -> list[tuple[float, bool]]:
        pass

    def _encode_pair(
        self, context: str, continuation: str
-    ) -> Tuple[List[int], List[int]]:
+    ) -> tuple[list[int], list[int]]:
+        import transformers
+
        n_spaces = len(context) - len(context.rstrip())
        if n_spaces > 0:
            continuation = context[-n_spaces:] + continuation
@@ -364,8 +373,8 @@ class TemplateLM(LM):
        return context_enc, continuation_enc

    def loglikelihood(
-        self, requests, disable_tqdm: bool = False
-    ) -> List[Tuple[float, bool]]:
+        self, requests: list["Instance"], disable_tqdm: bool = False
+    ) -> list[tuple[float, bool]]:
        new_reqs = []
        for context, continuation in [req.args for req in requests]:
            if context == "":
@@ -384,11 +393,11 @@ class TemplateLM(LM):
    @abc.abstractmethod
    def loglikelihood_rolling(
        self, requests, disable_tqdm: bool = False
-    ) -> List[float]:
+    ) -> list[float]:
        pass

    @abc.abstractmethod
-    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
+    def generate_until(self, requests, disable_tqdm: bool = False) -> list[str]:
        pass

    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -61,7 +61,7 @@ acpbench = ["lark>=1.1.9", "tarski[clingo]==0.8.2", "pddl==0.4.2", "kstar-planne
 api = ["requests", "aiohttp", "tenacity", "tqdm", "tiktoken"]
 audiolm_qwen = ["librosa", "soundfile"]
 deepsparse = ["deepsparse-nightly[llm]>=1.8.0.20240404"]
-dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy", "unitxt==1.22.0", "requests", "aiohttp", "tenacity", "tqdm", "tiktoken", "sentencepiece"]
+dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "requests", "aiohttp", "tenacity", "tqdm", "tiktoken", "sentencepiece"]
 gptq = ["auto-gptq[triton]>=0.6.0"]
 gptqmodel = ["gptqmodel>=1.0.9"]
 hf_transfer = ["hf_transfer"]
@@ -82,38 +82,18 @@ sentencepiece = ["sentencepiece>=0.1.98"]
 sparseml = ["sparseml-nightly[llm]>=1.8.0.20240404"]
 sparsify = ["sparsify"]
 testing = ["pytest", "pytest-cov", "pytest-xdist"]
+unitxt = ["unitxt==1.22.0"]
 vllm = ["vllm>=0.4.2"]
 wandb = ["wandb>=0.16.3", "pandas", "numpy"]
 zeno = ["pandas", "zeno-client"]
-all = [
+tasks = [
    "lm_eval[acpbench]",
-    "lm_eval[api]",
-    "lm_eval[audiolm_qwen]",
-    "lm_eval[deepsparse]",
-    "lm_eval[dev]",
-    "lm_eval[gptq]",
-    "lm_eval[gptqmodel]",
-    "lm_eval[hf_transfer]",
-    "lm_eval[ibm_watsonx_ai]",
    "lm_eval[ifeval]",
-    "lm_eval[ipex]",
    "lm_eval[japanese_leaderboard]",
    "lm_eval[longbench]",
-    "lm_eval[mamba]",
    "lm_eval[math]",
    "lm_eval[multilingual]",
-    "lm_eval[neuronx]",
-    "lm_eval[optimum]",
-    "lm_eval[promptsource]",
    "lm_eval[ruler]",
-    "lm_eval[sae_lens]",
-    "lm_eval[sentencepiece]",
-    "lm_eval[sparseml]",
-    "lm_eval[sparsify]",
-    "lm_eval[testing]",
-    "lm_eval[vllm]",
-    "lm_eval[wandb]",
-    "lm_eval[zeno]",
 ]

 [tool.pymarkdown]