Unverified Commit 28001d29 authored by Baber Abbasi's avatar Baber Abbasi Committed by GitHub
Browse files

remove all; reformat table (#3107)

parent 71d0289d
...@@ -50,12 +50,12 @@ jobs: ...@@ -50,12 +50,12 @@ jobs:
with: with:
python-version: 3.9 python-version: 3.9
cache: 'pip' cache: 'pip'
cache-dependency-path: setup.py cache-dependency-path: pyproject.toml
- name: Install dependencies - name: Install dependencies
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install -e '.[dev,ifeval]' --extra-index-url https://download.pytorch.org/whl/cpu pip install -e '.[dev,ifeval,unitxt]' --extra-index-url https://download.pytorch.org/whl/cpu
# Install optional git dependencies # Install optional git dependencies
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt # pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
......
...@@ -53,7 +53,7 @@ jobs: ...@@ -53,7 +53,7 @@ jobs:
# Cache HuggingFace cache directory for CPU tests # Cache HuggingFace cache directory for CPU tests
- name: Cache HuggingFace cache (CPU tests) - name: Cache HuggingFace cache (CPU tests)
uses: actions/cache@v3 uses: actions/cache@v4
id: cache-hf-cpu id: cache-hf-cpu
with: with:
path: ~/.cache/huggingface path: ~/.cache/huggingface
...@@ -64,7 +64,7 @@ jobs: ...@@ -64,7 +64,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install -e '.[dev]' --extra-index-url https://download.pytorch.org/whl/cpu pip install -e '.[dev,unitxt]' --extra-index-url https://download.pytorch.org/whl/cpu
pip install hf_xet pip install hf_xet
- name: Test with pytest - name: Test with pytest
......
...@@ -29,7 +29,7 @@ repos: ...@@ -29,7 +29,7 @@ repos:
- id: mixed-line-ending - id: mixed-line-ending
args: [--fix=lf] args: [--fix=lf]
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.10 rev: v0.12.2
hooks: hooks:
# Run the linter. # Run the linter.
- id: ruff - id: ruff
...@@ -47,7 +47,7 @@ repos: ...@@ -47,7 +47,7 @@ repos:
)$ )$
args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt] args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
- repo: https://github.com/jackdewinter/pymarkdown - repo: https://github.com/jackdewinter/pymarkdown
rev: v0.9.29 rev: v0.9.30
hooks: hooks:
- id: pymarkdown - id: pymarkdown
exclude: ^(lm_eval/tasks/.*|docs/footguns\.md)$ exclude: ^(lm_eval/tasks/.*|docs/footguns\.md)$
......
...@@ -599,37 +599,24 @@ The best way to get support is to open an issue on this repo or join the [Eleuth ...@@ -599,37 +599,24 @@ The best way to get support is to open an issue on this repo or join the [Eleuth
Extras dependencies can be installed via `pip install -e ".[NAME]"` Extras dependencies can be installed via `pip install -e ".[NAME]"`
| Name | Use | | NAME | Description | NAME | Description |
| -------------------- | ----------------------------------------------------- | |----------------------|--------------------------------|----------------|---------------------------------------|
| api | For using api models (Anthropic, OpenAI API) | | tasks | All task-specific dependencies | api | API models (Anthropic, OpenAI, local) |
| audiolm_qwen | For running Qwen2 audio models | | acpbench | ACP Bench tasks | audiolm_qwen | Qwen2 audio models |
| deepsparse | For running NM's DeepSparse models | | ifeval | IFEval task | deepsparse | DeepSparse models (CPU) |
| dev | For linting PRs and contributions | | japanese_leaderboard | Japanese LLM tasks | gptq | AutoGPTQ models |
| gptq | For loading models with AutoGPTQ | | longbench | LongBench tasks | gptqmodel | GPTQModel models |
| gptqmodel | For loading models with GPTQModel | | math | Math answer checking | hf_transfer | Speed up HF downloads |
| hf_transfer | For speeding up HF Hub file downloads | | multilingual | Multilingual tokenizers | ibm_watsonx_ai | IBM watsonx.ai models |
| ibm_watsonx_ai | For using IBM watsonx.ai model apis | | ruler | RULER tasks | ipex | Intel IPEX backend |
| ifeval | For running the IFEval task | | | | | |
| ipex | For running on optimum-intel ipex backend | | dev | Linting & contributions | mamba | Mamba SSM models |
| japanese_leaderboard | For running Japanese LLM Leaderboard tasks | | promptsource | PromptSource prompts | neuronx | AWS inf2 instances |
| longbench | For running LongBench tasks | | sentencepiece | Sentencepiece tokenizer | optimum | Intel OpenVINO models |
| mamba | For loading Mamba SSM models | | testing | Run test suite | sae_lens | SAELens model steering |
| math | For running math task answer checking | | unitxt | Run unitxt tasks | sparseml | SparseML models (CPU) |
| multilingual | For multilingual tokenizers | | wandb | Weights & Biases | sparsify | Sparsify model steering |
| neuronx | For running on AWS inf2 instances | | zeno | Result visualization | vllm | vLLM models |
| optimum | For running Intel OpenVINO models |
| promptsource | For using PromptSource prompts |
| ruler | For running RULER tasks |
| sae_lens | For using SAELens to steer models |
| sentencepiece | For using the sentencepiece tokenizer |
| sparseml | For using NM's SparseML models |
| sparsify | For using Sparsify to steer models |
| testing | For running library test suite |
| vllm | For loading models with vLLM |
| wandb | For integration with `Weights and Biases` platform |
| zeno | For visualizing results with Zeno |
| -------------------- | ----------------------------------------------------- |
| all | Loads all extras (not recommended) |
## Cite as ## Cite as
......
...@@ -3,15 +3,19 @@ import hashlib ...@@ -3,15 +3,19 @@ import hashlib
import json import json
import logging import logging
import os import os
from typing import Dict, List, Optional, Tuple, Type, TypeVar, Union from typing import TYPE_CHECKING, Any, Iterable, Optional, Type, TypeVar, Union
import transformers
from sqlitedict import SqliteDict
from tqdm import tqdm from tqdm import tqdm
from lm_eval import utils from lm_eval import utils
if TYPE_CHECKING:
from sqlitedict import SqliteDict
from lm_eval.api.instance import Instance
eval_logger = logging.getLogger(__name__) eval_logger = logging.getLogger(__name__)
T = TypeVar("T", bound="LM") T = TypeVar("T", bound="LM")
...@@ -27,10 +31,10 @@ class LM(abc.ABC): ...@@ -27,10 +31,10 @@ class LM(abc.ABC):
# set rank and world size to a single process, by default. # set rank and world size to a single process, by default.
self._rank = 0 self._rank = 0
self._world_size = 1 self._world_size = 1
self.cache_hook = CacheHook(None) self.cache_hook: "CacheHook" = CacheHook(None)
@abc.abstractmethod @abc.abstractmethod
def loglikelihood(self, requests) -> List[Tuple[float, bool]]: def loglikelihood(self, requests) -> list[tuple[float, bool]]:
"""Compute log-likelihood of generating a continuation from a context. """Compute log-likelihood of generating a continuation from a context.
Downstream tasks should attempt to use loglikelihood instead of other Downstream tasks should attempt to use loglikelihood instead of other
LM calls whenever possible. LM calls whenever possible.
...@@ -55,7 +59,7 @@ class LM(abc.ABC): ...@@ -55,7 +59,7 @@ class LM(abc.ABC):
pass pass
@abc.abstractmethod @abc.abstractmethod
def loglikelihood_rolling(self, requests) -> List[float]: def loglikelihood_rolling(self, requests) -> list[float]:
"""Compute full log-likelihood of a string, with no truncation, for perplexity computation """Compute full log-likelihood of a string, with no truncation, for perplexity computation
- We will use the full max context length of the model. - We will use the full max context length of the model.
- For inputs that exceed the max context length, we divide the tokenized string into chunks of up to - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
...@@ -97,7 +101,7 @@ class LM(abc.ABC): ...@@ -97,7 +101,7 @@ class LM(abc.ABC):
# TODO: Add an optional max length # TODO: Add an optional max length
@abc.abstractmethod @abc.abstractmethod
def generate_until(self, requests) -> List[str]: def generate_until(self, requests) -> list[str]:
"""Generate greedily until a stopping sequence """Generate greedily until a stopping sequence
:param requests: list[Instance] :param requests: list[Instance]
...@@ -114,7 +118,7 @@ class LM(abc.ABC): ...@@ -114,7 +118,7 @@ class LM(abc.ABC):
pass pass
def apply_chat_template( def apply_chat_template(
self, chat_history: List[Dict[str, str]], add_generation_prompt=True self, chat_history: list[dict[str, str]], add_generation_prompt=True
) -> str: ) -> str:
""" """
Defines how to transform few-shot examples provided as chat history into a format that can be used as input to the LM. Defines how to transform few-shot examples provided as chat history into a format that can be used as input to the LM.
...@@ -165,8 +169,7 @@ class LM(abc.ABC): ...@@ -165,8 +169,7 @@ class LM(abc.ABC):
- Instance of the LM class. - Instance of the LM class.
""" """
additional_config = {} if additional_config is None else additional_config additional_config = additional_config or {} | {
additional_config = {
k: v for k, v in additional_config.items() if v is not None k: v for k, v in additional_config.items() if v is not None
} }
...@@ -204,25 +207,25 @@ class LM(abc.ABC): ...@@ -204,25 +207,25 @@ class LM(abc.ABC):
return "" return ""
def set_cache_hook(self, cache_hook) -> None: def set_cache_hook(self, cache_hook: "CacheHook") -> None:
self.cache_hook = cache_hook self.cache_hook = cache_hook
### SQLite-based caching of LM responses ### SQLite-based caching of LM responses
def hash_args(attr, args): def hash_args(attr: str, args: Iterable[Any]) -> str:
dat = json.dumps([attr] + list(args)) dat = json.dumps([attr] + list(args))
return hashlib.sha256(dat.encode("utf-8")).hexdigest() return hashlib.sha256(dat.encode("utf-8")).hexdigest()
class CacheHook: class CacheHook:
def __init__(self, cachinglm) -> None: def __init__(self, cachinglm: Optional["CachingLM"]) -> None:
if cachinglm is None: if cachinglm is None:
self.dbdict = None self.dbdict: Optional["SqliteDict"] = None
return return
self.dbdict = cachinglm.dbdict self.dbdict = cachinglm.dbdict
def add_partial(self, attr, req, res) -> None: def add_partial(self, attr: str, req: Iterable[Any], res: Any) -> None:
if self.dbdict is None: if self.dbdict is None:
return return
hsh = hash_args(attr, req) hsh = hash_args(attr, req)
...@@ -230,7 +233,7 @@ class CacheHook: ...@@ -230,7 +233,7 @@ class CacheHook:
class CachingLM: class CachingLM:
def __init__(self, lm, cache_db) -> None: def __init__(self, lm: LM, cache_db: str) -> None:
"""LM wrapper that returns cached results if they exist, and uses the underlying LM if not. """LM wrapper that returns cached results if they exist, and uses the underlying LM if not.
:param lm: LM :param lm: LM
...@@ -238,8 +241,10 @@ class CachingLM: ...@@ -238,8 +241,10 @@ class CachingLM:
:param cache_db: str :param cache_db: str
Path to cache db Path to cache db
""" """
self.lm = lm from sqlitedict import SqliteDict
self.cache_db = cache_db
self.lm: LM = lm
self.cache_db: str = cache_db
if os.path.dirname(cache_db): if os.path.dirname(cache_db):
os.makedirs(os.path.dirname(cache_db), exist_ok=True) os.makedirs(os.path.dirname(cache_db), exist_ok=True)
self.dbdict = SqliteDict(cache_db, autocommit=True) self.dbdict = SqliteDict(cache_db, autocommit=True)
...@@ -247,13 +252,13 @@ class CachingLM: ...@@ -247,13 +252,13 @@ class CachingLM:
# add hook to lm # add hook to lm
lm.set_cache_hook(self.get_cache_hook()) lm.set_cache_hook(self.get_cache_hook())
def __getattr__(self, attr: str): def __getattr__(self, attr: str) -> Any:
lm_attr = getattr(self.lm, attr) lm_attr = getattr(self.lm, attr)
if attr not in ["loglikelihood", "loglikelihood_rolling", "generate_until"]: if attr not in ["loglikelihood", "loglikelihood_rolling", "generate_until"]:
eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM") eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM")
return lm_attr return lm_attr
def fn(requests): def _fn(requests: list["Instance"]) -> list["Instance"]:
res = [] res = []
remaining_reqs = [] remaining_reqs = []
warned = False warned = False
...@@ -306,9 +311,9 @@ class CachingLM: ...@@ -306,9 +311,9 @@ class CachingLM:
return res return res
return fn return _fn
def get_cache_hook(self): def get_cache_hook(self) -> "CacheHook":
return CacheHook(self) return CacheHook(self)
...@@ -331,19 +336,23 @@ class TemplateLM(LM): ...@@ -331,19 +336,23 @@ class TemplateLM(LM):
return self.eot_token_id return self.eot_token_id
@abc.abstractmethod @abc.abstractmethod
def tok_encode(self, string: str, **kwargs) -> List[int]: def tok_encode(self, string: str, **kwargs) -> list[int]:
""" """
Tokenize a string using the model's tokenizer and return a list of token IDs. Tokenize a string using the model's tokenizer and return a list of token IDs.
""" """
pass pass
@abc.abstractmethod @abc.abstractmethod
def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]: def _loglikelihood_tokens(
self, requests: list["Instance"], **kwargs
) -> list[tuple[float, bool]]:
pass pass
def _encode_pair( def _encode_pair(
self, context: str, continuation: str self, context: str, continuation: str
) -> Tuple[List[int], List[int]]: ) -> tuple[list[int], list[int]]:
import transformers
n_spaces = len(context) - len(context.rstrip()) n_spaces = len(context) - len(context.rstrip())
if n_spaces > 0: if n_spaces > 0:
continuation = context[-n_spaces:] + continuation continuation = context[-n_spaces:] + continuation
...@@ -364,8 +373,8 @@ class TemplateLM(LM): ...@@ -364,8 +373,8 @@ class TemplateLM(LM):
return context_enc, continuation_enc return context_enc, continuation_enc
def loglikelihood( def loglikelihood(
self, requests, disable_tqdm: bool = False self, requests: list["Instance"], disable_tqdm: bool = False
) -> List[Tuple[float, bool]]: ) -> list[tuple[float, bool]]:
new_reqs = [] new_reqs = []
for context, continuation in [req.args for req in requests]: for context, continuation in [req.args for req in requests]:
if context == "": if context == "":
...@@ -384,11 +393,11 @@ class TemplateLM(LM): ...@@ -384,11 +393,11 @@ class TemplateLM(LM):
@abc.abstractmethod @abc.abstractmethod
def loglikelihood_rolling( def loglikelihood_rolling(
self, requests, disable_tqdm: bool = False self, requests, disable_tqdm: bool = False
) -> List[float]: ) -> list[float]:
pass pass
@abc.abstractmethod @abc.abstractmethod
def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]: def generate_until(self, requests, disable_tqdm: bool = False) -> list[str]:
pass pass
def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]: def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
......
...@@ -61,7 +61,7 @@ acpbench = ["lark>=1.1.9", "tarski[clingo]==0.8.2", "pddl==0.4.2", "kstar-planne ...@@ -61,7 +61,7 @@ acpbench = ["lark>=1.1.9", "tarski[clingo]==0.8.2", "pddl==0.4.2", "kstar-planne
api = ["requests", "aiohttp", "tenacity", "tqdm", "tiktoken"] api = ["requests", "aiohttp", "tenacity", "tqdm", "tiktoken"]
audiolm_qwen = ["librosa", "soundfile"] audiolm_qwen = ["librosa", "soundfile"]
deepsparse = ["deepsparse-nightly[llm]>=1.8.0.20240404"] deepsparse = ["deepsparse-nightly[llm]>=1.8.0.20240404"]
dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy", "unitxt==1.22.0", "requests", "aiohttp", "tenacity", "tqdm", "tiktoken", "sentencepiece"] dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "requests", "aiohttp", "tenacity", "tqdm", "tiktoken", "sentencepiece"]
gptq = ["auto-gptq[triton]>=0.6.0"] gptq = ["auto-gptq[triton]>=0.6.0"]
gptqmodel = ["gptqmodel>=1.0.9"] gptqmodel = ["gptqmodel>=1.0.9"]
hf_transfer = ["hf_transfer"] hf_transfer = ["hf_transfer"]
...@@ -82,38 +82,18 @@ sentencepiece = ["sentencepiece>=0.1.98"] ...@@ -82,38 +82,18 @@ sentencepiece = ["sentencepiece>=0.1.98"]
sparseml = ["sparseml-nightly[llm]>=1.8.0.20240404"] sparseml = ["sparseml-nightly[llm]>=1.8.0.20240404"]
sparsify = ["sparsify"] sparsify = ["sparsify"]
testing = ["pytest", "pytest-cov", "pytest-xdist"] testing = ["pytest", "pytest-cov", "pytest-xdist"]
unitxt = ["unitxt==1.22.0"]
vllm = ["vllm>=0.4.2"] vllm = ["vllm>=0.4.2"]
wandb = ["wandb>=0.16.3", "pandas", "numpy"] wandb = ["wandb>=0.16.3", "pandas", "numpy"]
zeno = ["pandas", "zeno-client"] zeno = ["pandas", "zeno-client"]
all = [ tasks = [
"lm_eval[acpbench]", "lm_eval[acpbench]",
"lm_eval[api]",
"lm_eval[audiolm_qwen]",
"lm_eval[deepsparse]",
"lm_eval[dev]",
"lm_eval[gptq]",
"lm_eval[gptqmodel]",
"lm_eval[hf_transfer]",
"lm_eval[ibm_watsonx_ai]",
"lm_eval[ifeval]", "lm_eval[ifeval]",
"lm_eval[ipex]",
"lm_eval[japanese_leaderboard]", "lm_eval[japanese_leaderboard]",
"lm_eval[longbench]", "lm_eval[longbench]",
"lm_eval[mamba]",
"lm_eval[math]", "lm_eval[math]",
"lm_eval[multilingual]", "lm_eval[multilingual]",
"lm_eval[neuronx]",
"lm_eval[optimum]",
"lm_eval[promptsource]",
"lm_eval[ruler]", "lm_eval[ruler]",
"lm_eval[sae_lens]",
"lm_eval[sentencepiece]",
"lm_eval[sparseml]",
"lm_eval[sparsify]",
"lm_eval[testing]",
"lm_eval[vllm]",
"lm_eval[wandb]",
"lm_eval[zeno]",
] ]
[tool.pymarkdown] [tool.pymarkdown]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment