Commit 84d02f77 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into feature/eval_from_config

parents 15ce554c fcddf195
[run]
# tasks that aren't wired up.
omit =
lm_eval/tasks/quac.py
lm_eval/tasks/storycloze.py
lm_eval/tasks/cbt.py
lm_eval/tasks/sat.py
lm_eval/tasks/triviaqa.py
lm_eval/tasks/naturalqs.py
lm_eval/models/dummy.py
[report]
exclude_lines =
# Skip any pass lines such as may be used for @abstractmethod
pass
# Have to re-enable the standard pragma
pragma: no cover
# Don't complain about missing debug-only code:
def __repr__
if self\.debug
# Don't complain if tests don't hit defensive assertion code:
raise AssertionError
raise NotImplementedError
return NotImplemented
[flake8]
ignore = E203, E266, E501, W503, F403, F401, C901
max-line-length = 127
max-complexity = 10
select = B,C,E,F,W,T4,B9
...@@ -50,12 +50,12 @@ jobs: ...@@ -50,12 +50,12 @@ jobs:
with: with:
python-version: 3.9 python-version: 3.9
cache: 'pip' cache: 'pip'
cache-dependency-path: setup.py cache-dependency-path: pyproject.toml
- name: Install dependencies - name: Install dependencies
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install -e '.[dev,ifeval]' --extra-index-url https://download.pytorch.org/whl/cpu pip install -e '.[dev,ifeval,unitxt]' --extra-index-url https://download.pytorch.org/whl/cpu
# Install optional git dependencies # Install optional git dependencies
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt # pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
......
...@@ -53,7 +53,7 @@ jobs: ...@@ -53,7 +53,7 @@ jobs:
# Cache HuggingFace cache directory for CPU tests # Cache HuggingFace cache directory for CPU tests
- name: Cache HuggingFace cache (CPU tests) - name: Cache HuggingFace cache (CPU tests)
uses: actions/cache@v3 uses: actions/cache@v4
id: cache-hf-cpu id: cache-hf-cpu
with: with:
path: ~/.cache/huggingface path: ~/.cache/huggingface
...@@ -64,11 +64,11 @@ jobs: ...@@ -64,11 +64,11 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install -e '.[dev]' --extra-index-url https://download.pytorch.org/whl/cpu pip install -e '.[dev,unitxt]' --extra-index-url https://download.pytorch.org/whl/cpu
pip install hf_xet pip install hf_xet
- name: Test with pytest - name: Test with pytest
run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py --ignore=tests/models/test_hf_steered.py run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_openvino.py --ignore=tests/models/test_hf_steered.py
continue-on-error: true # Continue workflow even if tests fail continue-on-error: true # Continue workflow even if tests fail
# Save test artifacts # Save test artifacts
...@@ -106,7 +106,7 @@ jobs: ...@@ -106,7 +106,7 @@ jobs:
# - name: Install dependencies # - name: Install dependencies
# run: | # run: |
# python -m pip install --upgrade pip # python -m pip install --upgrade pip
# pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu # pip install -e '.[dev,optimum,api]' --extra-index-url https://download.pytorch.org/whl/cpu
# pip install -U transformers peft accelerate # pip install -U transformers peft accelerate
# #
# - name: Test with pytest # - name: Test with pytest
......
...@@ -29,7 +29,7 @@ repos: ...@@ -29,7 +29,7 @@ repos:
- id: mixed-line-ending - id: mixed-line-ending
args: [--fix=lf] args: [--fix=lf]
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.10 rev: v0.12.2
hooks: hooks:
# Run the linter. # Run the linter.
- id: ruff - id: ruff
...@@ -47,7 +47,7 @@ repos: ...@@ -47,7 +47,7 @@ repos:
)$ )$
args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt] args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
- repo: https://github.com/jackdewinter/pymarkdown - repo: https://github.com/jackdewinter/pymarkdown
rev: v0.9.29 rev: v0.9.30
hooks: hooks:
- id: pymarkdown - id: pymarkdown
exclude: ^(lm_eval/tasks/.*|docs/footguns\.md)$ exclude: ^(lm_eval/tasks/.*|docs/footguns\.md)$
......
...@@ -364,7 +364,7 @@ lm_eval --model local-completions --tasks gsm8k --model_args model=facebook/opt- ...@@ -364,7 +364,7 @@ lm_eval --model local-completions --tasks gsm8k --model_args model=facebook/opt-
Note that for externally hosted models, configs such as `--device` which relate to where to place a local model should not be used and do not function. Just like you can use `--model_args` to pass arbitrary arguments to the model constructor for local models, you can use it to pass arbitrary arguments to the model API for hosted models. See the documentation of the hosting service for information on what arguments they support. Note that for externally hosted models, configs such as `--device` which relate to where to place a local model should not be used and do not function. Just like you can use `--model_args` to pass arbitrary arguments to the model constructor for local models, you can use it to pass arbitrary arguments to the model API for hosted models. See the documentation of the hosting service for information on what arguments they support.
| API or Inference Server | Implemented? | `--model <xxx>` name | Models supported: | Request Types: | | API or Inference Server | Implemented? | `--model <xxx>` name | Models supported: | Request Types: |
| --------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------| |---------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------|
| OpenAI Completions | :heavy_check_mark: | `openai-completions`, `local-completions` | All OpenAI Completions API models | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | OpenAI Completions | :heavy_check_mark: | `openai-completions`, `local-completions` | All OpenAI Completions API models | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| OpenAI ChatCompletions | :heavy_check_mark: | `openai-chat-completions`, `local-chat-completions` | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt) | `generate_until` (no logprobs) | | OpenAI ChatCompletions | :heavy_check_mark: | `openai-chat-completions`, `local-chat-completions` | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt) | `generate_until` (no logprobs) |
| Anthropic | :heavy_check_mark: | `anthropic` | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model) | `generate_until` (no logprobs) | | Anthropic | :heavy_check_mark: | `anthropic` | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model) | `generate_until` (no logprobs) |
...@@ -377,8 +377,6 @@ Note that for externally hosted models, configs such as `--device` which relate ...@@ -377,8 +377,6 @@ Note that for externally hosted models, configs such as `--device` which relate
| Huggingface Optimum (Causal LMs) | :heavy_check_mark: | `openvino` | Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | Huggingface Optimum (Causal LMs) | :heavy_check_mark: | `openvino` | Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Huggingface Optimum-intel IPEX (Causal LMs) | :heavy_check_mark: | `ipex` | Any decoder-only AutoModelForCausalLM | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | Huggingface Optimum-intel IPEX (Causal LMs) | :heavy_check_mark: | `ipex` | Any decoder-only AutoModelForCausalLM | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Neuron via AWS Inf2 (Causal LMs) | :heavy_check_mark: | `neuronx` | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | Neuron via AWS Inf2 (Causal LMs) | :heavy_check_mark: | `neuronx` | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| [Neural Magic DeepSparse](https://github.com/neuralmagic/deepsparse) | :heavy_check_mark: | `deepsparse` | Any LM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub with the "deepsparse" tag](https://huggingface.co/models?other=deepsparse) | `generate_until`, `loglikelihood` |
| [Neural Magic SparseML](https://github.com/neuralmagic/sparseml) | :heavy_check_mark: | `sparseml` | Any decoder-only AutoModelForCausalLM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub](https://huggingface.co/neuralmagic). Especially useful for models with quantization like [`zoo:llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized`](https://sparsezoo.neuralmagic.com/models/llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| NVIDIA NeMo | :heavy_check_mark: | `nemo_lm` | [All supported models](https://docs.nvidia.com/nemo-framework/user-guide/24.09/nemotoolkit/core/core.html#nemo-models) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | NVIDIA NeMo | :heavy_check_mark: | `nemo_lm` | [All supported models](https://docs.nvidia.com/nemo-framework/user-guide/24.09/nemotoolkit/core/core.html#nemo-models) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Watsonx.ai | :heavy_check_mark: | `watsonx_llm` | [Supported Watsonx.ai Engines](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx) | `generate_until` `loglikelihood` | | Watsonx.ai | :heavy_check_mark: | `watsonx_llm` | [Supported Watsonx.ai Engines](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx) | `generate_until` `loglikelihood` |
| [Your local inference server!](docs/API_guide.md) | :heavy_check_mark: | `local-completions` or `local-chat-completions` | Support for OpenAI API-compatible servers, with easy customization for other APIs. | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | | [Your local inference server!](docs/API_guide.md) | :heavy_check_mark: | `local-completions` or `local-chat-completions` | Support for OpenAI API-compatible servers, with easy customization for other APIs. | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
...@@ -572,9 +570,19 @@ lm_eval \ ...@@ -572,9 +570,19 @@ lm_eval \
In the stdout, you will find the link to the W&B run page as well as link to the generated report. You can find an example of this workflow in [examples/visualize-wandb.ipynb](examples/visualize-wandb.ipynb), and an example of how to integrate it beyond the CLI. In the stdout, you will find the link to the W&B run page as well as link to the generated report. You can find an example of this workflow in [examples/visualize-wandb.ipynb](examples/visualize-wandb.ipynb), and an example of how to integrate it beyond the CLI.
## How to Contribute or Learn More? ## Contributing
For more information on the library and how everything fits together, check out all of our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs)! We plan to post a larger roadmap of desired + planned library improvements soon, with more information on how contributors can help. Check out our [open issues](https://github.com/EleutherAI/lm-evaluation-harness/issues) and feel free to submit pull requests!
For more information on the library and how everything fits together, see our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs).
To get started with development, first clone the repository and install the dev dependencies:
```bash
git clone https://github.com/EleutherAI/lm-evaluation-harness
cd lm-evaluation-harness
pip install -e ".[dev]"
````
### Implementing new tasks ### Implementing new tasks
...@@ -599,37 +607,24 @@ The best way to get support is to open an issue on this repo or join the [Eleuth ...@@ -599,37 +607,24 @@ The best way to get support is to open an issue on this repo or join the [Eleuth
Extras dependencies can be installed via `pip install -e ".[NAME]"` Extras dependencies can be installed via `pip install -e ".[NAME]"`
| Name | Use | | NAME | Description | NAME | Description |
| -------------------- | ----------------------------------------------------- | |----------------------|--------------------------------|----------------|---------------------------------------|
| api | For using api models (Anthropic, OpenAI API) | | tasks | All task-specific dependencies | api | API models (Anthropic, OpenAI, local) |
| audiolm_qwen | For running Qwen2 audio models | | acpbench | ACP Bench tasks | audiolm_qwen | Qwen2 audio models |
| deepsparse | For running NM's DeepSparse models | | ifeval | IFEval task | | |
| dev | For linting PRs and contributions | | japanese_leaderboard | Japanese LLM tasks | gptq | AutoGPTQ models |
| gptq | For loading models with AutoGPTQ | | longbench | LongBench tasks | gptqmodel | GPTQModel models |
| gptqmodel | For loading models with GPTQModel | | math | Math answer checking | hf_transfer | Speed up HF downloads |
| hf_transfer | For speeding up HF Hub file downloads | | multilingual | Multilingual tokenizers | ibm_watsonx_ai | IBM watsonx.ai models |
| ibm_watsonx_ai | For using IBM watsonx.ai model apis | | ruler | RULER tasks | ipex | Intel IPEX backend |
| ifeval | For running the IFEval task | | | | | |
| ipex | For running on optimum-intel ipex backend | | dev | Linting & contributions | mamba | Mamba SSM models |
| japanese_leaderboard | For running Japanese LLM Leaderboard tasks | | promptsource | PromptSource prompts | neuronx | AWS inf2 instances |
| longbench | For running LongBench tasks | | sentencepiece | Sentencepiece tokenizer | optimum | Intel OpenVINO models |
| mamba | For loading Mamba SSM models | | testing | Run test suite | sae_lens | SAELens model steering |
| math | For running math task answer checking | | unitxt | Run unitxt tasks | | |
| multilingual | For multilingual tokenizers | | wandb | Weights & Biases | sparsify | Sparsify model steering |
| neuronx | For running on AWS inf2 instances | | zeno | Result visualization | vllm | vLLM models |
| optimum | For running Intel OpenVINO models |
| promptsource | For using PromptSource prompts |
| ruler | For running RULER tasks |
| sae_lens | For using SAELens to steer models |
| sentencepiece | For using the sentencepiece tokenizer |
| sparseml | For using NM's SparseML models |
| sparsify | For using Sparsify to steer models |
| testing | For running library test suite |
| vllm | For loading models with vLLM |
| wandb | For integration with `Weights and Biases` platform |
| zeno | For visualizing results with Zeno |
| -------------------- | ----------------------------------------------------- |
| all | Loads all extras (not recommended) |
## Cite as ## Cite as
......
...@@ -30,7 +30,7 @@ in order to ensure linters and other checks will be run upon committing. ...@@ -30,7 +30,7 @@ in order to ensure linters and other checks will be run upon committing.
We use [pytest](https://docs.pytest.org/en/latest/) for running unit tests. All library unit tests can be run via: We use [pytest](https://docs.pytest.org/en/latest/) for running unit tests. All library unit tests can be run via:
```bash ```bash
python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_openvino.py
``` ```
## Contributor License Agreement ## Contributor License Agreement
......
...@@ -8,6 +8,8 @@ from functools import partial ...@@ -8,6 +8,8 @@ from functools import partial
from lm_eval._cli.subcommand import SubCommand from lm_eval._cli.subcommand import SubCommand
from lm_eval._cli.utils import ( from lm_eval._cli.utils import (
_int_or_none_list_arg_type, _int_or_none_list_arg_type,
key_val_to_dict,
merge_dicts,
request_caching_arg_to_dict, request_caching_arg_to_dict,
try_parse_json, try_parse_json,
) )
...@@ -26,13 +28,13 @@ class Run(SubCommand): ...@@ -26,13 +28,13 @@ class Run(SubCommand):
epilog=textwrap.dedent(""" epilog=textwrap.dedent("""
examples: examples:
# Basic evaluation with HuggingFace model # Basic evaluation with HuggingFace model
$ lm-eval run --model hf --model_args pretrained=gpt2 --tasks hellaswag $ lm-eval run --model hf --model_args pretrained=gpt2 dtype=float32 --tasks hellaswag
# Evaluate on multiple tasks with few-shot examples # Evaluate on multiple tasks with few-shot examples
$ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy,arc_challenge --num_fewshot 5 $ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy arc_challenge --num_fewshot 5
# Evaluation with custom generation parameters # Evaluation with custom generation parameters
$ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs "temperature=0.8,top_p=0.95" $ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs temperature=0.8 top_p=0.95
# Use configuration file # Use configuration file
$ lm-eval run --config my_config.yaml --tasks mmlu $ lm-eval run --config my_config.yaml --tasks mmlu
...@@ -73,9 +75,10 @@ class Run(SubCommand): ...@@ -73,9 +75,10 @@ class Run(SubCommand):
"-t", "-t",
default=None, default=None,
type=str, type=str,
metavar="TASK1,TASK2", nargs="*",
metavar="TASK1 TASK2",
help=textwrap.dedent(""" help=textwrap.dedent("""
Comma-separated list of task names or groupings. Space or Comma-separated list of task names or groupings.
Use 'lm-eval list tasks' to see all available tasks. Use 'lm-eval list tasks' to see all available tasks.
""").strip(), """).strip(),
) )
...@@ -83,9 +86,10 @@ class Run(SubCommand): ...@@ -83,9 +86,10 @@ class Run(SubCommand):
"--model_args", "--model_args",
"-a", "-a",
default=None, default=None,
type=try_parse_json, nargs="*",
type=key_val_to_dict,
metavar="ARGS", metavar="ARGS",
help="Model arguments as 'key=val,key2=val2' or JSON string", help="Model arguments as 'key=val,key2=val2' or `key=val` `key2=val2`",
) )
# Evaluation Settings # Evaluation Settings
...@@ -124,10 +128,13 @@ class Run(SubCommand): ...@@ -124,10 +128,13 @@ class Run(SubCommand):
) )
eval_group.add_argument( eval_group.add_argument(
"--gen_kwargs", "--gen_kwargs",
type=try_parse_json, type=key_val_to_dict,
default=None, default=None,
nargs="*",
metavar="KWARGS", metavar="KWARGS",
help="Generation arguments as 'key=val,key2=val2' or JSON string", help=textwrap.dedent(
'Generation arguments as `temperature=0,stop=["stop"]` or `key=val` `key2=val2`. Values should be parsable with ast.literal_eval.'
),
) )
# Data and Output # Data and Output
...@@ -250,24 +257,24 @@ class Run(SubCommand): ...@@ -250,24 +257,24 @@ class Run(SubCommand):
) )
logging_group.add_argument( logging_group.add_argument(
"--wandb_args", "--wandb_args",
type=str, type=key_val_to_dict,
default=argparse.SUPPRESS, default=argparse.SUPPRESS,
metavar="ARGS", metavar="ARGS",
help="Weights & Biases init arguments (key=val,key2=val2)", help="Weights & Biases init arguments key=val key2=val2",
) )
logging_group.add_argument( logging_group.add_argument(
"--wandb_config_args", "--wandb_config_args",
type=str, type=key_val_to_dict,
default=argparse.SUPPRESS, default=argparse.SUPPRESS,
metavar="ARGS", metavar="ARGS",
help="Weights & Biases config arguments (key=val,key2=val2)", help="Weights & Biases config arguments key=val key2=val2",
) )
logging_group.add_argument( logging_group.add_argument(
"--hf_hub_log_args", "--hf_hub_log_args",
type=str, type=key_val_to_dict,
default=argparse.SUPPRESS, default=argparse.SUPPRESS,
metavar="ARGS", metavar="ARGS",
help="Hugging Face Hub logging arguments (key=val,key2=val2)", help="Hugging Face Hub logging arguments key=val key2=val2",
) )
# Advanced Options # Advanced Options
...@@ -313,9 +320,21 @@ class Run(SubCommand): ...@@ -313,9 +320,21 @@ class Run(SubCommand):
), ),
) )
def _execute(self, args: argparse.Namespace) -> None: @staticmethod
def _execute(args: argparse.Namespace) -> None:
"""Runs the evaluation harness with the provided arguments.""" """Runs the evaluation harness with the provided arguments."""
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
MERGE_ARGS_DICTS = [
"model_args",
"gen_kwargs",
"wandb_args",
"wandb_config_args",
"hf_hub_log_args",
]
for arg_name in MERGE_ARGS_DICTS:
if current_value := getattr(args, arg_name, None):
setattr(args, arg_name, merge_dicts(*current_value))
from lm_eval.config.evaluate_config import EvaluatorConfig from lm_eval.config.evaluate_config import EvaluatorConfig
eval_logger = logging.getLogger(__name__) eval_logger = logging.getLogger(__name__)
......
import argparse import argparse
import ast
import json import json
import logging import logging
from typing import Optional, Union from typing import Any, Optional, Union
def try_parse_json(value: Union[str, dict, None]) -> Union[str, dict, None]: def try_parse_json(value: Union[str, dict, None]) -> Union[str, dict, None]:
...@@ -81,3 +82,35 @@ def check_argument_types(parser: argparse.ArgumentParser) -> None: ...@@ -81,3 +82,35 @@ def check_argument_types(parser: argparse.ArgumentParser) -> None:
raise ValueError(f"Argument '{action.dest}' doesn't have a type specified.") raise ValueError(f"Argument '{action.dest}' doesn't have a type specified.")
else: else:
continue continue
def handle_cli_value_string(arg: str) -> Any:
if arg.lower() == "true":
return True
elif arg.lower() == "false":
return False
elif arg.isnumeric():
return int(arg)
try:
return float(arg)
except ValueError:
try:
return ast.literal_eval(arg)
except (ValueError, SyntaxError):
return arg
def key_val_to_dict(args: str) -> dict:
"""Parse model arguments from a string into a dictionary."""
return (
{
k: handle_cli_value_string(v)
for k, v in (item.split("=") for item in args.split(","))
}
if args
else {}
)
def merge_dicts(*dicts):
return {k: v for d in dicts for k, v in d.items()}
import logging import logging
import math import math
import os
import random import random
import re import re
import string import string
from collections.abc import Iterable from collections.abc import Iterable
from typing import List from typing import Callable, List, Optional, Sequence, TypeVar
import numpy as np import numpy as np
import sacrebleu import sacrebleu
...@@ -12,6 +13,8 @@ import sacrebleu ...@@ -12,6 +13,8 @@ import sacrebleu
from lm_eval.api.registry import register_aggregation, register_metric from lm_eval.api.registry import register_aggregation, register_metric
T = TypeVar("T")
eval_logger = logging.getLogger(__name__) eval_logger = logging.getLogger(__name__)
...@@ -287,7 +290,7 @@ def pop_stddev(arr): ...@@ -287,7 +290,7 @@ def pop_stddev(arr):
return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr)) return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr))
def sample_stddev(arr): def sample_stddev(arr: Sequence[T]) -> float:
mu = mean(arr) mu = mean(arr)
return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1)) return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1))
...@@ -449,11 +452,16 @@ def _sacreformat(refs, preds): ...@@ -449,11 +452,16 @@ def _sacreformat(refs, preds):
class _bootstrap_internal: class _bootstrap_internal:
def __init__(self, f, n) -> None: """
Pool worker: `(i, xs)` → `n` bootstrap replicates
of `f(xs)`using a RNG seeded with `i`.
"""
def __init__(self, f: Callable[[Sequence[T]], float], n: int) -> None:
self.f = f self.f = f
self.n = n self.n = n
def __call__(self, v): def __call__(self, v: tuple[int, Sequence[T]]) -> list[float]:
i, xs = v i, xs = v
rnd = random.Random() rnd = random.Random()
rnd.seed(i) rnd.seed(i)
...@@ -463,36 +471,81 @@ class _bootstrap_internal: ...@@ -463,36 +471,81 @@ class _bootstrap_internal:
return res return res
def bootstrap_stderr(f, xs, iters): def _bootstrap_internal_no_mp(
import multiprocessing as mp f: Callable[[Sequence[T]], float], xs: Sequence[T], iters: int
) -> list[float]:
pool = mp.Pool(mp.cpu_count()) """
# this gives a biased estimate of the stderr (i.e w/ the mean, it gives something Single-process fallback: compute `iters` bootstrap replicates
# equivalent to stderr calculated without Bessel's correction in the stddev. of statistic`f(xs)`, chunked (≤ 1000 draws).
# Unfortunately, I haven't been able to figure out what the right correction is """
# to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but
# that would be ad-hoc and I can't prove that that would actually be an unbiased estimator)
# Thankfully, shouldn't matter because our samples are pretty big usually anyways
res = [] res = []
chunk_size = min(1000, iters) chunk_size = min(1000, iters)
from tqdm import tqdm from tqdm import tqdm
print("bootstrapping for stddev:", f.__name__) print(f"bootstrapping for stddev: {f.__name__}")
for bootstrap in tqdm(
pool.imap( # A single loop replaces the multiprocessing pool.
_bootstrap_internal(f, chunk_size), for i in tqdm(range(iters // chunk_size)):
[(i, xs) for i in range(iters // chunk_size)], rnd = random.Random(i)
), for _ in range(chunk_size):
total=iters // chunk_size, res.append(f(rnd.choices(xs, k=len(xs))))
):
# sample w replacement return res
res.extend(bootstrap)
pool.close() def bootstrap_stderr(
f: Callable[[Sequence[T]], float], xs: Sequence[T], iters: int
) -> float:
"""
Bootstrap estimate of the standard error of statistic `f(xs)`
using up to `iters` resamples, chunked (≤ 1000 draws)
Executes in parallel unless the env-var `DISABLE_MULTIPROC` is set;
"""
if not os.getenv("DISABLE_MULTIPROC"):
import multiprocessing as mp
pool = mp.Pool(mp.cpu_count())
# this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
# equivalent to stderr calculated without Bessel's correction in the stddev.
# Unfortunately, I haven't been able to figure out what the right correction is
# to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but
# that would be ad-hoc and I can't prove that that would actually be an unbiased estimator)
# Thankfully, shouldn't matter because our samples are pretty big usually anyways
res = []
chunk_size = min(1000, iters)
from tqdm import tqdm
print("bootstrapping for stddev:", f.__name__)
for bootstrap in tqdm(
pool.imap(
_bootstrap_internal(f, chunk_size),
[(i, xs) for i in range(iters // chunk_size)],
),
total=iters // chunk_size,
):
# sample w replacement
res.extend(bootstrap)
pool.close()
else:
res = _bootstrap_internal_no_mp(f, xs, iters)
return sample_stddev(res) return sample_stddev(res)
def stderr_for_metric(metric, bootstrap_iters: int): def stderr_for_metric(
metric: Callable[[Sequence[T]], float], bootstrap_iters: int
) -> Optional[Callable[[Sequence[T]], float]]:
"""
Return a function that estimates the standard error of `metric(xs)`.
* If `bootstrap_iters > 0` and the metric is in the pre-approved
bootstrappable list, use `bootstrap_stderr` with that many draws.
* If the metric has a closed-form SE (e.g. `mean`, `acc_all`), use it.
* Otherwise, return `None`.
"""
if bootstrap_iters <= 0: if bootstrap_iters <= 0:
# return no function (don't compute stderr) if bootstrap iters = 0 # return no function (don't compute stderr) if bootstrap iters = 0
return None return None
......
...@@ -3,15 +3,19 @@ import hashlib ...@@ -3,15 +3,19 @@ import hashlib
import json import json
import logging import logging
import os import os
from typing import Dict, List, Optional, Tuple, Type, TypeVar, Union from typing import TYPE_CHECKING, Any, Iterable, Optional, Type, TypeVar, Union
import transformers
from sqlitedict import SqliteDict
from tqdm import tqdm from tqdm import tqdm
from lm_eval import utils from lm_eval import utils
if TYPE_CHECKING:
from sqlitedict import SqliteDict
from lm_eval.api.instance import Instance
eval_logger = logging.getLogger(__name__) eval_logger = logging.getLogger(__name__)
T = TypeVar("T", bound="LM") T = TypeVar("T", bound="LM")
...@@ -27,10 +31,10 @@ class LM(abc.ABC): ...@@ -27,10 +31,10 @@ class LM(abc.ABC):
# set rank and world size to a single process, by default. # set rank and world size to a single process, by default.
self._rank = 0 self._rank = 0
self._world_size = 1 self._world_size = 1
self.cache_hook = CacheHook(None) self.cache_hook: "CacheHook" = CacheHook(None)
@abc.abstractmethod @abc.abstractmethod
def loglikelihood(self, requests) -> List[Tuple[float, bool]]: def loglikelihood(self, requests) -> list[tuple[float, bool]]:
"""Compute log-likelihood of generating a continuation from a context. """Compute log-likelihood of generating a continuation from a context.
Downstream tasks should attempt to use loglikelihood instead of other Downstream tasks should attempt to use loglikelihood instead of other
LM calls whenever possible. LM calls whenever possible.
...@@ -55,7 +59,7 @@ class LM(abc.ABC): ...@@ -55,7 +59,7 @@ class LM(abc.ABC):
pass pass
@abc.abstractmethod @abc.abstractmethod
def loglikelihood_rolling(self, requests) -> List[float]: def loglikelihood_rolling(self, requests) -> list[float]:
"""Compute full log-likelihood of a string, with no truncation, for perplexity computation """Compute full log-likelihood of a string, with no truncation, for perplexity computation
- We will use the full max context length of the model. - We will use the full max context length of the model.
- For inputs that exceed the max context length, we divide the tokenized string into chunks of up to - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
...@@ -97,7 +101,7 @@ class LM(abc.ABC): ...@@ -97,7 +101,7 @@ class LM(abc.ABC):
# TODO: Add an optional max length # TODO: Add an optional max length
@abc.abstractmethod @abc.abstractmethod
def generate_until(self, requests) -> List[str]: def generate_until(self, requests) -> list[str]:
"""Generate greedily until a stopping sequence """Generate greedily until a stopping sequence
:param requests: list[Instance] :param requests: list[Instance]
...@@ -114,7 +118,7 @@ class LM(abc.ABC): ...@@ -114,7 +118,7 @@ class LM(abc.ABC):
pass pass
def apply_chat_template( def apply_chat_template(
self, chat_history: List[Dict[str, str]], add_generation_prompt=True self, chat_history: list[dict[str, str]], add_generation_prompt=True
) -> str: ) -> str:
""" """
Defines how to transform few-shot examples provided as chat history into a format that can be used as input to the LM. Defines how to transform few-shot examples provided as chat history into a format that can be used as input to the LM.
...@@ -165,8 +169,7 @@ class LM(abc.ABC): ...@@ -165,8 +169,7 @@ class LM(abc.ABC):
- Instance of the LM class. - Instance of the LM class.
""" """
additional_config = {} if additional_config is None else additional_config additional_config = additional_config or {} | {
additional_config = {
k: v for k, v in additional_config.items() if v is not None k: v for k, v in additional_config.items() if v is not None
} }
...@@ -204,25 +207,25 @@ class LM(abc.ABC): ...@@ -204,25 +207,25 @@ class LM(abc.ABC):
return "" return ""
def set_cache_hook(self, cache_hook) -> None: def set_cache_hook(self, cache_hook: "CacheHook") -> None:
self.cache_hook = cache_hook self.cache_hook = cache_hook
### SQLite-based caching of LM responses ### SQLite-based caching of LM responses
def hash_args(attr, args): def hash_args(attr: str, args: Iterable[Any]) -> str:
dat = json.dumps([attr] + list(args)) dat = json.dumps([attr] + list(args))
return hashlib.sha256(dat.encode("utf-8")).hexdigest() return hashlib.sha256(dat.encode("utf-8")).hexdigest()
class CacheHook: class CacheHook:
def __init__(self, cachinglm) -> None: def __init__(self, cachinglm: Optional["CachingLM"]) -> None:
if cachinglm is None: if cachinglm is None:
self.dbdict = None self.dbdict: Optional["SqliteDict"] = None
return return
self.dbdict = cachinglm.dbdict self.dbdict = cachinglm.dbdict
def add_partial(self, attr, req, res) -> None: def add_partial(self, attr: str, req: Iterable[Any], res: Any) -> None:
if self.dbdict is None: if self.dbdict is None:
return return
hsh = hash_args(attr, req) hsh = hash_args(attr, req)
...@@ -230,7 +233,7 @@ class CacheHook: ...@@ -230,7 +233,7 @@ class CacheHook:
class CachingLM: class CachingLM:
def __init__(self, lm, cache_db) -> None: def __init__(self, lm: LM, cache_db: str) -> None:
"""LM wrapper that returns cached results if they exist, and uses the underlying LM if not. """LM wrapper that returns cached results if they exist, and uses the underlying LM if not.
:param lm: LM :param lm: LM
...@@ -238,8 +241,10 @@ class CachingLM: ...@@ -238,8 +241,10 @@ class CachingLM:
:param cache_db: str :param cache_db: str
Path to cache db Path to cache db
""" """
self.lm = lm from sqlitedict import SqliteDict
self.cache_db = cache_db
self.lm: LM = lm
self.cache_db: str = cache_db
if os.path.dirname(cache_db): if os.path.dirname(cache_db):
os.makedirs(os.path.dirname(cache_db), exist_ok=True) os.makedirs(os.path.dirname(cache_db), exist_ok=True)
self.dbdict = SqliteDict(cache_db, autocommit=True) self.dbdict = SqliteDict(cache_db, autocommit=True)
...@@ -247,13 +252,13 @@ class CachingLM: ...@@ -247,13 +252,13 @@ class CachingLM:
# add hook to lm # add hook to lm
lm.set_cache_hook(self.get_cache_hook()) lm.set_cache_hook(self.get_cache_hook())
def __getattr__(self, attr: str): def __getattr__(self, attr: str) -> Any:
lm_attr = getattr(self.lm, attr) lm_attr = getattr(self.lm, attr)
if attr not in ["loglikelihood", "loglikelihood_rolling", "generate_until"]: if attr not in ["loglikelihood", "loglikelihood_rolling", "generate_until"]:
eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM") eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM")
return lm_attr return lm_attr
def fn(requests): def _fn(requests: list["Instance"]) -> list["Instance"]:
res = [] res = []
remaining_reqs = [] remaining_reqs = []
warned = False warned = False
...@@ -306,9 +311,9 @@ class CachingLM: ...@@ -306,9 +311,9 @@ class CachingLM:
return res return res
return fn return _fn
def get_cache_hook(self): def get_cache_hook(self) -> "CacheHook":
return CacheHook(self) return CacheHook(self)
...@@ -331,19 +336,23 @@ class TemplateLM(LM): ...@@ -331,19 +336,23 @@ class TemplateLM(LM):
return self.eot_token_id return self.eot_token_id
@abc.abstractmethod @abc.abstractmethod
def tok_encode(self, string: str, **kwargs) -> List[int]: def tok_encode(self, string: str, **kwargs) -> list[int]:
""" """
Tokenize a string using the model's tokenizer and return a list of token IDs. Tokenize a string using the model's tokenizer and return a list of token IDs.
""" """
pass pass
@abc.abstractmethod @abc.abstractmethod
def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]: def _loglikelihood_tokens(
self, requests: list["Instance"], **kwargs
) -> list[tuple[float, bool]]:
pass pass
def _encode_pair( def _encode_pair(
self, context: str, continuation: str self, context: str, continuation: str
) -> Tuple[List[int], List[int]]: ) -> tuple[list[int], list[int]]:
import transformers
n_spaces = len(context) - len(context.rstrip()) n_spaces = len(context) - len(context.rstrip())
if n_spaces > 0: if n_spaces > 0:
continuation = context[-n_spaces:] + continuation continuation = context[-n_spaces:] + continuation
...@@ -364,8 +373,8 @@ class TemplateLM(LM): ...@@ -364,8 +373,8 @@ class TemplateLM(LM):
return context_enc, continuation_enc return context_enc, continuation_enc
def loglikelihood( def loglikelihood(
self, requests, disable_tqdm: bool = False self, requests: list["Instance"], disable_tqdm: bool = False
) -> List[Tuple[float, bool]]: ) -> list[tuple[float, bool]]:
new_reqs = [] new_reqs = []
for context, continuation in [req.args for req in requests]: for context, continuation in [req.args for req in requests]:
if context == "": if context == "":
...@@ -384,11 +393,11 @@ class TemplateLM(LM): ...@@ -384,11 +393,11 @@ class TemplateLM(LM):
@abc.abstractmethod @abc.abstractmethod
def loglikelihood_rolling( def loglikelihood_rolling(
self, requests, disable_tqdm: bool = False self, requests, disable_tqdm: bool = False
) -> List[float]: ) -> list[float]:
pass pass
@abc.abstractmethod @abc.abstractmethod
def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]: def generate_until(self, requests, disable_tqdm: bool = False) -> list[str]:
pass pass
def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]: def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
......
...@@ -63,9 +63,9 @@ class EvaluatorConfig: ...@@ -63,9 +63,9 @@ class EvaluatorConfig:
model_args: dict = field( model_args: dict = field(
default_factory=dict, metadata={"help": "Arguments for model initialization"} default_factory=dict, metadata={"help": "Arguments for model initialization"}
) )
tasks: Union[str, list[str]] = field( tasks: list[str] = field(
default_factory=list, default_factory=list,
metadata={"help": "Comma-separated list of task names to evaluate"}, metadata={"help": "List of task names to evaluate"},
) )
# Few-shot and batching # Few-shot and batching
...@@ -212,9 +212,9 @@ class EvaluatorConfig: ...@@ -212,9 +212,9 @@ class EvaluatorConfig:
# Create instance and validate # Create instance and validate
instance = cls(**config) instance = cls(**config)
instance.configure()
if used_config: if used_config:
print(textwrap.dedent(f"""{instance}""")) print(textwrap.dedent(f"""{instance}"""))
instance.configure()
return instance return instance
......
import itertools import itertools
import json import json
import logging import logging
import os
import random import random
import time import time
from collections import defaultdict from collections import defaultdict
...@@ -30,6 +31,7 @@ from lm_eval.tasks import TaskManager, get_task_dict ...@@ -30,6 +31,7 @@ from lm_eval.tasks import TaskManager, get_task_dict
from lm_eval.utils import ( from lm_eval.utils import (
get_logger, get_logger,
handle_non_serializable, handle_non_serializable,
hash_dict_images,
hash_string, hash_string,
positional_deprecated, positional_deprecated,
simple_parse_args_string, simple_parse_args_string,
...@@ -140,7 +142,6 @@ def simple_evaluate( ...@@ -140,7 +142,6 @@ def simple_evaluate(
Random seed for fewshot sampler random generator. If set to None, the seed of generator will be set to None. Random seed for fewshot sampler random generator. If set to None, the seed of generator will be set to None.
:param metadata: dict :param metadata: dict
Additional metadata to be added to the task manager. Will get passed to the download function of the task. Additional metadata to be added to the task manager. Will get passed to the download function of the task.
return return
Dictionary of results Dictionary of results
""" """
...@@ -153,15 +154,23 @@ def simple_evaluate( ...@@ -153,15 +154,23 @@ def simple_evaluate(
"Either 'limit' or 'samples' must be None, but both are not None." "Either 'limit' or 'samples' must be None, but both are not None."
) )
_NEEDS_CHAT_TEMPLATE = ("inst", "chat")
if ( if (
(isinstance(model_args, str) and "inst" in model_args.lower()) (
isinstance(model_args, str)
and any(kw in model_args.lower() for kw in _NEEDS_CHAT_TEMPLATE)
)
or ( or (
isinstance(model_args, dict) isinstance(model_args, dict)
and any("inst" in str(v).lower() for v in model_args.values()) and any(
any(kw in str(v).lower() for kw in _NEEDS_CHAT_TEMPLATE)
for v in model_args.values()
)
) )
) and not apply_chat_template: ) and not apply_chat_template:
eval_logger.warning( eval_logger.warning(
"Model appears to be an instruct variant but chat template is not applied. Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`)." "Model appears to be an instruct or chat variant but chat template is not applied. "
"Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`)."
) )
if delete_requests_cache: if delete_requests_cache:
...@@ -745,6 +754,13 @@ def evaluate( ...@@ -745,6 +754,13 @@ def evaluate(
}, },
} }
if log_samples: if log_samples:
# default: hash images
samples = (
hash_dict_images(samples)
if os.environ.get("LMEVAL_HASHMM", "1") != "0"
and (hasattr(lm, "MULTIMODAL"))
else samples
)
results_dict["samples"] = dict(samples) results_dict["samples"] = dict(samples)
return results_dict return results_dict
......
...@@ -141,7 +141,7 @@ class MultiChoiceRegexFilter(RegexFilter): ...@@ -141,7 +141,7 @@ class MultiChoiceRegexFilter(RegexFilter):
""" """
regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
- step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response. - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
- step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices. - step 2 : We parse the choice with regex: r's*([A-?])', where ? varies by number of choices.
group_select: Selects the (group_select)th match from the findall result. group_select: Selects the (group_select)th match from the findall result.
ignore_case: Ignores the case during step 1 matching ignore_case: Ignores the case during step 1 matching
ignore_punctuation: Remove the punctuation during step 1 matching ignore_punctuation: Remove the punctuation during step 1 matching
......
...@@ -10,7 +10,6 @@ from . import ( ...@@ -10,7 +10,6 @@ from . import (
ibm_watsonx_ai, ibm_watsonx_ai,
mamba_lm, mamba_lm,
nemo_lm, nemo_lm,
neuralmagic,
neuron_optimum, neuron_optimum,
openai_completions, openai_completions,
optimum_ipex, optimum_ipex,
......
import copy
import logging
from typing import List, Optional, Tuple, Union
import numpy
import transformers
from tqdm import tqdm
import lm_eval.models.utils
from lm_eval import utils
from lm_eval.api.instance import Instance
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM
eval_logger = logging.getLogger(__name__)
@register_model("sparseml")
class SparseMLLM(HFLM):
"""
SparseML is an open-source model optimization toolkit that enables you to create
inference-optimized sparse models using pruning, quantization, and distillation
algorithms. Models optimized with SparseML can then be exported to the ONNX format and
deployed with DeepSparse for GPU-class performance on CPU hardware.
This class is a wrapper around the HuggingFace LM class to enable SparseML
integration with the lm-evaluation-harness.
"""
def _create_model(
self,
pretrained: str,
revision: Optional[str] = "main",
dtype: Optional[str] = "auto",
trust_remote_code: Optional[bool] = False,
**kwargs,
) -> None:
try:
from sparseml.transformers import SparseAutoModelForCausalLM
except ModuleNotFoundError as exception:
raise type(exception)(
"Package `sparseml` is not installed. "
"Please install it via `pip install sparseml[transformers]`"
)
model_kwargs = kwargs if kwargs else {}
if "device_map" not in model_kwargs:
# set a device_map to initialize model on the right GPU.
# this is needed because it seems that the default behavior
# for quantized models now seems to be device_map="auto"
# which breaks data-parallel mode.
if hasattr(self, "accelerator"):
model_kwargs.update(
{"device_map": {"": f"cuda:{self.accelerator.local_process_index}"}}
)
else:
model_kwargs.update({"device_map": {"": str(self.device)}})
relevant_kwarg_names = [
"offload_folder",
"device_map",
]
relevant_kwargs = {
k: v for k, v in model_kwargs.items() if k in relevant_kwarg_names
}
# Log the difference between model_kwargs and relevant_kwargs so we can see
# what is being ignored
ignored_kwargs = {}
for k, v in model_kwargs.items():
if k not in relevant_kwargs.keys():
ignored_kwargs[k] = v
eval_logger.warning(
f"The sparseml integration is ignoring the following kwargs that are specified: {ignored_kwargs}"
)
model = SparseAutoModelForCausalLM.from_pretrained(
pretrained,
revision=revision,
torch_dtype=lm_eval.models.utils.get_dtype(dtype),
trust_remote_code=trust_remote_code,
**relevant_kwargs,
)
self._model = model
def _get_config(self, pretrained: str, **kwargs) -> None:
try:
from sparseml.transformers import SparseAutoConfig
except ModuleNotFoundError as exception:
raise type(exception)(
"Package `sparseml` is not installed. "
"Please install it via `pip install sparseml[transformers]`"
)
self._config = SparseAutoConfig.from_pretrained(
pretrained_model_name_or_path=pretrained, **kwargs
)
def _create_tokenizer(
self,
pretrained: Union[str, transformers.PreTrainedModel],
tokenizer: Optional[
Union[
str,
transformers.PreTrainedTokenizer,
transformers.PreTrainedTokenizerFast,
]
],
**kwargs,
) -> None:
try:
from sparseml.transformers import SparseAutoTokenizer
except ModuleNotFoundError as exception:
raise type(exception)(
"Package `sparseml` is not installed. "
"Please install it via `pip install sparseml[transformers]`"
)
if tokenizer:
if isinstance(tokenizer, str):
self.tokenizer = SparseAutoTokenizer.from_pretrained(
tokenizer,
**kwargs,
)
else:
assert isinstance(
tokenizer, transformers.PreTrainedTokenizer
) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
self.tokenizer = tokenizer
else:
# Get tokenizer based on 'pretrained'
if isinstance(pretrained, str):
model_name = pretrained
else:
# get the HF hub name via accessor on model
model_name = self.model.name_or_path
self.tokenizer = SparseAutoTokenizer.from_pretrained(
model_name,
**kwargs,
)
return None
@register_model("deepsparse")
class DeepSparseLM(LM):
"""
Wrapper around DeepSparse, a sparsity-aware deep learning
inference runtime for CPUs, to make it compatible with the
lm-evaluation-harness.
"""
_DEFAULT_MAX_LENGTH = 2048
def __init__(
self,
pretrained: str,
tokenizer: Optional[
Union[
str,
transformers.PreTrainedTokenizer,
transformers.PreTrainedTokenizerFast,
]
] = None,
batch_size: Optional[Union[int, str]] = 1,
max_gen_toks: Optional[int] = 256,
max_length: Optional[int] = None,
):
super().__init__()
try:
import deepsparse
except ModuleNotFoundError as exception:
raise type(exception)(
"Package `deepsparse` is not installed. "
"Please install it via `pip install deepsparse[transformers]`"
)
if isinstance(batch_size, str) and not batch_size.isdigit():
eval_logger.warning(
f"batch_size={batch_size} is not valid for deepsparse because it is not an integer. "
"Ignoring and using the default of 1."
)
batch_size = 1
self.batch_size = int(batch_size)
self._max_length = max_length if max_length else self._DEFAULT_MAX_LENGTH
self._max_gen_toks = max_gen_toks
self.batch_sizes = {}
# Initialize new model and tokenizer instances
self.model = deepsparse.TextGeneration(
model_path=pretrained,
sequence_length=self._max_length,
batch_size=batch_size,
)
self.tokenizer = tokenizer if tokenizer else self.model.tokenizer
self.config = self.model.config
def tok_encode(self, string: str) -> List[int]:
return self.tokenizer.encode(string)
def tok_decode(self, tokens: List[int]) -> str:
return self.tokenizer.decode(tokens)
@property
def eot_token_id(self):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id
@property
def prefix_token_id(self):
# it is used as prefix for loglikelihood
if self.tokenizer.bos_token_id is not None:
return self.tokenizer.bos_token_id
return self.tokenizer.eos_token_id
@property
def max_length(self) -> int:
return self._max_length
@property
def max_gen_toks(self) -> int:
return self._max_gen_toks
def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
"""
Copied directly from
https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
"""
new_reqs = []
for context, continuation in [req.args for req in requests]:
if context == "":
raise NotImplementedError(
"Implementing empty context is not supported yet"
)
context_enc, continuation_enc = self._encode_pair(context, continuation)
new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs)
def _loglikelihood_tokens(
self,
requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
disable_tqdm: bool = False,
) -> List[Tuple[float, bool]]:
"""
The function to compute the loglikelihood of the continuation
tokens given the context tokens.
This function is an adapted version of the original function from
https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
"""
res = []
def _collate(x):
"""Defines the key for the sorted method"""
toks = x[1] + x[2]
return -len(toks), tuple(toks)
re_ord = utils.Reorderer(requests, _collate)
for chunk in tqdm(
list(lm_eval.models.utils.chunks(re_ord.get_reordered(), self.batch_size)),
disable=disable_tqdm,
):
batch_inp = []
batch_cache_key = []
batch_continuation_enc = []
# len(chunk) is the batch_size
for cache_key, context_enc, continuation_enc in chunk:
# how this all works (illustrated on a causal decoder-only setup):
# CTX CONT
# inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
# model \ \
# logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the
# cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice # noqa: E501
inp = (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
batch_inp.append(self.tokenizer.decode(inp))
batch_cache_key.append(cache_key)
batch_continuation_enc.append(continuation_enc)
response = self.model(
prompt=batch_inp,
max_new_tokens=0,
output_scores=True,
include_prompt_logits=True,
)
for resp, continuation_enc, cache_key in zip(
response.generations, batch_continuation_enc, batch_cache_key
):
# (seq_len, vocab_size)
multi_scores = resp.score
from deepsparse.utils.data import numpy_log_softmax
# (seq_len, vocab_size) but with softmax applied
multi_logits = numpy_log_softmax(multi_scores, axis=1)
# toss out the context half of the sequence
# (cont_len, vocab_size)
continuation_multi_logits = multi_logits[-len(continuation_enc) :]
# pick out the logits for the continuation tokens
# (cont_len,)
continuation_logits = continuation_multi_logits[
numpy.arange(len(continuation_enc)), continuation_enc
]
# check if the tokens generated greedly are the same
# as the expected continuation
greedy_tokens = continuation_multi_logits.argmax(axis=1)
max_equal = greedy_tokens.tolist() == continuation_enc
# Answer: (log prob, is-exact-match)
answer = (float(continuation_logits.sum()), bool(max_equal))
res.append(answer)
if cache_key is not None:
# special case: loglikelihood_rolling produces a number of loglikelihood requests
# all with cache key None. instead do add_partial on the per-example level
# in the loglikelihood_rolling() function for those.
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
return re_ord.get_original(res)
def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
raise NotImplementedError(
"The method not required by any of our current task integrations so far"
)
def generate_until(self, requests: List[Instance]) -> List[str]:
"""
The function to generate a certain number of new tokens
given a context.
This function is an adapted version of the original function from
https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py
"""
if not requests:
return []
res = []
requests = [req.args for req in requests]
def _collate(x):
toks = self.tok_encode(x[0])
return len(toks), x[0]
re_ord = utils.Reorderer(requests, _collate)
def sameuntil_chunks(xs, size):
ret = []
lastuntil = xs[0][1]
for x in xs:
if len(ret) >= size or x[1] != lastuntil:
yield ret, lastuntil
ret = []
lastuntil = x[1]
ret.append(x)
if ret:
yield ret, lastuntil
pbar = tqdm(total=len(requests))
for chunk, request_args in tqdm(
list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size))
):
inps = []
# make a deepcopy since we are changing arguments
request_args = copy.deepcopy(request_args)
self._max_gen_toks = request_args.pop("max_gen_toks", self.max_gen_toks)
for context, _ in chunk:
# add context (prompts) to the list
inps.append(context)
until = request_args.pop("until", ["<|endoftext|>"])
request_args.pop("do_sample", None)
request_args["temperature"] = request_args.get("temperature", 0)
# run inference (generate max_gen_toks tokens)
out = self.model(
sequences=inps,
max_new_tokens=self.max_gen_toks - 1,
stop=until,
**request_args,
)
for resp, (context, args_) in zip(out.generations, chunk):
text = resp.text
until_ = until
# split the text at the first occurrence of any of the until tokens
for term in until_:
if len(term) > 0:
text = text.split(term)[0]
res.append(text)
self.cache_hook.add_partial(
"generate_until", (context, {"until": until_}), text
)
pbar.update(1)
pbar.close()
return re_ord.get_original(res)
def _encode_pair(
self, context: str, continuation: str
) -> Tuple[List[int], List[int]]:
"""
Copied directly from
https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
"""
n_spaces = len(context) - len(context.rstrip())
if n_spaces > 0:
continuation = context[-n_spaces:] + continuation
context = context[:-n_spaces]
whole_enc = self.tok_encode(context + continuation)
context_enc = self.tok_encode(context)
context_enc_len = len(context_enc)
continuation_enc = whole_enc[context_enc_len:]
return context_enc, continuation_enc
...@@ -613,3 +613,59 @@ def weighted_f1_score(items): ...@@ -613,3 +613,59 @@ def weighted_f1_score(items):
preds = unzipped_list[1] preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted") fscore = f1_score(golds, preds, average="weighted")
return fscore return fscore
def convert_pil_to_hash(value):
from io import BytesIO
img_bytes = BytesIO()
value.save(img_bytes, format="PNG")
return hashlib.sha256(str(img_bytes).encode()).hexdigest()
def convert_bytes_to_hash(value):
return hashlib.sha256(str(value).encode()).hexdigest()
def hash_dict_images(data_dict):
"""
Create a deep copy of `data_dict` where all bytes and PIL.Image.Image values
are replaced by their respective hashes using the provided converter functions.
Parameters:
data_dict (dict): The input dictionary with arbitrary nesting of dicts and lists.
Returns:
dict: A new dictionary with the same structure as `data_dict`, but with all
bytes and PIL.Image.Image objects replaced by their hashes.
"""
def _process_value(value):
# Bytes -> hash
from PIL import Image
if isinstance(value, (bytes, bytearray)):
return convert_bytes_to_hash(value)
# PIL Image -> hash
if isinstance(value, Image.Image):
return convert_pil_to_hash(value)
# Nested dictionary -> recurse
if isinstance(value, dict):
return {k: _process_value(v) for k, v in value.items()}
# List or tuple -> recurse, preserving type
if isinstance(value, list):
return [_process_value(v) for v in value]
if isinstance(value, tuple):
return tuple(_process_value(v) for v in value)
# Other types remain unchanged
return value
# Ensure the top-level is a dict
if not isinstance(data_dict, dict):
raise TypeError("Input must be a dictionary")
return (
{key: _process_value(val) for key, val in data_dict.items()}
if importlib.util.find_spec("PIL")
else data_dict
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment