Commit bc4b922c authored by Baber's avatar Baber
Browse files

Merge branch 'main' into llama

# Conflicts:
#	lm_eval/tasks/llama3/README.md
parents 748eb47e b2c090cc
......@@ -22,10 +22,10 @@ jobs:
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Set up Python 3.8
- name: Set up Python 3.9
uses: actions/setup-python@v5
with:
python-version: 3.8
python-version: 3.9
cache: pip
cache-dependency-path: pyproject.toml
- name: Pre-Commit
......@@ -42,7 +42,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [ "3.8", "3.9", "3.10", "3.11" ]
python-version: ["3.9", "3.10", "3.11", "3.12" ]
timeout-minutes: 30
steps:
- name: Checkout Code
......@@ -75,15 +75,16 @@ jobs:
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Set up Python 3.8
- name: Set up Python 3.9
uses: actions/setup-python@v5
with:
python-version: 3.8
python-version: 3.9
cache: pip
cache-dependency-path: pyproject.toml
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu
pip install -U transformers peft
- name: Test with pytest
run: python -m pytest tests/models --showlocals -s -vv
......@@ -29,7 +29,7 @@ repos:
- id: mixed-line-ending
args: [--fix=lf]
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.7.4
rev: v0.9.2
hooks:
# Run the linter.
- id: ruff
......
......@@ -270,6 +270,7 @@ Note that for externally hosted models, configs such as `--device` which relate
| vLLM | :heavy_check_mark: | `vllm` | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Mamba | :heavy_check_mark: | `mamba_ssm` | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Huggingface Optimum (Causal LMs) | ✔️ | `openvino` | Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... |
| Huggingface Optimum-intel IPEX (Causal LMs) | ✔️ | `ipex` | Any decoder-only AutoModelForCausalLM | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... |
| Neuron via AWS Inf2 (Causal LMs) | ✔️ | `neuronx` | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... |
| [Neural Magic DeepSparse](https://github.com/neuralmagic/deepsparse) | ✔️ | `deepsparse` | Any LM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub with the "deepsparse" tag](https://huggingface.co/models?other=deepsparse) | `generate_until`, `loglikelihood` | ... |
| [Neural Magic SparseML](https://github.com/neuralmagic/sparseml) | ✔️ | `sparseml` | Any decoder-only AutoModelForCausalLM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub](https://huggingface.co/neuralmagic). Especially useful for models with quantization like [`zoo:llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized`](https://sparsezoo.neuralmagic.com/models/llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... |
......@@ -492,6 +493,7 @@ Extras dependencies can be installed via `pip install -e ".[NAME]"`
| hf_transfer | For speeding up HF Hub file downloads |
| ifeval | For running the IFEval task |
| ibm_watsonx_ai | For using IBM watsonx.ai model apis |
| ipex | For running on optimum-intel ipex backend |
| neuronx | For running on AWS inf2 instances |
| mamba | For loading Mamba SSM models |
| math | For running math task answer checking |
......
......@@ -58,7 +58,7 @@ This mode supports a number of command-line arguments, the details of which can
* `--seed`: Set seed for python's random, numpy and torch. Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three. The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility). E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`. E.g, `--seed 42` sets all three seeds to 42.
* `--wandb_args`: Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list [here](https://docs.wandb.ai/ref/python/init). e.g., ```--wandb_args project=test-project,name=test-run```
* `--wandb_args`: Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list [here](https://docs.wandb.ai/ref/python/init). e.g., ```--wandb_args project=test-project,name=test-run```. Also allows for the passing of the step to log things at (passed to `wandb.run.log`), e.g., `--wandb_args step=123`.
* `--hf_hub_log_args` : Logs evaluation results to Hugging Face Hub. Accepts a string with the arguments separated by commas. Available arguments:
* `hub_results_org` - organization name on Hugging Face Hub, e.g., `EleutherAI`. If not provided, the results will be pushed to the owner of the Hugging Face token,
......
......@@ -190,7 +190,8 @@ doc_to_target: "{{answer}}"
```
**Important**: we now add `target_delimiter` between input and target which defaults to " ", such that the full input-output string is `doc_to_target(doc) + target_delimiter + doc_to_text(doc)`. `doc_to_text` and `doc_to_target` should not contain trailing right or left whitespace, respectively.
> [!WARNING]
> We add `target_delimiter` between input and target which defaults to " ", such that the full input-output string is `doc_to_text(doc) + target_delimiter + doc_to_target(doc)`. `doc_to_text` and `doc_to_target` should not contain trailing right or left whitespace, respectively. For multiple choice the target will be each choice index concatenated with the delimiter.
#### Multiple choice format
......@@ -206,7 +207,7 @@ doc_to_choice: "{{[distractor1, distractor2, distractor3, correct_answer]}}"
```
Task implementers are thus able to decide what the answer choices should be for a document, and what prompt format to use.
The label index can also be sourced from a feature directly. For example in `superglue/boolq`, the label index if defined in the feature `label`. We can set `doc_to_target` as simply `label`. The options or verbalizers can be written in a the form of a list `["no", "yes"]` that will correspond to the label index.
The label index can also be sourced from a feature directly. For example in `superglue/boolq`, the label index if defined in the feature `label`. We can set `doc_to_target` as simply `label`. The options or verbalizers can be written in the form of a list `["no", "yes"]` that will correspond to the label index.
```yaml
doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
......
......@@ -37,6 +37,7 @@ Prompting / in-context formatting options:
- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `generate_until` tasks.
- **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples.
- **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested.
- **assistant_prefill** (`str`, *optional*) — String to append after the <|assistant|> token. For example, if the task is to generate a question, the assistant_prefill could be "The answer is: " to prompt the model to generate an answer to the question. If not using a chat template then this string will be appended to the end of the prompt.
Runtime configuration options:
- **num_fewshot** (`int`, *optional*, defaults to 0) — Number of few-shot examples before the input.
......
......@@ -257,6 +257,11 @@ def setup_parser() -> argparse.ArgumentParser:
action="store_true",
help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
)
parser.add_argument(
"--confirm_run_unsafe_code",
action="store_true",
help="Confirm that you understand the risks of running unsafe code for tasks that require it",
)
return parser
......@@ -404,6 +409,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
numpy_random_seed=args.seed[1],
torch_random_seed=args.seed[2],
fewshot_random_seed=args.seed[3],
confirm_run_unsafe_code=args.confirm_run_unsafe_code,
**request_caching_args,
)
......
......@@ -112,6 +112,4 @@ class ConfigurableGroup(abc.ABC):
return self._config.group
def __repr__(self):
return (
f"ConfigurableGroup(group={self.group}," f"group_alias={self.group_alias})"
)
return f"ConfigurableGroup(group={self.group},group_alias={self.group_alias})"
......@@ -527,9 +527,9 @@ def pooled_sample_stderr(stderrs: List[float], sizes: List[int]):
def combined_sample_stderr(stderrs: List[float], sizes: List[int], metrics=None):
assert (
metrics is not None
), "Need to pass a list of each subtask's metric for this stderr aggregation"
assert metrics is not None, (
"Need to pass a list of each subtask's metric for this stderr aggregation"
)
assert len(stderrs) == len(sizes) and len(sizes) == len(metrics)
# See https://github.com/EleutherAI/lm-evaluation-harness/pull/1390 for more documentation.
......
......@@ -113,13 +113,17 @@ class LM(abc.ABC):
"""
pass
def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
def apply_chat_template(
self, chat_history: List[Dict[str, str]], add_generation_prompt=True
) -> str:
"""
Defines how to transform few-shot examples provided as chat history into a format that can be used as input to the LM.
:param chat_history: list[dict[str, str]]
A list of dictionaries with keys 'role' and 'content'.
Values are strings representing the role name and the content of the message, respectively.
:param add_generation_prompt: bool
Whether to append an assistant gen prefix (for e.g. <|assistant|>) to the assistant messages in the chat history. False if prefilling an assistant message.
:return: str
A string representing the chat history in a format that can be used as input to the LM.
"""
......
......@@ -17,13 +17,13 @@ def register_model(*names):
def decorate(cls):
for name in names:
assert issubclass(
cls, LM
), f"Model '{name}' ({cls.__name__}) must extend LM class"
assert issubclass(cls, LM), (
f"Model '{name}' ({cls.__name__}) must extend LM class"
)
assert (
name not in MODEL_REGISTRY
), f"Model named '{name}' conflicts with existing model! Please register with a non-conflicting alias instead."
assert name not in MODEL_REGISTRY, (
f"Model named '{name}' conflicts with existing model! Please register with a non-conflicting alias instead."
)
MODEL_REGISTRY[name] = cls
return cls
......@@ -48,9 +48,9 @@ func2task_index = {}
def register_task(name):
def decorate(fn):
assert (
name not in TASK_REGISTRY
), f"task named '{name}' conflicts with existing registered task!"
assert name not in TASK_REGISTRY, (
f"task named '{name}' conflicts with existing registered task!"
)
TASK_REGISTRY[name] = fn
ALL_TASKS.add(name)
......@@ -104,9 +104,9 @@ def register_metric(**args):
]:
if key in args:
value = args[key]
assert (
value not in registry
), f"{key} named '{value}' conflicts with existing registered {key}!"
assert value not in registry, (
f"{key} named '{value}' conflicts with existing registered {key}!"
)
if key == "metric":
registry[name] = fn
......@@ -140,9 +140,9 @@ def get_metric(name: str, hf_evaluate_metric=False) -> Callable:
def register_aggregation(name: str):
def decorate(fn):
assert (
name not in AGGREGATION_REGISTRY
), f"aggregation named '{name}' conflicts with existing registered aggregation!"
assert name not in AGGREGATION_REGISTRY, (
f"aggregation named '{name}' conflicts with existing registered aggregation!"
)
AGGREGATION_REGISTRY[name] = fn
return fn
......
from functools import partial
from typing import TYPE_CHECKING, Iterable, Optional, Union
import datasets
if TYPE_CHECKING:
from random import Random
from lm_eval.api.task import ConfigurableTask, Task
class ContextSampler:
def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
def __init__(
self,
docs: list[dict],
task: Union["Task", "ConfigurableTask"],
fewshot_indices: Optional[Iterable] = None,
rnd: Optional["Random"] = None,
) -> None:
self.rnd = rnd
if not self.rnd:
raise ValueError(
......@@ -58,8 +71,9 @@ class ContextSampler:
)
self.docs = self.docs.select(fewshot_indices)
def get_context(self, doc, num_fewshot):
def get_context(self, doc: dict, num_fewshot: int, gen_prefix: str = None):
# draw an extra fewshot sample if using same split as evaluating on
prefix = gen_prefix + " " if gen_prefix else ""
n_samples = (
num_fewshot + 1
if self.config.fewshot_split == self.config.test_split
......@@ -77,14 +91,14 @@ class ContextSampler:
for doc in selected_docs:
doc_content = self.doc_to_text(doc)
doc_target = self.doc_to_target(doc)
labeled_examples += (
doc_content
if self.config.doc_to_choice is None or isinstance(doc_content, str)
else self.doc_to_choice(doc)[doc_content]
)
if self.config.doc_to_choice is None or isinstance(doc_content, str):
labeled_examples += doc_content
else:
labeled_examples += self.doc_to_choice(doc)[doc_content]
if doc_target != "":
labeled_examples += self.target_delimiter
labeled_examples += prefix
labeled_examples += (
str(doc_target[0])
if isinstance(doc_target, list)
......@@ -98,10 +112,13 @@ class ContextSampler:
def get_chat_context(
self,
doc,
num_fewshot,
doc: dict,
num_fewshot: int,
fewshot_as_multiturn: bool = False,
gen_prefix: Optional[str] = None,
):
# TODO: Do we need any other delimiter
prefix = gen_prefix + " " if gen_prefix else ""
chat_history = []
# draw an extra fewshot sample if using same split as evaluating on
n_samples = (
......@@ -132,23 +149,28 @@ class ContextSampler:
chat_history.append(
{
"role": "assistant",
"content": str(doc_target[0])
"content": prefix + str(doc_target[0])
if isinstance(doc_target, list)
else doc_target
else prefix + doc_target
if self.config.doc_to_choice is None
or isinstance(doc_target, str)
else str(self.doc_to_choice(doc)[doc_target]),
else prefix + str(self.doc_to_choice(doc)[doc_target]),
}
)
else:
# get fewshot context as one user turn
chat_history.append(
{"role": "user", "content": self.get_context(doc, num_fewshot)}
{
"role": "user",
"content": self.get_context(
doc, num_fewshot, gen_prefix=gen_prefix
),
}
)
return chat_history
def sample(self, n):
def sample(self, n: int):
"""
Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
"""
......@@ -157,19 +179,19 @@ class ContextSampler:
class FirstNSampler(ContextSampler):
def sample(self, n) -> None:
def sample(self, n: int) -> None:
"""
Draw the first `n` samples in order from the specified split.
Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
"""
assert (
n <= len(self.docs)
), f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
assert n <= len(self.docs), (
f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
)
return self.docs[:n]
class BalancedSampler(ContextSampler):
def sample(self, n) -> None:
def sample(self, n: int) -> None:
"""
TODO: this should return approximately class-balanced samples from our fewshot examples.
TODO: what order should they be in? maybe random?
......@@ -179,7 +201,7 @@ class BalancedSampler(ContextSampler):
class ManualSampler(ContextSampler):
def sample(self, n) -> None:
def sample(self, n: int) -> None:
""" """
pass
......@@ -190,7 +212,7 @@ SAMPLER_REGISTRY = {
}
def get_sampler(name):
def get_sampler(name: str):
try:
return SAMPLER_REGISTRY[name]
except KeyError:
......
......@@ -75,6 +75,7 @@ class TaskConfig(dict):
doc_to_text: Optional[Union[Callable, str]] = None
doc_to_target: Optional[Union[Callable, str]] = None
doc_to_image: Union[Callable, str] = None
unsafe_code: bool = False
doc_to_choice: Optional[Union[Callable, str, dict, list]] = None
process_results: Optional[Union[Callable, str]] = None
use_prompt: Optional[str] = None
......@@ -92,6 +93,7 @@ class TaskConfig(dict):
filter_list: Optional[Union[str, list]] = None
should_decontaminate: bool = False
doc_to_decontamination_query: Optional[str] = None
gen_prefix: Optional[str] = None
metadata: Optional[dict] = (
None # by default, not used in the code. allows for users to pass arbitrary info to tasks
)
......@@ -369,6 +371,9 @@ class Task(abc.ABC):
def doc_to_image(self, doc):
raise NotImplementedError
def doc_to_prefix(self, doc):
return ""
def build_all_requests(
self,
*,
......@@ -398,7 +403,7 @@ class Task(abc.ABC):
)
cache_key += f"-tokenizer{tokenizer_name}"
cached_instances = load_from_cache(file_name=cache_key)
cached_instances = load_from_cache(file_name=cache_key, cache=cache_requests)
if cache_requests and cached_instances and not rewrite_requests_cache:
cached_instances = cached_instances[:limit]
......@@ -442,6 +447,7 @@ class Task(abc.ABC):
apply_chat_template,
fewshot_as_multiturn,
chat_template,
gen_prefix=self.doc_to_prefix(doc),
)
# TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
......@@ -541,13 +547,7 @@ class Task(abc.ABC):
return len(re.split(r"\s+", doc))
@utils.positional_deprecated
def fewshot_context(
self,
doc,
num_fewshot,
rnd=None,
description=None,
):
def fewshot_context(self, doc, num_fewshot, rnd=None, description=None, **kwargs):
"""Returns a fewshot context string that is made up of a prepended description
(if provided), the `num_fewshot` number of examples, and an appended prompt example.
......@@ -732,6 +732,9 @@ class ConfigurableTask(Task):
# mark the task as requiring multimodality.
self.MULTIMODAL = True
if self.config.unsafe_code is not False:
self.UNSAFE_CODE = True
if self.config.dataset_path is not None:
self.DATASET_PATH = self.config.dataset_path
......@@ -1000,6 +1003,7 @@ class ConfigurableTask(Task):
labeled_examples: List[Dict[str, str]],
question: str,
fewshot_as_multiturn: bool = False,
gen_prefix: Optional[str] = None,
) -> None:
"""Adds a target question to the labeled examples list.
If fewshot_as_multiturn is True, or labeled_examples is empty, or the last entry is a system turn, appends the question as a new user entry.
......@@ -1015,17 +1019,20 @@ class ConfigurableTask(Task):
else:
# if fewshot_as_multiturn is True, append as next user entry (last is always assistant)
labeled_examples.append({"role": "user", "content": question})
if gen_prefix:
labeled_examples.append({"role": "assistant", "content": gen_prefix})
@utils.positional_deprecated
def fewshot_context(
self,
doc: str,
doc: dict,
num_fewshot: int,
system_instruction: Optional[str] = None,
apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
chat_template: Optional[Callable] = None,
) -> str:
gen_prefix: Optional[str] = None,
) -> Union[str, List[str]]:
"""Returns a fewshot context string that is made up of a prepended description
(if provided), the `num_fewshot` number of examples, and an appended prompt example.
......@@ -1044,7 +1051,6 @@ class ConfigurableTask(Task):
:returns: str
The fewshot context.
"""
if apply_chat_template:
labeled_examples = []
else:
......@@ -1072,25 +1078,33 @@ class ConfigurableTask(Task):
labeled_examples.append({"role": "system", "content": system_prompt})
else:
labeled_examples = system_prompt
# if few-shot - append examples after the system prompt
if num_fewshot > 0:
if apply_chat_template:
labeled_examples.extend(
self.sampler.get_chat_context(
doc, num_fewshot, fewshot_as_multiturn
doc,
num_fewshot,
fewshot_as_multiturn,
gen_prefix=gen_prefix,
)
)
else:
labeled_examples += self.sampler.get_context(doc, num_fewshot)
labeled_examples += self.sampler.get_context(
doc, num_fewshot, gen_prefix=gen_prefix
)
example = self.doc_to_text(doc)
if apply_chat_template:
if self.multiple_input:
# TODO: append prefill?
return chat_template(labeled_examples)
if isinstance(example, str):
self.append_target_question(
labeled_examples, example, fewshot_as_multiturn
labeled_examples,
example,
fewshot_as_multiturn,
gen_prefix=gen_prefix,
)
# for loglikelihood create a list of questions with appended choices
elif isinstance(example, list):
......@@ -1098,37 +1112,62 @@ class ConfigurableTask(Task):
# copy chat history for each example and append the answer
for ex in example:
chat = deepcopy(labeled_examples)
self.append_target_question(chat, ex, fewshot_as_multiturn)
labeled_examples_list.append(chat_template(chat))
self.append_target_question(
chat,
ex,
fewshot_as_multiturn,
gen_prefix=gen_prefix,
)
# TODO: append prefill?
labeled_examples_list.append(
chat_template(
chat,
add_generation_prompt=False if gen_prefix else True,
)
)
return labeled_examples_list
# if example is an integer, append the choice or convert to string
elif isinstance(example, int):
if self.config.doc_to_choice is not None:
choices = self.doc_to_choice(doc)
self.append_target_question(
labeled_examples, choices[example], fewshot_as_multiturn
labeled_examples,
choices[example],
fewshot_as_multiturn,
gen_prefix=gen_prefix,
)
else:
self.append_target_question(
labeled_examples, str(example), fewshot_as_multiturn
labeled_examples,
str(example),
fewshot_as_multiturn,
gen_prefix=gen_prefix,
)
# return lm.apply_chat_template(labeled_examples)
return chat_template(labeled_examples)
return chat_template(
labeled_examples,
add_generation_prompt=False if gen_prefix else True,
)
else:
prefix = (
self.config.target_delimiter + gen_prefix
if gen_prefix is not None
else ""
)
if self.multiple_input:
return labeled_examples
if isinstance(example, str):
return labeled_examples + example
return labeled_examples + example + prefix
elif isinstance(example, list):
return [labeled_examples + ex for ex in example]
return [labeled_examples + ex + prefix for ex in example]
elif isinstance(example, int):
if self.config.doc_to_choice is not None:
choices = self.doc_to_choice(doc)
return labeled_examples + choices[example]
return labeled_examples + choices[example] + prefix
else:
return labeled_examples + str(example)
return labeled_examples + str(example) + prefix
def apply_filters(self):
def apply_filters(self) -> Optional[List[Instance]]:
"""Iterates over FilterEnsembles and applies them to instances"""
if hasattr(self, "_filters"):
for f in self._filters:
......@@ -1140,7 +1179,7 @@ class ConfigurableTask(Task):
def should_decontaminate(self):
return self.config.should_decontaminate
def doc_to_decontamination_query(self, doc):
def doc_to_decontamination_query(self, doc: dict):
if self.config.should_decontaminate:
if self.config.doc_to_decontamination_query is None:
return self.doc_to_text(doc)
......@@ -1299,6 +1338,14 @@ class ConfigurableTask(Task):
else:
return None
def doc_to_prefix(self, doc):
if (gen_prefix := self.config.gen_prefix) is not None:
if gen_prefix in self.features:
return doc[gen_prefix]
else:
return utils.apply_template(gen_prefix, doc)
return None
def construct_requests(
self, doc: dict, ctx: str, **kwargs
) -> Union[List[Instance], Instance]:
......@@ -1503,9 +1550,9 @@ class ConfigurableTask(Task):
# we expect multiple_targets to be a list.
elif self.multiple_target:
gold = list(gold)
elif (
type(gold) is not type(result)
and "bypass" not in self._metric_fn_list.keys()
# TODO: handle this better
elif type(gold) is not type(result) and not (
"bypass" in self._metric_fn_list.keys() or isinstance(result, list)
):
# cast gold to the same type as result
gold = type(result)(gold)
......@@ -1561,7 +1608,10 @@ class ConfigurableTask(Task):
result_score = self._metric_fn_list[metric]([gold, result])
if isinstance(result_score, dict):
# TODO: this handles the case where HF evaluate returns a dict.
result_score = result_score[metric]
# This allows for multiple metrics to be returned from the same function
for k, v in result_score.items():
result_dict[k] = v
return result_dict
result_dict[metric] = result_score
else:
raise ValueError(
......
......@@ -21,7 +21,9 @@ HASH_PREFIX = hashlib.sha256(HASH_INPUT.encode("utf-8")).hexdigest()
FILE_SUFFIX = f".{HASH_PREFIX}.pickle"
def load_from_cache(file_name):
def load_from_cache(file_name: str, cache: bool = False):
if not cache:
return
try:
path = f"{PATH}/{file_name}{FILE_SUFFIX}"
......
......@@ -110,12 +110,15 @@ class TextReader:
def read_tqdm(self, update_frequency: int = 10000):
current_file_position = 0
line_counter = 0
with open(self.file_path, "r", encoding="utf-8") as fh, tqdm.tqdm(
total=os.path.getsize(self.file_path),
dynamic_ncols=True,
unit="byte",
unit_scale=1,
) as progress:
with (
open(self.file_path, "r", encoding="utf-8") as fh,
tqdm.tqdm(
total=os.path.getsize(self.file_path),
dynamic_ncols=True,
unit="byte",
unit_scale=1,
) as progress,
):
with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
for line in iter(mmap_obj.readline, b""):
line = line.decode("utf-8")
......
......@@ -151,7 +151,7 @@ def get_train_overlap(docs_by_task_set: dict, ngrams_path: str, limit: int) -> d
elapsed = time.perf_counter() - start
print(f"Read took {elapsed:0.5f} seconds.")
print(f"Speed: {(os.path.getsize(file)/1000000.0)/elapsed}MB/second")
print(f"Speed: {(os.path.getsize(file) / 1000000.0) / elapsed}MB/second")
print(duplicates)
......
......@@ -74,6 +74,7 @@ def simple_evaluate(
numpy_random_seed: int = 1234,
torch_random_seed: int = 1234,
fewshot_random_seed: int = 1234,
confirm_run_unsafe_code: bool = False,
):
"""Instantiate and evaluate a model on a list of tasks.
......@@ -313,6 +314,7 @@ def simple_evaluate(
apply_chat_template=apply_chat_template,
fewshot_as_multiturn=fewshot_as_multiturn,
verbosity=verbosity,
confirm_run_unsafe_code=confirm_run_unsafe_code,
)
if lm.rank == 0:
......@@ -372,6 +374,7 @@ def evaluate(
apply_chat_template: Union[bool, str] = False,
fewshot_as_multiturn: bool = False,
verbosity: str = "INFO",
confirm_run_unsafe_code: bool = False,
):
"""Instantiate and evaluate a model on a list of tasks.
......@@ -381,6 +384,10 @@ def evaluate(
Dictionary of tasks. Tasks will be taken to have name type(task).config.task .
:param limit: int, optional
Limit the number of examples per task (only use this for testing)
:param cache_requests: bool, optional
Speed up evaluation by caching the building of dataset requests.
:param rewrite_requests_cache: bool, optional
Rewrites all the request cache if set to `True`.
:param bootstrap_iters:
Number of iterations for bootstrap statistics, used when calculating stderr. Set to 0 for skipping all stderr calculations.
:param write_out: bool
......@@ -396,6 +403,10 @@ def evaluate(
Defaults to False (no chat template applied).
:param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:param verbosity: str
Verbosity level for logging
:param confirm_run_unsafe_code: bool
Whether to confirm running tasks marked as unsafe.
:return
Dictionary of results
"""
......@@ -422,13 +433,19 @@ def evaluate(
):
raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
# validation check: are we running multimodal task <-> non-multimodal model class, or vice-versa.
# validation checks:
# 1.are we running multimodal task <-> non-multimodal model class, or vice-versa.
# 2.are we running code that is marked as unsafe.
incompatible_tasks = []
for task_output in eval_tasks:
task: Task = task_output.task
if getattr(lm, "MULTIMODAL", False) != getattr(task, "MULTIMODAL", False):
incompatible_tasks.append(task_output.task_name)
elif getattr(task, "UNSAFE_CODE", False) and not confirm_run_unsafe_code:
raise ValueError(
f"Attempted to run task: {task_output.task_name} which is marked as unsafe. Set confirm_run_unsafe_code=True to run this task."
)
if len(incompatible_tasks) > 0:
if not getattr(lm, "MULTIMODAL", False):
raise ValueError(
......@@ -438,7 +455,7 @@ def evaluate(
raise ValueError(
f"Attempted to run tasks: {incompatible_tasks} which are text-only, but used a model type which only currently supports multimodal tasks."
)
# end multimodality validation check
# end validation check
# Cache the limit arg.
limit_arg = limit
......
......@@ -7,6 +7,7 @@ from typing import List, Optional, Tuple, Union
from lm_eval.api.group import ConfigurableGroup
from lm_eval.api.metrics import (
aggregate_subtask_metrics,
mean,
pooled_sample_stderr,
stderr_for_metric,
)
......@@ -99,7 +100,12 @@ class TaskOutput:
def calculate_aggregate_metric(self, bootstrap_iters=100000) -> None:
for (metric, filter_key), items in self.sample_metrics.items():
agg_fn = self.task.aggregation()[metric]
try:
agg_fn = self.task.aggregation()[metric]
except KeyError:
# This is when process results output an arbitrary metric
# TODO: Handle this better and allow other aggregate functions other than mean.
agg_fn = mean
metric_key = f"{metric},{filter_key}"
self.agg_metrics[metric_key] = agg_fn(items)
self.sample_len = len(items) # TODO: same sample size for each metric?
......
......@@ -4,7 +4,7 @@ from typing import List
from lm_eval.api.filter import FilterEnsemble
from lm_eval.api.registry import get_filter
from . import extraction, selection, transformation
from . import custom, extraction, selection, transformation
def build_filter_ensemble(
......
from lm_eval.api.filter import Filter
from lm_eval.api.registry import register_filter
@register_filter("custom")
class CustomFilter(Filter):
"""
Custom filter that applies a custom, user-defined function to the model responses.
"""
def __init__(self, **kwargs) -> None:
self.filter_fn = kwargs.pop("filter_fn")
super().__init__(**kwargs)
def apply(self, resps, docs):
return self.filter_fn(resps, docs)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment