Commit 173b2bc3 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into humaneval

# Conflicts:
#	lm_eval/api/task.py
parents 74344829 bb098f13
...@@ -3,7 +3,7 @@ import hashlib ...@@ -3,7 +3,7 @@ import hashlib
import json import json
import logging import logging
import os import os
from typing import Dict, List, Optional, Tuple, Type, TypeVar from typing import Dict, List, Optional, Tuple, Type, TypeVar, Union
import transformers import transformers
from sqlitedict import SqliteDict from sqlitedict import SqliteDict
...@@ -55,7 +55,7 @@ class LM(abc.ABC): ...@@ -55,7 +55,7 @@ class LM(abc.ABC):
pass pass
@abc.abstractmethod @abc.abstractmethod
def loglikelihood_rolling(self, requests) -> List[Tuple[float]]: def loglikelihood_rolling(self, requests) -> List[float]:
"""Compute full log-likelihood of a string, with no truncation, for perplexity computation """Compute full log-likelihood of a string, with no truncation, for perplexity computation
- We will use the full max context length of the model. - We will use the full max context length of the model.
- For inputs that exceed the max context length, we divide the tokenized string into chunks of up to - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
...@@ -101,14 +101,13 @@ class LM(abc.ABC): ...@@ -101,14 +101,13 @@ class LM(abc.ABC):
"""Generate greedily until a stopping sequence """Generate greedily until a stopping sequence
:param requests: list[Instance] :param requests: list[Instance]
A list of Instance objects with property `args` which returns a tuple (context, until). A list of Instance objects with property `args` which returns a tuple (context, gen_kwargs).
context: str context: str
Context string Context string
until: [str] gen_kwargs: dict
The string sequences to generate until. These string sequences A dictionary of keyword arguments to pass to the generation function e.g. top_k, until, etc.
may each span across multiple tokens, or may be part of one token.
:return: list[str] :return: list[str]
A list of strings continuation A list of model generated continuations.
continuation: str continuation: str
The generated continuation. The generated continuation.
""" """
...@@ -193,15 +192,13 @@ class LM(abc.ABC): ...@@ -193,15 +192,13 @@ class LM(abc.ABC):
"To use this model with chat templates, please implement the 'tokenizer_name' property." "To use this model with chat templates, please implement the 'tokenizer_name' property."
) )
@property def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
def chat_template(self) -> str: """Returns the chat template structure for user/assistant messages if a template is provided.
"""Must be defined for LM subclasses that implement Chat Templating. This method is intended to be overridden in a subclass to define a specific chat template format.
Should return the structure of the chat template applied to user/assistant messages. For models that do not support chat templates, this method returns None by default.
This is used only to save in the experiment results for reproducibility.
""" """
raise NotImplementedError(
"To use this model with chat templates, please implement the 'chat_template' property." return ""
)
def set_cache_hook(self, cache_hook) -> None: def set_cache_hook(self, cache_hook) -> None:
self.cache_hook = cache_hook self.cache_hook = cache_hook
...@@ -246,9 +243,10 @@ class CachingLM: ...@@ -246,9 +243,10 @@ class CachingLM:
# add hook to lm # add hook to lm
lm.set_cache_hook(self.get_cache_hook()) lm.set_cache_hook(self.get_cache_hook())
def __getattr__(self, attr): def __getattr__(self, attr: str):
lm_attr = getattr(self.lm, attr) lm_attr = getattr(self.lm, attr)
if not callable(lm_attr): if attr not in ["loglikelihood", "loglikelihood_rolling", "generate_until"]:
eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM")
return lm_attr return lm_attr
def fn(requests): def fn(requests):
...@@ -283,8 +281,11 @@ class CachingLM: ...@@ -283,8 +281,11 @@ class CachingLM:
eval_logger.info( eval_logger.info(
f"Cached requests: {len(requests) - len(remaining_reqs)}, Requests remaining: {len(remaining_reqs)}" f"Cached requests: {len(requests) - len(remaining_reqs)}, Requests remaining: {len(remaining_reqs)}"
) )
# actually run the LM on the requests that do not have cached results if remaining_reqs:
rem_res = getattr(self.lm, attr)(remaining_reqs) # actually run the LM on the requests that do not have cached results
rem_res = getattr(self.lm, attr)(remaining_reqs)
else:
rem_res = []
# stick the new ones back into the list and also cache any of the new ones # stick the new ones back into the list and also cache any of the new ones
resptr = 0 resptr = 0
...@@ -313,6 +314,8 @@ class TemplateLM(LM): ...@@ -313,6 +314,8 @@ class TemplateLM(LM):
and boilerplate often included in other LM subclasses. and boilerplate often included in other LM subclasses.
""" """
tokenizer = None
@property @property
@abc.abstractmethod @abc.abstractmethod
def eot_token_id(self): def eot_token_id(self):
...@@ -324,14 +327,19 @@ class TemplateLM(LM): ...@@ -324,14 +327,19 @@ class TemplateLM(LM):
return self.eot_token_id return self.eot_token_id
@abc.abstractmethod @abc.abstractmethod
def tok_encode(self, string: str, **kwargs): def tok_encode(self, string: str, **kwargs) -> List[int]:
"""
Tokenize a string using the model's tokenizer and return a list of token IDs.
"""
pass pass
@abc.abstractmethod @abc.abstractmethod
def _loglikelihood_tokens(self, requests, **kwargs): def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
pass pass
def _encode_pair(self, context, continuation): def _encode_pair(
self, context: str, continuation: str
) -> Tuple[List[int], List[int]]:
n_spaces = len(context) - len(context.rstrip()) n_spaces = len(context) - len(context.rstrip())
if n_spaces > 0: if n_spaces > 0:
continuation = context[-n_spaces:] + continuation continuation = context[-n_spaces:] + continuation
...@@ -372,9 +380,110 @@ class TemplateLM(LM): ...@@ -372,9 +380,110 @@ class TemplateLM(LM):
@abc.abstractmethod @abc.abstractmethod
def loglikelihood_rolling( def loglikelihood_rolling(
self, requests, disable_tqdm: bool = False self, requests, disable_tqdm: bool = False
) -> List[Tuple[float, bool]]: ) -> List[float]:
pass pass
@abc.abstractmethod @abc.abstractmethod
def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]: def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
pass pass
def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
"""
Set and get the appropriate chat template for the model.
This method sets the tokenizer's chat_template and returns the template string for reproducibility.
The template selection logic is adapted from the Transformers library's `apply_chat_template`
method in the Tokenizer class. The original implementation can be found at:
https://github.com/huggingface/transformers/blob/fc35907f95459d7a6c5281dfadd680b6f7b620e3/src/transformers/tokenization_utils_base.py#L1687
This method ensures that the right template is chosen based on the following:
0. If the model has no 'tokenizer' attribute: assumes that there is only a single possible chat template, handled on the model provider side internally. Returns the empty string.
1. If the model's tokenizer has multiple templates:
a. Use the specified template if it exists in the dictionary.
b. Use the default template from the list if no specific template is provided.
c. Raise an error if no default template exists and no specific template is provided.
2. If the model's tokenizer has a single template or no template:
a. Use the tokenizer's chat template if available.
b. Fall back to the default chat template if no tokenizer chat template exists.
Args:
chat_template (Union[bool, str]): Specifies the chat template to use.
- If False or None, no template is applied.
- If True, the default or only available template is used.
- If a string, the template with the matching name is used.
Returns:
Optional[str]: The selected chat template, or None if no template is applied.
"""
if self.tokenizer is None:
return ""
if chat_template is False or chat_template is None:
eval_logger.warning(
"model.chat_template was called with the chat_template set to False or None. "
"Therefore no chat template will be applied. Make sure this is an intended behavior."
)
return None
# Convert boolean chat_template to None to ensure compatibility with the adapted logic
if isinstance(chat_template, bool):
chat_template = None
using_default_template = False
# First, handle the cases when the model has a dict of multiple templates
try:
template = (
self.tokenizer.chat_template or self.tokenizer.default_chat_template
)
except AttributeError:
return None
if isinstance(template, dict):
using_default_dict = self.tokenizer.chat_template is None
if chat_template is not None:
if chat_template in template:
selected_template = template[chat_template]
if using_default_dict:
using_default_template = True
else:
raise ValueError(
f"The specified chat template '{chat_template}' is not available. "
f"Available template names are {sorted(template.keys())}."
)
else:
# If user didn't pass a chat template, use the default template from the dict
if "default" in template:
selected_template = template["default"]
using_default_template = True
else:
raise ValueError(
"This model has multiple chat templates with no default specified! Please either pass a chat "
"template or the name of the template you wish to use to the `chat_template` argument. Available "
f"template names are {sorted(template.keys())}."
)
# Cases when the model has a single template or no template
else:
# priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template
if isinstance(chat_template, str):
eval_logger.warning(
"Chat template name provided, but the tokenizer's chat template is not a dictionary. "
"Using the tokenizer's chat template or the default template instead."
)
if self.tokenizer.chat_template is not None:
selected_template = self.tokenizer.chat_template
else:
selected_template = self.tokenizer.default_chat_template
using_default_template = True
if using_default_template:
eval_logger.warning(
"No chat template is set for this tokenizer, falling back to a default class-level template. This is "
"very error-prone, because models are often trained with templates different from the class default! "
"Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which "
"point any code depending on them will stop working. We recommend setting a valid chat template before "
"then to ensure that this model continues working without issues."
)
return selected_template
import logging import logging
from typing import Callable, Dict from typing import Callable, Dict, Union
import evaluate as hf_evaluate import evaluate as hf_evaluate
...@@ -185,8 +185,12 @@ def register_filter(name): ...@@ -185,8 +185,12 @@ def register_filter(name):
return decorate return decorate
def get_filter(filter_name: str) -> type: def get_filter(filter_name: Union[str, Callable]) -> Callable:
try: try:
return FILTER_REGISTRY[filter_name] return FILTER_REGISTRY[filter_name]
except KeyError: except KeyError as e:
eval_logger.warning(f"filter `{filter_name}` is not registered!") if callable(filter_name):
return filter_name
else:
eval_logger.warning(f"filter `{filter_name}` is not registered!")
raise e
from functools import partial
import datasets import datasets
...@@ -15,9 +17,38 @@ class ContextSampler: ...@@ -15,9 +17,38 @@ class ContextSampler:
self.target_delimiter = self.config.target_delimiter self.target_delimiter = self.config.target_delimiter
self.fewshot_delimiter = self.config.fewshot_delimiter self.fewshot_delimiter = self.config.fewshot_delimiter
self.doc_to_text = self.task.doc_to_text if (
self.doc_to_target = self.task.doc_to_target self.config.fewshot_config is not None
self.doc_to_choice = self.task.doc_to_choice and self.config.fewshot_config.get("doc_to_text", None) is not None
):
self.doc_to_text = partial(
self.task.doc_to_text,
doc_to_text=self.config.fewshot_config.get("doc_to_text", None),
)
else:
self.doc_to_text = self.task.doc_to_text
if (
self.config.fewshot_config is not None
and self.config.fewshot_config.get("doc_to_target", None) is not None
):
self.doc_to_target = partial(
self.task.doc_to_target,
doc_to_target=self.config.fewshot_config.get("doc_to_target", None),
)
else:
self.doc_to_target = self.task.doc_to_target
if (
self.config.fewshot_config is not None
and self.config.fewshot_config.get("doc_to_choice", None) is not None
):
self.doc_to_choice = partial(
self.task.doc_to_choice,
doc_to_choice=self.config.fewshot_config.get("doc_to_choice", None),
)
else:
self.doc_to_choice = self.task.doc_to_choice
self.docs = docs # HF dataset split, provided by task._fewshot_docs() self.docs = docs # HF dataset split, provided by task._fewshot_docs()
if fewshot_indices: # subset few-shot docs from if fewshot_indices: # subset few-shot docs from
...@@ -51,15 +82,17 @@ class ContextSampler: ...@@ -51,15 +82,17 @@ class ContextSampler:
if self.config.doc_to_choice is None or isinstance(doc_content, str) if self.config.doc_to_choice is None or isinstance(doc_content, str)
else self.doc_to_choice(doc)[doc_content] else self.doc_to_choice(doc)[doc_content]
) )
labeled_examples += self.target_delimiter
labeled_examples += ( if doc_target != "":
str(doc_target[0]) labeled_examples += self.target_delimiter
if isinstance(doc_target, list) labeled_examples += (
else doc_target str(doc_target[0])
if self.config.doc_to_choice is None or isinstance(doc_target, str) if isinstance(doc_target, list)
else str(self.doc_to_choice(doc)[doc_target]) else doc_target
) if self.config.doc_to_choice is None or isinstance(doc_target, str)
labeled_examples += self.fewshot_delimiter else str(self.doc_to_choice(doc)[doc_target])
)
labeled_examples += self.fewshot_delimiter
return labeled_examples return labeled_examples
......
...@@ -56,8 +56,7 @@ class TaskConfig(dict): ...@@ -56,8 +56,7 @@ class TaskConfig(dict):
# task naming/registry # task naming/registry
task: Optional[str] = None task: Optional[str] = None
task_alias: Optional[str] = None task_alias: Optional[str] = None
group: Optional[Union[str, list]] = None tag: Optional[Union[str, list]] = None
group_alias: Optional[Union[str, list]] = None
# HF dataset options. # HF dataset options.
# which dataset to use, # which dataset to use,
# and what splits for what purpose # and what splits for what purpose
...@@ -68,13 +67,14 @@ class TaskConfig(dict): ...@@ -68,13 +67,14 @@ class TaskConfig(dict):
validation_split: Optional[str] = None validation_split: Optional[str] = None
test_split: Optional[str] = None test_split: Optional[str] = None
fewshot_split: Optional[str] = ( fewshot_split: Optional[str] = (
None # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?) None # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaluating (?)
) )
# formatting / prompting options. # formatting / prompting options.
# see docs/advanced_task_guide.md for more info # see docs/advanced_task_guide.md for more info
process_docs: Optional[Callable] = None process_docs: Optional[Callable] = None
doc_to_text: Optional[Union[Callable, str]] = None doc_to_text: Optional[Union[Callable, str]] = None
doc_to_target: Optional[Union[Callable, str]] = None doc_to_target: Optional[Union[Callable, str]] = None
doc_to_image: Union[Callable, str] = None
doc_to_choice: Optional[Union[Callable, str, dict, list]] = None doc_to_choice: Optional[Union[Callable, str, dict, list]] = None
process_results: Optional[Union[Callable, str]] = None process_results: Optional[Union[Callable, str]] = None
use_prompt: Optional[str] = None use_prompt: Optional[str] = None
...@@ -365,18 +365,23 @@ class Task(abc.ABC): ...@@ -365,18 +365,23 @@ class Task(abc.ABC):
def doc_to_target(self, doc): def doc_to_target(self, doc):
pass pass
# not an abstractmethod because not every language-only task has to implement this
def doc_to_image(self, doc):
raise NotImplementedError
def build_all_requests( def build_all_requests(
self, self,
*, *,
limit=None, limit: Union[int, None] = None,
rank=None, rank: int = 0,
world_size=None, world_size: int = 1,
cache_requests=False, cache_requests: bool = False,
rewrite_requests_cache=False, rewrite_requests_cache: bool = False,
system_instruction=None, system_instruction: Optional[str] = None,
apply_chat_template=False, apply_chat_template: bool = False,
fewshot_as_multiturn=False, fewshot_as_multiturn: bool = False,
lm=None, chat_template: Optional[Callable] = None,
tokenizer_name: str = "",
) -> None: ) -> None:
"""Build a set of Instances for a task, and store them in task.instances""" """Build a set of Instances for a task, and store them in task.instances"""
...@@ -391,9 +396,9 @@ class Task(abc.ABC): ...@@ -391,9 +396,9 @@ class Task(abc.ABC):
if system_instruction is not None if system_instruction is not None
else "" else ""
) )
cache_key += f"-tokenizer{lm.tokenizer_name}" if apply_chat_template else "" cache_key += f"-tokenizer{tokenizer_name}"
cached_instances = load_from_cache(file_name=cache_key) cached_instances = load_from_cache(file_name=cache_key, cache=cache_requests)
if cache_requests and cached_instances and not rewrite_requests_cache: if cache_requests and cached_instances and not rewrite_requests_cache:
cached_instances = cached_instances[:limit] cached_instances = cached_instances[:limit]
...@@ -436,7 +441,7 @@ class Task(abc.ABC): ...@@ -436,7 +441,7 @@ class Task(abc.ABC):
system_instruction, system_instruction,
apply_chat_template, apply_chat_template,
fewshot_as_multiturn, fewshot_as_multiturn,
lm, chat_template,
) )
# TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
...@@ -444,6 +449,7 @@ class Task(abc.ABC): ...@@ -444,6 +449,7 @@ class Task(abc.ABC):
doc=doc, doc=doc,
ctx=fewshot_ctx, ctx=fewshot_ctx,
metadata=(self.config["task"], doc_id, self.config.repeats), metadata=(self.config["task"], doc_id, self.config.repeats),
apply_chat_template=apply_chat_template,
) )
if not isinstance(inst, list): if not isinstance(inst, list):
...@@ -722,6 +728,10 @@ class ConfigurableTask(Task): ...@@ -722,6 +728,10 @@ class ConfigurableTask(Task):
) )
self.OUTPUT_TYPE = self.config.output_type self.OUTPUT_TYPE = self.config.output_type
if self.config.doc_to_image is not None:
# mark the task as requiring multimodality.
self.MULTIMODAL = True
if self.config.dataset_path is not None: if self.config.dataset_path is not None:
self.DATASET_PATH = self.config.dataset_path self.DATASET_PATH = self.config.dataset_path
...@@ -979,7 +989,7 @@ class ConfigurableTask(Task): ...@@ -979,7 +989,7 @@ class ConfigurableTask(Task):
else: else:
if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0): if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0):
eval_logger.warning( eval_logger.warning(
f"Task '{self.config.task}': " f"[Task: {self.config.task}] "
"num_fewshot > 0 but fewshot_split is None. " "num_fewshot > 0 but fewshot_split is None. "
"using preconfigured rule." "using preconfigured rule."
) )
...@@ -1014,7 +1024,7 @@ class ConfigurableTask(Task): ...@@ -1014,7 +1024,7 @@ class ConfigurableTask(Task):
system_instruction: Optional[str] = None, system_instruction: Optional[str] = None,
apply_chat_template: bool = False, apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False, fewshot_as_multiturn: bool = False,
lm=None, chat_template: Optional[Callable] = None,
) -> str: ) -> str:
"""Returns a fewshot context string that is made up of a prepended description """Returns a fewshot context string that is made up of a prepended description
(if provided), the `num_fewshot` number of examples, and an appended prompt example. (if provided), the `num_fewshot` number of examples, and an appended prompt example.
...@@ -1029,8 +1039,8 @@ class ConfigurableTask(Task): ...@@ -1029,8 +1039,8 @@ class ConfigurableTask(Task):
Whether to apply the chat template to the fewshot context. Whether to apply the chat template to the fewshot context.
:param fewshot_as_multiturn: bool :param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn. Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:param lm: :param chat_template:
Language model with definition of the tokenizer/function to use for applying the chat template. callable (from lm.apply_chat_template) that takes in a list[Dict] chat transcript and renders it into a string.
:returns: str :returns: str
The fewshot context. The fewshot context.
""" """
...@@ -1077,7 +1087,7 @@ class ConfigurableTask(Task): ...@@ -1077,7 +1087,7 @@ class ConfigurableTask(Task):
example = self.doc_to_text(doc) example = self.doc_to_text(doc)
if apply_chat_template: if apply_chat_template:
if self.multiple_input: if self.multiple_input:
return lm.apply_chat_template(labeled_examples) return chat_template(labeled_examples)
if isinstance(example, str): if isinstance(example, str):
self.append_target_question( self.append_target_question(
labeled_examples, example, fewshot_as_multiturn labeled_examples, example, fewshot_as_multiturn
...@@ -1089,7 +1099,7 @@ class ConfigurableTask(Task): ...@@ -1089,7 +1099,7 @@ class ConfigurableTask(Task):
for ex in example: for ex in example:
chat = deepcopy(labeled_examples) chat = deepcopy(labeled_examples)
self.append_target_question(chat, ex, fewshot_as_multiturn) self.append_target_question(chat, ex, fewshot_as_multiturn)
labeled_examples_list.append(lm.apply_chat_template(chat)) labeled_examples_list.append(chat_template(chat))
return labeled_examples_list return labeled_examples_list
# if example is an integer, append the choice or convert to string # if example is an integer, append the choice or convert to string
elif isinstance(example, int): elif isinstance(example, int):
...@@ -1103,7 +1113,7 @@ class ConfigurableTask(Task): ...@@ -1103,7 +1113,7 @@ class ConfigurableTask(Task):
labeled_examples, str(example), fewshot_as_multiturn labeled_examples, str(example), fewshot_as_multiturn
) )
# return lm.apply_chat_template(labeled_examples) # return lm.apply_chat_template(labeled_examples)
return lm.apply_chat_template(labeled_examples) return chat_template(labeled_examples)
else: else:
if self.multiple_input: if self.multiple_input:
return labeled_examples return labeled_examples
...@@ -1158,9 +1168,11 @@ class ConfigurableTask(Task): ...@@ -1158,9 +1168,11 @@ class ConfigurableTask(Task):
""" """
return doc return doc
def doc_to_text(self, doc): def doc_to_text(self, doc, doc_to_text=None):
if self.prompt is not None: if self.prompt is not None:
doc_to_text = self.prompt doc_to_text = self.prompt
elif doc_to_text is not None:
doc_to_text = doc_to_text
else: else:
doc_to_text = self.config.doc_to_text doc_to_text = self.config.doc_to_text
...@@ -1192,9 +1204,11 @@ class ConfigurableTask(Task): ...@@ -1192,9 +1204,11 @@ class ConfigurableTask(Task):
print(type(doc_to_text)) print(type(doc_to_text))
raise TypeError raise TypeError
def doc_to_target(self, doc: Mapping) -> Union[int, str, list]: def doc_to_target(self, doc: Mapping, doc_to_target=None) -> Union[int, str, list]:
if self.prompt is not None: if self.prompt is not None:
doc_to_target = self.prompt doc_to_target = self.prompt
elif doc_to_target is not None:
doc_to_target = doc_to_target
else: else:
doc_to_target = self.config.doc_to_target doc_to_target = self.config.doc_to_target
...@@ -1236,9 +1250,11 @@ class ConfigurableTask(Task): ...@@ -1236,9 +1250,11 @@ class ConfigurableTask(Task):
else: else:
raise TypeError raise TypeError
def doc_to_choice(self, doc: Any) -> List[str]: def doc_to_choice(self, doc: Any, doc_to_choice=None) -> List[str]:
if self.prompt is not None: if self.prompt is not None:
doc_to_choice = self.prompt doc_to_choice = self.prompt
elif doc_to_choice is not None:
doc_to_choice = doc_to_choice
elif self.config.doc_to_choice is None: elif self.config.doc_to_choice is None:
eval_logger.error("doc_to_choice was called but not set in config") eval_logger.error("doc_to_choice was called but not set in config")
else: else:
...@@ -1260,9 +1276,36 @@ class ConfigurableTask(Task): ...@@ -1260,9 +1276,36 @@ class ConfigurableTask(Task):
else: else:
raise TypeError raise TypeError
def doc_to_image(self, doc: Any, doc_to_image=None) -> Union[int, str, list]:
if doc_to_image is not None:
doc_to_image = doc_to_image
elif self.config.doc_to_image is not None:
doc_to_image = self.config.doc_to_image
else:
return None
if isinstance(doc_to_image, list):
image_feature = [
self.doc_to_image(doc, feature) for feature in doc_to_image
]
return [feature for feature in image_feature if feature is not None]
elif isinstance(doc_to_image, str):
if doc_to_image in self.features:
return doc[doc_to_image]
else:
return ast.literal_eval(utils.apply_template(doc_to_image, doc))
elif callable(doc_to_image):
return doc_to_image(doc)
else:
return None
def construct_requests( def construct_requests(
self, doc: dict, ctx: str, **kwargs self, doc: dict, ctx: str, **kwargs
) -> Union[List[Instance], Instance]: ) -> Union[List[Instance], Instance]:
apply_chat_template = kwargs.pop("apply_chat_template", False)
aux_arguments = None
if self.OUTPUT_TYPE == "loglikelihood": if self.OUTPUT_TYPE == "loglikelihood":
arguments = (ctx, self.doc_to_target(doc)) arguments = (ctx, self.doc_to_target(doc))
elif self.OUTPUT_TYPE == "loglikelihood_rolling": elif self.OUTPUT_TYPE == "loglikelihood_rolling":
...@@ -1270,6 +1313,8 @@ class ConfigurableTask(Task): ...@@ -1270,6 +1313,8 @@ class ConfigurableTask(Task):
elif self.OUTPUT_TYPE == "multiple_choice": elif self.OUTPUT_TYPE == "multiple_choice":
choices = self.doc_to_choice(doc) choices = self.doc_to_choice(doc)
target_delimiter = self.config.target_delimiter target_delimiter = self.config.target_delimiter
if apply_chat_template:
target_delimiter = ""
if self.multiple_input: if self.multiple_input:
# If there are multiple inputs, choices are placed in the ctx # If there are multiple inputs, choices are placed in the ctx
cont = self.doc_to_target(doc) cont = self.doc_to_target(doc)
...@@ -1280,6 +1325,37 @@ class ConfigurableTask(Task): ...@@ -1280,6 +1325,37 @@ class ConfigurableTask(Task):
# Otherwise they are placed in the continuation # Otherwise they are placed in the continuation
arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices] arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]
# TODO: we should raise a warning telling users this will at most ~2x runtime.
if "acc_mutual_info" in self._metric_fn_list.keys():
# if we are calculating multiple choice accuracy
# using mutual information instead of raw loglikelihood as metric, need unconditional lls.
# here mutual info refers to calculating
# log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
# in other words normalizing by subtracting the unconditional logprob of each choice.
aux_arguments = [("", f"{choice}") for choice in choices]
arguments.extend(aux_arguments)
elif self.OUTPUT_TYPE == "generate_until":
arguments = (ctx, deepcopy(self.config.generation_kwargs))
multimodal_arg = {}
if (
self.config.doc_to_image
): # TODO: ensure that non-multimodal tasks aren't getting visual args
multimodal_arg = {
**multimodal_arg,
**{"visual": self.doc_to_image(doc)},
}
if bool(multimodal_arg):
if isinstance(arguments, list):
arguments = [arg + (multimodal_arg,) for arg in arguments]
else:
arguments = arguments + (multimodal_arg,)
if self.OUTPUT_TYPE == "multiple_choice":
request_list = [ request_list = [
Instance( Instance(
request_type="loglikelihood", request_type="loglikelihood",
...@@ -1290,33 +1366,15 @@ class ConfigurableTask(Task): ...@@ -1290,33 +1366,15 @@ class ConfigurableTask(Task):
) )
for i, arg in enumerate(arguments) for i, arg in enumerate(arguments)
] ]
# TODO: we should raise a warning telling users this will at most ~2x runtime.
if "acc_mutual_info" in self._metric_fn_list.keys():
# if we are calculating multiple choice accuracy
# using mutual information instead of raw loglikelihood as metric, need unconditional lls.
# here mutual info refers to calculating
# log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
# in other words normalizing by subtracting the unconditional logprob of each choice.
request_list.extend(
[
Instance(
request_type="loglikelihood",
doc=doc,
arguments=("", "{}".format(choice)),
idx=i,
**kwargs,
)
for i, choice in enumerate(choices)
]
)
return request_list return request_list
elif self.OUTPUT_TYPE == "generate_until":
arguments = (ctx, deepcopy(self.config.generation_kwargs))
return Instance( return Instance(
request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs request_type=self.OUTPUT_TYPE,
doc=doc,
arguments=arguments,
idx=0,
**kwargs,
) )
def process_results(self, doc, results): def process_results(self, doc, results):
...@@ -1445,7 +1503,10 @@ class ConfigurableTask(Task): ...@@ -1445,7 +1503,10 @@ class ConfigurableTask(Task):
# we expect multiple_targets to be a list. # we expect multiple_targets to be a list.
elif self.multiple_target: elif self.multiple_target:
gold = list(gold) gold = list(gold)
elif type(gold) != type(result) and not isinstance(result, List): elif (
type(gold) is not type(result)
and "bypass" not in self._metric_fn_list.keys()
):
# cast gold to the same type as result # cast gold to the same type as result
gold = type(result)(gold) gold = type(result)(gold)
...@@ -1519,10 +1580,13 @@ class ConfigurableTask(Task): ...@@ -1519,10 +1580,13 @@ class ConfigurableTask(Task):
def get_config(self, key: str) -> Any: def get_config(self, key: str) -> Any:
return getattr(self._config, key, None) return getattr(self._config, key, None)
@property
def task_name(self) -> Any:
return getattr(self.config, "task", None)
def __repr__(self): def __repr__(self):
return ( return (
f"ConfigurableTask(task_name={getattr(self.config, 'task', None)}," f"ConfigurableTask(task_name={getattr(self.config, 'task', None)},"
f"group_name={getattr(self.config, 'group', None)},"
f"output_type={self.OUTPUT_TYPE}," f"output_type={self.OUTPUT_TYPE},"
f"num_fewshot={getattr(self.config, 'num_fewshot', None)}," f"num_fewshot={getattr(self.config, 'num_fewshot', None)},"
f"num_samples={len(self.eval_docs)})" f"num_samples={len(self.eval_docs)})"
......
...@@ -21,7 +21,9 @@ HASH_PREFIX = hashlib.sha256(HASH_INPUT.encode("utf-8")).hexdigest() ...@@ -21,7 +21,9 @@ HASH_PREFIX = hashlib.sha256(HASH_INPUT.encode("utf-8")).hexdigest()
FILE_SUFFIX = f".{HASH_PREFIX}.pickle" FILE_SUFFIX = f".{HASH_PREFIX}.pickle"
def load_from_cache(file_name): def load_from_cache(file_name: str, cache: bool = False):
if not cache:
return
try: try:
path = f"{PATH}/{file_name}{FILE_SUFFIX}" path = f"{PATH}/{file_name}{FILE_SUFFIX}"
......
...@@ -110,12 +110,15 @@ class TextReader: ...@@ -110,12 +110,15 @@ class TextReader:
def read_tqdm(self, update_frequency: int = 10000): def read_tqdm(self, update_frequency: int = 10000):
current_file_position = 0 current_file_position = 0
line_counter = 0 line_counter = 0
with open(self.file_path, "r", encoding="utf-8") as fh, tqdm.tqdm( with (
total=os.path.getsize(self.file_path), open(self.file_path, "r", encoding="utf-8") as fh,
dynamic_ncols=True, tqdm.tqdm(
unit="byte", total=os.path.getsize(self.file_path),
unit_scale=1, dynamic_ncols=True,
) as progress: unit="byte",
unit_scale=1,
) as progress,
):
with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj: with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
for line in iter(mmap_obj.readline, b""): for line in iter(mmap_obj.readline, b""):
line = line.decode("utf-8") line = line.decode("utf-8")
......
This diff is collapsed.
...@@ -2,9 +2,15 @@ import collections ...@@ -2,9 +2,15 @@ import collections
import math import math
import pathlib import pathlib
import sys import sys
from typing import Dict, List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
from lm_eval.api import metrics from lm_eval.api.group import ConfigurableGroup
from lm_eval.api.metrics import (
aggregate_subtask_metrics,
pooled_sample_stderr,
stderr_for_metric,
)
from lm_eval.api.task import Task
from lm_eval.utils import eval_logger, positional_deprecated from lm_eval.utils import eval_logger, positional_deprecated
...@@ -98,7 +104,7 @@ class TaskOutput: ...@@ -98,7 +104,7 @@ class TaskOutput:
self.agg_metrics[metric_key] = agg_fn(items) self.agg_metrics[metric_key] = agg_fn(items)
self.sample_len = len(items) # TODO: same sample size for each metric? self.sample_len = len(items) # TODO: same sample size for each metric?
if isinstance(bootstrap_iters, int): if isinstance(bootstrap_iters, int):
stderr_fn = metrics.stderr_for_metric( stderr_fn = stderr_for_metric(
metric=agg_fn, metric=agg_fn,
bootstrap_iters=min(bootstrap_iters, 100) bootstrap_iters=min(bootstrap_iters, 100)
if metric in ["bleu", "chrf", "ter"] if metric in ["bleu", "chrf", "ter"]
...@@ -116,23 +122,71 @@ class TaskOutput: ...@@ -116,23 +122,71 @@ class TaskOutput:
return ( return (
f"TaskOutput(task_name={self.task_name}, " f"TaskOutput(task_name={self.task_name}, "
f"group_name={self.group_name}, " f"group_name={self.group_name}, "
f"version={self.version}," f"version={self.version}, "
f"n_shot={self.n_shot}" f"n_shot={self.n_shot}, "
f"task_alias={self.task_alias}, group_alias={self.group_alias})" f"task_alias={self.task_alias}, "
f"group_alias={self.group_alias})"
) )
def get_task_list(task_dict: dict) -> Tuple[Dict[str, list], List[TaskOutput]]: def get_task_list(task_dict: dict) -> List[TaskOutput]:
task_hierarchy = collections.defaultdict(list) outputs = []
outputs = list(TaskOutput.from_taskdict(x, y) for x, y in task_dict.items()) for task_name, task_obj in task_dict.items():
for task_output in outputs: if isinstance(task_obj, dict):
if group_name := task_output.group_name: _outputs = get_task_list(task_obj)
task_hierarchy[group_name].append(task_output.task_name) outputs.extend(_outputs)
else: else:
task_hierarchy[task_output.task_name] = [] task_output = TaskOutput.from_taskdict(task_name, task_obj)
# returns task_hierarchy tracking which groups contain which subtasks, outputs.append(task_output)
# and a list of TaskOutput classes for each non-group subtask
return task_hierarchy, [x for x in outputs if x.task] return outputs
def get_subtask_list(task_dict, task_root=None, depth=0):
subtask_list = {}
for group_obj, task_obj in task_dict.items():
if isinstance(group_obj, ConfigurableGroup):
# group_name = group_obj.group_name
group_name = group_obj.group_name
else:
group_name = group_obj
if isinstance(task_obj, dict):
_subtask_list = get_subtask_list(
task_obj, task_root=group_name, depth=depth + 1
)
if task_root:
subtask_list.setdefault((task_root, depth), []).extend(
[
_task
for (_task, _depth) in _subtask_list.keys()
if (_depth - 1) == depth
]
)
subtask_list = {**subtask_list, **_subtask_list}
else:
if isinstance(task_obj, ConfigurableGroup):
# group_or_task_name = task_obj.group_name
group_or_task_name = task_obj.group_name
elif isinstance(task_obj, Task):
# group_or_task_name = task_obj.task_name
group_or_task_name = task_obj.task_name
if task_root is None:
subtask_list.setdefault((group_or_task_name, depth), [])
else:
subtask_list.setdefault((task_root, depth), []).append(
group_or_task_name
)
if depth == 0:
_subtask_list = {}
for group_key, task_list in subtask_list.items():
group_name, depth = group_key
_subtask_list[group_name] = task_list
subtask_list = _subtask_list
return subtask_list
def print_writeout(task) -> None: def print_writeout(task) -> None:
...@@ -155,70 +209,95 @@ def get_sample_size(task, limit: Optional[int]) -> Union[int, None]: ...@@ -155,70 +209,95 @@ def get_sample_size(task, limit: Optional[int]) -> Union[int, None]:
def prepare_print_tasks( def prepare_print_tasks(
task_hierarchy: dict, results: dict, tab=0 task_dict: dict,
results: dict,
task_depth=0,
group_depth=0,
) -> Tuple[dict, dict]: ) -> Tuple[dict, dict]:
""" """
@param task_hierarchy: Dictionary representing the group hierarchy of tasks. Each key is a group name and its @param task_dict: Dictionary representing the group hierarchy of tasks. Each key is a group name and its
value is a list of task names. value is a list of task names.
@param results: Dictionary containing the results of each task. Each key is a @param results: Dictionary containing the results of each task. Each key is a
group name and its value is a dictionary of task results. group name and its value is a dictionary of task results.
@param tab: The indentation level for printing the task @param task_depth: The indentation level for printing the task
hierarchy. Default is 0.
@param group_depth: The indentation level for printing the group
hierarchy. Default is 0. hierarchy. Default is 0.
@return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains @return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains
aggregated results for each task, and groups_agg contains aggregated results for each group. aggregated results for each task, and groups_agg contains aggregated results for each group.
Prepares the task hierarchy and aggregates the results for each task and group recursively for printing. Prepares the task hierarchy and aggregates the results for each task and group recursively for printing.
""" """
results_agg = collections.defaultdict(dict)
groups_agg = collections.defaultdict(dict)
(group_name, task_list), *_ = task_hierarchy.items()
task_list = sorted(task_list)
results_agg[group_name] = results[group_name].copy()
# results_agg[group_name]["tab"] = tab
if "samples" in results_agg[group_name]:
results_agg[group_name].pop("samples")
tab_string = " " * tab + "- " if tab > 0 else ""
if "alias" in results_agg[group_name]: def _sort_task_dict(task_dict):
results_agg[group_name]["alias"] = tab_string + results_agg[group_name]["alias"] """
else: Helper utility. Sorts the task dict at the current level of the hierarchy based on alphabetized task name.
results_agg[group_name]["alias"] = tab_string + group_name Required so that we end up sorting within each sub-header correctly.
"""
if len(task_list) > 0:
groups_agg[group_name] = results[group_name].copy() return dict(
# groups_agg[group_name]["tab"] = tab sorted(
if "samples" in groups_agg[group_name]: task_dict.items(),
groups_agg[group_name].pop("samples") key=lambda item: item[0].group_name
if isinstance(item[0], ConfigurableGroup)
if "alias" in groups_agg[group_name]: else item[0],
groups_agg[group_name]["alias"] = (
tab_string + groups_agg[group_name]["alias"]
) )
else: )
groups_agg[group_name]["alias"] = tab_string + group_name
for task_name in task_list: task_agg = collections.defaultdict(dict)
if task_name in task_hierarchy: group_agg = collections.defaultdict(dict)
_task_hierarchy = { task_dict = _sort_task_dict(task_dict)
**{task_name: task_hierarchy[task_name]}, for task_or_group_name, task_or_group_obj in task_dict.items():
**task_hierarchy, tab_string = " " * task_depth + "- " if task_depth > 0 else ""
} if isinstance(task_or_group_name, ConfigurableGroup):
# string_name = task_or_group_name.group_name
name = task_or_group_name.group_name
from_configurable_group = True
task_or_group_obj = _sort_task_dict(task_or_group_obj)
elif isinstance(task_or_group_name, str):
name = task_or_group_name
if isinstance(task_or_group_obj, Task):
# string_name = task_or_group_obj.task_name
name = task_or_group_obj.task_name
from_configurable_group = False
task_agg[name] = results[name].copy()
if from_configurable_group:
if task_or_group_name.group_alias is not None:
alias = task_or_group_name.group_alias
else: else:
_task_hierarchy = { alias = task_or_group_name.group
**{task_name: []}, else:
**task_hierarchy, if "alias" in task_agg[name]:
} alias = task_agg[name]["alias"]
else:
_results_agg, _groups_agg = prepare_print_tasks( alias = name
_task_hierarchy, results, tab + 1
task_agg[name]["alias"] = tab_string + alias
if "samples" in task_agg[name]:
task_agg[name].pop("samples")
if from_configurable_group and (" " not in results[name]):
group_tab_string = " " * group_depth + "- " if group_depth > 0 else ""
group_agg[name] = results[name].copy()
group_agg[name]["alias"] = group_tab_string + alias
if "samples" in group_agg[name]:
group_agg[name].pop("samples")
if isinstance(task_or_group_obj, dict):
task_depth += 1
group_depth += 1
_task_agg, _group_agg = prepare_print_tasks(
task_or_group_obj, results, task_depth, group_depth
) )
results_agg = {**results_agg, **_results_agg} task_agg = {
groups_agg = {**groups_agg, **_groups_agg} **task_agg,
**_task_agg,
return results_agg, groups_agg }
group_agg = {**group_agg, **_group_agg}
task_depth -= 1
group_depth -= 1
return task_agg, group_agg
def consolidate_results( def consolidate_results(
...@@ -261,6 +340,8 @@ def consolidate_results( ...@@ -261,6 +340,8 @@ def consolidate_results(
for task_output in eval_tasks: for task_output in eval_tasks:
if "task_alias" in (task_config := task_output.task_config): if "task_alias" in (task_config := task_output.task_config):
results[task_output.task_name]["alias"] = task_config["task_alias"] results[task_output.task_name]["alias"] = task_config["task_alias"]
else:
results[task_output.task_name]["alias"] = task_output.task_name
if group_alias := task_output.group_alias: if group_alias := task_output.group_alias:
if group_alias not in results and (group_name := task_output.group_name): if group_alias not in results and (group_name := task_output.group_name):
results[group_name]["alias"] = group_alias results[group_name]["alias"] = group_alias
...@@ -275,12 +356,153 @@ def consolidate_results( ...@@ -275,12 +356,153 @@ def consolidate_results(
metric_key metric_key
] ]
results[task_output.task_name]["samples"] = task_output.sample_len results[task_output.task_name]["samples"] = task_output.sample_len
results[task_output.task_name][ results[task_output.task_name][f"{metric}_stderr,{filter_key}"] = (
f"{metric}_stderr,{filter_key}" task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
] = task_output.agg_metrics[f"{metric}_stderr,{filter_key}"] )
return results, samples, configs, versions, num_fewshot, higher_is_better return results, samples, configs, versions, num_fewshot, higher_is_better
def consolidate_group_results(
results,
versions,
task_dict,
task_root=None,
show_group_table=False,
task_aggregation_list=None,
) -> Tuple[dict, dict, bool, Union[None,]]:
"""
(Recursively) calculates groups' aggregated metrics and updates the results and versions dictionaries with this info.
@return: a tuple [results, versions, show_group_table, task_aggregation_list] with formats described below:
- results: A defaultdict with task names (and, after this function is called, group names of
groups that perform aggregation) as keys, and dictionaries with "alias" and metric,filter_name pairs as keys.
- versions: A defaultdict with task names (and, after this function is called, group names of
groups that perform aggregation) as keys, and float values representing the task or group's version if a version is specified. (defaulting to None).
- show_group_table: a boolean which is true if there exists a group that requires printing of its aggregated scores in a group table.
- task_aggregation_list: a defaultdict listing the subtasks to average over to produce a given group's end metric.
The method then returns the updated results, versions, show_group_table, and task_aggregation_list as a tuple.
In the top-level invocation of this function, task_aggregation_list is ignored.
"""
if task_root is None:
task_root = {}
if task_aggregation_list is None:
task_aggregation_list = {}
for group_or_task, group_or_task_info in task_dict.items():
# Convert to string
if isinstance(group_or_task, ConfigurableGroup):
group_config = group_or_task.config
group_or_task = group_or_task.group_name
else:
group_config = None
if isinstance(group_or_task_info, Task):
if task_root:
task_aggregation_list.setdefault(task_root, []).append(
group_or_task_info.task_name
)
else:
(
results,
versions,
show_group_table,
_task_aggregation_list,
) = consolidate_group_results(
results,
versions,
group_or_task_info,
group_or_task,
show_group_table,
task_aggregation_list,
)
if task_root:
task_aggregation_list.setdefault(task_root, []).extend(
task_aggregation_list.get(group_or_task, [])
)
if (group_config is None) or (
group_config["aggregate_metric_list"] is None
):
results[group_or_task][" "] = " "
continue
if "aggregate_metric_list" in group_config:
agg_metric_list = group_config["aggregate_metric_list"]
show_group_table = show_group_table | bool(
group_config["aggregate_metric_list"]
)
task_list = _task_aggregation_list[group_or_task]
metric_list = list(
{
key
for task in task_list
for key in results[task].keys()
if "_stderr" not in key and key not in ["task", "alias", "samples"]
}
)
for metric in metric_list:
stderr = "_stderr,".join(metric.split(","))
# gather metrics, sizes, and stderrs from subtasks
metrics = [
results[task][metric]
for task in task_list
if metric in results[task]
] # TODO: copy?
stderrs = [
results[task][stderr]
for task in task_list
if stderr in results[task]
]
sizes = [
results[task]["samples"]
for task in task_list
if metric in results[task]
]
for metric_config in agg_metric_list:
for filter_name in metric_config["filter_list"]:
if metric != ",".join([metric_config["metric"], filter_name]):
continue
# compute group's pooled metric and stderr
if metric_config["aggregation"] == "mean":
aggregate_fn = aggregate_subtask_metrics
elif callable(metric_config["aggregation"]):
aggregate_fn = metric_config["aggregation"]
else:
raise ValueError(
f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{metric_config['aggregation']}' for group '{group_or_task}'"
)
results[group_or_task][metric] = aggregate_fn(
metrics,
sizes,
metric_config["weight_by_size"],
)
# TODO: calculate groups' metrics using arbitrary agg fns
if "N/A" in stderrs:
results[group_or_task][stderr] = "N/A"
else:
# NOTE: this assumes we are using the mean to aggregate. There are warnings about this elsewhere
results[group_or_task][stderr] = pooled_sample_stderr(
stderrs, sizes
)
results[group_or_task]["samples"] = sum(sizes)
group_metadata = group_config.get("metadata", None)
if group_metadata is not None:
versions[group_or_task] = group_metadata.get("version", None)
# print(results)
return results, versions, show_group_table, task_aggregation_list
@positional_deprecated @positional_deprecated
def find_test_root(start_path: pathlib.Path) -> pathlib.Path: def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
""" """
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
from . import ( from . import (
anthropic_llms, anthropic_llms,
api_models,
dummy, dummy,
gguf, gguf,
hf_vlms,
huggingface, huggingface,
ibm_watsonx_ai,
mamba_lm, mamba_lm,
nemo_lm, nemo_lm,
neuralmagic, neuralmagic,
neuron_optimum, neuron_optimum,
openai_completions, openai_completions,
optimum_ipex,
optimum_lm, optimum_lm,
textsynth, textsynth,
vllm_causallms, vllm_causallms,
vllm_vlms,
) )
......
This diff is collapsed.
This diff is collapsed.
...@@ -26,9 +26,9 @@ class DummyLM(LM): ...@@ -26,9 +26,9 @@ class DummyLM(LM):
def generate_until(self, requests, disable_tqdm: bool = False): def generate_until(self, requests, disable_tqdm: bool = False):
res = [] res = []
for ctx, _ in tqdm(requests, disable=disable_tqdm): for request in tqdm(requests, disable=disable_tqdm):
res.append("lol") res.append("lol")
assert ctx.strip() != "" assert request.arguments[0].strip() != ""
return res return res
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment