Commit 948f120f authored by Baber's avatar Baber
Browse files

Merge branch 'main' into autobatchtest

# Conflicts:
#	lm_eval/models/huggingface.py
parents a5b1c7a8 bd80a6c0
...@@ -56,8 +56,7 @@ class TaskConfig(dict): ...@@ -56,8 +56,7 @@ class TaskConfig(dict):
# task naming/registry # task naming/registry
task: Optional[str] = None task: Optional[str] = None
task_alias: Optional[str] = None task_alias: Optional[str] = None
group: Optional[Union[str, list]] = None tag: Optional[Union[str, list]] = None
group_alias: Optional[Union[str, list]] = None
# HF dataset options. # HF dataset options.
# which dataset to use, # which dataset to use,
# and what splits for what purpose # and what splits for what purpose
...@@ -68,13 +67,14 @@ class TaskConfig(dict): ...@@ -68,13 +67,14 @@ class TaskConfig(dict):
validation_split: Optional[str] = None validation_split: Optional[str] = None
test_split: Optional[str] = None test_split: Optional[str] = None
fewshot_split: Optional[str] = ( fewshot_split: Optional[str] = (
None # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?) None # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaluating (?)
) )
# formatting / prompting options. # formatting / prompting options.
# see docs/advanced_task_guide.md for more info # see docs/advanced_task_guide.md for more info
process_docs: Optional[Callable] = None process_docs: Optional[Callable] = None
doc_to_text: Optional[Union[Callable, str]] = None doc_to_text: Optional[Union[Callable, str]] = None
doc_to_target: Optional[Union[Callable, str]] = None doc_to_target: Optional[Union[Callable, str]] = None
doc_to_image: Union[Callable, str] = None
doc_to_choice: Optional[Union[Callable, str, dict, list]] = None doc_to_choice: Optional[Union[Callable, str, dict, list]] = None
process_results: Optional[Union[Callable, str]] = None process_results: Optional[Union[Callable, str]] = None
use_prompt: Optional[str] = None use_prompt: Optional[str] = None
...@@ -365,6 +365,10 @@ class Task(abc.ABC): ...@@ -365,6 +365,10 @@ class Task(abc.ABC):
def doc_to_target(self, doc): def doc_to_target(self, doc):
pass pass
# not an abstractmethod because not every language-only task has to implement this
def doc_to_image(self, doc):
raise NotImplementedError
def build_all_requests( def build_all_requests(
self, self,
*, *,
...@@ -723,6 +727,10 @@ class ConfigurableTask(Task): ...@@ -723,6 +727,10 @@ class ConfigurableTask(Task):
) )
self.OUTPUT_TYPE = self.config.output_type self.OUTPUT_TYPE = self.config.output_type
if self.config.doc_to_image is not None:
# mark the task as requiring multimodality.
self.MULTIMODAL = True
if self.config.dataset_path is not None: if self.config.dataset_path is not None:
self.DATASET_PATH = self.config.dataset_path self.DATASET_PATH = self.config.dataset_path
...@@ -980,7 +988,7 @@ class ConfigurableTask(Task): ...@@ -980,7 +988,7 @@ class ConfigurableTask(Task):
else: else:
if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0): if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0):
eval_logger.warning( eval_logger.warning(
f"Task '{self.config.task}': " f"[Task: {self.config.task}] "
"num_fewshot > 0 but fewshot_split is None. " "num_fewshot > 0 but fewshot_split is None. "
"using preconfigured rule." "using preconfigured rule."
) )
...@@ -1030,8 +1038,8 @@ class ConfigurableTask(Task): ...@@ -1030,8 +1038,8 @@ class ConfigurableTask(Task):
Whether to apply the chat template to the fewshot context. Whether to apply the chat template to the fewshot context.
:param fewshot_as_multiturn: bool :param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn. Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:param chat_template: Callable :param chat_template:
Chat template to be applied to the fewshot context. callable (from lm.apply_chat_template) that takes in a list[Dict] chat transcript and renders it into a string.
:returns: str :returns: str
The fewshot context. The fewshot context.
""" """
...@@ -1159,9 +1167,11 @@ class ConfigurableTask(Task): ...@@ -1159,9 +1167,11 @@ class ConfigurableTask(Task):
""" """
return doc return doc
def doc_to_text(self, doc): def doc_to_text(self, doc, doc_to_text=None):
if self.prompt is not None: if self.prompt is not None:
doc_to_text = self.prompt doc_to_text = self.prompt
elif doc_to_text is not None:
doc_to_text = doc_to_text
else: else:
doc_to_text = self.config.doc_to_text doc_to_text = self.config.doc_to_text
...@@ -1193,9 +1203,11 @@ class ConfigurableTask(Task): ...@@ -1193,9 +1203,11 @@ class ConfigurableTask(Task):
print(type(doc_to_text)) print(type(doc_to_text))
raise TypeError raise TypeError
def doc_to_target(self, doc: Mapping) -> Union[int, str, list]: def doc_to_target(self, doc: Mapping, doc_to_target=None) -> Union[int, str, list]:
if self.prompt is not None: if self.prompt is not None:
doc_to_target = self.prompt doc_to_target = self.prompt
elif doc_to_target is not None:
doc_to_target = doc_to_target
else: else:
doc_to_target = self.config.doc_to_target doc_to_target = self.config.doc_to_target
...@@ -1237,9 +1249,11 @@ class ConfigurableTask(Task): ...@@ -1237,9 +1249,11 @@ class ConfigurableTask(Task):
else: else:
raise TypeError raise TypeError
def doc_to_choice(self, doc: Any) -> List[str]: def doc_to_choice(self, doc: Any, doc_to_choice=None) -> List[str]:
if self.prompt is not None: if self.prompt is not None:
doc_to_choice = self.prompt doc_to_choice = self.prompt
elif doc_to_choice is not None:
doc_to_choice = doc_to_choice
elif self.config.doc_to_choice is None: elif self.config.doc_to_choice is None:
eval_logger.error("doc_to_choice was called but not set in config") eval_logger.error("doc_to_choice was called but not set in config")
else: else:
...@@ -1261,9 +1275,34 @@ class ConfigurableTask(Task): ...@@ -1261,9 +1275,34 @@ class ConfigurableTask(Task):
else: else:
raise TypeError raise TypeError
def doc_to_image(self, doc: Any, doc_to_image=None) -> Union[int, str, list]:
if doc_to_image is not None:
doc_to_image = doc_to_image
elif self.config.doc_to_image is not None:
doc_to_image = self.config.doc_to_image
else:
return None
if isinstance(doc_to_image, list):
image_feature = [
self.doc_to_image(doc, feature) for feature in doc_to_image
]
return [feature for feature in image_feature if feature is not None]
elif isinstance(doc_to_image, str):
if doc_to_image in self.features:
return doc[doc_to_image]
else:
return ast.literal_eval(utils.apply_template(doc_to_image, doc))
elif callable(doc_to_image):
return doc_to_image(doc)
else:
return None
def construct_requests( def construct_requests(
self, doc: dict, ctx: str, **kwargs self, doc: dict, ctx: str, **kwargs
) -> Union[List[Instance], Instance]: ) -> Union[List[Instance], Instance]:
aux_arguments = None
if self.OUTPUT_TYPE == "loglikelihood": if self.OUTPUT_TYPE == "loglikelihood":
arguments = (ctx, self.doc_to_target(doc)) arguments = (ctx, self.doc_to_target(doc))
elif self.OUTPUT_TYPE == "loglikelihood_rolling": elif self.OUTPUT_TYPE == "loglikelihood_rolling":
...@@ -1281,6 +1320,37 @@ class ConfigurableTask(Task): ...@@ -1281,6 +1320,37 @@ class ConfigurableTask(Task):
# Otherwise they are placed in the continuation # Otherwise they are placed in the continuation
arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices] arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]
# TODO: we should raise a warning telling users this will at most ~2x runtime.
if "acc_mutual_info" in self._metric_fn_list.keys():
# if we are calculating multiple choice accuracy
# using mutual information instead of raw loglikelihood as metric, need unconditional lls.
# here mutual info refers to calculating
# log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
# in other words normalizing by subtracting the unconditional logprob of each choice.
aux_arguments = [("", f"{choice}") for choice in choices]
arguments.extend(aux_arguments)
elif self.OUTPUT_TYPE == "generate_until":
arguments = (ctx, deepcopy(self.config.generation_kwargs))
multimodal_arg = {}
if (
self.config.doc_to_image
): # TODO: ensure that non-multimodal tasks aren't getting visual args
multimodal_arg = {
**multimodal_arg,
**{"visual": self.doc_to_image(doc)},
}
if bool(multimodal_arg):
if isinstance(arguments, list):
arguments = [arg + (multimodal_arg,) for arg in arguments]
else:
arguments = arguments + (multimodal_arg,)
if self.OUTPUT_TYPE == "multiple_choice":
request_list = [ request_list = [
Instance( Instance(
request_type="loglikelihood", request_type="loglikelihood",
...@@ -1291,33 +1361,15 @@ class ConfigurableTask(Task): ...@@ -1291,33 +1361,15 @@ class ConfigurableTask(Task):
) )
for i, arg in enumerate(arguments) for i, arg in enumerate(arguments)
] ]
# TODO: we should raise a warning telling users this will at most ~2x runtime.
if "acc_mutual_info" in self._metric_fn_list.keys():
# if we are calculating multiple choice accuracy
# using mutual information instead of raw loglikelihood as metric, need unconditional lls.
# here mutual info refers to calculating
# log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
# in other words normalizing by subtracting the unconditional logprob of each choice.
request_list.extend(
[
Instance(
request_type="loglikelihood",
doc=doc,
arguments=("", "{}".format(choice)),
idx=i,
**kwargs,
)
for i, choice in enumerate(choices)
]
)
return request_list return request_list
elif self.OUTPUT_TYPE == "generate_until":
arguments = (ctx, deepcopy(self.config.generation_kwargs))
return Instance( return Instance(
request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs request_type=self.OUTPUT_TYPE,
doc=doc,
arguments=arguments,
idx=0,
**kwargs,
) )
def process_results(self, doc, results): def process_results(self, doc, results):
...@@ -1446,7 +1498,7 @@ class ConfigurableTask(Task): ...@@ -1446,7 +1498,7 @@ class ConfigurableTask(Task):
# we expect multiple_targets to be a list. # we expect multiple_targets to be a list.
elif self.multiple_target: elif self.multiple_target:
gold = list(gold) gold = list(gold)
elif type(gold) != type(result): elif type(gold) is not type(result):
# cast gold to the same type as result # cast gold to the same type as result
gold = type(result)(gold) gold = type(result)(gold)
...@@ -1520,10 +1572,13 @@ class ConfigurableTask(Task): ...@@ -1520,10 +1572,13 @@ class ConfigurableTask(Task):
def get_config(self, key: str) -> Any: def get_config(self, key: str) -> Any:
return getattr(self._config, key, None) return getattr(self._config, key, None)
@property
def task_name(self) -> Any:
return getattr(self.config, "task", None)
def __repr__(self): def __repr__(self):
return ( return (
f"ConfigurableTask(task_name={getattr(self.config, 'task', None)}," f"ConfigurableTask(task_name={getattr(self.config, 'task', None)},"
f"group_name={getattr(self.config, 'group', None)},"
f"output_type={self.OUTPUT_TYPE}," f"output_type={self.OUTPUT_TYPE},"
f"num_fewshot={getattr(self.config, 'num_fewshot', None)}," f"num_fewshot={getattr(self.config, 'num_fewshot', None)},"
f"num_samples={len(self.eval_docs)})" f"num_samples={len(self.eval_docs)})"
......
...@@ -11,11 +11,14 @@ import torch ...@@ -11,11 +11,14 @@ import torch
import lm_eval.api.metrics import lm_eval.api.metrics
import lm_eval.api.registry import lm_eval.api.registry
import lm_eval.api.task
import lm_eval.models import lm_eval.models
from lm_eval.caching.cache import delete_cache from lm_eval.caching.cache import delete_cache
from lm_eval.evaluator_utils import ( from lm_eval.evaluator_utils import (
consolidate_group_results,
consolidate_results, consolidate_results,
get_sample_size, get_sample_size,
get_subtask_list,
get_task_list, get_task_list,
prepare_print_tasks, prepare_print_tasks,
print_writeout, print_writeout,
...@@ -23,7 +26,10 @@ from lm_eval.evaluator_utils import ( ...@@ -23,7 +26,10 @@ from lm_eval.evaluator_utils import (
) )
from lm_eval.loggers import EvaluationTracker from lm_eval.loggers import EvaluationTracker
from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash
from lm_eval.tasks import TaskManager, get_task_dict from lm_eval.tasks import (
TaskManager,
get_task_dict,
)
from lm_eval.utils import ( from lm_eval.utils import (
eval_logger, eval_logger,
handle_non_serializable, handle_non_serializable,
...@@ -35,7 +41,7 @@ from lm_eval.utils import ( ...@@ -35,7 +41,7 @@ from lm_eval.utils import (
if TYPE_CHECKING: if TYPE_CHECKING:
from lm_eval.api.model import LM from lm_eval.api.model import LM
from lm_eval.tasks import Task from lm_eval.api.task import Task
@positional_deprecated @positional_deprecated
...@@ -44,7 +50,7 @@ def simple_evaluate( ...@@ -44,7 +50,7 @@ def simple_evaluate(
model_args: Optional[Union[str, dict]] = None, model_args: Optional[Union[str, dict]] = None,
tasks: Optional[List[Union[str, dict, object]]] = None, tasks: Optional[List[Union[str, dict, object]]] = None,
num_fewshot: Optional[int] = None, num_fewshot: Optional[int] = None,
batch_size: Optional[int] = None, batch_size: Optional[Union[int, str]] = None,
max_batch_size: Optional[int] = None, max_batch_size: Optional[int] = None,
device: Optional[str] = None, device: Optional[str] = None,
use_cache: Optional[str] = None, use_cache: Optional[str] = None,
...@@ -58,7 +64,7 @@ def simple_evaluate( ...@@ -58,7 +64,7 @@ def simple_evaluate(
log_samples: bool = True, log_samples: bool = True,
evaluation_tracker: Optional[EvaluationTracker] = None, evaluation_tracker: Optional[EvaluationTracker] = None,
system_instruction: Optional[str] = None, system_instruction: Optional[str] = None,
apply_chat_template: bool = False, apply_chat_template: Union[bool, str] = False,
fewshot_as_multiturn: bool = False, fewshot_as_multiturn: bool = False,
gen_kwargs: Optional[str] = None, gen_kwargs: Optional[str] = None,
task_manager: Optional[TaskManager] = None, task_manager: Optional[TaskManager] = None,
...@@ -106,8 +112,11 @@ def simple_evaluate( ...@@ -106,8 +112,11 @@ def simple_evaluate(
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
:param system_instruction: str :param system_instruction: str
System instruction to be applied to the prompt System instruction to be applied to the prompt
:param apply_chat_template: bool :param apply_chat_template: Union[bool, str]
If True, apply chat template to the prompt Specifies whether to apply a chat template to the prompt.
- If set to True, the default chat template is applied.
- If set to a string, applies the specified chat template by name.
Defaults to False (no chat template applied).
:param fewshot_as_multiturn: bool :param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn. Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:param gen_kwargs: str :param gen_kwargs: str
...@@ -148,6 +157,9 @@ def simple_evaluate( ...@@ -148,6 +157,9 @@ def simple_evaluate(
seed_message.append(f"Setting torch manual seed to {torch_random_seed}") seed_message.append(f"Setting torch manual seed to {torch_random_seed}")
torch.manual_seed(torch_random_seed) torch.manual_seed(torch_random_seed)
if fewshot_random_seed is not None:
seed_message.append(f"Setting fewshot manual seed to {fewshot_random_seed}")
if seed_message: if seed_message:
eval_logger.info(" | ".join(seed_message)) eval_logger.info(" | ".join(seed_message))
...@@ -199,7 +211,9 @@ def simple_evaluate( ...@@ -199,7 +211,9 @@ def simple_evaluate(
) )
else: else:
if not isinstance(model, lm_eval.api.model.LM): if not isinstance(model, lm_eval.api.model.LM):
raise TypeError raise TypeError(
f"The value of `model` passed to simple_evaluate() was of type {type(model)}, but is required to be a subclass of lm_eval.api.model.LM . This may be because you are passing an initialized Hugging Face PreTrainedModel without having wrapped it in `lm_eval.models.huggingface.HFLM(pretrained=my_model)` first."
)
eval_logger.info("Using pre-initialized model") eval_logger.info("Using pre-initialized model")
lm = model lm = model
...@@ -219,48 +233,58 @@ def simple_evaluate( ...@@ -219,48 +233,58 @@ def simple_evaluate(
task_manager = TaskManager(verbosity) task_manager = TaskManager(verbosity)
task_dict = get_task_dict(tasks, task_manager) task_dict = get_task_dict(tasks, task_manager)
for task_name in task_dict.keys():
task_obj = task_dict[task_name]
if isinstance(task_obj, tuple):
_, task_obj = task_obj
if task_obj is None:
continue
if task_obj.get_config("output_type") == "generate_until":
if gen_kwargs is not None:
task_obj.set_config(
key="generation_kwargs", value=gen_kwargs, update=True
)
if predict_only: # helper function to recursively apply config overrides to leaf subtasks, skipping their constituent groups.
log_samples = True # (setting of num_fewshot ; bypassing metric calculation ; setting fewshot seed)
eval_logger.info( def _adjust_config(task_dict):
f"Processing {task_name} in output-only mode. Metrics will not be calculated!" adjusted_task_dict = {}
) for task_name, task_obj in task_dict.items():
# we have to change the class properties post-hoc. This is pretty hacky. if isinstance(task_obj, dict):
task_obj.override_metric(metric_name="bypass") adjusted_task_dict = {
**adjusted_task_dict,
# override tasks' fewshot values to the provided num_fewshot arg value **{task_name: _adjust_config(task_obj)},
# except if tasks have it set to 0 manually in their configs--then we should never overwrite that }
if num_fewshot is not None:
if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
eval_logger.info(
f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
)
else: else:
eval_logger.warning( if task_obj.get_config("output_type") == "generate_until":
f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}" if gen_kwargs is not None:
) task_obj.set_config(
task_obj.set_config(key="num_fewshot", value=num_fewshot) key="generation_kwargs", value=gen_kwargs, update=True
else: )
# if num_fewshot not provided, and the task does not define a default one, default to 0
if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None: if predict_only:
task_obj.set_config(key="num_fewshot", value=0) eval_logger.info(
# fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file) f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
task_obj.set_fewshot_seed(seed=fewshot_random_seed) )
eval_logger.info( # we have to change the class properties post-hoc. This is pretty hacky.
f"Setting fewshot random generator seed to {fewshot_random_seed}" task_obj.override_metric(metric_name="bypass")
)
# override tasks' fewshot values to the provided num_fewshot arg value
# except if tasks have it set to 0 manually in their configs--then we should never overwrite that
if num_fewshot is not None:
if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
eval_logger.info(
f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
)
else:
eval_logger.warning(
f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
)
task_obj.set_config(key="num_fewshot", value=num_fewshot)
else:
# if num_fewshot not provided, and the task does not define a default one, default to 0
if (
default_num_fewshot := task_obj.get_config("num_fewshot")
) is None:
task_obj.set_config(key="num_fewshot", value=0)
# fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file)
task_obj.set_fewshot_seed(seed=fewshot_random_seed)
adjusted_task_dict[task_name] = task_obj
return adjusted_task_dict
task_dict = _adjust_config(task_dict)
if check_integrity: if check_integrity:
run_task_tests(task_list=tasks) run_task_tests(task_list=tasks)
...@@ -270,7 +294,7 @@ def simple_evaluate( ...@@ -270,7 +294,7 @@ def simple_evaluate(
model_source=model, model_source=model,
model_args=model_args, model_args=model_args,
system_instruction=system_instruction, system_instruction=system_instruction,
chat_template=lm.chat_template if apply_chat_template else None, chat_template=lm.chat_template(apply_chat_template),
fewshot_as_multiturn=fewshot_as_multiturn, fewshot_as_multiturn=fewshot_as_multiturn,
) )
...@@ -282,7 +306,7 @@ def simple_evaluate( ...@@ -282,7 +306,7 @@ def simple_evaluate(
rewrite_requests_cache=rewrite_requests_cache, rewrite_requests_cache=rewrite_requests_cache,
bootstrap_iters=bootstrap_iters, bootstrap_iters=bootstrap_iters,
write_out=write_out, write_out=write_out,
log_samples=log_samples, log_samples=True if predict_only else log_samples,
system_instruction=system_instruction, system_instruction=system_instruction,
apply_chat_template=apply_chat_template, apply_chat_template=apply_chat_template,
fewshot_as_multiturn=fewshot_as_multiturn, fewshot_as_multiturn=fewshot_as_multiturn,
...@@ -343,7 +367,7 @@ def evaluate( ...@@ -343,7 +367,7 @@ def evaluate(
write_out: bool = False, write_out: bool = False,
log_samples: bool = True, log_samples: bool = True,
system_instruction: Optional[str] = None, system_instruction: Optional[str] = None,
apply_chat_template: bool = False, apply_chat_template: Union[bool, str] = False,
fewshot_as_multiturn: bool = False, fewshot_as_multiturn: bool = False,
verbosity: str = "INFO", verbosity: str = "INFO",
): ):
...@@ -363,8 +387,11 @@ def evaluate( ...@@ -363,8 +387,11 @@ def evaluate(
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
:param system_instruction: str :param system_instruction: str
System instruction to be applied to the prompt System instruction to be applied to the prompt
:param apply_chat_template: bool :param apply_chat_template: Union[bool, str]
If True, apply chat template to the prompt Specifies whether to apply a chat template to the prompt.
- If set to True, the default chat template is applied.
- If set to a string, applies the specified chat template by name.
Defaults to False (no chat template applied).
:param fewshot_as_multiturn: bool :param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn. Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:return :return
...@@ -380,16 +407,40 @@ def evaluate( ...@@ -380,16 +407,40 @@ def evaluate(
padding_requests = defaultdict(int) padding_requests = defaultdict(int)
# get lists of group hierarchy and each type of request # get lists of group hierarchy and each type of request
task_hierarchy, eval_tasks = get_task_list(task_dict) eval_tasks = get_task_list(task_dict)
if not log_samples: if not log_samples:
if not all( if not all(
"bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys() "bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys()
for task_output in eval_tasks for task_output in eval_tasks
): ):
raise ValueError("log_samples must be True for 'bypass' metric-only tasks") raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
# validation check: are we running multimodal task <-> non-multimodal model class, or vice-versa.
incompatible_tasks = []
for task_output in eval_tasks: for task_output in eval_tasks:
task: Task = task_output.task task: Task = task_output.task
limit = get_sample_size(task, limit)
if getattr(lm, "MULTIMODAL", False) != getattr(task, "MULTIMODAL", False):
incompatible_tasks.append(task_output.task_name)
if len(incompatible_tasks) > 0:
if not getattr(lm, "MULTIMODAL", False):
raise ValueError(
f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the ['hf-multimodal', 'vllm-vlm'] model type."
)
else:
raise ValueError(
f"Attempted to run tasks: {incompatible_tasks} which are text-only, but used a model type which only currently supports multimodal tasks."
)
# end multimodality validation check
# Cache the limit arg.
limit_arg = limit
limits = []
for task_output in eval_tasks:
task: Task = task_output.task
limit = get_sample_size(task, limit_arg)
limits.append(limit)
task.build_all_requests( task.build_all_requests(
limit=limit, limit=limit,
rank=lm.rank, rank=lm.rank,
...@@ -397,7 +448,7 @@ def evaluate( ...@@ -397,7 +448,7 @@ def evaluate(
cache_requests=cache_requests, cache_requests=cache_requests,
rewrite_requests_cache=rewrite_requests_cache, rewrite_requests_cache=rewrite_requests_cache,
system_instruction=system_instruction, system_instruction=system_instruction,
apply_chat_template=apply_chat_template, apply_chat_template=bool(apply_chat_template),
fewshot_as_multiturn=fewshot_as_multiturn, fewshot_as_multiturn=fewshot_as_multiturn,
chat_template=getattr(lm, "apply_chat_template") chat_template=getattr(lm, "apply_chat_template")
if apply_chat_template if apply_chat_template
...@@ -459,7 +510,7 @@ def evaluate( ...@@ -459,7 +510,7 @@ def evaluate(
WORLD_SIZE = lm.world_size WORLD_SIZE = lm.world_size
### Postprocess outputs ### ### Postprocess outputs ###
# TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately) # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
for task_output in eval_tasks: for task_output, limit in zip(eval_tasks, limits):
task = task_output.task task = task_output.task
task.apply_filters() task.apply_filters()
...@@ -557,106 +608,45 @@ def evaluate( ...@@ -557,106 +608,45 @@ def evaluate(
### Calculate group metrics ### ### Calculate group metrics ###
if bool(results): if bool(results):
for group, task_list in reversed(task_hierarchy.items()): results, versions, show_group_table, *_ = consolidate_group_results(
if len(task_list) == 0: results, versions, task_dict
# task_hierarchy entries are either )
# `group_name: [subtask1, subtask2, ...]`
# or `task_name: []`. results_agg, group_agg = prepare_print_tasks(task_dict, results)
# we only want to operate on groups here. subtask_list = get_subtask_list(task_dict)
continue
# collect all higher_is_better values for metrics
# collect all higher_is_better values for metrics # in the group's subtasks.
# in the group's subtasks. # TODO: clean this up ; unify with the below metric_list loop?
# TODO: clean this up ; unify with the below metric_list loop? _higher_is_better = {}
_higher_is_better = {} for group, task_list in subtask_list.items():
if (
len(task_list) != 0
): # subtask list will list "task_name": [] for solo tasks
for task in task_list: for task in task_list:
for m, h in higher_is_better[task].items(): for m, h in higher_is_better[task].items():
if m not in _higher_is_better.keys(): if m not in _higher_is_better.keys():
_higher_is_better[m] = h _higher_is_better[m] = h
if (
m in _higher_is_better
and _higher_is_better[m] is not None
and _higher_is_better[m] != h
):
eval_logger.warning(
f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
)
_higher_is_better[m] = None
higher_is_better[group] = _higher_is_better
# collect all metric keys used by a subtask in the group.
metric_list = list(
{
key
for task in task_list
for key in results[task].keys()
if "_stderr" not in key and key not in ["alias", "samples"]
}
)
for metric in metric_list:
stderr = "_stderr,".join(metric.split(","))
# gather metrics, sizes, and stderrs from subtasks
metrics = [
results[task][metric]
for task in task_list
if metric in results[task]
] # TODO: copy?
stderrs = [
results[task][stderr]
for task in task_list
if stderr in results[task]
]
sizes = [
results[task]["samples"]
for task in task_list
if metric in results[task]
]
# compute group's pooled metric and stderr
results[group][metric] = (
lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
)
# TODO: calculate grouped metric using aggregation fn
if "N/A" in stderrs:
results[group][stderr] = "N/A"
else:
results[group][stderr] = (
lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
)
# TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
# To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
# results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
results[group]["samples"] = sum(sizes)
results_agg = defaultdict(dict)
groups_agg = defaultdict(dict)
all_tasks_list = list(task_hierarchy.keys())
while True:
add_tasks_list = list(k for k in results_agg.keys())
left_tasks_list = sorted(list(set(all_tasks_list) - set(add_tasks_list)))
if len(left_tasks_list) == 0:
break
_task_hierarchy = {
k: v for k, v in task_hierarchy.items() if k in left_tasks_list
}
_results_agg, _groups_agg = prepare_print_tasks(_task_hierarchy, results)
results_agg = {**results_agg, **_results_agg} if (
groups_agg = {**groups_agg, **_groups_agg} m in _higher_is_better
and _higher_is_better[m] is not None
for group_name, task_list in task_hierarchy.items(): and _higher_is_better[m] != h
if task_list: ):
num_fewshot[group_name] = num_fewshot[ eval_logger.warning(
task_list[0] f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
] # TODO: validate this )
_higher_is_better[m] = None
higher_is_better[group] = _higher_is_better
results_dict = { results_dict = {
"results": dict(results_agg.items()), "results": dict(results_agg.items()),
**({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}), **(
"group_subtasks": dict(reversed(task_hierarchy.items())), {"groups": dict(group_agg.items())}
if (bool(group_agg) & show_group_table)
else {}
),
"group_subtasks": dict(reversed(subtask_list.items())),
"configs": dict(sorted(configs.items())), "configs": dict(sorted(configs.items())),
"versions": dict(sorted(versions.items())), "versions": dict(sorted(versions.items())),
"n-shot": dict(sorted(num_fewshot.items())), "n-shot": dict(sorted(num_fewshot.items())),
...@@ -669,7 +659,7 @@ def evaluate( ...@@ -669,7 +659,7 @@ def evaluate(
len(task_output.task.eval_docs), len(task_output.task.eval_docs),
), ),
} }
for task_output in eval_tasks for task_output, limit in zip(eval_tasks, limits)
}, },
} }
if log_samples: if log_samples:
......
...@@ -2,9 +2,15 @@ import collections ...@@ -2,9 +2,15 @@ import collections
import math import math
import pathlib import pathlib
import sys import sys
from typing import Dict, List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
from lm_eval.api import metrics from lm_eval.api.group import ConfigurableGroup
from lm_eval.api.metrics import (
aggregate_subtask_metrics,
pooled_sample_stderr,
stderr_for_metric,
)
from lm_eval.api.task import Task
from lm_eval.utils import eval_logger, positional_deprecated from lm_eval.utils import eval_logger, positional_deprecated
...@@ -98,7 +104,7 @@ class TaskOutput: ...@@ -98,7 +104,7 @@ class TaskOutput:
self.agg_metrics[metric_key] = agg_fn(items) self.agg_metrics[metric_key] = agg_fn(items)
self.sample_len = len(items) # TODO: same sample size for each metric? self.sample_len = len(items) # TODO: same sample size for each metric?
if isinstance(bootstrap_iters, int): if isinstance(bootstrap_iters, int):
stderr_fn = metrics.stderr_for_metric( stderr_fn = stderr_for_metric(
metric=agg_fn, metric=agg_fn,
bootstrap_iters=min(bootstrap_iters, 100) bootstrap_iters=min(bootstrap_iters, 100)
if metric in ["bleu", "chrf", "ter"] if metric in ["bleu", "chrf", "ter"]
...@@ -116,23 +122,71 @@ class TaskOutput: ...@@ -116,23 +122,71 @@ class TaskOutput:
return ( return (
f"TaskOutput(task_name={self.task_name}, " f"TaskOutput(task_name={self.task_name}, "
f"group_name={self.group_name}, " f"group_name={self.group_name}, "
f"version={self.version}," f"version={self.version}, "
f"n_shot={self.n_shot}" f"n_shot={self.n_shot}, "
f"task_alias={self.task_alias}, group_alias={self.group_alias})" f"task_alias={self.task_alias}, "
f"group_alias={self.group_alias})"
) )
def get_task_list(task_dict: dict) -> Tuple[Dict[str, list], List[TaskOutput]]: def get_task_list(task_dict: dict) -> List[TaskOutput]:
task_hierarchy = collections.defaultdict(list) outputs = []
outputs = list(TaskOutput.from_taskdict(x, y) for x, y in task_dict.items()) for task_name, task_obj in task_dict.items():
for task_output in outputs: if isinstance(task_obj, dict):
if group_name := task_output.group_name: _outputs = get_task_list(task_obj)
task_hierarchy[group_name].append(task_output.task_name) outputs.extend(_outputs)
else: else:
task_hierarchy[task_output.task_name] = [] task_output = TaskOutput.from_taskdict(task_name, task_obj)
# returns task_hierarchy tracking which groups contain which subtasks, outputs.append(task_output)
# and a list of TaskOutput classes for each non-group subtask
return task_hierarchy, [x for x in outputs if x.task] return outputs
def get_subtask_list(task_dict, task_root=None, depth=0):
subtask_list = {}
for group_obj, task_obj in task_dict.items():
if isinstance(group_obj, ConfigurableGroup):
# group_name = group_obj.group_name
group_name = group_obj.group_name
else:
group_name = group_obj
if isinstance(task_obj, dict):
_subtask_list = get_subtask_list(
task_obj, task_root=group_name, depth=depth + 1
)
if task_root:
subtask_list.setdefault((task_root, depth), []).extend(
[
_task
for (_task, _depth) in _subtask_list.keys()
if (_depth - 1) == depth
]
)
subtask_list = {**subtask_list, **_subtask_list}
else:
if isinstance(task_obj, ConfigurableGroup):
# group_or_task_name = task_obj.group_name
group_or_task_name = task_obj.group_name
elif isinstance(task_obj, Task):
# group_or_task_name = task_obj.task_name
group_or_task_name = task_obj.task_name
if task_root is None:
subtask_list.setdefault((group_or_task_name, depth), [])
else:
subtask_list.setdefault((task_root, depth), []).append(
group_or_task_name
)
if depth == 0:
_subtask_list = {}
for group_key, task_list in subtask_list.items():
group_name, depth = group_key
_subtask_list[group_name] = task_list
subtask_list = _subtask_list
return subtask_list
def print_writeout(task) -> None: def print_writeout(task) -> None:
...@@ -155,70 +209,95 @@ def get_sample_size(task, limit: Optional[int]) -> Union[int, None]: ...@@ -155,70 +209,95 @@ def get_sample_size(task, limit: Optional[int]) -> Union[int, None]:
def prepare_print_tasks( def prepare_print_tasks(
task_hierarchy: dict, results: dict, tab=0 task_dict: dict,
results: dict,
task_depth=0,
group_depth=0,
) -> Tuple[dict, dict]: ) -> Tuple[dict, dict]:
""" """
@param task_hierarchy: Dictionary representing the group hierarchy of tasks. Each key is a group name and its @param task_dict: Dictionary representing the group hierarchy of tasks. Each key is a group name and its
value is a list of task names. value is a list of task names.
@param results: Dictionary containing the results of each task. Each key is a @param results: Dictionary containing the results of each task. Each key is a
group name and its value is a dictionary of task results. group name and its value is a dictionary of task results.
@param tab: The indentation level for printing the task @param task_depth: The indentation level for printing the task
hierarchy. Default is 0.
@param group_depth: The indentation level for printing the group
hierarchy. Default is 0. hierarchy. Default is 0.
@return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains @return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains
aggregated results for each task, and groups_agg contains aggregated results for each group. aggregated results for each task, and groups_agg contains aggregated results for each group.
Prepares the task hierarchy and aggregates the results for each task and group recursively for printing. Prepares the task hierarchy and aggregates the results for each task and group recursively for printing.
""" """
results_agg = collections.defaultdict(dict)
groups_agg = collections.defaultdict(dict)
(group_name, task_list), *_ = task_hierarchy.items()
task_list = sorted(task_list)
results_agg[group_name] = results[group_name].copy()
# results_agg[group_name]["tab"] = tab
if "samples" in results_agg[group_name]:
results_agg[group_name].pop("samples")
tab_string = " " * tab + "- " if tab > 0 else ""
if "alias" in results_agg[group_name]: def _sort_task_dict(task_dict):
results_agg[group_name]["alias"] = tab_string + results_agg[group_name]["alias"] """
else: Helper utility. Sorts the task dict at the current level of the hierarchy based on alphabetized task name.
results_agg[group_name]["alias"] = tab_string + group_name Required so that we end up sorting within each sub-header correctly.
"""
if len(task_list) > 0:
groups_agg[group_name] = results[group_name].copy() return dict(
# groups_agg[group_name]["tab"] = tab sorted(
if "samples" in groups_agg[group_name]: task_dict.items(),
groups_agg[group_name].pop("samples") key=lambda item: item[0].group_name
if isinstance(item[0], ConfigurableGroup)
if "alias" in groups_agg[group_name]: else item[0],
groups_agg[group_name]["alias"] = (
tab_string + groups_agg[group_name]["alias"]
) )
else: )
groups_agg[group_name]["alias"] = tab_string + group_name
for task_name in task_list: task_agg = collections.defaultdict(dict)
if task_name in task_hierarchy: group_agg = collections.defaultdict(dict)
_task_hierarchy = { task_dict = _sort_task_dict(task_dict)
**{task_name: task_hierarchy[task_name]}, for task_or_group_name, task_or_group_obj in task_dict.items():
**task_hierarchy, tab_string = " " * task_depth + "- " if task_depth > 0 else ""
} if isinstance(task_or_group_name, ConfigurableGroup):
# string_name = task_or_group_name.group_name
name = task_or_group_name.group_name
from_configurable_group = True
task_or_group_obj = _sort_task_dict(task_or_group_obj)
elif isinstance(task_or_group_name, str):
name = task_or_group_name
if isinstance(task_or_group_obj, Task):
# string_name = task_or_group_obj.task_name
name = task_or_group_obj.task_name
from_configurable_group = False
task_agg[name] = results[name].copy()
if from_configurable_group:
if task_or_group_name.group_alias is not None:
alias = task_or_group_name.group_alias
else: else:
_task_hierarchy = { alias = task_or_group_name.group
**{task_name: []}, else:
**task_hierarchy, if "alias" in task_agg[name]:
} alias = task_agg[name]["alias"]
else:
_results_agg, _groups_agg = prepare_print_tasks( alias = name
_task_hierarchy, results, tab + 1
task_agg[name]["alias"] = tab_string + alias
if "samples" in task_agg[name]:
task_agg[name].pop("samples")
if from_configurable_group and (" " not in results[name]):
group_tab_string = " " * group_depth + "- " if group_depth > 0 else ""
group_agg[name] = results[name].copy()
group_agg[name]["alias"] = group_tab_string + alias
if "samples" in group_agg[name]:
group_agg[name].pop("samples")
if isinstance(task_or_group_obj, dict):
task_depth += 1
group_depth += 1
_task_agg, _group_agg = prepare_print_tasks(
task_or_group_obj, results, task_depth, group_depth
) )
results_agg = {**results_agg, **_results_agg} task_agg = {
groups_agg = {**groups_agg, **_groups_agg} **task_agg,
**_task_agg,
return results_agg, groups_agg }
group_agg = {**group_agg, **_group_agg}
task_depth -= 1
group_depth -= 1
return task_agg, group_agg
def consolidate_results( def consolidate_results(
...@@ -261,6 +340,8 @@ def consolidate_results( ...@@ -261,6 +340,8 @@ def consolidate_results(
for task_output in eval_tasks: for task_output in eval_tasks:
if "task_alias" in (task_config := task_output.task_config): if "task_alias" in (task_config := task_output.task_config):
results[task_output.task_name]["alias"] = task_config["task_alias"] results[task_output.task_name]["alias"] = task_config["task_alias"]
else:
results[task_output.task_name]["alias"] = task_output.task_name
if group_alias := task_output.group_alias: if group_alias := task_output.group_alias:
if group_alias not in results and (group_name := task_output.group_name): if group_alias not in results and (group_name := task_output.group_name):
results[group_name]["alias"] = group_alias results[group_name]["alias"] = group_alias
...@@ -281,6 +362,147 @@ def consolidate_results( ...@@ -281,6 +362,147 @@ def consolidate_results(
return results, samples, configs, versions, num_fewshot, higher_is_better return results, samples, configs, versions, num_fewshot, higher_is_better
def consolidate_group_results(
results,
versions,
task_dict,
task_root=None,
show_group_table=False,
task_aggregation_list=None,
) -> Tuple[dict, dict, bool, Union[None,]]:
"""
(Recursively) calculates groups' aggregated metrics and updates the results and versions dictionaries with this info.
@return: a tuple [results, versions, show_group_table, task_aggregation_list] with formats described below:
- results: A defaultdict with task names (and, after this function is called, group names of
groups that perform aggregation) as keys, and dictionaries with "alias" and metric,filter_name pairs as keys.
- versions: A defaultdict with task names (and, after this function is called, group names of
groups that perform aggregation) as keys, and float values representing the task or group's version if a version is specified. (defaulting to None).
- show_group_table: a boolean which is true if there exists a group that requires printing of its aggregated scores in a group table.
- task_aggregation_list: a defaultdict listing the subtasks to average over to produce a given group's end metric.
The method then returns the updated results, versions, show_group_table, and task_aggregation_list as a tuple.
In the top-level invocation of this function, task_aggregation_list is ignored.
"""
if task_root is None:
task_root = {}
if task_aggregation_list is None:
task_aggregation_list = {}
for group_or_task, group_or_task_info in task_dict.items():
# Convert to string
if isinstance(group_or_task, ConfigurableGroup):
group_config = group_or_task.config
group_or_task = group_or_task.group_name
else:
group_config = None
if isinstance(group_or_task_info, Task):
if task_root:
task_aggregation_list.setdefault(task_root, []).append(
group_or_task_info.task_name
)
else:
(
results,
versions,
show_group_table,
_task_aggregation_list,
) = consolidate_group_results(
results,
versions,
group_or_task_info,
group_or_task,
show_group_table,
task_aggregation_list,
)
if task_root:
task_aggregation_list.setdefault(task_root, []).extend(
task_aggregation_list.get(group_or_task, [])
)
if (group_config is None) or (
group_config["aggregate_metric_list"] is None
):
results[group_or_task][" "] = " "
continue
if "aggregate_metric_list" in group_config:
agg_metric_list = group_config["aggregate_metric_list"]
show_group_table = show_group_table | bool(
group_config["aggregate_metric_list"]
)
task_list = _task_aggregation_list[group_or_task]
metric_list = list(
{
key
for task in task_list
for key in results[task].keys()
if "_stderr" not in key and key not in ["task", "alias", "samples"]
}
)
for metric in metric_list:
stderr = "_stderr,".join(metric.split(","))
# gather metrics, sizes, and stderrs from subtasks
metrics = [
results[task][metric]
for task in task_list
if metric in results[task]
] # TODO: copy?
stderrs = [
results[task][stderr]
for task in task_list
if stderr in results[task]
]
sizes = [
results[task]["samples"]
for task in task_list
if metric in results[task]
]
for metric_config in agg_metric_list:
for filter_name in metric_config["filter_list"]:
if metric != ",".join([metric_config["metric"], filter_name]):
continue
# compute group's pooled metric and stderr
if metric_config["aggregation"] == "mean":
aggregate_fn = aggregate_subtask_metrics
elif callable(metric_config["aggregation"]):
aggregate_fn = metric_config["aggregation"]
else:
raise ValueError(
f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{metric_config['aggregation']}' for group '{group_or_task}'"
)
results[group_or_task][metric] = aggregate_fn(
metrics,
sizes,
metric_config["weight_by_size"],
)
# TODO: calculate groups' metrics using arbitrary agg fns
if "N/A" in stderrs:
results[group_or_task][stderr] = "N/A"
else:
# NOTE: this assumes we are using the mean to aggregate. There are warnings about this elsewhere
results[group_or_task][stderr] = pooled_sample_stderr(
stderrs, sizes
)
results[group_or_task]["samples"] = sum(sizes)
group_metadata = group_config.get("metadata", None)
if group_metadata is not None:
versions[group_or_task] = group_metadata.get("version", None)
# print(results)
return results, versions, show_group_table, task_aggregation_list
@positional_deprecated @positional_deprecated
def find_test_root(start_path: pathlib.Path) -> pathlib.Path: def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
""" """
......
...@@ -62,11 +62,8 @@ class WhitespaceFilter(Filter): ...@@ -62,11 +62,8 @@ class WhitespaceFilter(Filter):
def filter_set(inst): def filter_set(inst):
filtered_resp = [] filtered_resp = []
for resp in inst: for resp in inst:
if resp.startswith(" "): resp = resp.lstrip()
resp = resp[1:]
filtered_resp.append(resp) filtered_resp.append(resp)
return filtered_resp return filtered_resp
filtered_resps = [filter_set(resp) for resp in resps] filtered_resps = [filter_set(resp) for resp in resps]
......
import json import json
import os
import re import re
import time import time
from collections import defaultdict from collections import defaultdict
...@@ -14,6 +15,7 @@ from huggingface_hub import ( ...@@ -14,6 +15,7 @@ from huggingface_hub import (
HfApi, HfApi,
hf_hub_url, hf_hub_url,
) )
from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
from lm_eval.utils import ( from lm_eval.utils import (
eval_logger, eval_logger,
...@@ -112,12 +114,15 @@ class EvaluationTracker: ...@@ -112,12 +114,15 @@ class EvaluationTracker:
output_path: str = None, output_path: str = None,
hub_results_org: str = "", hub_results_org: str = "",
hub_repo_name: str = "", hub_repo_name: str = "",
details_repo_name: str = "",
results_repo_name: str = "",
push_results_to_hub: bool = False, push_results_to_hub: bool = False,
push_samples_to_hub: bool = False, push_samples_to_hub: bool = False,
public_repo: bool = False, public_repo: bool = False,
token: str = "", token: str = "",
leaderboard_url: str = "", leaderboard_url: str = "",
point_of_contact: str = "", point_of_contact: str = "",
gated: bool = False,
) -> None: ) -> None:
""" """
Creates all the necessary loggers for evaluation tracking. Creates all the necessary loggers for evaluation tracking.
...@@ -126,12 +131,15 @@ class EvaluationTracker: ...@@ -126,12 +131,15 @@ class EvaluationTracker:
output_path (str): Path to save the results. If not provided, the results won't be saved. output_path (str): Path to save the results. If not provided, the results won't be saved.
hub_results_org (str): The Hugging Face organization to push the results to. If not provided, the results will be pushed to the owner of the Hugging Face token. hub_results_org (str): The Hugging Face organization to push the results to. If not provided, the results will be pushed to the owner of the Hugging Face token.
hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`. hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`.
details_repo_name (str): The name of the Hugging Face repository to push the details to. If not provided, the results will be pushed to `lm-eval-results`.
result_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will not be pushed and will be found in the details_hub_repo.
push_results_to_hub (bool): Whether to push the results to the Hugging Face hub. push_results_to_hub (bool): Whether to push the results to the Hugging Face hub.
push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub. push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub.
public_repo (bool): Whether to push the results to a public or private repository. public_repo (bool): Whether to push the results to a public or private repository.
token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`. token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`.
leaderboard_url (str): URL to the leaderboard on the Hugging Face hub on the dataset card. leaderboard_url (str): URL to the leaderboard on the Hugging Face hub on the dataset card.
point_of_contact (str): Contact information on the Hugging Face hub dataset card. point_of_contact (str): Contact information on the Hugging Face hub dataset card.
gated (bool): Whether to gate the repository.
""" """
self.general_config_tracker = GeneralConfigTracker() self.general_config_tracker = GeneralConfigTracker()
...@@ -142,6 +150,7 @@ class EvaluationTracker: ...@@ -142,6 +150,7 @@ class EvaluationTracker:
self.leaderboard_url = leaderboard_url self.leaderboard_url = leaderboard_url
self.point_of_contact = point_of_contact self.point_of_contact = point_of_contact
self.api = HfApi(token=token) if token else None self.api = HfApi(token=token) if token else None
self.gated_repo = gated
if not self.api and (push_results_to_hub or push_samples_to_hub): if not self.api and (push_results_to_hub or push_samples_to_hub):
raise ValueError( raise ValueError(
...@@ -159,9 +168,24 @@ class EvaluationTracker: ...@@ -159,9 +168,24 @@ class EvaluationTracker:
f"hub_results_org was not specified. Results will be pushed to '{hub_results_org}'." f"hub_results_org was not specified. Results will be pushed to '{hub_results_org}'."
) )
hub_repo_name = hub_repo_name if hub_repo_name else "lm-eval-results" if hub_repo_name == "":
self.hub_results_repo = f"{hub_results_org}/{hub_repo_name}" details_repo_name = (
self.hub_results_repo_private = f"{hub_results_org}/{hub_repo_name}-private" details_repo_name if details_repo_name != "" else "lm-eval-results"
)
results_repo_name = (
results_repo_name if results_repo_name != "" else details_repo_name
)
else:
details_repo_name = hub_repo_name
results_repo_name = hub_repo_name
eval_logger.warning(
"hub_repo_name was specified. Both details and results will be pushed to the same repository. Using hub_repo_name is no longer recommended, details_repo_name and results_repo_name should be used instead."
)
self.details_repo = f"{hub_results_org}/{details_repo_name}"
self.details_repo_private = f"{hub_results_org}/{details_repo_name}-private"
self.results_repo = f"{hub_results_org}/{results_repo_name}"
self.results_repo_private = f"{hub_results_org}/{results_repo_name}-private"
def save_results_aggregated( def save_results_aggregated(
self, self,
...@@ -211,9 +235,9 @@ class EvaluationTracker: ...@@ -211,9 +235,9 @@ class EvaluationTracker:
if self.api and self.push_results_to_hub: if self.api and self.push_results_to_hub:
repo_id = ( repo_id = (
self.hub_results_repo self.results_repo
if self.public_repo if self.public_repo
else self.hub_results_repo_private else self.results_repo_private
) )
self.api.create_repo( self.api.create_repo(
repo_id=repo_id, repo_id=repo_id,
...@@ -221,10 +245,15 @@ class EvaluationTracker: ...@@ -221,10 +245,15 @@ class EvaluationTracker:
private=not self.public_repo, private=not self.public_repo,
exist_ok=True, exist_ok=True,
) )
self.api.upload_folder( self.api.upload_file(
repo_id=repo_id, repo_id=repo_id,
folder_path=str(path), path_or_fileobj=str(
path_in_repo=self.general_config_tracker.model_name_sanitized, path.joinpath(f"results_{self.date_id}.json")
),
path_in_repo=os.path.join(
self.general_config_tracker.model_name,
f"results_{self.date_id}.json",
),
repo_type="dataset", repo_type="dataset",
commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}", commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
) )
...@@ -278,6 +307,7 @@ class EvaluationTracker: ...@@ -278,6 +307,7 @@ class EvaluationTracker:
sample["resps"] = sanitize_list(sample["resps"]) sample["resps"] = sanitize_list(sample["resps"])
sample["filtered_resps"] = sanitize_list(sample["filtered_resps"]) sample["filtered_resps"] = sanitize_list(sample["filtered_resps"])
sample["arguments"] = arguments sample["arguments"] = arguments
sample["target"] = str(sample["target"])
sample_dump = ( sample_dump = (
json.dumps( json.dumps(
...@@ -288,14 +318,14 @@ class EvaluationTracker: ...@@ -288,14 +318,14 @@ class EvaluationTracker:
+ "\n" + "\n"
) )
with open(file_results_samples, "a") as f: with open(file_results_samples, "a", encoding="utf-8") as f:
f.write(sample_dump) f.write(sample_dump)
if self.api and self.push_samples_to_hub: if self.api and self.push_samples_to_hub:
repo_id = ( repo_id = (
self.hub_results_repo self.details_repo
if self.public_repo if self.public_repo
else self.hub_results_repo_private else self.details_repo_private
) )
self.api.create_repo( self.api.create_repo(
repo_id=repo_id, repo_id=repo_id,
...@@ -303,6 +333,18 @@ class EvaluationTracker: ...@@ -303,6 +333,18 @@ class EvaluationTracker:
private=not self.public_repo, private=not self.public_repo,
exist_ok=True, exist_ok=True,
) )
try:
if self.gated_repo:
headers = build_hf_headers()
r = get_session().put(
url=f"https://huggingface.co/api/datasets/{repo_id}/settings",
headers=headers,
json={"gated": "auto"},
)
hf_raise_for_status(r)
except Exception as e:
eval_logger.warning("Could not gate the repository")
eval_logger.info(repr(e))
self.api.upload_folder( self.api.upload_folder(
repo_id=repo_id, repo_id=repo_id,
folder_path=str(path), folder_path=str(path),
...@@ -327,9 +369,7 @@ class EvaluationTracker: ...@@ -327,9 +369,7 @@ class EvaluationTracker:
""" """
eval_logger.info("Recreating metadata card") eval_logger.info("Recreating metadata card")
repo_id = ( repo_id = self.details_repo if self.public_repo else self.details_repo_private
self.hub_results_repo if self.public_repo else self.hub_results_repo_private
)
files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset") files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
results_files = get_results_filenames(files_in_repo) results_files = get_results_filenames(files_in_repo)
...@@ -360,7 +400,10 @@ class EvaluationTracker: ...@@ -360,7 +400,10 @@ class EvaluationTracker:
results_datetime, results_datetime,
) )
latest_task_results_datetime[samples_key] = latest_datetime latest_task_results_datetime[samples_key] = latest_datetime
latest_task_results_datetime[results_key] = latest_datetime latest_task_results_datetime[results_key] = max(
latest_task_results_datetime[results_key],
latest_datetime,
)
# Create metadata card # Create metadata card
card_metadata = MetadataConfigs() card_metadata = MetadataConfigs()
...@@ -377,14 +420,15 @@ class EvaluationTracker: ...@@ -377,14 +420,15 @@ class EvaluationTracker:
sanitized_last_eval_date_results = re.sub( sanitized_last_eval_date_results = re.sub(
r"[^\w\.]", "_", latest_task_results_datetime[config_name] r"[^\w\.]", "_", latest_task_results_datetime[config_name]
) )
# Ensure that all results files are listed in the metadata card
current_results = card_metadata.get(config_name, {"data_files": []})
current_results["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_results
# If the results file is the newest, update the "latest" field in the metadata card
if eval_date_sanitized == sanitized_last_eval_date_results: if eval_date_sanitized == sanitized_last_eval_date_results:
# Ensure that all results files are listed in the metadata card
current_results = card_metadata.get(config_name, {"data_files": []})
current_results["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_results
# If the results file is the newest, update the "latest" field in the metadata card
card_metadata[config_name]["data_files"].append( card_metadata[config_name]["data_files"].append(
{"split": "latest", "path": [str(results_filename)]} {"split": "latest", "path": [str(results_filename)]}
) )
...@@ -403,65 +447,20 @@ class EvaluationTracker: ...@@ -403,65 +447,20 @@ class EvaluationTracker:
sanitized_last_eval_date_results = re.sub( sanitized_last_eval_date_results = re.sub(
r"[^\w\.]", "_", latest_task_results_datetime[config_name] r"[^\w\.]", "_", latest_task_results_datetime[config_name]
) )
# Ensure that all sample results files are listed in the metadata card
current_details_for_task = card_metadata.get(
config_name, {"data_files": []}
)
current_details_for_task["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_details_for_task
# If the samples results file is the newest, update the "latest" field in the metadata card
if eval_date_sanitized == sanitized_last_eval_date_results: if eval_date_sanitized == sanitized_last_eval_date_results:
# Ensure that all sample results files are listed in the metadata card
current_details_for_task = card_metadata.get(
config_name, {"data_files": []}
)
current_details_for_task["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_details_for_task
# If the samples results file is the newest, update the "latest" field in the metadata card
card_metadata[config_name]["data_files"].append( card_metadata[config_name]["data_files"].append(
{"split": "latest", "path": [str(results_filename)]} {"split": "latest", "path": [str(results_filename)]}
) )
# Special case for MMLU with a single split covering it all
# We add another config with all MMLU splits results together for easy inspection
SPECIAL_TASKS = ["mmlu", "gpqa", "minerva_math"]
for special_task in SPECIAL_TASKS:
if special_task in config_name:
special_task = f"{model_name}__{special_task}"
former_entry = card_metadata.get(special_task, {"data_files": []})
former_split = [
(i, entry)
for i, entry in enumerate(former_entry["data_files"])
if entry.get("split", None) == eval_date_sanitized
]
if len(former_split) == 0:
former_entry["data_files"].append(
{
"split": eval_date_sanitized,
"path": [str(results_filename)],
}
)
else:
split_index, _ = former_split[0]
former_entry["data_files"][split_index]["path"].append(
str(results_filename)
)
if eval_date_sanitized == sanitized_last_eval_date_results:
latest_split = [
(i, entry)
for i, entry in enumerate(former_entry["data_files"])
if entry.get("split", None) == "latest"
]
if len(latest_split) == 0:
former_entry["data_files"].append(
{"split": "latest", "path": [str(results_filename)]}
)
else:
latest_index, _ = latest_split[0]
former_entry["data_files"][latest_index]["path"].append(
str(results_filename)
)
card_metadata[special_task] = former_entry
# Get latest results and extract info to update metadata card examples # Get latest results and extract info to update metadata card examples
latest_datetime = max(latest_task_results_datetime.values()) latest_datetime = max(latest_task_results_datetime.values())
latest_model_name = max( latest_model_name = max(
......
...@@ -114,15 +114,29 @@ def add_env_info(storage: Dict[str, Any]): ...@@ -114,15 +114,29 @@ def add_env_info(storage: Dict[str, Any]):
def add_tokenizer_info(storage: Dict[str, Any], lm): def add_tokenizer_info(storage: Dict[str, Any], lm):
if getattr(lm, "tokenizer", False): if getattr(lm, "tokenizer", False):
tokenizer_info = { try:
"tokenizer_pad_token": [lm.tokenizer.pad_token, lm.tokenizer.pad_token_id], tokenizer_info = {
"tokenizer_eos_token": [lm.tokenizer.eos_token, lm.tokenizer.eos_token_id], "tokenizer_pad_token": [
"tokenizer_bos_token": [lm.tokenizer.bos_token, lm.tokenizer.bos_token_id], lm.tokenizer.pad_token,
"eot_token_id": getattr(lm, "eot_token_id", None), str(lm.tokenizer.pad_token_id),
"max_length": getattr(lm, "max_length", None), ],
} "tokenizer_eos_token": [
storage.update(tokenizer_info) lm.tokenizer.eos_token,
# seems gguf and textsynth do not have tokenizer str(lm.tokenizer.eos_token_id),
],
"tokenizer_bos_token": [
lm.tokenizer.bos_token,
str(lm.tokenizer.bos_token_id),
],
"eot_token_id": getattr(lm, "eot_token_id", None),
"max_length": getattr(lm, "max_length", None),
}
storage.update(tokenizer_info)
except Exception as err:
logger.debug(
f"Logging detailed tokenizer info failed with {err}, skipping..."
)
# seems gguf and textsynth do not have tokenizer
else: else:
logger.debug( logger.debug(
"LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results." "LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results."
......
from . import ( from . import (
anthropic_llms, anthropic_llms,
api_models,
dummy, dummy,
gguf, gguf,
hf_vlms,
huggingface, huggingface,
ibm_watsonx_ai,
mamba_lm, mamba_lm,
nemo_lm, nemo_lm,
neuralmagic, neuralmagic,
...@@ -11,6 +14,7 @@ from . import ( ...@@ -11,6 +14,7 @@ from . import (
optimum_lm, optimum_lm,
textsynth, textsynth,
vllm_causallms, vllm_causallms,
vllm_vlms,
) )
......
from typing import Any, List, Tuple import os
from functools import cached_property
from typing import Any, Dict, List, Tuple, Union
from tqdm import tqdm from tqdm import tqdm
from lm_eval import utils from lm_eval import utils
from lm_eval.api.model import LM from lm_eval.api.model import LM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.models.openai_completions import LocalCompletionsAPI
from lm_eval.models.utils import retry_on_specific_exceptions from lm_eval.models.utils import retry_on_specific_exceptions
...@@ -42,8 +45,8 @@ def anthropic_completion( ...@@ -42,8 +45,8 @@ def anthropic_completion(
try: try:
import anthropic import anthropic
except ModuleNotFoundError: except ModuleNotFoundError as exception:
raise Exception( raise type(exception)(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \ "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`", please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
) )
...@@ -105,8 +108,8 @@ def anthropic_chat( ...@@ -105,8 +108,8 @@ def anthropic_chat(
try: try:
import anthropic import anthropic
except ModuleNotFoundError: except ModuleNotFoundError as exception:
raise Exception( raise type(exception)(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \ "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`", please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
) )
...@@ -138,7 +141,7 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install ...@@ -138,7 +141,7 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
return messages() return messages()
@register_model("anthropic") @register_model("anthropic-completions")
class AnthropicLM(LM): class AnthropicLM(LM):
REQ_CHUNK_SIZE = 20 # TODO: not used REQ_CHUNK_SIZE = 20 # TODO: not used
...@@ -165,8 +168,8 @@ class AnthropicLM(LM): ...@@ -165,8 +168,8 @@ class AnthropicLM(LM):
try: try:
import anthropic import anthropic
except ModuleNotFoundError: except ModuleNotFoundError as exception:
raise Exception( raise type(exception)(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \ "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`", please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
) )
...@@ -214,8 +217,8 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install ...@@ -214,8 +217,8 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]: def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
try: try:
import anthropic import anthropic
except ModuleNotFoundError: except ModuleNotFoundError as exception:
raise Exception( raise type(exception)(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \ "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`", please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
) )
...@@ -271,90 +274,89 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install ...@@ -271,90 +274,89 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
@register_model("anthropic-chat", "anthropic-chat-completions") @register_model("anthropic-chat", "anthropic-chat-completions")
class AnthropicChatLM(AnthropicLM): class AnthropicChat(LocalCompletionsAPI):
REQ_CHUNK_SIZE = 20 # TODO: not used
def __init__( def __init__(
self, self,
model: str, base_url="https://api.anthropic.com/v1/messages",
batch_size: int = 1, tokenizer_backend=None,
max_tokens: int = 256, **kwargs,
temperature: float = 0, # defaults to 1 ):
**kwargs, # top_p, top_k, etc. super().__init__(
) -> None: base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
"""Anthropic API wrapper. )
eval_logger.warning(
:param model: str "Chat completions does not support batching. Defaulting to batch size 1."
Anthropic model e.g. 'claude-3-opus-20240229', 'claude-3-sonnet-20240229' )
:param max_tokens: int self._batch_size = 1
Maximum number of tokens to sample from the model self.anthropic_version = "2023-06-01"
:param temperature: float eval_logger.warning(
Sampling temperature f"Using Anthropic Version: {self.anthropic_version}. Confirm the current version here: https://docs.anthropic.com/en/api/versioning"
:param kwargs: Any )
Additional model_args to pass to the API client
"""
super().__init__()
try: @cached_property
import anthropic def api_key(self):
except ModuleNotFoundError: """Override this property to return the API key for the API request."""
raise Exception( key = os.environ.get("ANTHROPIC_API_KEY", None)
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \ if key is None:
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`", raise ValueError(
"API key not found. Please set the ANTHROPIC_API_KEY environment variable."
) )
return key
self.model = model
# defaults to os.environ.get("ANTHROPIC_API_KEY") @cached_property
self.client = anthropic.Anthropic() def header(self):
self.temperature = temperature return {
self.max_tokens = max_tokens "x-api-key": f"{self.api_key}",
self.tokenizer = self.client.get_tokenizer() "anthropic-version": self.anthropic_version,
self.kwargs = kwargs }
@property def _create_payload(
def max_gen_toks(self) -> int: self, messages: List[Dict], generate=True, gen_kwargs: dict = None, **kwargs
return self.max_tokens ) -> dict:
system = (
def generate_until(self, requests) -> List[str]: messages[0].get("content") if messages[0].get("role") == "system" else None
try: )
import anthropic if system:
except ModuleNotFoundError: messages = messages[1:]
raise Exception( gen_kwargs.pop("do_sample", False)
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \ max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`", temperature = gen_kwargs.pop("temperature", 0)
) stop = gen_kwargs.pop("until", ["\n\nHuman:"])
if not isinstance(stop, list):
if not requests: stop = [stop]
return [] out = {
"messages": messages,
_requests: List[Tuple[str, dict]] = [req.args for req in requests] "model": self.model,
"max_tokens": max_tokens,
"temperature": temperature,
"stop_sequences": stop,
**gen_kwargs,
}
if system:
out["system"] = system
return out
def parse_generations(
self, outputs: Union[Dict, List[Dict]], **kwargs
) -> List[str]:
res = [] res = []
for request in tqdm(_requests): if not isinstance(outputs, list):
try: outputs = [outputs]
inp = request[0] for out in outputs:
request_args = request[1] for choices in out["content"]:
# generation_kwargs res.append(choices["text"])
until = request_args.get("until")
max_tokens = request_args.get("max_gen_toks", self.max_length)
temperature = request_args.get("temperature", self.temperature)
response = anthropic_chat(
client=self.client,
model=self.model,
prompt=inp,
max_tokens=max_tokens,
temperature=temperature, # TODO: implement non-greedy sampling for Anthropic
stop=until, # type: ignore
**self.kwargs,
)
res.append(response)
self.cache_hook.add_partial("generate_until", request, response)
except anthropic.APIConnectionError as e: # type: ignore # noqa: F821
eval_logger.critical(f"Server unreachable: {e.__cause__}")
break
except anthropic.APIStatusError as e: # type: ignore # noqa: F821
eval_logger.critical(f"API error {e.status_code}: {e.message}")
break
return res return res
def tok_encode(
self,
string: str,
left_truncate_len=None,
add_special_tokens=None,
**kwargs,
) -> List[str]:
return [string]
def loglikelihood(self, requests, **kwargs):
raise NotImplementedError(
"Anthropic Chat Completions API does not support the return of loglikelihood"
)
import abc
import asyncio
import copy
import itertools
import json
from functools import cached_property
from typing import (
Any,
Awaitable,
Callable,
Dict,
Iterable,
List,
Literal,
NamedTuple,
Optional,
Tuple,
Union,
)
try:
import requests
from aiohttp import ClientSession, TCPConnector
from tenacity import RetryError, retry, stop_after_attempt, wait_exponential
from tqdm import tqdm
from tqdm.asyncio import tqdm_asyncio
except ModuleNotFoundError:
pass
from importlib.util import find_spec
from lm_eval import utils
from lm_eval.api.instance import Instance
from lm_eval.api.model import TemplateLM
from lm_eval.models.utils import Collator, chunks, configure_pad_token
LogLikelihoodInputs = Tuple[Tuple[str, str], List[int], List[int]]
# utility class to keep track of json encoded chats
class JsonChatStr(NamedTuple):
prompt: str
def encode(self, encoding):
return self.prompt.encode(encoding)
eval_logger = utils.eval_logger
class TemplateAPI(TemplateLM):
def __init__(
self,
model: str = None,
pretrained: str = None, # `model` takes precedence over `pretrained` when passed.
base_url: str = None,
tokenizer: Optional[str] = None,
# Loglikelihood tasks require a tokenizer to calculate context lengths,
# however the requests can be sent as a string if the API doesn't support token inputs.
# use tokenized_requests=False
tokenizer_backend: Optional[
Literal["tiktoken", "huggingface", None]
] = "huggingface",
truncate: bool = False,
# number of concurrent requests. More useful if not batching
num_concurrent: int = 1,
max_retries: int = 3,
max_gen_toks: int = 256,
batch_size: Union[str, int] = 1,
seed: int = 1234,
max_length: Optional[int] = 2048,
add_bos_token: bool = False,
custom_prefix_token_id: int = None,
# send the requests as tokens or strings
tokenized_requests: bool = True,
trust_remote_code: bool = False,
revision: Optional[str] = "main",
use_fast_tokenizer: bool = True,
verify_certificate: bool = True,
**kwargs,
) -> None:
super().__init__()
missing_packages = [
pkg
for pkg in ["aiohttp", "tqdm", "tenacity", "requests"]
if find_spec(pkg) is None
]
if missing_packages:
raise ModuleNotFoundError(
f"Attempted to use an API model, but the required packages {missing_packages} are not installed. "
'Please install these via `pip install lm-eval[api]` or `pip install -e ."[api]"`'
)
self.model = model or pretrained
self.base_url = base_url
self.tokenizer = tokenizer
if not isinstance(batch_size, int) and "auto" in batch_size:
eval_logger.warning(
"Automatic batch size is not supported for API models. Defaulting to batch size 1."
)
elif int(batch_size) > 1:
eval_logger.warning(
"Batch size > 1 detected. Ensure your API supports batched requests with varying total sequence lengths."
)
self._batch_size = int(batch_size) if batch_size != "auto" else 1
self._truncate = truncate
self._max_gen_toks = int(max_gen_toks)
self._seed = int(seed)
# max_length - 1 as we always have 1 token for generation
eval_logger.info(f"Using max length {max_length} - 1")
self.max_length = max_length - 1
if int(num_concurrent) <= 1:
eval_logger.info(
"Concurrent requests are disabled. To enable concurrent requests, set `num_concurrent` > 1."
)
self._concurrent = int(num_concurrent)
self.tokenizer_backend = tokenizer_backend
self.add_bos_token = add_bos_token
self.custom_prefix_token_id = custom_prefix_token_id
self.tokenized_requests = tokenized_requests
self.max_retries = int(max_retries)
self.verify_certificate = verify_certificate
eval_logger.info(f"Using tokenizer {self.tokenizer_backend}")
if self.tokenizer_backend is None:
self.tokenizer = None
self.tokenized_requests = False
else:
if self.tokenizer is None:
if self.tokenizer_backend == "huggingface":
import transformers
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
self.tokenizer if self.tokenizer else self.model,
trust_remote_code=trust_remote_code,
revision=revision,
use_fast=use_fast_tokenizer,
)
# Not used as the API will handle padding but to mirror the behavior of the HFLM
self.tokenizer = configure_pad_token(self.tokenizer)
elif self.tokenizer_backend == "tiktoken":
try:
import tiktoken
self.tokenizer = tiktoken.encoding_for_model(self.model)
except ModuleNotFoundError as e:
raise ModuleNotFoundError(
"Attempted to use 'openai' LM type, but the package `tiktoken` is not installed. "
"Please install it via `pip install lm-eval[api]` or `pip install -e .[api]`."
) from e
if "openai" not in self.base_url:
eval_logger.warning(
f"Passed `base_url={self.base_url}` but using (OpenAI) Tiktoken tokenizer backend. "
"Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken."
)
else:
import transformers
assert isinstance(tokenizer, str), "tokenizer must be a string"
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
tokenizer,
trust_remote_code=trust_remote_code,
revision=revision,
use_fast=use_fast_tokenizer,
)
@abc.abstractmethod
def _create_payload(
self,
messages: Union[List[List[int]], List[dict], List[str], str],
*,
generate: bool = True,
gen_kwargs: Optional[dict] = None,
seed: int = 1234,
**kwargs,
) -> dict:
"""This method is responsible for creating the json payload that will be sent to the API."""
raise NotImplementedError
def create_message(
self,
messages: Union[List[List[int]], List[str], List[JsonChatStr]],
generate=False,
) -> Union[List[List[int]], List[dict], List[str], str]:
"""Helper method to transform the prompt into the expected API input format. messages consist of batched requests"""
if isinstance(messages[0], JsonChatStr):
# for chat completions we need to decode the json string to list[dict,...]
assert (
self._batch_size == 1
), "non-tokenized chat requests are only supported with batch_size=1"
# list[dict["role":..., "content":...],...]
return json.loads(messages[0].prompt)
if not self.tokenized_requests:
# if messages are tokenized:
if isinstance(messages[0][0], int):
# assuming decoding is lossless. However, this is only for loglikelihood requests
# as we need to compute the context length. For generations, we don't need to tokenize.
messages = self.decode_batch(messages)
if self._batch_size <= 1:
# if batch is 1 return str
return messages[0]
else:
# list[str,...]
return messages
# list[list[int], ...]
return messages
@staticmethod
@abc.abstractmethod
def parse_logprobs(
outputs: Union[Any, List[Any]],
tokens: List[List[int]] = None,
ctxlen: List[int] = None,
**kwargs,
) -> List[Tuple[float, bool]]:
"""Method used to parse the logprobs from the (batched) API response. This method should return a list of tuples"""
raise NotImplementedError
@staticmethod
@abc.abstractmethod
def parse_generations(outputs: Union[Any, List[Any]], **kwargs) -> List[str]:
"""Method used to parse the generations from the (batched) API response. This method should return a list of str"""
raise NotImplementedError
@cached_property
def api_key(self) -> str:
"""Override this property to return the API key for the API request."""
return ""
@cached_property
def header(self) -> dict:
"""Override this property to return the headers for the API request."""
return {"Authorization": f"Bearer {self.api_key}"}
@property
def tokenizer_name(self) -> str:
"""Must be defined for LM subclasses which implement Chat Templating.
Should return the name of the tokenizer or chat template used.
Used only to properly fingerprint caches when requests are being cached with `--cache_requests`, otherwise not used.
"""
return ""
def apply_chat_template(
self, chat_history: List[Dict[str, str]]
) -> Union[str, JsonChatStr]:
"""Applies a chat template to a list of chat history between user and model."""
if self.tokenizer_backend == "huggingface" and self.tokenized_requests:
return self.tokenizer.apply_chat_template(
chat_history, tokenize=False, add_generation_prompt=True
)
else:
# bit of a hack. We'll load back before sending to the API
return JsonChatStr(json.dumps(chat_history))
@cached_property
def eot_token_id(self) -> Optional[int]:
if self.tokenizer is None:
return None
else:
if self.tokenizer_backend == "huggingface":
return self.tokenizer.eos_token_id
elif self.tokenizer_backend == "tiktoken":
return self.tokenizer.eot_token
@cached_property
def prefix_token_id(self) -> Optional[int]:
if self.tokenizer is None:
return None
else:
if self.custom_prefix_token_id is not None:
return self.custom_prefix_token_id
if self.tokenizer_backend == "huggingface":
if self.tokenizer.bos_token_id is not None:
return self.tokenizer.bos_token_id
return self.tokenizer.eos_token_id
else:
return self.tokenizer.eot_token
def tok_encode(
self,
string: str,
left_truncate_len: int = None,
add_special_tokens: bool = False,
truncation: bool = False,
**kwargs,
) -> Union[List[List[int]], List[int], List[str]]:
if self.tokenizer_backend is None:
return [string]
elif self.tokenizer_backend == "huggingface":
# by default for CausalLM - false or self.add_bos_token is set
if not add_special_tokens:
add_special_tokens = False or self.add_bos_token
encoding: Union[List[List[int]], List[int]] = self.tokenizer(
string,
add_special_tokens=add_special_tokens,
truncation=truncation,
return_attention_mask=False,
).input_ids
# left-truncate the encoded context to be at most `left_truncate_len` tokens long
if left_truncate_len:
if not isinstance(string, str):
encoding = [enc[-left_truncate_len:] for enc in encoding]
else:
encoding = encoding[-left_truncate_len:]
return encoding
else:
try:
encoding = self.tokenizer.encode(string)
except Exception:
encoding = self.tokenizer.encode_batch(string)
return encoding
def decode_batch(self, tokens: List[List[int]]) -> List[str]:
if self.tokenizer_backend == "huggingface":
return self.tokenizer.batch_decode(tokens)
elif self.tokenizer_backend == "tiktoken":
return self.tokenizer.decode_batch(tokens)
def model_call(
self,
messages: Union[List[List[int]], List[str], List[JsonChatStr]],
*,
generate: bool = True,
gen_kwargs: Optional[Dict] = None,
**kwargs,
) -> Optional[dict]:
# !!! Copy: shared dict for each request, need new object !!!
gen_kwargs = copy.deepcopy(gen_kwargs)
try:
response = requests.post(
self.base_url,
json=self._create_payload(
self.create_message(messages),
generate=generate,
gen_kwargs=gen_kwargs,
seed=self._seed,
**kwargs,
),
headers=self.header,
verify=self.verify_certificate,
)
if not response.ok:
eval_logger.warning(
f"API request failed with error message: {response.text}. Retrying..."
)
response.raise_for_status()
return response.json()
except RetryError:
eval_logger.error(
"API request failed after multiple retries. Please check the API status."
)
return None
async def amodel_call(
self,
session: ClientSession,
messages: Union[List[List[int]], List[str], List[JsonChatStr]],
*,
generate: bool = True,
cache_keys: list = None,
ctxlens: Optional[List[int]] = None,
gen_kwargs: Optional[Dict] = None,
**kwargs,
) -> Union[List[str], List[Tuple[float, bool]], None]:
# !!! Copy: shared dict for each request, need new object !!!
gen_kwargs = copy.deepcopy(gen_kwargs)
payload = self._create_payload(
self.create_message(messages),
generate=generate,
gen_kwargs=gen_kwargs,
seed=self._seed,
**kwargs,
)
cache_method = "generate_until" if generate else "loglikelihood"
try:
async with session.post(
self.base_url,
json=payload,
headers=self.header,
) as response:
if not response.ok:
error_text = await response.text()
eval_logger.warning(
f"API request failed with error message: {error_text}. Retrying..."
)
# raising exception will retry the request
response.raise_for_status()
outputs = await response.json()
answers = (
self.parse_generations(
outputs=outputs,
)
if generate
else self.parse_logprobs(
outputs=outputs,
tokens=messages,
ctxlens=ctxlens,
)
)
if cache_keys:
for res, cache in zip(answers, cache_keys):
self.cache_hook.add_partial(cache_method, cache, res)
return answers
# If the retries also fail
except RetryError:
eval_logger.error(
"API request failed after multiple retries. Please check the API status."
)
return None
def batch_loglikelihood_requests(
self, chunks: Iterable[List[LogLikelihoodInputs]]
) -> Tuple[List[List[int]], List[int], List[Tuple[str, str]]]:
inputs = []
ctxlens = []
cache_keys = []
for chunk in chunks:
for cache_key, context_enc, continuation_enc in chunk:
# max_length - 1 as we always have 1 token for generation
inp = (context_enc + continuation_enc)[-(self.max_length) :]
ctxlen = len(context_enc) - max(
0, len(context_enc) + len(continuation_enc) - (self.max_length)
)
inputs.append(inp)
ctxlens.append(ctxlen)
cache_keys.append(cache_key)
return inputs, ctxlens, cache_keys
async def get_batched_requests(
self,
requests: list,
cache_keys: list,
*,
generate: bool = True,
ctxlens: List[int] = None,
**kwargs,
) -> Union[List[List[str]], List[List[Tuple[float, bool]]]]:
ctxlens = ctxlens if ctxlens else [None] * len(requests)
conn = TCPConnector(limit=self._concurrent)
async with ClientSession(connector=conn) as session:
retry_: Callable[..., Awaitable[Any]] = retry(
stop=stop_after_attempt(self.max_retries),
wait=wait_exponential(multiplier=0.5, min=1, max=10),
reraise=True,
)(self.amodel_call)
# Create tasks for each batch of request
tasks = [
asyncio.create_task(
retry_(
session=session,
messages=message,
cache_keys=cache_key,
generate=generate,
ctxlens=ctxlen,
**kwargs,
)
)
for message, cache_key, ctxlen in zip(
chunks(requests, n=self._batch_size),
chunks(cache_keys, n=self._batch_size),
chunks(ctxlens, n=self._batch_size),
)
]
return await tqdm_asyncio.gather(*tasks, desc="Requesting API")
def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
assert (
self.tokenizer is not None
), "Tokenizer is required for loglikelihood tasks to compute context lengths."
res = []
def _collate(req: LogLikelihoodInputs):
"""Defines the key for the sorted method"""
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch
# padded context length. this is useful to simplify the batching logic and more importantly to make
# automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks = req[1] + req[2]
return -len(toks), tuple(toks)
re_ord = Collator(
requests,
sort_fn=_collate,
group_by=None,
)
# if concurrent then we'll batch in the async context
chunked = re_ord.get_batched(n=self._batch_size if self._concurrent <= 1 else 0)
if self._concurrent <= 1:
pbar = tqdm(desc="Requesting API", total=len(requests))
for chunk in chunked:
inputs, ctxlens, cache_keys = self.batch_loglikelihood_requests([chunk])
outputs = retry(
stop=stop_after_attempt(self.max_retries),
wait=wait_exponential(multiplier=0.5, min=1, max=10),
reraise=True,
)(self.model_call)(messages=inputs, generate=False)
if isinstance(outputs, dict):
outputs = [outputs]
for answer_, cache_key in zip(
self.parse_logprobs(
outputs=outputs, tokens=inputs, ctxlens=ctxlens
),
cache_keys,
):
if answer_ is not None:
res.append(answer_)
# cache requests that aren't from a loglikelihood_rolling request
if cache_key is not None:
self.cache_hook.add_partial(
"loglikelihood", cache_key, answer_
)
pbar.update(1)
else:
inputs, ctxlens, cache_keys = self.batch_loglikelihood_requests(chunked)
res = itertools.chain.from_iterable(
asyncio.run(
self.get_batched_requests(
inputs, cache_keys, generate=False, ctxlens=ctxlens
)
)
)
return re_ord.get_original(res)
def generate_until(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[str]:
res = []
def _collate_gen(_requests):
# sort by the length of the non-tokenized contexts
return -len(_requests[0])
# Let the API deal with tokenization
requests, all_gen_kwargs = zip(*(req.args for req in requests))
if self.tokenized_requests:
encodings_list = self.tok_encode(
requests, add_special_tokens=self.add_bos_token
)
else:
encodings_list = [None] * len(requests)
requests = [
(a, b, c) for a, b, c in zip(requests, all_gen_kwargs, encodings_list)
]
re_ord = Collator(
requests,
sort_fn=_collate_gen,
group_by="gen_kwargs",
)
chunked = re_ord.get_batched(
n=self._batch_size if self._concurrent <= 1 else 0, batch_fn=None
)
if self._concurrent <= 1:
pbar = tqdm(desc="Requesting API", total=len(requests))
for chunk in chunked:
contexts, all_gen_kwargs, encodings_list = zip(*chunk)
req = encodings_list if self.tokenized_requests else contexts
outputs = retry(
stop=stop_after_attempt(self.max_retries),
wait=wait_exponential(multiplier=0.5, min=1, max=10),
reraise=True,
)(self.model_call)(
messages=req,
generate=True,
gen_kwargs=copy.deepcopy(all_gen_kwargs[0]),
)
for generated_text, context in zip(
self.parse_generations(
outputs=outputs,
contexts=contexts,
),
contexts,
):
if generated_text is not None:
res.append(generated_text)
# partial caching
if context is not None:
self.cache_hook.add_partial(
"generate_until",
(context, all_gen_kwargs[0]),
generated_text,
)
pbar.update(1)
else:
for chunk in chunked:
contexts, all_gen_kwargs, encodings_list = zip(*chunk)
req = encodings_list if self.tokenized_requests else contexts
results = itertools.chain.from_iterable(
asyncio.run(
self.get_batched_requests(
req,
cache_keys=[(ctx, all_gen_kwargs[0]) for ctx in contexts],
generate=True,
gen_kwargs=copy.deepcopy(all_gen_kwargs[0]),
)
)
)
res.extend(results)
return re_ord.get_original(res)
def loglikelihood_rolling(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[float]:
loglikelihoods = []
for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
rolling_token_windows = list(
map(
utils.make_disjoint_window,
utils.get_rolling_token_windows(
token_list=self.tok_encode(string),
prefix_token=self.prefix_token_id,
# max_seq_len - (1 for context)
max_seq_len=self.max_length - 1,
context_len=1,
),
)
)
# TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
rolling_token_windows = [(None,) + x for x in rolling_token_windows]
string_nll = self._loglikelihood_tokens(
rolling_token_windows,
disable_tqdm=True,
)
# discard is_greedy
string_nll = [x[0] for x in string_nll]
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
# cache this loglikelihood_rolling request
self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
return loglikelihoods
...@@ -26,9 +26,9 @@ class DummyLM(LM): ...@@ -26,9 +26,9 @@ class DummyLM(LM):
def generate_until(self, requests, disable_tqdm: bool = False): def generate_until(self, requests, disable_tqdm: bool = False):
res = [] res = []
for ctx, _ in tqdm(requests, disable=disable_tqdm): for request in tqdm(requests, disable=disable_tqdm):
res.append("lol") res.append("lol")
assert ctx.strip() != "" assert request.arguments[0].strip() != ""
return res return res
......
...@@ -68,7 +68,9 @@ class GGUFLM(LM): ...@@ -68,7 +68,9 @@ class GGUFLM(LM):
logger.error(f"RequestException: {e}") logger.error(f"RequestException: {e}")
time.sleep(delay) # wait before retrying time.sleep(delay) # wait before retrying
else: else:
raise Exception(f"Failed to get a valid response after {retries} retries.") raise RuntimeError(
f"Failed to get a valid response after {retries} retries."
)
def loglikelihood(self, requests, disable_tqdm: bool = False): def loglikelihood(self, requests, disable_tqdm: bool = False):
if not requests: if not requests:
......
import copy
from typing import Dict, List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
import transformers
from tqdm import tqdm
from transformers import BatchEncoding
from lm_eval import utils
from lm_eval.api.instance import Instance
from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM
from lm_eval.models.utils import (
Collator,
flatten_image_list,
pad_and_concat,
replace_placeholders,
stop_sequences_criteria,
)
DEFAULT_IMAGE_PLACEHOLDER = "<image>"
eval_logger = utils.eval_logger
@register_model("hf-multimodal")
class HFMultimodalLM(HFLM):
"""
An abstracted Hugging Face model class for multimodal LMs like Llava and Idefics.
"""
AUTO_MODEL_CLASS = transformers.AutoModelForVision2Seq
MULTIMODAL = True # flag to indicate, for now, that this model type can run multimodal requests
def __init__(
self,
pretrained: Union[str, transformers.PreTrainedModel],
image_token_id: Optional[int] = None,
image_string: Optional[str] = None,
interleave: bool = True,
# TODO: handle whitespace in image placeholder (replacement)
max_images: Optional[int] = 999,
convert_img_format=False,
**kwargs,
):
# We initialize using HFLM's init. Sub-methods like _create_model and _create_tokenizer
# modify init behavior.
super().__init__(pretrained, **kwargs)
assert (
self.batch_size != "auto"
), "Batch size 'auto' is not yet supported for hf-multimodal models."
self.chat_applied: bool = False
# TODO: phi-3.5 "image placeholders" are <image_1>, <image_2>, ... in order. how to handle this case
# HF AutoModelForVision2Seq models have an `image_token_id` value in their configs
# denoting the token which indicates a location where an image will be substituted in.
# This can take different string values across models, e.g. <image> for Idefics2 and <|image_pad|> for Qwen2-VL
self.interleave = interleave
self.max_images = max_images
self.rgb = convert_img_format
# WARNING: improperly set image_token_id can lead to ignored image input or other (potentially silent) errors!
if not image_string:
self.image_token_id = (
int(image_token_id)
if image_token_id
else (
getattr(self.config, "image_token_id", None)
or getattr(self.config, "image_token_index", None)
)
)
assert (
self.image_token_id is not None
), "Must have a non-None image_token_id to evaluate a Hugging Face AutoModelForVision2Seq model. Please pass `image_token_id` in `--model_args` if model's config does not already specify one."
# get the string this token ID corresponds to
self.image_token = self.tok_decode(
[self.image_token_id], skip_special_tokens=False
)
if image_token_id is not None:
eval_logger.info(
f"A non-default image_token_id with image_token_id={self.image_token_id} and string value '{self.image_token}' was specified manually. Note that using an improper image_token placeholder may lead to ignored image input or errors!"
)
else:
eval_logger.info(
f"A non-default image_token string with string value image_string='{image_string}' was specified manually. Note that using an improper image_token placeholder may lead to ignored image input or errors!"
)
self.image_token = image_string
def _create_tokenizer(
self,
pretrained: Union[str, transformers.PreTrainedModel],
tokenizer: Optional[
Union[
str,
transformers.ProcessorMixin,
]
],
revision: Optional[str] = "main",
trust_remote_code: Optional[bool] = False,
**kwargs,
) -> None:
"""
Helper method during initialization.
For the multimodal variant, we initialize not just
`self.tokenizer` but also `self.processor`.
"""
if tokenizer:
if isinstance(tokenizer, str):
return transformers.AutoProcessor.from_pretrained(
tokenizer,
revision=revision,
trust_remote_code=trust_remote_code,
# use_fast=use_fast_tokenizer,
)
else:
assert isinstance(
tokenizer, transformers.ProcessorMixin
) # TODO: check this condition
return tokenizer
# Get tokenizer based on 'pretrained'
if isinstance(pretrained, str):
model_name = pretrained
else:
# get the HF hub name via accessor on model
model_name = self.model.name_or_path
self.processor = transformers.AutoProcessor.from_pretrained(
model_name,
revision=revision,
trust_remote_code=trust_remote_code,
# use_fast=use_fast_tokenizer,
)
self.tokenizer = self.processor.tokenizer
def tok_multimodal_encode(
self, string, images, left_truncate_len=None, add_special_tokens=None
):
"""Helper function which encodes an image + string combo using AutoProcessor"""
# We inherit special token kwarg setup from HFLM.tok_encode
# special_tokens_kwargs = {}
# by default for CausalLM - false or self.add_bos_token is set
# if add_special_tokens is None:
# special_tokens_kwargs = {"add_special_tokens": False or self.add_bos_token}
# otherwise the method explicitly defines the value
# else:
# special_tokens_kwargs = {"add_special_tokens": add_special_tokens}
# encode text+images
# TODO: why does (Qwen2-VL) processor error when attempting to add special tokens to text?
encoding = self.processor(
text=string, images=images, return_tensors=None
) # , **special_tokens_kwargs)
# remove (and store) our tokenized text
text_encoding = encoding.pop("input_ids")
encoding.pop("attention_mask")
# left-truncate the encoded context to be at most `left_truncate_len` tokens long
if left_truncate_len:
text_encoding = text_encoding[-left_truncate_len:]
return text_encoding, encoding # image_encoding is a dict
def _encode_multimodal_pair(self, context, continuation, images):
"""Helper function to perform the role of TemplateLM._encode_pair
Except allowing for image input to also be processed alongside `context`.
This method is a bit messy due to the need to defer conversion of image and text token input
into PyTorch tensors until the main inference loop.
"""
n_spaces = len(context) - len(context.rstrip())
if n_spaces > 0:
continuation = context[-n_spaces:] + continuation
context = context[:-n_spaces]
# TODO: replace default <image> placeholder with self.image_token, for contexts
whole_enc, image_enc = self.tok_multimodal_encode(
context + continuation, images
)
context_enc, _ = self.tok_multimodal_encode(context, images)
# tok_multimodal_encode returns List[List[int]] for tokenized text. Get rid of the batch dim
# since we only are encoding a single string.
# TODO: this is a bit hacky, it'd be nice to make this generally cleaner
whole_enc, context_enc = whole_enc[0], context_enc[0]
context_enc_len = len(context_enc)
continuation_enc = whole_enc[context_enc_len:]
return context_enc, continuation_enc, image_enc
def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
self.chat_applied = True
if not self.interleave:
for content in chat_history:
c = []
text = content["content"]
# Count and remove image placeholders
image_count = min(
self.max_images, text.count(DEFAULT_IMAGE_PLACEHOLDER)
)
text = text.replace(DEFAULT_IMAGE_PLACEHOLDER, "")
# Add image entries
for _ in range(image_count):
c.append({"type": "image", "image": None})
# Add single text entry at the end
c.append({"type": "text", "text": text})
content["content"] = c
else:
for content in chat_history:
c = []
text = content["content"]
expected_image_count = min(
self.max_images, text.count(DEFAULT_IMAGE_PLACEHOLDER)
)
actual_image_count = 0
text_parts = text.split(DEFAULT_IMAGE_PLACEHOLDER)
for i, part in enumerate(text_parts):
# TODO: concatenate text parts (esp. if skipping images)?
if part: # Add non-empty text parts
c.append({"type": "text", "text": part})
if (
(i < len(text_parts) - 1) and i < self.max_images
): # Add image placeholder after each split except the last
c.append({"type": "image"})
actual_image_count += 1
content["content"] = c
if actual_image_count != expected_image_count:
raise ValueError(
f"Mismatch in image placeholder count. Expected: {expected_image_count}, Actual: {actual_image_count}"
)
return self.processor.apply_chat_template(
chat_history, add_generation_prompt=True
)
def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
if hasattr(self.processor, "apply_chat_template"):
_tokenizer = self.tokenizer
self.tokenizer = self.processor
selected_template = super().chat_template(chat_template)
self.tokenizer = _tokenizer
return selected_template
else:
return super().chat_template(chat_template)
def tok_batch_multimodal_encode(
self,
strings: List[str], # note that input signature of this fn is different
images: List[List], # TODO: images are pil.Image at the moment, update typehint
padding_side: str = "left",
left_truncate_len: int = None,
truncation: bool = False,
) -> Union[
BatchEncoding, Dict[str, torch.Tensor]
]: # note that this return signature differs from HFLM tok_batch_encode.
# NOTE: here, we replace <image> tags with our model's corresponding image_token string value.
if not self.chat_applied:
# TODO<baber>: This still keeps the whitespace in the image placeholder, which is not ideal.
strings = [
replace_placeholders(
string, DEFAULT_IMAGE_PLACEHOLDER, self.image_token, self.max_images
)
for string in strings
]
# encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
old_padding_side = self.tokenizer.padding_side
self.tokenizer.padding_side = padding_side
# add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
images = [img[: self.max_images] for img in images]
if self.rgb:
images = [[img.convert("RGB") for img in sublist] for sublist in images]
# certain models like llava expect a single-level image list even for bs>1, multi-image. TODO: port this over to loglikelihoods
if getattr(self.config, "model_type", "") == "llava":
images = flatten_image_list(images)
encoding = self.processor(
images=images,
text=strings,
truncation=truncation,
padding="longest",
return_tensors="pt",
# **add_special_tokens, # TODO: at least some Processors error out when passing this. How do we control whether text gets BOS added?
)
encoding.to( # TODO: our other tokenization methods in HFLM don't typically move to device. this breaks convention
self.device, self.model.dtype
) # TODO: This only casts the pixel values. Should they always be float16?
if left_truncate_len:
encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
encoding["attention_mask"] = encoding["attention_mask"][
:, -left_truncate_len:
]
self.tokenizer.padding_side = old_padding_side
return encoding
def _model_multimodal_call(self, inps, imgs, attn_mask=None, labels=None):
"""
TODO: update docstring
"""
# note: imgs is a dict.
with torch.no_grad():
return self.model(inps, **imgs).logits
def _model_multimodal_generate(self, inputs, max_length, stop, **generation_kwargs):
generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
do_sample = generation_kwargs.get("do_sample", None)
# The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
generation_kwargs["do_sample"] = do_sample = False
if do_sample is False and generation_kwargs.get("temperature") == 0.0:
generation_kwargs.pop("temperature")
stopping_criteria = stop_sequences_criteria(
self.tokenizer,
stop,
inputs["input_ids"].shape[1],
inputs["input_ids"].shape[0],
)
return self.model.generate(
**inputs,
max_length=max_length,
stopping_criteria=stopping_criteria,
pad_token_id=self.tokenizer.pad_token_id,
use_cache=True,
**generation_kwargs,
)
def _batch_images(self, image_encs):
"""
Helper function: batch together image encodings across examples in a batch.
# TODO: for variable-sized images, this may break down.
"""
batched_imgs = {}
for key in image_encs[0].keys():
batched_imgs[key] = torch.cat(
[
torch.tensor(
image_enc[key], device=self.device, dtype=self.model.dtype
)
for image_enc in image_encs
],
dim=0,
)
return batched_imgs
def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
raise NotImplementedError(
"model type `hf-multimodal` does not support loglikelihood_rolling. Use 'hf' model type for text-only loglikelihood_rolling tasks ",
"this is because we do not support measuring the loglikelihood a model assigns to an image.",
)
def loglikelihood(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[Tuple[float, bool]]:
raise NotImplementedError(
"'loglikelihood' requests for model type `hf-multimodal` are not yet tested. This feature will be enabled when a loglikelihood-based multiple-choice VQA dataset is added!"
)
new_reqs = []
for context, continuation, aux_arguments in [req.args for req in requests]:
if context == "":
raise ValueError(
"Must get non-empty context for multimodal requests! You might be trying to run 'loglikelihood_rolling', which is not supported in the multimodal case."
)
else:
visuals = aux_arguments["visual"]
context_enc, continuation_enc, image_enc = self._encode_multimodal_pair(
context, continuation, visuals
)
# TODO: key to pick for caching images
new_reqs.append(
(
(context, continuation, visuals),
context_enc,
continuation_enc,
image_enc,
)
)
return self._loglikelihood_tokens(new_reqs, disable_tqdm=disable_tqdm)
def _loglikelihood_tokens(
self,
requests: List[
Tuple[Tuple[None, str, str], List[int], List[int], List[int]]
], # TODO: update typehint to be correct
disable_tqdm: bool = False,
override_bs: int = None,
) -> List[Tuple[float, bool]]:
res = []
# TODO: **improve multimodal collation.** We currently ignore image size when ordering docs. ideally we'd take them into account
def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
"""Defines the key for the sorted method"""
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch
# padded context length. this is useful to simplify the batching logic and more importantly to make
# automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks = req[1] + req[2]
return -len(toks), tuple(toks)
def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
"""Defines the key to group and lookup one-token continuations"""
# Use with group_by="contexts" (optional)"
# allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
# speeds up some multiple-choice tasks proportionally to the number of choices.
# groups requests by context+continuation[:-1] and infer on one request/group.
return req[-1] + req[-3] + req[-2][:-1]
re_ord = Collator(
requests,
sort_fn=_collate,
group_by="contexts" # TODO: can't group-by just "contexts" any more, need to incorporate imgs
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
and self.logits_cache
else None,
group_fn=_lookup_one_token_cont,
)
# automatic (variable) batch size detection for vectorization
# pull longest context sample from request
n_reordered_requests = len(re_ord)
batch_size = (
self.batch_size
if self.batch_size != "auto"
else override_bs
if override_bs is not None
else 0
)
batch_fn = (
self._batch_scheduler
if self.batch_size == "auto"
and n_reordered_requests > 0
and not override_bs
else None
)
chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn)
pbar = tqdm(
total=len(requests),
disable=(disable_tqdm or (self.rank != 0)),
desc="Running loglikelihood requests with text+image input",
)
for chunk in chunks:
imgs = []
inps = []
cont_toks_list = []
inplens = []
padding_len_inp = None
# because vectorizing is annoying, we first convert each (context, continuation) pair to padded
# tensors, then we pack them together into a batch, call the model, and then pick it all apart
# again because vectorizing is annoying
for _, context_enc, continuation_enc, image_enc in chunk:
# sanity check
assert len(image_enc) > 0
assert len(context_enc) > 0
assert len(continuation_enc) > 0
assert len(continuation_enc) <= self.max_length
# how this all works (illustrated on a causal decoder-only setup):
# CTX CONT
# inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
# model \ \
# logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the
# cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice
# when too long to fit in context, truncate from the left
# TODO: assuming that we won't handle enc-dec Vision2Seq models. Is that a safe assumption?
inp = torch.tensor(
(context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
dtype=torch.long,
device=self.device,
)
(inplen,) = inp.shape
padding_len_inp = (
max(padding_len_inp, inplen)
if padding_len_inp is not None
else inplen
)
inps.append(inp) # [1, inp_length]
cont_toks_list.append(continuation_enc)
inplens.append(inplen)
imgs.append(image_enc)
# create encoder attn mask and batched conts, if seq2seq
call_kwargs = {}
batched_inps = pad_and_concat(
padding_len_inp, inps, padding_side="right"
) # [batch, padding_len_inp]
# batch our examples' image inputs together
batched_imgs = self._batch_images(
imgs
) # TODO: fix/test for bs>1 case with differently-sized imgs!
multi_logits = F.log_softmax(
self._model_multimodal_call(batched_inps, batched_imgs, **call_kwargs),
dim=-1,
) # [batch, padding_length (inp or cont), vocab]
for (
request_str,
ctx_tokens,
_,
image_encs,
), logits, inplen, cont_toks in zip(
chunk, multi_logits, inplens, cont_toks_list
):
# Slice to original seq length
contlen = len(cont_toks)
# take only logits in the continuation
# (discard context toks if decoder-only ; discard right-padding)
# also discards + checks for "virtual tokens" in the causal LM's input window
# from prompt/prefix tuning tokens, if applicable
ctx_len = (
inplen + (logits.shape[0] - padding_len_inp)
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
else None
)
logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
logits = logits.unsqueeze(0) # [1, seq, vocab]
# Check if per-token argmax is exactly equal to continuation
greedy_tokens = logits.argmax(dim=-1)
# check for one-token continuation cache hits.
# noop in case group_by != "contexts" or no cache hit and returns the
# original args. Otherwise, expands the logits batch dimension and yields each
# batch along with matching continuation tokens and prompt strings.
# logits -> [1, seq, vocab]
for request_str, cont_toks, logits in re_ord.get_cache(
req_str=request_str,
cxt_toks=ctx_tokens,
cont_toks=cont_toks,
logits=logits,
):
cont_toks = torch.tensor(
cont_toks, dtype=torch.long, device=self.device
).unsqueeze(0) # [1, seq]
max_equal = (greedy_tokens == cont_toks).all()
# Obtain log-probs at the corresponding continuation token indices
# last_token_slice = logits[:, -1, :].squeeze(0).tolist()
logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
-1
) # [1, seq]
# Answer: (log prob, is-exact-match)
answer = (float(logits.sum()), bool(max_equal))
res.append(answer)
self.cache_hook.add_partial(
"loglikelihood", request_str, answer
) # TODO: choose convention for adding images into the cache key
pbar.update(1)
pbar.close()
return re_ord.get_original(res)
def generate_until(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[str]:
# TODO: back out to HFLM.generate_until() for all requests without aux_arguments (text-only reqs)
res = []
def _collate(x):
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch
# padded context length. this is useful to simplify the batching logic and more importantly to make
# automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks = self.tok_encode(x[0])
return -len(toks), x[0]
pbar = tqdm(
total=len(requests),
disable=(disable_tqdm or (self.rank != 0)),
desc="Running generate_until requests with text+image input",
)
# TODO: port auto-batch sizing into this.
# we group requests by their generation_kwargs,
# so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
# in the same batch.
re_ords = Collator(
[reg.args for reg in requests],
_collate,
group_by="gen_kwargs",
group_fn=lambda x: x[1],
)
chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
### Up to here: was identical to non-multimodal HFLM generate_until ###
for chunk in chunks:
contexts, all_gen_kwargs, aux_arguments = zip(*chunk)
visuals = [arg["visual"] for arg in aux_arguments]
if not isinstance(contexts, list):
contexts = list(
contexts
) # for Qwen2-VL, processor is unhappy accepting a tuple of strings instead of a list.
# TODO: could we upstream this workaround to HF?
### this part onward: same as HFLM ###
# we assume all gen kwargs in the batch are the same
# this is safe to assume because the `grouper` object ensures it.
gen_kwargs = all_gen_kwargs[0]
# unpack our keyword arguments.
until = None
if isinstance(gen_kwargs, dict):
kwargs = copy.deepcopy(gen_kwargs) # edge case for repeats > 1
if "until" in kwargs.keys():
until = kwargs.pop("until")
if isinstance(until, str):
until = [until]
elif not isinstance(until, list):
raise ValueError(
f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
)
else:
raise ValueError(
f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
)
# add EOS token to stop sequences
eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
if not until:
until = [eos]
else:
until.append(eos)
if "max_gen_toks" in kwargs.keys():
max_gen_toks = kwargs.pop("max_gen_toks")
else:
max_gen_toks = self.max_gen_toks
### end stuff that's entirely copied verbatim from HFLM ###
max_ctx_len = self.max_length - max_gen_toks
inputs = self.tok_batch_multimodal_encode(
contexts,
visuals,
left_truncate_len=max_ctx_len,
truncation=self.truncation,
)
context_enc = inputs["input_ids"]
if "max_length" not in kwargs:
kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
cont = self._model_multimodal_generate(inputs, stop=until, **kwargs)
del inputs
torch.cuda.empty_cache()
import gc
gc.collect()
### essentially same as HFLM beyond this line!
cont_toks_list = cont.tolist()
for cont_toks, context in zip(cont_toks_list, contexts):
# discard context + left-padding toks if using causal decoder-only VLM
cont_toks = cont_toks[context_enc.shape[1] :]
s = self.tok_decode(cont_toks)
# use secondary stop seqs to cut off should-have-been-stopped content post-hoc
for term in until:
if len(term) > 0:
# ignore '' separator,
# for seq2seq case where self.tok_decode(self.eot_token_id) = ''
s = s.split(term)[0]
res.append(s)
self.cache_hook.add_partial(
"generate_until", (context, gen_kwargs), s
) # TODO: cache key for multimodal input should be what?
pbar.update(1)
# reorder this group of results back to original unsorted form
res = re_ords.get_original(res)
pbar.close()
return res
...@@ -9,10 +9,10 @@ import torch.nn.functional as F ...@@ -9,10 +9,10 @@ import torch.nn.functional as F
import transformers import transformers
from accelerate import ( from accelerate import (
Accelerator, Accelerator,
DistributedType,
InitProcessGroupKwargs, InitProcessGroupKwargs,
find_executable_batch_size, find_executable_batch_size,
) )
from accelerate.utils import get_max_memory
from huggingface_hub import HfApi from huggingface_hub import HfApi
from packaging import version from packaging import version
from peft import PeftModel from peft import PeftModel
...@@ -30,6 +30,7 @@ from lm_eval.api.registry import register_model ...@@ -30,6 +30,7 @@ from lm_eval.api.registry import register_model
from lm_eval.models.utils import ( from lm_eval.models.utils import (
Collator, Collator,
clear_torch_cache, clear_torch_cache,
configure_pad_token,
get_dtype, get_dtype,
pad_and_concat, pad_and_concat,
stop_sequences_criteria, stop_sequences_criteria,
...@@ -39,31 +40,6 @@ from lm_eval.models.utils import ( ...@@ -39,31 +40,6 @@ from lm_eval.models.utils import (
eval_logger = utils.eval_logger eval_logger = utils.eval_logger
def _get_accelerate_args(
device_map_option: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None,
offload_folder: Optional[str] = "./offload",
gpus: Optional[int] = None,
) -> dict:
"""Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
max_memory = {}
if max_memory_per_gpu is not None:
max_memory_per_gpu_map = {
device_idx: max_memory_per_gpu for device_idx in range(gpus)
}
max_memory.update(max_memory_per_gpu_map)
if max_cpu_memory is not None:
max_memory["cpu"] = max_cpu_memory
args = {}
if max_memory:
args["max_memory"] = max_memory
args["device_map"] = device_map_option
args["offload_folder"] = offload_folder
return args
@register_model("hf-auto", "hf", "huggingface") @register_model("hf-auto", "hf", "huggingface")
class HFLM(TemplateLM): class HFLM(TemplateLM):
""" """
...@@ -79,7 +55,7 @@ class HFLM(TemplateLM): ...@@ -79,7 +55,7 @@ class HFLM(TemplateLM):
def __init__( def __init__(
self, self,
pretrained: Union[str, transformers.PreTrainedModel], pretrained: Union[str, transformers.PreTrainedModel],
backend: Optional[Literal["default", "causal", "seq2seq"]] = "default", backend: Literal["default", "causal", "seq2seq"] = "default",
# override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq) # override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq)
revision: Optional[str] = "main", revision: Optional[str] = "main",
subfolder: Optional[str] = None, subfolder: Optional[str] = None,
...@@ -104,7 +80,6 @@ class HFLM(TemplateLM): ...@@ -104,7 +80,6 @@ class HFLM(TemplateLM):
# arguments used for splitting a model across GPUs naively. # arguments used for splitting a model across GPUs naively.
# only used if `parallelize=True`. # only used if `parallelize=True`.
parallelize: Optional[bool] = False, parallelize: Optional[bool] = False,
device_map_option: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None, max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None, max_cpu_memory: Optional[Union[int, str]] = None,
offload_folder: Optional[Union[str, os.PathLike]] = "./offload", offload_folder: Optional[Union[str, os.PathLike]] = "./offload",
...@@ -112,10 +87,10 @@ class HFLM(TemplateLM): ...@@ -112,10 +87,10 @@ class HFLM(TemplateLM):
peft: Optional[str] = None, peft: Optional[str] = None,
delta: Optional[str] = None, delta: Optional[str] = None,
autogptq: Optional[Union[bool, str]] = False, autogptq: Optional[Union[bool, str]] = False,
gptqmodel: Optional[bool] = False,
**kwargs, **kwargs,
) -> None: ) -> None:
super().__init__() super().__init__()
# optionally: take in an already-initialized transformers.PreTrainedModel # optionally: take in an already-initialized transformers.PreTrainedModel
if not isinstance(pretrained, str): if not isinstance(pretrained, str):
eval_logger.warning( eval_logger.warning(
...@@ -127,21 +102,6 @@ class HFLM(TemplateLM): ...@@ -127,21 +102,6 @@ class HFLM(TemplateLM):
self._config = self._model.config self._config = self._model.config
gpus = 0 gpus = 0
if tokenizer:
assert isinstance(
tokenizer, transformers.PreTrainedTokenizer
) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
self.tokenizer = tokenizer
else:
# Get tokenizer
model_name = self._model.name_or_path
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
model_name,
revision=revision,
trust_remote_code=trust_remote_code,
use_fast=use_fast_tokenizer,
)
else: else:
assert isinstance(device, str) assert isinstance(device, str)
assert isinstance(pretrained, str) assert isinstance(pretrained, str)
...@@ -156,6 +116,7 @@ class HFLM(TemplateLM): ...@@ -156,6 +116,7 @@ class HFLM(TemplateLM):
if "npu" in accelerator.device.type: if "npu" in accelerator.device.type:
gpus = torch.npu.device_count() gpus = torch.npu.device_count()
# using one process with no model parallelism
if not (parallelize or accelerator.num_processes > 1): if not (parallelize or accelerator.num_processes > 1):
# use user-passed device # use user-passed device
device_list = set( device_list = set(
...@@ -181,14 +142,19 @@ class HFLM(TemplateLM): ...@@ -181,14 +142,19 @@ class HFLM(TemplateLM):
if torch.cuda.is_available() if torch.cuda.is_available()
else torch.device("cpu") else torch.device("cpu")
) )
else: else: # Parallelism managed by accelerate
if device != "cuda": if device != "cuda":
eval_logger.info( eval_logger.info(
f"Using `accelerate launch` or `parallelize=True`, device '{device}' will be overridden when placing model." f"Using `accelerate launch` or `parallelize=True`, device '{device}' will be overridden when placing model."
) )
# TODO: include in warning that `load_in_8bit` etc. affect this too # TODO: include in warning that `load_in_8bit` etc. affect this too
self._device = torch.device(device) self._device = (
self.accelerator.device
if hasattr(self, "accelerator")
else torch.device(device)
)
revision = str(revision) # cast to string if not already one
# TODO: update this to be less of a hack once subfolder is fixed in HF # TODO: update this to be less of a hack once subfolder is fixed in HF
revision = revision + ("/" + subfolder if subfolder is not None else "") revision = revision + ("/" + subfolder if subfolder is not None else "")
...@@ -198,7 +164,7 @@ class HFLM(TemplateLM): ...@@ -198,7 +164,7 @@ class HFLM(TemplateLM):
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
) )
# determine which of 'causal' and 'seq2seq' backends to use # determine which of 'causal' and 'seq2seq' backends to use for HF models
self._get_backend( self._get_backend(
config=self.config, backend=backend, trust_remote_code=trust_remote_code config=self.config, backend=backend, trust_remote_code=trust_remote_code
) )
...@@ -221,13 +187,13 @@ class HFLM(TemplateLM): ...@@ -221,13 +187,13 @@ class HFLM(TemplateLM):
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
parallelize=parallelize, parallelize=parallelize,
gpus=gpus, gpus=gpus,
device_map_option=device_map_option,
max_memory_per_gpu=max_memory_per_gpu, max_memory_per_gpu=max_memory_per_gpu,
max_cpu_memory=max_cpu_memory, max_cpu_memory=max_cpu_memory,
offload_folder=offload_folder, offload_folder=offload_folder,
peft=peft, peft=peft,
delta=delta, delta=delta,
autogptq=autogptq, autogptq=autogptq,
gptqmodel=gptqmodel,
**kwargs, **kwargs,
) )
...@@ -236,52 +202,17 @@ class HFLM(TemplateLM): ...@@ -236,52 +202,17 @@ class HFLM(TemplateLM):
self.model.eval() self.model.eval()
self.model.tie_weights() self.model.tie_weights()
if isinstance(pretrained, str) and (gpus >= 1 or str(self.device) == "mps"):
# TODO: can remove this whole snippet except in the mps case, perhaps?
if not (parallelize or autogptq or hasattr(self, "accelerator")):
# place model onto device requested manually,
# if not using HF Accelerate or device_map
# or any other option that preloads model onto device
try:
self.model.to(self.device)
except ValueError:
eval_logger.debug(
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
)
self.truncation = truncation self.truncation = truncation
self.logits_cache = logits_cache self.logits_cache = logits_cache
self.vocab_size = self.tokenizer.vocab_size self.vocab_size = self.tokenizer.vocab_size
# select (or create) a pad token to use # select (or create) a pad token to use
if self.tokenizer.pad_token: self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)
pass
elif self.tokenizer.unk_token:
self.tokenizer.pad_token_id = self.tokenizer.unk_token_id
elif self.tokenizer.eos_token:
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
else:
if getattr(self.config, "model_type", None) == "qwen":
# Qwen's trust_remote_code tokenizer does not allow for adding special tokens
self.tokenizer.pad_token = "<|endoftext|>"
elif (
self.tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
or self.tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
):
# The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
# The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
# ---
# Note that the world tokenizer class name, might change in the future for the final huggingface merge
# https://github.com/huggingface/transformers/pull/26963
assert self.tokenizer.pad_token_id == 0
else:
self.tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
# TODO: override this for Gemma
self.add_bos_token = add_bos_token self.add_bos_token = add_bos_token
if getattr(self.config, "model_type", None) == "gemma": if "gemma" in getattr(self.config, "model_type", ""):
self.add_bos_token = True self.add_bos_token = True
eval_logger.info( eval_logger.info(
f"Model type is '{self.config.model_type}', a BOS token will be used as Gemma underperforms without it." f"Model type is '{self.config.model_type}', part of the Gemma family--a BOS token will be used as Gemma underperforms without it."
) )
self._max_length = max_length self._max_length = max_length
...@@ -302,49 +233,46 @@ class HFLM(TemplateLM): ...@@ -302,49 +233,46 @@ class HFLM(TemplateLM):
self.batch_size_per_gpu = int(batch_size) self.batch_size_per_gpu = int(batch_size)
if isinstance(pretrained, str): if isinstance(pretrained, str):
if gpus >= 1 or str(self.device) == "mps":
# TODO: can remove this whole snippet except in the mps case, perhaps?
if not (parallelize or autogptq or hasattr(self, "accelerator")):
# place model onto device requested manually,
# if not using HF Accelerate or device_map
# or any other option that preloads model onto device
try:
self.model.to(self.device)
except ValueError:
eval_logger.debug(
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
)
# multigpu data-parallel support when launched with accelerate # multigpu data-parallel support when launched with accelerate
if gpus > 1: if gpus > 1:
if parallelize: if accelerator.num_processes > 1:
if accelerator.num_processes > 1: if parallelize:
raise RuntimeError( eval_logger.warning(
"Attempted to use both a HF Accelerate `device_map` and to launch via `accelerate launch`. If this is the case, please either remove `parallelize=True` from --model_args or launch outside of the Accelerate launcher." "You are both using a HF Accelerate `device_map` (`--model_args parallelize=True`) and launching via `accelerate launch`. This will attempt to do model and data parallelism depending on the resources available."
) )
else: elif gpus > accelerator.num_processes:
pass
elif accelerator.num_processes == 1:
# if we aren't launching via accelerate, ditch
self._rank = 0
self._world_size = 1
else:
if gpus > accelerator.num_processes:
eval_logger.warning( eval_logger.warning(
"WARNING: The number of total system GPUs does not match the number of spawned processes. " "WARNING: The number of total system GPUs does not match the number of spawned processes. "
"If you would like to use data parallelism, please launch the script " "If you would like to use data parallelism, please launch the script "
"with 'accelerate launch *script*'. " "with 'accelerate launch *script*'. "
f"Current run will proceed with {accelerator.num_processes} devices." f"Current run will proceed with {accelerator.num_processes} devices."
) )
assert ( if self.accelerator.is_local_main_process:
accelerator.distributed_type eval_logger.info(
in [ f"Using {gpus} devices with data parallelism"
DistributedType.FSDP, )
DistributedType.MULTI_GPU,
DistributedType.MULTI_NPU,
]
), "Unsupported distributed type provided. Only DDP and FSDP are supported."
if accelerator.distributed_type == DistributedType.FSDP:
self._model = accelerator.prepare(self.model)
else:
self._model = accelerator.prepare_model(
self.model, evaluation_mode=True
)
self._device = torch.device(f"{accelerator.device}") self._device = torch.device(f"{accelerator.device}")
self.accelerator = accelerator self.accelerator = accelerator
if self.accelerator.is_local_main_process:
eval_logger.info(f"Using {gpus} devices with data parallelism")
self._rank = self.accelerator.local_process_index self._rank = self.accelerator.local_process_index
self._world_size = self.accelerator.num_processes self._world_size = self.accelerator.num_processes
else:
# if we aren't launching via accelerate, ditch
self._rank = 0
self._world_size = 1
else: else:
# if a PreTrainedModel was passed into HFLM, we forgo distributed setup. # if a PreTrainedModel was passed into HFLM, we forgo distributed setup.
eval_logger.warning( eval_logger.warning(
...@@ -359,6 +287,94 @@ class HFLM(TemplateLM): ...@@ -359,6 +287,94 @@ class HFLM(TemplateLM):
f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}" f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}"
) )
def _get_accelerate_args(
self,
parallelize: Optional[bool] = None,
device_map: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None,
offload_folder: Optional[str] = "./offload",
gpus: Optional[int] = None,
) -> dict:
"""Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
num_local_processes = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
num_machines = int(os.environ.get("WORLD_SIZE", 0)) // num_local_processes
if (
num_machines == 0
and hasattr(self, "accelerator")
and self.accelerator is not None
):
eval_logger.info(
"We are not in a distributed setting for accelerate. Setting model_parallel to False."
)
parallelize = False
if parallelize is None:
# If parallelism is unset by the user, we automatically assign model parallelism
# if enough extra GPUs are available
max_memory_all_gpus = get_max_memory()
# We just want gpu, not cpu, max memory
if "cpu" in max_memory_all_gpus:
del max_memory_all_gpus["cpu"]
parallelize = bool(num_local_processes < len(max_memory_all_gpus))
eval_logger.info(
f"Setting model parallel to {parallelize} since "
f"the number of local processes is {num_local_processes} "
f"and the number of GPUs is {len(max_memory_all_gpus)}"
)
args = {}
if parallelize: # Model parallelism will be used
max_memory = {}
if max_memory_per_gpu is not None: # Using the provided memory requirements
max_memory_per_gpu_map = {
device_idx: max_memory_per_gpu for device_idx in range(gpus)
}
else: # Estimating the possible memory requirements
max_memory_all_gpus = get_max_memory()
if "cpu" in max_memory_all_gpus:
del max_memory_all_gpus["cpu"]
if not hasattr(self, "accelerator"):
max_memory_per_gpu_map = {
k: v for k, v in max_memory_all_gpus.items()
}
else:
# use only 1 / num_processes of the GPUs if we are running under accelerate launch
max_memory_per_gpu_map = {
k: v
for k, v in max_memory_all_gpus.items()
if k % num_local_processes
== (self.accelerator.process_index % num_local_processes)
}
args["max_memory"] = max_memory_per_gpu_map
args["device_map"] = "auto" if device_map is None else device_map
eval_logger.info(
f"Model parallel was set to True, setting max memory per GPU to {max_memory_per_gpu_map} and device map to {args.get('device_map')}"
)
if max_cpu_memory is not None:
max_memory["cpu"] = max_cpu_memory
args["offload_folder"] = offload_folder
elif (
device_map is None
): # No model parallelism, we use the default provided device for our model
if hasattr(self, "accelerator"):
device_map = {"": f"{self.accelerator.device}"}
else:
device_map = {"": str(self.device)}
args["max_memory"] = None
args["device_map"] = device_map
eval_logger.info(
f"Model parallel was set to False, max memory was not set, and device map was set to {device_map}"
)
else:
args["max_memory"] = None
args["device_map"] = None
eval_logger.info("Model parallel was set to False.")
return args
@property @property
def config(self): def config(self):
# return the associated transformers.AutoConfig for the given pretrained model. # return the associated transformers.AutoConfig for the given pretrained model.
...@@ -424,31 +440,29 @@ class HFLM(TemplateLM): ...@@ -424,31 +440,29 @@ class HFLM(TemplateLM):
def tokenizer_name(self) -> str: def tokenizer_name(self) -> str:
return self.tokenizer.name_or_path.replace("/", "__") return self.tokenizer.name_or_path.replace("/", "__")
@property
def chat_template(self) -> str:
if self.tokenizer.chat_template is not None:
return self.tokenizer.chat_template
return self.tokenizer.default_chat_template
def _get_backend( def _get_backend(
self, self,
config: Union[transformers.PretrainedConfig, transformers.AutoConfig], config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
backend: Optional[Literal["default", "causal", "seq2seq"]] = "default", backend: Literal["default", "causal", "seq2seq"] = "default",
trust_remote_code: Optional[bool] = False, trust_remote_code: Optional[bool] = False,
) -> None: ) -> None:
""" """
Helper method during initialization. Helper method during initialization.
Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder)) Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder)) model type to be used.
model type to be used. sets `self.AUTO_MODEL_CLASS` appropriately if not already set.
**If not calling HFLM.__init__() or HFLM._get_backend() within a subclass of HFLM,
user must set `self.backend` to be either "causal" or "seq2seq" manually!**
""" """
assert backend in ["default", "causal", "seq2seq"] assert backend in ["default", "causal", "seq2seq"]
if backend != "default": if backend != "default":
# if we've settled on non-default backend, use that manually # if we've settled on non-default backend, use that manually
if backend == "causal": if backend == "causal":
self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM self.backend = backend
elif backend == "seq2seq": elif backend == "seq2seq":
self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM self.backend = backend
eval_logger.info( eval_logger.info(
f"Overrode HF model backend type, and using type '{backend}'" f"Overrode HF model backend type, and using type '{backend}'"
) )
...@@ -461,26 +475,32 @@ class HFLM(TemplateLM): ...@@ -461,26 +475,32 @@ class HFLM(TemplateLM):
# first check if model type is listed under seq2seq models, since some # first check if model type is listed under seq2seq models, since some
# models like MBart are listed in both seq2seq and causal mistakenly in HF transformers. # models like MBart are listed in both seq2seq and causal mistakenly in HF transformers.
# these special cases should be treated as seq2seq models. # these special cases should be treated as seq2seq models.
self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM self.backend = "seq2seq"
eval_logger.info(f"Using model type '{backend}'")
elif ( elif (
getattr(self.config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES getattr(self.config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
): ):
self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM self.backend = "causal"
eval_logger.info(f"Using model type '{backend}'")
else: else:
if not trust_remote_code: if not trust_remote_code:
eval_logger.warning( eval_logger.warning(
"HF model type is neither marked as CausalLM or Seq2SeqLM. \ "HF model type is neither marked as CausalLM or Seq2SeqLM. \
This is expected if your model requires `trust_remote_code=True` but may be an error otherwise." This is expected if your model requires `trust_remote_code=True` but may be an error otherwise."
"Setting backend to causal"
) )
# if model type is neither in HF transformers causal or seq2seq model registries # if model type is neither in HF transformers causal or seq2seq model registries
# then we default to AutoModelForCausalLM # then we default to assuming AutoModelForCausalLM
self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM self.backend = "causal"
eval_logger.info(
f"Model type cannot be determined. Using default model type '{backend}'"
)
assert self.AUTO_MODEL_CLASS in [ if self.AUTO_MODEL_CLASS is None:
transformers.AutoModelForCausalLM, if self.backend == "causal":
transformers.AutoModelForSeq2SeqLM, self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
] elif self.backend == "seq2seq":
return None self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
def _get_config( def _get_config(
self, self,
...@@ -488,6 +508,7 @@ class HFLM(TemplateLM): ...@@ -488,6 +508,7 @@ class HFLM(TemplateLM):
revision: str = "main", revision: str = "main",
trust_remote_code: bool = False, trust_remote_code: bool = False,
) -> None: ) -> None:
"""Return the model config for HuggingFace models"""
self._config = transformers.AutoConfig.from_pretrained( self._config = transformers.AutoConfig.from_pretrained(
pretrained, pretrained,
revision=revision, revision=revision,
...@@ -505,7 +526,6 @@ class HFLM(TemplateLM): ...@@ -505,7 +526,6 @@ class HFLM(TemplateLM):
# (accelerate naive PP (device_map) options) # (accelerate naive PP (device_map) options)
parallelize: Optional[bool] = False, parallelize: Optional[bool] = False,
gpus: Optional[int] = None, gpus: Optional[int] = None,
device_map_option: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None, max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None, max_cpu_memory: Optional[Union[int, str]] = None,
offload_folder: Optional[str] = "./offload", offload_folder: Optional[str] = "./offload",
...@@ -513,6 +533,7 @@ class HFLM(TemplateLM): ...@@ -513,6 +533,7 @@ class HFLM(TemplateLM):
peft: Optional[str] = None, peft: Optional[str] = None,
delta: Optional[str] = None, delta: Optional[str] = None,
autogptq: Optional[Union[bool, str]] = False, autogptq: Optional[Union[bool, str]] = False,
gptqmodel: Optional[bool] = False,
**kwargs, **kwargs,
) -> None: ) -> None:
""" """
...@@ -529,27 +550,18 @@ class HFLM(TemplateLM): ...@@ -529,27 +550,18 @@ class HFLM(TemplateLM):
model_kwargs = kwargs if kwargs else {} model_kwargs = kwargs if kwargs else {}
if parallelize: model_kwargs.update(
model_kwargs.update( self._get_accelerate_args(
_get_accelerate_args( parallelize=parallelize,
device_map_option, # TODO: phase out device_map_option? device_map=kwargs.get("device_map", None),
max_memory_per_gpu, max_memory_per_gpu=max_memory_per_gpu,
max_cpu_memory, max_cpu_memory=max_cpu_memory,
offload_folder, offload_folder=offload_folder,
gpus, gpus=gpus,
)
) )
elif "device_map" not in model_kwargs: )
# set a device_map to initialize model on the right GPU.
# this is needed because it seems that the default behavior
# for quantized models now seems to be device_map="auto"
# which breaks data-parallel mode.
if hasattr(self, "accelerator"):
model_kwargs.update({"device_map": {"": f"{self.accelerator.device}"}})
else:
model_kwargs.update({"device_map": {"": str(self.device)}})
if not autogptq: if not autogptq and not gptqmodel:
if model_kwargs.get("load_in_4bit", None): if model_kwargs.get("load_in_4bit", None):
assert ( assert (
transformers.__version__ >= "4.30.0" transformers.__version__ >= "4.30.0"
...@@ -560,6 +572,7 @@ class HFLM(TemplateLM): ...@@ -560,6 +572,7 @@ class HFLM(TemplateLM):
model_kwargs["bnb_4bit_compute_dtype"] = get_dtype( model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(
model_kwargs["bnb_4bit_compute_dtype"] model_kwargs["bnb_4bit_compute_dtype"]
) )
self._model = self.AUTO_MODEL_CLASS.from_pretrained( self._model = self.AUTO_MODEL_CLASS.from_pretrained(
pretrained, pretrained,
revision=revision, revision=revision,
...@@ -568,23 +581,42 @@ class HFLM(TemplateLM): ...@@ -568,23 +581,42 @@ class HFLM(TemplateLM):
**model_kwargs, **model_kwargs,
) )
else: else:
try: if autogptq and gptqmodel:
from auto_gptq import AutoGPTQForCausalLM raise ValueError(
except ModuleNotFoundError: "Cannot use both 'autogptq' and 'gptqmodel' options at the same time."
raise Exception(
"Tried to load auto_gptq, but auto-gptq is not installed ",
"please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]",
) )
self._model = AutoGPTQForCausalLM.from_quantized( if autogptq:
pretrained, try:
trust_remote_code=trust_remote_code, from auto_gptq import AutoGPTQForCausalLM
model_basename=None if autogptq is True else Path(autogptq).stem, except ModuleNotFoundError as exception:
use_safetensors=True raise type(exception)(
if autogptq is True "Tried to load auto_gptq, but auto-gptq is not installed ",
else autogptq.endswith(".safetensors"), "please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]",
**model_kwargs, )
)
self._model = AutoGPTQForCausalLM.from_quantized(
pretrained,
trust_remote_code=trust_remote_code,
model_basename=None if autogptq is True else Path(autogptq).stem,
use_safetensors=True
if autogptq is True
else autogptq.endswith(".safetensors"),
**model_kwargs,
)
if gptqmodel:
try:
from gptqmodel import GPTQModel
except ModuleNotFoundError as exception:
raise type(exception)(
"Tried to load gptqmodel, but gptqmodel is not installed ",
"please install gptqmodel via `pip install gptqmodel --no-build-isolation` or `pip install lm-eval[gptqmodel] --no-build-isolation`",
)
self._model = GPTQModel.from_quantized(
pretrained, trust_remote_code=trust_remote_code, **model_kwargs
)
if peft and delta: if peft and delta:
raise ValueError( raise ValueError(
...@@ -597,10 +629,10 @@ class HFLM(TemplateLM): ...@@ -597,10 +629,10 @@ class HFLM(TemplateLM):
raise AssertionError("load_in_4bit requires peft >= 0.4.0") raise AssertionError("load_in_4bit requires peft >= 0.4.0")
if self._model.config.vocab_size != len(self.tokenizer): if self._model.config.vocab_size != len(self.tokenizer):
# resize model for LoRAs with added tokens # resize model for LoRAs with added tokens
self._model.resize_token_embeddings(len(self.tokenizer))
eval_logger.info( eval_logger.info(
f"Model config indicates vocab_size='{self._model.config.vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..." f"Model config indicates vocab_size='{self._model.config.vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..."
) )
self._model.resize_token_embeddings(len(self.tokenizer))
self._model = PeftModel.from_pretrained( self._model = PeftModel.from_pretrained(
self._model, peft, revision=revision self._model, peft, revision=revision
) )
...@@ -718,7 +750,7 @@ class HFLM(TemplateLM): ...@@ -718,7 +750,7 @@ class HFLM(TemplateLM):
@find_executable_batch_size(starting_batch_size=self.max_batch_size) @find_executable_batch_size(starting_batch_size=self.max_batch_size)
def forward_batch(batch_size): def forward_batch(batch_size):
security_margin = int(0.05 * security_margin_factor * batch_size) security_margin = int(0.05 * security_margin_factor * batch_size)
if self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: if self.backend == "seq2seq":
length = max(max_context_enc, max_cont_enc) length = max(max_context_enc, max_cont_enc)
batched_conts = torch.ones( batched_conts = torch.ones(
(batch_size + security_margin, length), device=self.device (batch_size + security_margin, length), device=self.device
...@@ -773,7 +805,7 @@ class HFLM(TemplateLM): ...@@ -773,7 +805,7 @@ class HFLM(TemplateLM):
# by default for CausalLM - false or self.add_bos_token is set # by default for CausalLM - false or self.add_bos_token is set
if add_special_tokens is None: if add_special_tokens is None:
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: if self.backend == "causal":
special_tokens_kwargs = { special_tokens_kwargs = {
"add_special_tokens": False or self.add_bos_token "add_special_tokens": False or self.add_bos_token
} }
...@@ -801,7 +833,7 @@ class HFLM(TemplateLM): ...@@ -801,7 +833,7 @@ class HFLM(TemplateLM):
self.tokenizer.padding_side = padding_side self.tokenizer.padding_side = padding_side
add_special_tokens = {} add_special_tokens = {}
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: if self.backend == "causal":
add_special_tokens = {"add_special_tokens": False or self.add_bos_token} add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
encoding = self.tokenizer( encoding = self.tokenizer(
...@@ -879,14 +911,14 @@ class HFLM(TemplateLM): ...@@ -879,14 +911,14 @@ class HFLM(TemplateLM):
def _select_cont_toks( def _select_cont_toks(
self, logits: torch.Tensor, contlen: int = None, inplen: int = None self, logits: torch.Tensor, contlen: int = None, inplen: int = None
) -> torch.Tensor: ) -> torch.Tensor:
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: if self.backend == "causal":
assert ( assert (
contlen and inplen contlen and inplen
), "Must pass input len and cont. len to select scored logits for causal LM" ), "Must pass input len and cont. len to select scored logits for causal LM"
# discard right-padding. # discard right-padding.
# also discard the input/context tokens. we'll only score continuations. # also discard the input/context tokens. we'll only score continuations.
logits = logits[inplen - contlen : inplen] logits = logits[inplen - contlen : inplen]
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: elif self.backend == "seq2seq":
assert ( assert (
contlen and not inplen contlen and not inplen
), "Selecting scored logits for Seq2SeqLM requires only cont. len" ), "Selecting scored logits for Seq2SeqLM requires only cont. len"
...@@ -944,6 +976,9 @@ class HFLM(TemplateLM): ...@@ -944,6 +976,9 @@ class HFLM(TemplateLM):
string_nll = sum(string_nll) string_nll = sum(string_nll)
loglikelihoods.append(string_nll) loglikelihoods.append(string_nll)
# cache this loglikelihood_rolling request
self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
return loglikelihoods return loglikelihoods
def _batch_scheduler(self, pos, n_reordered_requests): def _batch_scheduler(self, pos, n_reordered_requests):
...@@ -1000,8 +1035,7 @@ class HFLM(TemplateLM): ...@@ -1000,8 +1035,7 @@ class HFLM(TemplateLM):
requests, requests,
sort_fn=_collate, sort_fn=_collate,
group_by="contexts" group_by="contexts"
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM if self.backend == "causal" and self.logits_cache
and self.logits_cache
else None, else None,
group_fn=_lookup_one_token_cont, group_fn=_lookup_one_token_cont,
) )
...@@ -1058,14 +1092,14 @@ class HFLM(TemplateLM): ...@@ -1058,14 +1092,14 @@ class HFLM(TemplateLM):
# cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice # cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice
# when too long to fit in context, truncate from the left # when too long to fit in context, truncate from the left
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: if self.backend == "causal":
inp = torch.tensor( inp = torch.tensor(
(context_enc + continuation_enc)[-(self.max_length + 1) :][:-1], (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
dtype=torch.long, dtype=torch.long,
device=self.device, device=self.device,
) )
(inplen,) = inp.shape (inplen,) = inp.shape
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: elif self.backend == "seq2seq":
inp = torch.tensor( inp = torch.tensor(
(context_enc)[-self.max_length :], (context_enc)[-self.max_length :],
dtype=torch.long, dtype=torch.long,
...@@ -1105,11 +1139,11 @@ class HFLM(TemplateLM): ...@@ -1105,11 +1139,11 @@ class HFLM(TemplateLM):
# create encoder attn mask and batched conts, if seq2seq # create encoder attn mask and batched conts, if seq2seq
call_kwargs = {} call_kwargs = {}
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: if self.backend == "causal":
batched_inps = pad_and_concat( batched_inps = pad_and_concat(
padding_len_inp, inps, padding_side="right" padding_len_inp, inps, padding_side="right"
) # [batch, padding_len_inp] ) # [batch, padding_len_inp]
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: elif self.backend == "seq2seq":
# TODO: left-pad encoder inps and mask? # TODO: left-pad encoder inps and mask?
batched_inps = pad_and_concat( batched_inps = pad_and_concat(
padding_len_inp, inps padding_len_inp, inps
...@@ -1142,7 +1176,7 @@ class HFLM(TemplateLM): ...@@ -1142,7 +1176,7 @@ class HFLM(TemplateLM):
# from prompt/prefix tuning tokens, if applicable # from prompt/prefix tuning tokens, if applicable
ctx_len = ( ctx_len = (
inplen + (logits.shape[0] - padding_len_inp) inplen + (logits.shape[0] - padding_len_inp)
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM if self.backend == "causal"
else None else None
) )
logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len) logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
...@@ -1178,7 +1212,13 @@ class HFLM(TemplateLM): ...@@ -1178,7 +1212,13 @@ class HFLM(TemplateLM):
res.append(answer) res.append(answer)
self.cache_hook.add_partial("loglikelihood", request_str, answer) if request_str is not None:
# special case: loglikelihood_rolling produces a number of loglikelihood requests
# all with cache key None. instead do add_partial on the per-example level
# in the loglikelihood_rolling() function for those.
self.cache_hook.add_partial(
"loglikelihood", request_str, answer
)
pbar.update(1) pbar.update(1)
pbar.close() pbar.close()
...@@ -1260,7 +1300,7 @@ class HFLM(TemplateLM): ...@@ -1260,7 +1300,7 @@ class HFLM(TemplateLM):
max_gen_toks = self.max_gen_toks max_gen_toks = self.max_gen_toks
# set the max length in tokens of inputs ("context_enc") # set the max length in tokens of inputs ("context_enc")
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: if self.backend == "causal":
# max len for inputs = max length, minus room to generate the max new tokens # max len for inputs = max length, minus room to generate the max new tokens
# if the max new tokens is too large, halve it until it fits as we cannot change # if the max new tokens is too large, halve it until it fits as we cannot change
# the max model length # the max model length
...@@ -1268,7 +1308,7 @@ class HFLM(TemplateLM): ...@@ -1268,7 +1308,7 @@ class HFLM(TemplateLM):
while max_ctx_len <= 0: while max_ctx_len <= 0:
max_gen_toks = max_gen_toks // 2 max_gen_toks = max_gen_toks // 2
max_ctx_len = self.max_length - max_gen_toks max_ctx_len = self.max_length - max_gen_toks
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: elif self.backend == "seq2seq":
# max len for inputs = encoder's whole max_length # max len for inputs = encoder's whole max_length
max_ctx_len = self.max_length max_ctx_len = self.max_length
...@@ -1295,7 +1335,7 @@ class HFLM(TemplateLM): ...@@ -1295,7 +1335,7 @@ class HFLM(TemplateLM):
cont_toks_list = cont.tolist() cont_toks_list = cont.tolist()
for cont_toks, context in zip(cont_toks_list, contexts): for cont_toks, context in zip(cont_toks_list, contexts):
# discard context + left-padding toks if using causal decoder-only LM # discard context + left-padding toks if using causal decoder-only LM
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: if self.backend == "causal":
cont_toks = cont_toks[context_enc.shape[1] :] cont_toks = cont_toks[context_enc.shape[1] :]
s = self.tok_decode(cont_toks) s = self.tok_decode(cont_toks)
......
import json
import os
from configparser import ConfigParser
from functools import lru_cache
from pathlib import Path
from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Type, cast
from tqdm import tqdm
from lm_eval.api.instance import Instance
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.utils import eval_logger, simple_parse_args_string
class LogLikelihoodResult(NamedTuple):
log_likelihood: float
is_greedy: bool
@lru_cache(maxsize=None)
def get_watsonx_credentials(
env_name: str = "YP_QA",
config_path: str = "config.ini",
) -> Dict[str, str]:
"""
Retrieves Watsonx API credentials from environmental variables or from a configuration file.
Args:
env_name (str, optional): The name of the environment from which to retrieve credentials. Defaults to "YP_QA".
config_path (str, optional): The file path to the `config.ini` configuration file. Defaults to "config.ini".
Returns:
dict[str, str]: A dictionary containing the credentials necessary for authentication, including
keys such as `apikey`, `url`, and `project_id`.
Raises:
FileNotFoundError: If the specified configuration file does not exist.
AssertionError: If the credentials format is invalid.
"""
def _verify_credentials(creds: Any) -> None:
assert isinstance(creds, dict) and all(
key in creds.keys() for key in ["apikey", "url", "project_id"]
), "Wrong configuration for credentials."
credentials = {
"apikey": os.getenv("WATSONX_API_KEY", None),
"url": os.getenv("WATSONX_URL", None),
"project_id": os.getenv("WATSONX_PROJECT_ID", None),
}
if any(credentials.get(key) is None for key in ["apikey", "url", "project_id"]):
eval_logger.warning(
"One or more required environment variables are missing, trying to load config.ini file."
)
config_path = "config.ini" if not config_path else config_path
if not Path(config_path).is_absolute():
config_path = os.path.join(
Path(__file__).parent.parent.absolute(), config_path
)
if not os.path.exists(config_path):
raise FileNotFoundError(
f"Provided config file path {config_path} does not exist. "
"You need to specify credentials in config.ini file under specified location."
)
config = ConfigParser()
config.read(config_path)
credentials = json.loads(config.get(env_name))
_verify_credentials(credentials)
return credentials
@register_model("watsonx_llm")
class WatsonxLLM(LM):
"""
Implementation of LM model interface for evaluating Watsonx model with the lm_eval framework.
See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/model_guide.md for reference.
"""
@classmethod
def create_from_arg_string(
cls: Type["WatsonxLLM"],
arg_string: str,
config_path: Optional[str] = None,
) -> "WatsonxLLM":
"""
Allow the user to specify model parameters (TextGenerationParameters) in CLI arguments.
"""
try:
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
except ImportError:
raise ImportError(
"Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
)
args = simple_parse_args_string(arg_string)
model_id = args.pop("model_id", None)
if model_id is None:
raise ValueError("'model_id' is required, please pass it in 'model_args'")
if not args.get("do_sample", None):
args["temperature"] = None
args["top_p"] = None
args["top_k"] = None
args["seed"] = None
cls.generate_params = {
GenParams.DECODING_METHOD: (
"greedy" if not args.get("do_sample", None) else "sample"
),
GenParams.LENGTH_PENALTY: args.get("length_penalty", None),
GenParams.TEMPERATURE: args.get("temperature", None),
GenParams.TOP_P: args.get("top_p", None),
GenParams.TOP_K: args.get("top_k", None),
GenParams.RANDOM_SEED: args.get("seed", None),
GenParams.REPETITION_PENALTY: args.get("repetition_penalty", None),
GenParams.MIN_NEW_TOKENS: args.get("min_new_tokens", None),
GenParams.MAX_NEW_TOKENS: args.get("max_new_tokens", 256),
GenParams.STOP_SEQUENCES: args.get("stop_sequences", None),
GenParams.TIME_LIMIT: args.get("time_limit", None),
GenParams.TRUNCATE_INPUT_TOKENS: args.get("truncate_input_tokens", None),
GenParams.RETURN_OPTIONS: {
"generated_tokens": True,
"input_tokens": True,
"token_logprobs": True,
"token_ranks": True,
},
}
generate_params = {
k: v for k, v in cls.generate_params.items() if v is not None
}
return cls(
watsonx_credentials=get_watsonx_credentials(config_path),
model_id=model_id,
generate_params=generate_params,
)
def __init__(
self,
watsonx_credentials: Dict,
model_id,
generate_params: Optional[Dict[Any, Any]] = None,
) -> None:
try:
from ibm_watsonx_ai import APIClient
from ibm_watsonx_ai.foundation_models import ModelInference
except ImportError:
raise ImportError(
"Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
)
super().__init__()
client = APIClient(watsonx_credentials)
project_id = watsonx_credentials.get("project_id", None)
deployment_id = watsonx_credentials.get("deployment_id", None)
client.set.default_project(project_id)
self.generate_params = generate_params or {}
self.model = ModelInference(
model_id=model_id,
deployment_id=deployment_id,
api_client=client,
project_id=project_id,
)
self._model_id = model_id
@staticmethod
def _has_stop_token(response_tokens: List[str], context_tokens: List[str]) -> bool:
"""
Determines whether a stop token has been generated in the `response_tokens` compared to the `context_tokens`.
If the tokens do not match as expected, the function raises a RuntimeError, indicating a possible
misalignment between the tokens generated by the tokenizer and the model.
Args:
response_tokens (List[str]): The List of tokens generated as a response by the model.
context_tokens (List[str]): The List of tokens representing the input context.
Returns:
bool: True if the `response_tokens` likely contain a stop token that terminates the sequence,
otherwise raises an exception.
Raises:
RuntimeError: If there is an unexpected mismatch between the `response_tokens` and the `context_tokens`.
"""
context_length = len(context_tokens)
if response_tokens[: context_length - 1] == context_tokens[:-1]:
return (
response_tokens[-1] != context_tokens[-1]
) # only last token differs, probably stop sequence (</s>)
raise RuntimeError(
f"There is an unexpected difference between tokenizer and model tokens:\n"
f"context_tokens={context_tokens}\n"
f"response_tokens={response_tokens[:context_length]}"
)
def _check_model_logprobs_support(self):
"""
Verifies if the model supports returning log probabilities for input tokens.
This function sends a prompt to the model and checks whether the model's response
includes log probabilities for the input tokens. If log probabilities are not present,
it raises a `RuntimeError`, indicating that the model is not supported.
Raises:
RuntimeError: If the model does not return log probabilities for input tokens.
"""
tokens = self.model.generate_text(
prompt=["The best ice cream flavor is:"],
params=self.generate_params,
raw_response=True,
)[0]["results"][0]
if all(token.get("logprob", None) is None for token in tokens["input_tokens"]):
raise RuntimeError(
f"Model {self._model_id} is not supported: does not return logprobs for input tokens"
)
def _get_log_likelihood(
self,
input_tokens: List[Dict[str, float]],
context_tokens: List[Dict[str, float]],
) -> LogLikelihoodResult:
"""
Calculates the log likelihood of the generated tokens compared to the context tokens.
Args:
input_tokens (List[dict[str, float]]): A List of token dictionaries, each containing
token information like `text` and `logprob`.
context_tokens (List[dict[str, float]]): A List of token dictionaries representing
the input context.
Returns:
LogLikelihoodResult: An object containing the calculated log likelihood and a boolean
flag indicating if the tokens were generated greedily.
"""
response_tokens = [token["text"] for token in input_tokens]
context_length = len(context_tokens)
if self._has_stop_token(response_tokens, context_tokens):
context_length -= 1
return LogLikelihoodResult(
log_likelihood=sum(
token.get("logprob", 0) for token in input_tokens[context_length:]
),
is_greedy=all(
token["rank"] == 1 for token in input_tokens[context_length:]
),
)
def generate_until(self, requests: List[Instance]) -> List[str]:
"""
Generates text responses for a List of requests, with progress tracking and caching.
Args:
requests (List[Instance]): A List of instances, each containing a text input to be processed.
Returns:
List[str]: A List of generated responses.
"""
requests = [request.args[0] for request in requests]
results = []
batch_size = 5
for i in tqdm(
range(0, len(requests), batch_size),
desc=f"Running generate_until function with batch size {batch_size}",
):
batch = requests[i : i + batch_size]
try:
responses = self.model.generate_text(batch, self.generate_params)
except Exception as exp:
eval_logger.error(f"Error while generating text {exp}")
continue
for response, context in zip(responses, batch):
results.append(response)
self.cache_hook.add_partial("generated_text", context, response)
eval_logger.info("Cached responses")
return results
def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
"""
Args:
requests: Each request contains Instance.args : Tuple[str, str] containing:
1. an input string to the LM and
2. a target string on which the loglikelihood of the LM producing this target,
conditioned on the input, will be returned.
Returns:
tuple (loglikelihood, is_greedy) for each request according to the input order:
loglikelihood: probability of generating the target string conditioned on the input
is_greedy: True if and only if the target string would be generated by greedy sampling from the LM
"""
try:
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
except ImportError:
raise ImportError(
"Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
)
self._check_model_logprobs_support()
self.generate_params[GenParams.MAX_NEW_TOKENS] = 1
requests = [request.args for request in requests]
results: List[LogLikelihoodResult] = []
batch_size = 5
for i in tqdm(
range(0, len(requests), batch_size),
desc=f"Running loglikelihood function with batch size {batch_size}",
):
batch = requests[i : i + batch_size]
try:
tokenized_contexts = [
self.model.tokenize(prompt=context, return_tokens=True)["result"][
"tokens"
]
for context, _ in batch
]
except Exception as exp:
eval_logger.error(f"Error while model tokenize:\n {exp}")
continue
input_prompts = [context + continuation for context, continuation in batch]
try:
responses = self.model.generate_text(
prompt=input_prompts, params=self.generate_params, raw_response=True
)
except Exception as exp:
eval_logger.error(f"Error while model generate text:\n {exp}")
continue
for response, tokenized_context, (context, continuation) in zip(
responses, tokenized_contexts, batch
):
log_likelihood_response = self._get_log_likelihood(
response["results"][0]["input_tokens"], tokenized_context
)
results.append(log_likelihood_response)
self.cache_hook.add_partial(
"loglikelihood",
(context, continuation),
(
log_likelihood_response.log_likelihood,
log_likelihood_response.is_greedy,
),
)
eval_logger.info("Cached batch")
return cast(List[Tuple[float, bool]], results)
def loglikelihood_rolling(self, requests) -> List[Tuple[float, bool]]:
"""
Used to evaluate perplexity on a data distribution.
Args:
requests: Each request contains Instance.args : tuple[str] containing an input string to the model whose
entire loglikelihood, conditioned on purely the EOT token, will be calculated.
Returns:
tuple (loglikelihood,) for each request according to the input order:
loglikelihood: solely the probability of producing each piece of text given no starting input.
"""
try:
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
except ImportError:
raise ImportError(
"Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
)
self._check_model_logprobs_support()
self.generate_params[GenParams.MAX_NEW_TOKENS] = 1
requests = [request.args[0] for request in requests]
results: List[LogLikelihoodResult] = []
batch_size = 5
for i in tqdm(
range(0, len(requests), batch_size),
desc=f"Running loglikelihood_rolling function with batch size {batch_size}",
):
batch = requests[i : i + batch_size]
try:
responses = self.model.generate_text(
prompt=batch, params=self.generate_params, raw_response=True
)
except Exception as exp:
eval_logger.error(f"Error while model generate text:\n {exp}")
continue
for response, context in zip(responses, batch):
try:
log_likelihood_response = self._get_log_likelihood(
response["results"][0]["input_tokens"], []
)
results.append(log_likelihood_response)
self.cache_hook.add_partial(
"loglikelihood_rolling",
context,
(
log_likelihood_response.log_likelihood,
log_likelihood_response.is_greedy,
),
)
except Exception as exp:
eval_logger.error(
f"Error during log likelihood calculation:\n {exp}"
)
continue
eval_logger.info("Cached batch")
return cast(List[Tuple[float, bool]], results)
...@@ -69,8 +69,8 @@ class MambaLMWrapper(HFLM): ...@@ -69,8 +69,8 @@ class MambaLMWrapper(HFLM):
) -> None: ) -> None:
try: try:
from mamba_ssm.utils.hf import load_config_hf # noqa: F811 from mamba_ssm.utils.hf import load_config_hf # noqa: F811
except ModuleNotFoundError: except ModuleNotFoundError as exception:
raise Exception( raise type(exception)(
"attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \ "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`", please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
) )
...@@ -88,8 +88,8 @@ please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba ...@@ -88,8 +88,8 @@ please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba
) -> None: ) -> None:
try: try:
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel # noqa: F811 from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel # noqa: F811
except ModuleNotFoundError: except ModuleNotFoundError as exception:
raise Exception( raise type(exception)(
"attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \ "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`", please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
) )
......
...@@ -39,8 +39,8 @@ def _patch_pretrained_cfg( ...@@ -39,8 +39,8 @@ def _patch_pretrained_cfg(
): ):
try: try:
import omegaconf import omegaconf
except ModuleNotFoundError: except ModuleNotFoundError as exception:
raise Exception( raise type(exception)(
"Attempted to use 'nemo_lm' model type, but package `nemo` is not installed" "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
"Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, " "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
"or installing nemo following https://github.com/NVIDIA/NeMo.", "or installing nemo following https://github.com/NVIDIA/NeMo.",
...@@ -79,8 +79,8 @@ def load_model( ...@@ -79,8 +79,8 @@ def load_model(
MegatronGPTModel, MegatronGPTModel,
) )
from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
except ModuleNotFoundError: except ModuleNotFoundError as exception:
raise Exception( raise type(exception)(
"Attempted to use 'nemo_lm' model type, but package `nemo` is not installed" "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
"Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, " "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
"or installing nemo following https://github.com/NVIDIA/NeMo.", "or installing nemo following https://github.com/NVIDIA/NeMo.",
...@@ -140,8 +140,8 @@ def load_model( ...@@ -140,8 +140,8 @@ def load_model(
def setup_distributed_environment(trainer): def setup_distributed_environment(trainer):
try: try:
from nemo.utils.app_state import AppState from nemo.utils.app_state import AppState
except ModuleNotFoundError: except ModuleNotFoundError as exception:
raise Exception( raise type(exception)(
"Attempted to use 'nemo_lm' model type, but package `nemo` is not installed" "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
"Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, " "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
"or installing nemo following https://github.com/NVIDIA/NeMo.", "or installing nemo following https://github.com/NVIDIA/NeMo.",
...@@ -194,8 +194,8 @@ class NeMoLM(LM): ...@@ -194,8 +194,8 @@ class NeMoLM(LM):
from pytorch_lightning.trainer.trainer import Trainer from pytorch_lightning.trainer.trainer import Trainer
self.generate = generate self.generate = generate
except ModuleNotFoundError: except ModuleNotFoundError as exception:
raise Exception( raise type(exception)(
"Attempted to use 'nemo_lm' model type, but package `nemo` is not installed" "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
"Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, " "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
"or installing nemo following https://github.com/NVIDIA/NeMo.", "or installing nemo following https://github.com/NVIDIA/NeMo.",
...@@ -386,6 +386,9 @@ class NeMoLM(LM): ...@@ -386,6 +386,9 @@ class NeMoLM(LM):
string_nll = sum(string_nll) string_nll = sum(string_nll)
loglikelihoods.append(string_nll) loglikelihoods.append(string_nll)
# cache this loglikelihood_rolling request
self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
return loglikelihoods return loglikelihoods
def _loglikelihood_tokens(self, requests, disable_tqdm=False): def _loglikelihood_tokens(self, requests, disable_tqdm=False):
...@@ -468,6 +471,9 @@ class NeMoLM(LM): ...@@ -468,6 +471,9 @@ class NeMoLM(LM):
answer = (logprob, is_greedy) answer = (logprob, is_greedy)
if cache_key is not None: if cache_key is not None:
# special case: loglikelihood_rolling produces a number of loglikelihood requests
# all with cache key None. instead do add_partial on the per-example level
# in the loglikelihood_rolling() function for those.
self.cache_hook.add_partial("loglikelihood", cache_key, answer) self.cache_hook.add_partial("loglikelihood", cache_key, answer)
res.append(answer) res.append(answer)
......
...@@ -38,8 +38,8 @@ class SparseMLLM(HFLM): ...@@ -38,8 +38,8 @@ class SparseMLLM(HFLM):
) -> None: ) -> None:
try: try:
from sparseml.transformers import SparseAutoModelForCausalLM from sparseml.transformers import SparseAutoModelForCausalLM
except ModuleNotFoundError: except ModuleNotFoundError as exception:
raise Exception( raise type(exception)(
"Package `sparseml` is not installed. " "Package `sparseml` is not installed. "
"Please install it via `pip install sparseml[transformers]`" "Please install it via `pip install sparseml[transformers]`"
) )
...@@ -88,8 +88,8 @@ class SparseMLLM(HFLM): ...@@ -88,8 +88,8 @@ class SparseMLLM(HFLM):
def _get_config(self, pretrained: str, **kwargs) -> None: def _get_config(self, pretrained: str, **kwargs) -> None:
try: try:
from sparseml.transformers import SparseAutoConfig from sparseml.transformers import SparseAutoConfig
except ModuleNotFoundError: except ModuleNotFoundError as exception:
raise Exception( raise type(exception)(
"Package `sparseml` is not installed. " "Package `sparseml` is not installed. "
"Please install it via `pip install sparseml[transformers]`" "Please install it via `pip install sparseml[transformers]`"
) )
...@@ -112,8 +112,8 @@ class SparseMLLM(HFLM): ...@@ -112,8 +112,8 @@ class SparseMLLM(HFLM):
) -> None: ) -> None:
try: try:
from sparseml.transformers import SparseAutoTokenizer from sparseml.transformers import SparseAutoTokenizer
except ModuleNotFoundError: except ModuleNotFoundError as exception:
raise Exception( raise type(exception)(
"Package `sparseml` is not installed. " "Package `sparseml` is not installed. "
"Please install it via `pip install sparseml[transformers]`" "Please install it via `pip install sparseml[transformers]`"
) )
...@@ -171,8 +171,8 @@ class DeepSparseLM(LM): ...@@ -171,8 +171,8 @@ class DeepSparseLM(LM):
try: try:
import deepsparse import deepsparse
except ModuleNotFoundError: except ModuleNotFoundError as exception:
raise Exception( raise type(exception)(
"Package `deepsparse` is not installed. " "Package `deepsparse` is not installed. "
"Please install it via `pip install deepsparse[transformers]`" "Please install it via `pip install deepsparse[transformers]`"
) )
...@@ -321,6 +321,9 @@ class DeepSparseLM(LM): ...@@ -321,6 +321,9 @@ class DeepSparseLM(LM):
res.append(answer) res.append(answer)
if cache_key is not None: if cache_key is not None:
# special case: loglikelihood_rolling produces a number of loglikelihood requests
# all with cache key None. instead do add_partial on the per-example level
# in the loglikelihood_rolling() function for those.
self.cache_hook.add_partial("loglikelihood", cache_key, answer) self.cache_hook.add_partial("loglikelihood", cache_key, answer)
return re_ord.get_original(res) return re_ord.get_original(res)
......
import copy import copy
import json
import logging import logging
import subprocess
from collections import defaultdict from collections import defaultdict
from typing import List, Optional, Union from typing import List, Optional, Union
...@@ -33,54 +31,6 @@ except ImportError: ...@@ -33,54 +31,6 @@ except ImportError:
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def get_nc_count() -> Union[int, None]:
"""Returns the number of neuron cores on the current instance."""
try:
cmd = "neuron-ls --json-output"
result = subprocess.run(cmd, shell=True, capture_output=True)
print(f"inferring nc_count from `neuron-ls` {result.stdout}")
json_output = json.loads(result.stdout)
count = sum([x["nc_count"] for x in json_output])
print(f"nc_count={count}")
return count
except Exception:
return None
def wrap_constant_batch_size(func):
def _decorator(self, input_ids):
"""input_ids a 2D array with batch_size on dim=0
makes sure the func runs with self.batch_size
"""
# access a from TestSample
batch_size = input_ids.shape[0]
if batch_size < self.batch_size:
# handle the event of input_ids.shape[0] != batch_size
# Neuron cores expect constant batch_size
input_ids = torch.concat(
(
input_ids,
# add missing_batch_size dummy
torch.zeros(
[self.batch_size - batch_size, *input_ids.size()[1:]],
dtype=input_ids.dtype,
device=input_ids.device,
),
),
dim=0,
)
elif batch_size > self.batch_size:
raise ValueError(
f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
)
# return the forward pass that requires constant batch size
return func(self, input_ids)[:batch_size]
return _decorator
class CustomNeuronModelForCausalLM(NeuronModelForCausalLM): class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
"""NeuronModelForCausalLM with `stopping_criteria` in `generate`""" """NeuronModelForCausalLM with `stopping_criteria` in `generate`"""
...@@ -146,7 +96,7 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM): ...@@ -146,7 +96,7 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
raise ValueError( raise ValueError(
f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})" f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
) )
elif batch_size < self.batch_size: elif batch_size < self.batch_size and not self.continuous_batching:
logger.warning( logger.warning(
"Inputs will be padded to match the model static batch size. This will increase latency." "Inputs will be padded to match the model static batch size. This will increase latency."
) )
...@@ -158,8 +108,6 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM): ...@@ -158,8 +108,6 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
if attention_mask is not None: if attention_mask is not None:
padding = torch.zeros(padding_shape, dtype=torch.int64) padding = torch.zeros(padding_shape, dtype=torch.int64)
padded_attention_mask = torch.cat([attention_mask, padding]) padded_attention_mask = torch.cat([attention_mask, padding])
# Drop the current generation context and clear the Key/Value cache
self.reset_generation()
output_ids = self.generate_tokens( output_ids = self.generate_tokens(
padded_input_ids, padded_input_ids,
...@@ -179,8 +127,6 @@ class NEURON_HF(TemplateLM): ...@@ -179,8 +127,6 @@ class NEURON_HF(TemplateLM):
Tested with neuron 2.17.0 Tested with neuron 2.17.0
""" """
_DEFAULT_MAX_LENGTH = 2048
def __init__( def __init__(
self, self,
pretrained: Optional[str] = "TinyLlama/TinyLlama-1.1B-Chat-v1.0", pretrained: Optional[str] = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
...@@ -198,12 +144,12 @@ class NEURON_HF(TemplateLM): ...@@ -198,12 +144,12 @@ class NEURON_HF(TemplateLM):
add_bos_token: Optional[bool] = False, add_bos_token: Optional[bool] = False,
) -> None: ) -> None:
if not NEURON_AVAILABLE: if not NEURON_AVAILABLE:
raise Exception( raise ImportError(
"Tried to load neuron model, but neuron is not installed ", "Tried to load neuron model, but neuron is not installed ",
"please install neuron via pip install transformers-neuron ", "please install neuron via pip install transformers-neuron ",
"also make sure you are running on an AWS inf2 instance", "also make sure you are running on an AWS inf2 instance",
) )
if version.parse(optimum_neuron_version) != version.parse("0.0.17"): if version.parse(optimum_neuron_version) != version.parse("0.0.24"):
logger.warning( logger.warning(
'`optimum-neuron` model requires `pip install "optimum[neuronx]>=0.0.17" ' '`optimum-neuron` model requires `pip install "optimum[neuronx]>=0.0.17" '
"preferably using the Hugging Face Neuron Deep Learning AMI (Ubuntu 22.04) " "preferably using the Hugging Face Neuron Deep Learning AMI (Ubuntu 22.04) "
...@@ -217,34 +163,16 @@ class NEURON_HF(TemplateLM): ...@@ -217,34 +163,16 @@ class NEURON_HF(TemplateLM):
self.batch_size_per_gpu = int(batch_size) self.batch_size_per_gpu = int(batch_size)
batch_size = int(batch_size) batch_size = int(batch_size)
if tp_degree is None:
# execute `neuron-ls --json-output | jq '.[0].nc_count'``
# to get the number of neuron cores on your instance
tp_degree = get_nc_count()
assert isinstance(tp_degree, int), (
f"model_args must include tp_degree. tp_degree must be set to an integer,"
f" but is tp_degree=`{tp_degree}` with type=`{type(tp_degree)}`."
"Set it to number of neuron cores on your instance."
" For inf2.xlarge and inf2.8xlarge, set it to `2`."
" For inf2.24xlarge, set it to `12`."
" For inf2.48xlarge, set it to `24`."
)
# TODO: update this to be less of a hack once subfolder is fixed in HF
revision = revision + ("/" + subfolder if subfolder is not None else "")
self._config = transformers.AutoConfig.from_pretrained( self._config = transformers.AutoConfig.from_pretrained(
pretrained, pretrained,
revision=revision, revision=revision,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
) )
torch_dtype = lm_eval.models.utils.get_dtype(dtype)
assert torch_dtype in [ revision = str(revision) # cast to string if not already one
torch.float16, # TODO: update this to be less of a hack once subfolder is fixed in HF
torch.bfloat16, revision = revision + ("/" + subfolder if subfolder is not None else "")
], "Only float16 and bfloat16 are supported"
self.tokenizer = transformers.AutoTokenizer.from_pretrained( self.tokenizer = transformers.AutoTokenizer.from_pretrained(
pretrained if tokenizer is None else tokenizer, pretrained if tokenizer is None else tokenizer,
...@@ -253,36 +181,58 @@ class NEURON_HF(TemplateLM): ...@@ -253,36 +181,58 @@ class NEURON_HF(TemplateLM):
use_fast=use_fast_tokenizer, use_fast=use_fast_tokenizer,
) )
# Neuron specific code neuron_config = getattr(self._config, "neuron", None)
if torch_dtype == torch.float16: if neuron_config is None:
self.amp_dtype = "f16" # Check export parameters
elif torch_dtype == torch.bfloat16: if tp_degree is not None:
self.amp_dtype = "bf16" assert isinstance(tp_degree, int), (
elif torch_dtype == torch.float32: f"tp_degree must be set to an integer,"
self.amp_dtype = "f32" f" but is tp_degree=`{tp_degree}` with type=`{type(tp_degree)}`."
else: "Set it to a number lower than the number of neuron cores on your instance."
raise NotImplementedError("Only float16 and bfloat16 are implemented.") " For inf2.xlarge and inf2.8xlarge, set it to `2`."
" For inf2.24xlarge, set it <= `12`."
compiler_args = {"num_cores": tp_degree, "auto_cast_type": self.amp_dtype} " For inf2.48xlarge, set it <= `24`."
input_shapes = { )
"batch_size": batch_size, torch_dtype = lm_eval.models.utils.get_dtype(dtype)
"sequence_length": self._DEFAULT_MAX_LENGTH,
} if torch_dtype == torch.float16:
self.amp_dtype = "f16"
elif torch_dtype == torch.bfloat16:
self.amp_dtype = "bf16"
elif torch_dtype == torch.float32:
self.amp_dtype = "f32"
else:
raise NotImplementedError(
"Only float16/bfloat16/float32 are supported."
)
print( print(f"{'='*20} \n exporting model to neuron")
f"{'='*20} \n loading model to neuron with" self.model = CustomNeuronModelForCausalLM.from_pretrained(
f" {compiler_args}, {input_shapes}..." pretrained,
) revision=revision,
self.model = CustomNeuronModelForCausalLM.from_pretrained( trust_remote_code=trust_remote_code,
pretrained, low_cpu_mem_usage=low_cpu_mem_usage,
revision=revision, export=True,
trust_remote_code=trust_remote_code, batch_size=batch_size,
low_cpu_mem_usage=low_cpu_mem_usage, num_cores=tp_degree,
export=True, auto_cast_type=self.amp_dtype,
**compiler_args, sequence_length=max_length,
**input_shapes, )
) neuron_config = self.model.config.neuron
print(f"SUCCESS: neuron model compiled. \n {'='*20}") print(
f"SUCCESS: neuron model exported with config {neuron_config}. \n {'='*20}"
)
else:
print(
f"{'='*20} \n loading neuron model with config" f" {neuron_config}..."
)
self.model = CustomNeuronModelForCausalLM.from_pretrained(
pretrained,
revision=revision,
trust_remote_code=trust_remote_code,
low_cpu_mem_usage=low_cpu_mem_usage,
)
print(f"SUCCESS: neuron model loaded. \n {'='*20}")
self.truncation = truncation self.truncation = truncation
...@@ -290,8 +240,6 @@ class NEURON_HF(TemplateLM): ...@@ -290,8 +240,6 @@ class NEURON_HF(TemplateLM):
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self.add_bos_token = add_bos_token self.add_bos_token = add_bos_token
self._max_length = max_length
self.batch_schedule = 1 self.batch_schedule = 1
self.batch_sizes = {} self.batch_sizes = {}
...@@ -312,17 +260,7 @@ class NEURON_HF(TemplateLM): ...@@ -312,17 +260,7 @@ class NEURON_HF(TemplateLM):
@property @property
def max_length(self): def max_length(self):
if self._max_length: # if max length manually set, return it return self.model.max_length
return self._max_length
seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
for attr in seqlen_config_attrs:
if hasattr(self.model.config, attr):
return getattr(self.model.config, attr)
if hasattr(self.tokenizer, "model_max_length"):
if self.tokenizer.model_max_length == 1000000000000000019884624838656:
return self._DEFAULT_MAX_LENGTH
return self.tokenizer.model_max_length
return self._DEFAULT_MAX_LENGTH
@property @property
def max_gen_toks(self) -> int: def max_gen_toks(self) -> int:
...@@ -390,34 +328,6 @@ class NEURON_HF(TemplateLM): ...@@ -390,34 +328,6 @@ class NEURON_HF(TemplateLM):
def tok_decode(self, tokens): def tok_decode(self, tokens):
return self.tokenizer.decode(tokens) return self.tokenizer.decode(tokens)
@wrap_constant_batch_size
def _model_call(self, input_ids: torch.Tensor):
"""
get logits for the entire sequence
:param input_ids: torch.Tensor
A torch tensor of shape [batch, sequence_cont]
the size of sequence may vary from call to call
:return
A torch tensor of shape [batch, sequence, vocab] with the
logits returned from the model's decoder-lm head
"""
_, sequence_length = input_ids.shape
with torch.inference_mode():
cache_ids = torch.arange(0, sequence_length, dtype=torch.int32).split(1)
input_ids_split = input_ids.split(1, dim=1)
return torch.concat(
[
self.model.forward(
input_ids=input_id, cache_ids=cache_id, return_dict=False
)[0]
for input_id, cache_id in zip(input_ids_split, cache_ids)
],
dim=1,
)
def _model_generate(self, context, max_length, stop, **generation_kwargs): def _model_generate(self, context, max_length, stop, **generation_kwargs):
# we require users to pass do_sample=True explicitly # we require users to pass do_sample=True explicitly
# for non-greedy gen. This should be reevaluated when considering beam search. # for non-greedy gen. This should be reevaluated when considering beam search.
...@@ -501,7 +411,8 @@ class NEURON_HF(TemplateLM): ...@@ -501,7 +411,8 @@ class NEURON_HF(TemplateLM):
string_nll = sum(string_nll) string_nll = sum(string_nll)
loglikelihoods.append(string_nll) loglikelihoods.append(string_nll)
# cache this loglikelihood_rolling request
self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
return loglikelihoods return loglikelihoods
def _loglikelihood_tokens( def _loglikelihood_tokens(
...@@ -578,15 +489,41 @@ class NEURON_HF(TemplateLM): ...@@ -578,15 +489,41 @@ class NEURON_HF(TemplateLM):
cont_toks_list.append(continuation_enc) cont_toks_list.append(continuation_enc)
inplens.append(inplen) inplens.append(inplen)
# create encoder attn mask and batched conts, if seq2seq # Add dummy inputs up to the model static batch size
call_kwargs = {} if len(inps) < self.batch_size:
inps = inps + [
torch.zeros_like(inps[0]),
] * (self.batch_size - len(inps))
masks = [torch.ones_like(inp) for inp in inps]
batched_inps = lm_eval.models.utils.pad_and_concat( batched_inps = lm_eval.models.utils.pad_and_concat(
padding_len_inp, inps, padding_side="right" padding_len_inp, inps, padding_side="right"
) # [batch, padding_len_inp] ) # [batch, padding_len_inp]
multi_logits = F.log_softmax( batched_masks = lm_eval.models.utils.pad_and_concat(
self._model_call(batched_inps, **call_kwargs), dim=-1 padding_len_inp, masks, padding_side="right"
) # [batch, padding_length (inp or cont), vocab] )
if self.model.model.neuron_config.output_all_logits:
inputs = self.model.prepare_inputs_for_prefill(
batched_inps, batched_masks
)
multi_logits = F.log_softmax(
self.model.forward(**inputs).logits, dim=-1
) # [batch, padding_length (inp or cont), vocab]
else:
# The model will only return the logits for the last input token, so we need
# to iterate over inputs to accumulate logits.
# To speed things up we use the KV cache as we would do when generating.
inputs = self.model.prepare_inputs_for_prefill(
batched_inps[:, :1], batched_masks[:, :1]
)
outputs = [self.model.forward(**inputs).logits]
for i in range(1, padding_len_inp):
inputs = self.model.prepare_inputs_for_decode(
batched_inps[:, : i + 1], batched_masks[:, : i + 1]
)
outputs.append(self.model.forward(**inputs).logits)
multi_logits = F.log_softmax(torch.concat(outputs, dim=1), dim=-1)
for (cache_key, _, _), logits, inplen, cont_toks in zip( for (cache_key, _, _), logits, inplen, cont_toks in zip(
chunk, multi_logits, inplens, cont_toks_list chunk, multi_logits, inplens, cont_toks_list
...@@ -619,7 +556,11 @@ class NEURON_HF(TemplateLM): ...@@ -619,7 +556,11 @@ class NEURON_HF(TemplateLM):
res.append(answer) res.append(answer)
self.cache_hook.add_partial("loglikelihood", cache_key, answer) if cache_key is not None:
# special case: loglikelihood_rolling produces a number of loglikelihood requests
# all with cache key None. instead do add_partial on the per-example level
# in the loglikelihood_rolling() function for those.
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
return re_ord.get_original(res) return re_ord.get_original(res)
......
import copy
import os import os
from collections import defaultdict from functools import cached_property
from importlib.util import find_spec from typing import Any, Dict, List, Optional, Tuple, Union
from typing import List, Literal, Optional, Tuple
from tqdm import tqdm
import lm_eval.models.utils
from lm_eval import utils
from lm_eval.api.model import LM, TemplateLM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.models.utils import retry_on_specific_exceptions from lm_eval.models.api_models import TemplateAPI
from lm_eval.utils import eval_logger from lm_eval.utils import eval_logger
def get_result(response) -> Tuple[float, bool]: @register_model("local-completions")
"""Process results from OpenAI API response. class LocalCompletionsAPI(TemplateAPI):
def __init__(
:param response: dict self,
OpenAI API Response base_url=None,
:return: tokenizer_backend="huggingface",
continuation_logprobs: np.array **kwargs,
Log probabilities of continuation tokens ):
is_greedy: bool super().__init__(
whether argmax matches given continuation exactly base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
"""
is_greedy = True
logprobs = response.logprobs.token_logprobs
continuation_logprobs = sum(logprobs)
for i in range(len(response.logprobs.token_logprobs)):
token = response.logprobs.token_logprobs[i]
top_tokens = response.logprobs.top_logprobs[i]
top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
if top_token != token:
is_greedy = False
break
return continuation_logprobs, is_greedy
def oa_completion(client, chat: bool = False, **kwargs):
"""Query OpenAI API for completion.
Retry with back-off until they respond
"""
if not find_spec("openai") or not find_spec("tiktoken"):
raise Exception(
"attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. "
"Please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`"
) )
else:
import openai
def _exception_callback(e: Exception, sleep_time: float) -> None:
import traceback
traceback.print_exc()
@retry_on_specific_exceptions(
on_exceptions=[openai.OpenAIError],
max_retries=None, # retry forever, consider changing
on_exception_callback=_exception_callback,
)
def completion():
if chat:
return client.chat.completions.create(**kwargs)
else:
return client.completions.create(**kwargs)
return completion() def _create_payload(
@register_model("openai-completions", "local-completions")
class OpenaiCompletionsLM(TemplateLM):
_DEFAULT_MAX_LENGTH = 2048
def __init__(
self, self,
model: str, messages: Union[List[List[int]], List[dict], List[str], str],
base_url: str = None, generate=False,
tokenizer: Optional[str] = None, gen_kwargs: Optional[dict] = None,
tokenizer_backend: Literal["tiktoken", "huggingface"] = "tiktoken",
truncate: bool = False,
max_gen_toks: int = 256,
batch_size: int = 1,
seed: int = 1234, seed: int = 1234,
max_length: Optional[int] = None, **kwargs,
) -> None: ) -> dict:
""" if generate:
gen_kwargs.pop("do_sample", False)
:param engine: str if "max_tokens" in gen_kwargs:
OpenAI API engine (e.g. gpt-3.5-turbo-instruct) max_tokens = gen_kwargs.pop("max_tokens")
:param truncate: bool else:
Truncate input if too long (if False and input is too long, throw error) max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
""" temperature = gen_kwargs.pop("temperature", 0)
super().__init__() stop = gen_kwargs.pop("until", ["<|endoftext|>"])
self.seed = seed return {
try: "prompt": messages,
import openai # noqa: E401 "model": self.model,
import tiktoken "max_tokens": max_tokens,
except ModuleNotFoundError: "temperature": temperature,
raise Exception( "stop": stop,
"attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \ "seed": seed,
please install these via `pip install lm-eval[openai]` or `pip install -e .\"[openai]\"`", **gen_kwargs,
) }
self.model = model
self.base_url = base_url
self.tokenizer_backend = tokenizer_backend
self.truncate = truncate
self._batch_size = int(batch_size)
self._max_gen_toks = max_gen_toks
self._max_length = max_length
# if we have a local model, use HF tokenizer over tiktoken
if self.tokenizer_backend == "huggingface":
import transformers # noqa: E401
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
tokenizer if tokenizer else self.model
)
self.vocab_size = self.tokenizer.vocab
self.end_of_text_token_id = self.tokenizer.eos_token
elif self.tokenizer_backend == "tiktoken":
if self.base_url:
eval_logger.warning(
f"Passed `base_url={self.base_url}` but using Tiktoken tokenizer backend. "
"Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken."
)
self.tokenizer = tiktoken.encoding_for_model(self.model)
self.vocab_size = self.tokenizer.n_vocab
self.end_of_text_token_id = self.tokenizer.eot_token
else:
raise ValueError(
f"Expected tokenizer_backend to be one of ['tiktoken', 'huggingface'] but got {self.tokenizer_backend}"
)
# Read from environment variable OPENAI_API_KEY
# Set to EMPTY for local
openai.api_key = os.environ["OPENAI_API_KEY"]
if self.base_url:
self.client = openai.OpenAI(base_url=self.base_url)
else:
self.client = openai.OpenAI()
@property
def eot_token_id(self):
return self.end_of_text_token_id
@property
def max_length(self) -> int:
if self._max_length:
return self._max_length
else: else:
return self._DEFAULT_MAX_LENGTH return {
"model": self.model,
@property "prompt": messages,
def max_gen_toks(self) -> int: "temperature": 0,
return self._max_gen_toks "max_tokens": 1,
"logprobs": 1,
@property "seed": seed,
def batch_size(self) -> int: "echo": True,
return self._batch_size }
@property @staticmethod
def device(self): def parse_logprobs(
# Isn't used because we override _loglikelihood_tokens outputs: Union[Dict, List[Dict]],
raise NotImplementedError() tokens: List[List[int]] = None,
ctxlens: List[int] = None,
def tok_encode(self, string: str, **kwargs) -> List[int]: **kwargs,
return self.tokenizer.encode(string)
def tok_decode(self, tokens: List[int]) -> str:
return self.tokenizer.decode(tokens)
def _loglikelihood_tokens(
self, requests, disable_tqdm: bool = False
) -> List[Tuple[float, bool]]: ) -> List[Tuple[float, bool]]:
res = [] res = []
if not isinstance(outputs, list):
def _collate(x): outputs = [outputs]
# this doesn't efficiently handle last-token differences yet, but those are kinda annoying because for out in outputs:
# it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations for choice, ctxlen in zip(out["choices"], ctxlens):
# we care about, and so we need some kind of backup for when it isn't assert ctxlen > 0, "Context length must be greater than 0"
toks = x[1] + x[2] logprobs = sum(choice["logprobs"]["token_logprobs"][ctxlen:-1])
return -len(toks), tuple(toks) tokens_logprobs = choice["logprobs"]["token_logprobs"][ctxlen:-1]
top_logprobs = choice["logprobs"]["top_logprobs"][ctxlen:-1]
re_ord = utils.Reorderer(requests, _collate) is_greedy = True
for tok, top in zip(tokens_logprobs, top_logprobs):
for chunk in tqdm( if tok != max(top.values()):
list(lm_eval.models.utils.chunks(re_ord.get_reordered(), self.batch_size)), is_greedy = False
disable=disable_tqdm, break
): res.append((logprobs, is_greedy))
inps = [] return res
ctxlens = []
for cache_key, context_enc, continuation_enc in chunk: @staticmethod
# max_length+1 because the API takes up to 2049 tokens, including the first context token def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]:
inp = (context_enc + continuation_enc)[-(self.max_length + 1) :]
# TODO: the logic is much simpler if we just look at the length of continuation tokens
ctxlen = len(context_enc) - max(
0, len(context_enc) + len(continuation_enc) - (self.max_length + 1)
)
inps.append(inp)
ctxlens.append(ctxlen)
response = oa_completion(
client=self.client,
model=self.model,
prompt=inps,
max_tokens=0,
temperature=0.0,
logprobs=10,
seed=self.seed,
)
for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip(
response.choices, ctxlens, chunk
):
answer = get_result(resp)
res.append(answer)
# partial caching
if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
return re_ord.get_original(res)
def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
if not requests:
return []
res = [] res = []
requests = [req.args for req in requests] if not isinstance(outputs, list):
outputs = [outputs]
def _collate(x): for out in outputs:
toks = self.tok_encode(x[0]) for choices in out["choices"]:
return len(toks), x[0] res.append(choices["text"])
return res
re_ord = utils.Reorderer(requests, _collate) @property
def api_key(self):
def sameuntil_chunks(xs, size): return os.environ.get("OPENAI_API_KEY", "")
ret = []
lastuntil = xs[0][1]
for x in xs:
if len(ret) >= size or x[1] != lastuntil:
yield ret, lastuntil
ret = []
lastuntil = x[1]
ret.append(x)
if ret:
yield ret, lastuntil
# todo: more intelligent batching for heterogeneous `until`
for chunk, request_args in tqdm(
list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size)),
disable=disable_tqdm,
):
inps = []
self._max_gen_toks = request_args.get("max_gen_toks", self.max_gen_toks)
for context, _ in chunk:
context_enc = self.tok_encode(context)
inp = context_enc[-(self.max_length - self.max_gen_toks) :]
inps.append(inp)
until = request_args.get("until", ["<|endoftext|>"])
request_args["temperature"] = request_args.get("temperature", 0)
response = oa_completion( @register_model("local-chat-completions")
client=self.client, class LocalChatCompletion(LocalCompletionsAPI):
model=self.model, def __init__(
prompt=inps, self,
max_tokens=self.max_gen_toks, base_url=None,
stop=until, tokenizer_backend=None,
seed=self.seed, tokenized_requests=False,
**{ **kwargs,
k: v ):
for k, v in request_args.items() eval_logger.warning(
if k not in {"do_sample", "max_gen_toks", "until"} "chat-completions endpoint requires the `--apply_chat_template` flag."
}, )
super().__init__(
base_url=base_url,
tokenizer_backend=tokenizer_backend,
tokenized_requests=tokenized_requests,
**kwargs,
)
if self._batch_size > 1:
eval_logger.warning(
"Chat completions does not support batching. Defaulting to batch size 1."
) )
for resp, (context, args_) in zip(response.choices, chunk): self._batch_size = 1
s = getattr(resp, "text")
until_ = until
for term in until_:
if len(term) > 0:
s = s.split(term)[0]
# partial caching
self.cache_hook.add_partial(
"generate_until", (context, {"until": until_}), s
)
res.append(s)
return re_ord.get_original(res)
def _model_call(self, inps):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
def _model_generate(self, context, max_length, eos_token_id): def _create_payload(
# Isn't used because we override generate_until self,
raise NotImplementedError() messages: List[Dict],
generate=False,
gen_kwargs: dict = None,
seed=1234,
**kwargs,
) -> dict:
assert (
type(messages) is not str
), "chat-completions require the --apply_chat_template flag."
gen_kwargs.pop("do_sample", False)
if "max_tokens" in gen_kwargs:
max_tokens = gen_kwargs.pop("max_tokens")
else:
max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
temperature = gen_kwargs.pop("temperature", 0)
stop = gen_kwargs.pop("until", ["<|endoftext|>"])
if not isinstance(stop, (list, tuple)):
stop = [stop]
return {
"messages": messages,
"model": self.model,
"max_tokens": max_tokens,
"temperature": temperature,
"stop": stop[:4],
"seed": seed,
**gen_kwargs,
}
@staticmethod
def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]:
res = []
if not isinstance(outputs, list):
outputs = [outputs]
for out in outputs:
for choices in out["choices"]:
res.append(choices["message"]["content"])
return res
def tok_encode(
self,
string: Union[str, Any],
left_truncate_len=None,
add_special_tokens=None,
**kwargs,
) -> Union[List[str], List[int], Any]:
return string
def loglikelihood_rolling( def loglikelihood(self, requests, **kwargs):
self, requests, disable_tqdm: bool = False raise NotImplementedError(
) -> List[float]: "Loglikelihood is not supported for chat completions. Consider using the completions API instead."
loglikelihoods = [] )
for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
rolling_token_windows = list(
map(
utils.make_disjoint_window,
utils.get_rolling_token_windows(
token_list=self.tok_encode(string),
prefix_token=self.eot_token_id,
max_seq_len=self.max_length,
context_len=1,
),
)
)
# TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case @register_model(
rolling_token_windows = [(None,) + x for x in rolling_token_windows] "openai-completions",
)
class OpenAICompletionsAPI(LocalCompletionsAPI):
def __init__(
self,
base_url="https://api.openai.com/v1/completions",
tokenizer_backend="tiktoken",
**kwargs,
):
super().__init__(
base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
)
string_nll = self._loglikelihood_tokens( @cached_property
rolling_token_windows, def api_key(self):
disable_tqdm=True, """Override this property to return the API key for the API request."""
key = os.environ.get("OPENAI_API_KEY", None)
if key is None:
raise ValueError(
"API key not found. Please set the `OPENAI_API_KEY` environment variable."
) )
return key
# discard is_greedy def loglikelihood(self, requests, **kwargs):
string_nll = [x[0] for x in string_nll] assert (
self.model
in [
"babbage-002",
"davinci-002",
]
), f"Prompt loglikelihoods are only supported by OpenAI's API for {['babbage-002', 'davinci-002']}."
return super().loglikelihood(requests, **kwargs)
string_nll = sum(string_nll) def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
loglikelihoods.append(string_nll) return ""
return loglikelihoods
@register_model("openai-chat-completions", "local-chat-completions") @register_model("openai-chat-completions")
class OpenaiChatCompletionsLM(LM): class OpenAIChatCompletion(LocalChatCompletion):
def __init__( def __init__(
self, self,
model: str = "gpt-3.5-turbo", # GPT model or Local model using HuggingFace model paths base_url="https://api.openai.com/v1/chat/completions",
base_url: str = None, tokenizer_backend=None,
truncate: bool = False, tokenized_requests=False,
**kwargs, **kwargs,
) -> None: ):
""" if "o1" in kwargs.get("model", ""):
eval_logger.warning(
:param model: str "o1 models do not support `stop` and only support temperature=1"
Implements an OpenAI-style chat completion API for
accessing both OpenAI OR locally-hosted models using
HuggingFace Tokenizer
OpenAI API model (e.g. gpt-3.5-turbo)
using the **gen_kwargs passed on init
:param truncate: bool
Truncate input if too long (if False and input is too long, throw error)
"""
super().__init__()
try:
import openai # noqa: E401
except ModuleNotFoundError:
raise Exception(
"attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
) )
self.model = model super().__init__(
self.base_url = base_url base_url=base_url,
self.truncate = truncate tokenizer_backend=tokenizer_backend,
tokenized_requests=tokenized_requests,
# Read from environment variable OPENAI_API_KEY **kwargs,
# Set to EMPTY for local )
if self.base_url:
self.client = openai.OpenAI(base_url=self.base_url)
else:
self.client = openai.OpenAI() # openai.AsyncOpenAI()
@property
def max_length(self) -> int:
# Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
return 2048
@property
def max_gen_toks(self) -> int:
return 256
@property
def batch_size(self):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
@property
def device(self):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
res = defaultdict(list)
re_ords = {}
# we group requests by their generation_kwargs, @cached_property
# so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling def api_key(self):
# in the same batch. """Override this property to return the API key for the API request."""
grouper = lm_eval.models.utils.Grouper(requests, lambda x: str(x.args[1])) key = os.environ.get("OPENAI_API_KEY", None)
for key, reqs in grouper.get_grouped().items(): if key is None:
# within each set of reqs for given kwargs, we reorder by token length, descending. raise ValueError(
re_ords[key] = utils.Reorderer( "API key not found. Please set the `OPENAI_API_KEY` environment variable."
[req.args for req in reqs], lambda x: (-len(x[0]), x[0])
) )
return key
pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0))) def loglikelihood(self, requests, **kwargs):
for key, re_ord in re_ords.items(): raise NotImplementedError(
# n needs to be 1 because messages in "Loglikelihood (and therefore `multiple_choice`-type tasks) is not supported for chat completions as OpenAI does not provide prompt logprobs. See https://github.com/EleutherAI/lm-evaluation-harness/issues/942#issuecomment-1777836312 or https://github.com/EleutherAI/lm-evaluation-harness/issues/1196 for more background on this limitation."
# chat completion are not batch but )
# is regarded as a single conversation.
chunks = lm_eval.models.utils.chunks(re_ord.get_reordered(), n=1)
for chunk in chunks:
contexts, all_gen_kwargs = zip(*chunk)
inps = [{"role": "user", "content": context} for context in contexts]
gen_kwargs = all_gen_kwargs[0]
until = None
if isinstance(kwargs := copy.deepcopy(gen_kwargs), dict):
if "do_sample" in kwargs.keys():
kwargs.pop("do_sample")
if "until" in kwargs.keys():
until = kwargs.pop("until")
if isinstance(until, str):
until = [until]
elif not isinstance(until, list):
raise ValueError(
f"Expected repr(kwargs['until']) to be of type Union[str, list] but got {until}"
)
kwargs["stop"] = until
kwargs["max_tokens"] = kwargs.pop("max_gen_toks", self.max_gen_toks)
else:
raise ValueError(
f"Expected repr(kwargs) to be of type repr(dict) but got {kwargs}"
)
response = oa_completion(
client=self.client,
chat=True,
messages=inps,
model=self.model,
**kwargs,
)
for resp, (context, args_) in zip(response.choices, chunk):
s = resp.message.content
if until is not None:
for term in until:
if len(term) > 0:
s = s.split(term)[0]
res[key].append(s)
self.cache_hook.add_partial(
"generate_until", (context, {"until": until}), s
)
pbar.update(1)
# reorder this group of results back to original unsorted form
res[key] = re_ord.get_original(res[key])
pbar.close()
return grouper.get_original(res)
def loglikelihood(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.")
def loglikelihood_rolling(self, requests, disable_tqdm: bool = False): def _create_payload(
raise NotImplementedError("No support for logits.") self,
messages: List[Dict],
generate=False,
gen_kwargs: dict = None,
seed=1234,
**kwargs,
) -> dict:
assert (
type(messages) is not str
), "chat-completions require the --apply_chat_template flag."
gen_kwargs.pop("do_sample", False)
if "max_tokens" in gen_kwargs:
max_tokens = gen_kwargs.pop("max_tokens")
else:
max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
temperature = gen_kwargs.pop("temperature", 0)
stop = gen_kwargs.pop("until", ["<|endoftext|>"])
if not isinstance(stop, (list, tuple)):
stop = [stop]
output = {
"messages": messages,
"model": self.model,
"max_completion_tokens": max_tokens,
"temperature": temperature,
"stop": stop[:4],
"seed": seed,
**gen_kwargs,
}
if "o1" in self.model:
output.pop("stop")
output["temperature"] = 1
return output
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment