Commit 948f120f authored by Baber's avatar Baber
Browse files

Merge branch 'main' into autobatchtest

# Conflicts:
#	lm_eval/models/huggingface.py
parents a5b1c7a8 bd80a6c0
......@@ -56,8 +56,7 @@ class TaskConfig(dict):
# task naming/registry
task: Optional[str] = None
task_alias: Optional[str] = None
group: Optional[Union[str, list]] = None
group_alias: Optional[Union[str, list]] = None
tag: Optional[Union[str, list]] = None
# HF dataset options.
# which dataset to use,
# and what splits for what purpose
......@@ -68,13 +67,14 @@ class TaskConfig(dict):
validation_split: Optional[str] = None
test_split: Optional[str] = None
fewshot_split: Optional[str] = (
None # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
None # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaluating (?)
)
# formatting / prompting options.
# see docs/advanced_task_guide.md for more info
process_docs: Optional[Callable] = None
doc_to_text: Optional[Union[Callable, str]] = None
doc_to_target: Optional[Union[Callable, str]] = None
doc_to_image: Union[Callable, str] = None
doc_to_choice: Optional[Union[Callable, str, dict, list]] = None
process_results: Optional[Union[Callable, str]] = None
use_prompt: Optional[str] = None
......@@ -365,6 +365,10 @@ class Task(abc.ABC):
def doc_to_target(self, doc):
pass
# not an abstractmethod because not every language-only task has to implement this
def doc_to_image(self, doc):
raise NotImplementedError
def build_all_requests(
self,
*,
......@@ -723,6 +727,10 @@ class ConfigurableTask(Task):
)
self.OUTPUT_TYPE = self.config.output_type
if self.config.doc_to_image is not None:
# mark the task as requiring multimodality.
self.MULTIMODAL = True
if self.config.dataset_path is not None:
self.DATASET_PATH = self.config.dataset_path
......@@ -980,7 +988,7 @@ class ConfigurableTask(Task):
else:
if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0):
eval_logger.warning(
f"Task '{self.config.task}': "
f"[Task: {self.config.task}] "
"num_fewshot > 0 but fewshot_split is None. "
"using preconfigured rule."
)
......@@ -1030,8 +1038,8 @@ class ConfigurableTask(Task):
Whether to apply the chat template to the fewshot context.
:param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:param chat_template: Callable
Chat template to be applied to the fewshot context.
:param chat_template:
callable (from lm.apply_chat_template) that takes in a list[Dict] chat transcript and renders it into a string.
:returns: str
The fewshot context.
"""
......@@ -1159,9 +1167,11 @@ class ConfigurableTask(Task):
"""
return doc
def doc_to_text(self, doc):
def doc_to_text(self, doc, doc_to_text=None):
if self.prompt is not None:
doc_to_text = self.prompt
elif doc_to_text is not None:
doc_to_text = doc_to_text
else:
doc_to_text = self.config.doc_to_text
......@@ -1193,9 +1203,11 @@ class ConfigurableTask(Task):
print(type(doc_to_text))
raise TypeError
def doc_to_target(self, doc: Mapping) -> Union[int, str, list]:
def doc_to_target(self, doc: Mapping, doc_to_target=None) -> Union[int, str, list]:
if self.prompt is not None:
doc_to_target = self.prompt
elif doc_to_target is not None:
doc_to_target = doc_to_target
else:
doc_to_target = self.config.doc_to_target
......@@ -1237,9 +1249,11 @@ class ConfigurableTask(Task):
else:
raise TypeError
def doc_to_choice(self, doc: Any) -> List[str]:
def doc_to_choice(self, doc: Any, doc_to_choice=None) -> List[str]:
if self.prompt is not None:
doc_to_choice = self.prompt
elif doc_to_choice is not None:
doc_to_choice = doc_to_choice
elif self.config.doc_to_choice is None:
eval_logger.error("doc_to_choice was called but not set in config")
else:
......@@ -1261,9 +1275,34 @@ class ConfigurableTask(Task):
else:
raise TypeError
def doc_to_image(self, doc: Any, doc_to_image=None) -> Union[int, str, list]:
if doc_to_image is not None:
doc_to_image = doc_to_image
elif self.config.doc_to_image is not None:
doc_to_image = self.config.doc_to_image
else:
return None
if isinstance(doc_to_image, list):
image_feature = [
self.doc_to_image(doc, feature) for feature in doc_to_image
]
return [feature for feature in image_feature if feature is not None]
elif isinstance(doc_to_image, str):
if doc_to_image in self.features:
return doc[doc_to_image]
else:
return ast.literal_eval(utils.apply_template(doc_to_image, doc))
elif callable(doc_to_image):
return doc_to_image(doc)
else:
return None
def construct_requests(
self, doc: dict, ctx: str, **kwargs
) -> Union[List[Instance], Instance]:
aux_arguments = None
if self.OUTPUT_TYPE == "loglikelihood":
arguments = (ctx, self.doc_to_target(doc))
elif self.OUTPUT_TYPE == "loglikelihood_rolling":
......@@ -1281,6 +1320,37 @@ class ConfigurableTask(Task):
# Otherwise they are placed in the continuation
arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]
# TODO: we should raise a warning telling users this will at most ~2x runtime.
if "acc_mutual_info" in self._metric_fn_list.keys():
# if we are calculating multiple choice accuracy
# using mutual information instead of raw loglikelihood as metric, need unconditional lls.
# here mutual info refers to calculating
# log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
# in other words normalizing by subtracting the unconditional logprob of each choice.
aux_arguments = [("", f"{choice}") for choice in choices]
arguments.extend(aux_arguments)
elif self.OUTPUT_TYPE == "generate_until":
arguments = (ctx, deepcopy(self.config.generation_kwargs))
multimodal_arg = {}
if (
self.config.doc_to_image
): # TODO: ensure that non-multimodal tasks aren't getting visual args
multimodal_arg = {
**multimodal_arg,
**{"visual": self.doc_to_image(doc)},
}
if bool(multimodal_arg):
if isinstance(arguments, list):
arguments = [arg + (multimodal_arg,) for arg in arguments]
else:
arguments = arguments + (multimodal_arg,)
if self.OUTPUT_TYPE == "multiple_choice":
request_list = [
Instance(
request_type="loglikelihood",
......@@ -1291,33 +1361,15 @@ class ConfigurableTask(Task):
)
for i, arg in enumerate(arguments)
]
# TODO: we should raise a warning telling users this will at most ~2x runtime.
if "acc_mutual_info" in self._metric_fn_list.keys():
# if we are calculating multiple choice accuracy
# using mutual information instead of raw loglikelihood as metric, need unconditional lls.
# here mutual info refers to calculating
# log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
# in other words normalizing by subtracting the unconditional logprob of each choice.
request_list.extend(
[
Instance(
request_type="loglikelihood",
doc=doc,
arguments=("", "{}".format(choice)),
idx=i,
**kwargs,
)
for i, choice in enumerate(choices)
]
)
return request_list
elif self.OUTPUT_TYPE == "generate_until":
arguments = (ctx, deepcopy(self.config.generation_kwargs))
return Instance(
request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs
request_type=self.OUTPUT_TYPE,
doc=doc,
arguments=arguments,
idx=0,
**kwargs,
)
def process_results(self, doc, results):
......@@ -1446,7 +1498,7 @@ class ConfigurableTask(Task):
# we expect multiple_targets to be a list.
elif self.multiple_target:
gold = list(gold)
elif type(gold) != type(result):
elif type(gold) is not type(result):
# cast gold to the same type as result
gold = type(result)(gold)
......@@ -1520,10 +1572,13 @@ class ConfigurableTask(Task):
def get_config(self, key: str) -> Any:
return getattr(self._config, key, None)
@property
def task_name(self) -> Any:
return getattr(self.config, "task", None)
def __repr__(self):
return (
f"ConfigurableTask(task_name={getattr(self.config, 'task', None)},"
f"group_name={getattr(self.config, 'group', None)},"
f"output_type={self.OUTPUT_TYPE},"
f"num_fewshot={getattr(self.config, 'num_fewshot', None)},"
f"num_samples={len(self.eval_docs)})"
......
......@@ -11,11 +11,14 @@ import torch
import lm_eval.api.metrics
import lm_eval.api.registry
import lm_eval.api.task
import lm_eval.models
from lm_eval.caching.cache import delete_cache
from lm_eval.evaluator_utils import (
consolidate_group_results,
consolidate_results,
get_sample_size,
get_subtask_list,
get_task_list,
prepare_print_tasks,
print_writeout,
......@@ -23,7 +26,10 @@ from lm_eval.evaluator_utils import (
)
from lm_eval.loggers import EvaluationTracker
from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash
from lm_eval.tasks import TaskManager, get_task_dict
from lm_eval.tasks import (
TaskManager,
get_task_dict,
)
from lm_eval.utils import (
eval_logger,
handle_non_serializable,
......@@ -35,7 +41,7 @@ from lm_eval.utils import (
if TYPE_CHECKING:
from lm_eval.api.model import LM
from lm_eval.tasks import Task
from lm_eval.api.task import Task
@positional_deprecated
......@@ -44,7 +50,7 @@ def simple_evaluate(
model_args: Optional[Union[str, dict]] = None,
tasks: Optional[List[Union[str, dict, object]]] = None,
num_fewshot: Optional[int] = None,
batch_size: Optional[int] = None,
batch_size: Optional[Union[int, str]] = None,
max_batch_size: Optional[int] = None,
device: Optional[str] = None,
use_cache: Optional[str] = None,
......@@ -58,7 +64,7 @@ def simple_evaluate(
log_samples: bool = True,
evaluation_tracker: Optional[EvaluationTracker] = None,
system_instruction: Optional[str] = None,
apply_chat_template: bool = False,
apply_chat_template: Union[bool, str] = False,
fewshot_as_multiturn: bool = False,
gen_kwargs: Optional[str] = None,
task_manager: Optional[TaskManager] = None,
......@@ -106,8 +112,11 @@ def simple_evaluate(
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
:param system_instruction: str
System instruction to be applied to the prompt
:param apply_chat_template: bool
If True, apply chat template to the prompt
:param apply_chat_template: Union[bool, str]
Specifies whether to apply a chat template to the prompt.
- If set to True, the default chat template is applied.
- If set to a string, applies the specified chat template by name.
Defaults to False (no chat template applied).
:param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:param gen_kwargs: str
......@@ -148,6 +157,9 @@ def simple_evaluate(
seed_message.append(f"Setting torch manual seed to {torch_random_seed}")
torch.manual_seed(torch_random_seed)
if fewshot_random_seed is not None:
seed_message.append(f"Setting fewshot manual seed to {fewshot_random_seed}")
if seed_message:
eval_logger.info(" | ".join(seed_message))
......@@ -199,7 +211,9 @@ def simple_evaluate(
)
else:
if not isinstance(model, lm_eval.api.model.LM):
raise TypeError
raise TypeError(
f"The value of `model` passed to simple_evaluate() was of type {type(model)}, but is required to be a subclass of lm_eval.api.model.LM . This may be because you are passing an initialized Hugging Face PreTrainedModel without having wrapped it in `lm_eval.models.huggingface.HFLM(pretrained=my_model)` first."
)
eval_logger.info("Using pre-initialized model")
lm = model
......@@ -219,48 +233,58 @@ def simple_evaluate(
task_manager = TaskManager(verbosity)
task_dict = get_task_dict(tasks, task_manager)
for task_name in task_dict.keys():
task_obj = task_dict[task_name]
if isinstance(task_obj, tuple):
_, task_obj = task_obj
if task_obj is None:
continue
if task_obj.get_config("output_type") == "generate_until":
if gen_kwargs is not None:
task_obj.set_config(
key="generation_kwargs", value=gen_kwargs, update=True
)
if predict_only:
log_samples = True
eval_logger.info(
f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
)
# we have to change the class properties post-hoc. This is pretty hacky.
task_obj.override_metric(metric_name="bypass")
# override tasks' fewshot values to the provided num_fewshot arg value
# except if tasks have it set to 0 manually in their configs--then we should never overwrite that
if num_fewshot is not None:
if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
eval_logger.info(
f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
)
# helper function to recursively apply config overrides to leaf subtasks, skipping their constituent groups.
# (setting of num_fewshot ; bypassing metric calculation ; setting fewshot seed)
def _adjust_config(task_dict):
adjusted_task_dict = {}
for task_name, task_obj in task_dict.items():
if isinstance(task_obj, dict):
adjusted_task_dict = {
**adjusted_task_dict,
**{task_name: _adjust_config(task_obj)},
}
else:
eval_logger.warning(
f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
)
task_obj.set_config(key="num_fewshot", value=num_fewshot)
else:
# if num_fewshot not provided, and the task does not define a default one, default to 0
if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None:
task_obj.set_config(key="num_fewshot", value=0)
# fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file)
task_obj.set_fewshot_seed(seed=fewshot_random_seed)
eval_logger.info(
f"Setting fewshot random generator seed to {fewshot_random_seed}"
)
if task_obj.get_config("output_type") == "generate_until":
if gen_kwargs is not None:
task_obj.set_config(
key="generation_kwargs", value=gen_kwargs, update=True
)
if predict_only:
eval_logger.info(
f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
)
# we have to change the class properties post-hoc. This is pretty hacky.
task_obj.override_metric(metric_name="bypass")
# override tasks' fewshot values to the provided num_fewshot arg value
# except if tasks have it set to 0 manually in their configs--then we should never overwrite that
if num_fewshot is not None:
if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
eval_logger.info(
f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
)
else:
eval_logger.warning(
f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
)
task_obj.set_config(key="num_fewshot", value=num_fewshot)
else:
# if num_fewshot not provided, and the task does not define a default one, default to 0
if (
default_num_fewshot := task_obj.get_config("num_fewshot")
) is None:
task_obj.set_config(key="num_fewshot", value=0)
# fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file)
task_obj.set_fewshot_seed(seed=fewshot_random_seed)
adjusted_task_dict[task_name] = task_obj
return adjusted_task_dict
task_dict = _adjust_config(task_dict)
if check_integrity:
run_task_tests(task_list=tasks)
......@@ -270,7 +294,7 @@ def simple_evaluate(
model_source=model,
model_args=model_args,
system_instruction=system_instruction,
chat_template=lm.chat_template if apply_chat_template else None,
chat_template=lm.chat_template(apply_chat_template),
fewshot_as_multiturn=fewshot_as_multiturn,
)
......@@ -282,7 +306,7 @@ def simple_evaluate(
rewrite_requests_cache=rewrite_requests_cache,
bootstrap_iters=bootstrap_iters,
write_out=write_out,
log_samples=log_samples,
log_samples=True if predict_only else log_samples,
system_instruction=system_instruction,
apply_chat_template=apply_chat_template,
fewshot_as_multiturn=fewshot_as_multiturn,
......@@ -343,7 +367,7 @@ def evaluate(
write_out: bool = False,
log_samples: bool = True,
system_instruction: Optional[str] = None,
apply_chat_template: bool = False,
apply_chat_template: Union[bool, str] = False,
fewshot_as_multiturn: bool = False,
verbosity: str = "INFO",
):
......@@ -363,8 +387,11 @@ def evaluate(
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
:param system_instruction: str
System instruction to be applied to the prompt
:param apply_chat_template: bool
If True, apply chat template to the prompt
:param apply_chat_template: Union[bool, str]
Specifies whether to apply a chat template to the prompt.
- If set to True, the default chat template is applied.
- If set to a string, applies the specified chat template by name.
Defaults to False (no chat template applied).
:param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:return
......@@ -380,16 +407,40 @@ def evaluate(
padding_requests = defaultdict(int)
# get lists of group hierarchy and each type of request
task_hierarchy, eval_tasks = get_task_list(task_dict)
eval_tasks = get_task_list(task_dict)
if not log_samples:
if not all(
"bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys()
for task_output in eval_tasks
):
raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
# validation check: are we running multimodal task <-> non-multimodal model class, or vice-versa.
incompatible_tasks = []
for task_output in eval_tasks:
task: Task = task_output.task
limit = get_sample_size(task, limit)
if getattr(lm, "MULTIMODAL", False) != getattr(task, "MULTIMODAL", False):
incompatible_tasks.append(task_output.task_name)
if len(incompatible_tasks) > 0:
if not getattr(lm, "MULTIMODAL", False):
raise ValueError(
f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the ['hf-multimodal', 'vllm-vlm'] model type."
)
else:
raise ValueError(
f"Attempted to run tasks: {incompatible_tasks} which are text-only, but used a model type which only currently supports multimodal tasks."
)
# end multimodality validation check
# Cache the limit arg.
limit_arg = limit
limits = []
for task_output in eval_tasks:
task: Task = task_output.task
limit = get_sample_size(task, limit_arg)
limits.append(limit)
task.build_all_requests(
limit=limit,
rank=lm.rank,
......@@ -397,7 +448,7 @@ def evaluate(
cache_requests=cache_requests,
rewrite_requests_cache=rewrite_requests_cache,
system_instruction=system_instruction,
apply_chat_template=apply_chat_template,
apply_chat_template=bool(apply_chat_template),
fewshot_as_multiturn=fewshot_as_multiturn,
chat_template=getattr(lm, "apply_chat_template")
if apply_chat_template
......@@ -459,7 +510,7 @@ def evaluate(
WORLD_SIZE = lm.world_size
### Postprocess outputs ###
# TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
for task_output in eval_tasks:
for task_output, limit in zip(eval_tasks, limits):
task = task_output.task
task.apply_filters()
......@@ -557,106 +608,45 @@ def evaluate(
### Calculate group metrics ###
if bool(results):
for group, task_list in reversed(task_hierarchy.items()):
if len(task_list) == 0:
# task_hierarchy entries are either
# `group_name: [subtask1, subtask2, ...]`
# or `task_name: []`.
# we only want to operate on groups here.
continue
# collect all higher_is_better values for metrics
# in the group's subtasks.
# TODO: clean this up ; unify with the below metric_list loop?
_higher_is_better = {}
results, versions, show_group_table, *_ = consolidate_group_results(
results, versions, task_dict
)
results_agg, group_agg = prepare_print_tasks(task_dict, results)
subtask_list = get_subtask_list(task_dict)
# collect all higher_is_better values for metrics
# in the group's subtasks.
# TODO: clean this up ; unify with the below metric_list loop?
_higher_is_better = {}
for group, task_list in subtask_list.items():
if (
len(task_list) != 0
): # subtask list will list "task_name": [] for solo tasks
for task in task_list:
for m, h in higher_is_better[task].items():
if m not in _higher_is_better.keys():
_higher_is_better[m] = h
if (
m in _higher_is_better
and _higher_is_better[m] is not None
and _higher_is_better[m] != h
):
eval_logger.warning(
f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
)
_higher_is_better[m] = None
higher_is_better[group] = _higher_is_better
# collect all metric keys used by a subtask in the group.
metric_list = list(
{
key
for task in task_list
for key in results[task].keys()
if "_stderr" not in key and key not in ["alias", "samples"]
}
)
for metric in metric_list:
stderr = "_stderr,".join(metric.split(","))
# gather metrics, sizes, and stderrs from subtasks
metrics = [
results[task][metric]
for task in task_list
if metric in results[task]
] # TODO: copy?
stderrs = [
results[task][stderr]
for task in task_list
if stderr in results[task]
]
sizes = [
results[task]["samples"]
for task in task_list
if metric in results[task]
]
# compute group's pooled metric and stderr
results[group][metric] = (
lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
)
# TODO: calculate grouped metric using aggregation fn
if "N/A" in stderrs:
results[group][stderr] = "N/A"
else:
results[group][stderr] = (
lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
)
# TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
# To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
# results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
results[group]["samples"] = sum(sizes)
results_agg = defaultdict(dict)
groups_agg = defaultdict(dict)
all_tasks_list = list(task_hierarchy.keys())
while True:
add_tasks_list = list(k for k in results_agg.keys())
left_tasks_list = sorted(list(set(all_tasks_list) - set(add_tasks_list)))
if len(left_tasks_list) == 0:
break
_task_hierarchy = {
k: v for k, v in task_hierarchy.items() if k in left_tasks_list
}
_results_agg, _groups_agg = prepare_print_tasks(_task_hierarchy, results)
results_agg = {**results_agg, **_results_agg}
groups_agg = {**groups_agg, **_groups_agg}
for group_name, task_list in task_hierarchy.items():
if task_list:
num_fewshot[group_name] = num_fewshot[
task_list[0]
] # TODO: validate this
if (
m in _higher_is_better
and _higher_is_better[m] is not None
and _higher_is_better[m] != h
):
eval_logger.warning(
f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
)
_higher_is_better[m] = None
higher_is_better[group] = _higher_is_better
results_dict = {
"results": dict(results_agg.items()),
**({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
"group_subtasks": dict(reversed(task_hierarchy.items())),
**(
{"groups": dict(group_agg.items())}
if (bool(group_agg) & show_group_table)
else {}
),
"group_subtasks": dict(reversed(subtask_list.items())),
"configs": dict(sorted(configs.items())),
"versions": dict(sorted(versions.items())),
"n-shot": dict(sorted(num_fewshot.items())),
......@@ -669,7 +659,7 @@ def evaluate(
len(task_output.task.eval_docs),
),
}
for task_output in eval_tasks
for task_output, limit in zip(eval_tasks, limits)
},
}
if log_samples:
......
......@@ -2,9 +2,15 @@ import collections
import math
import pathlib
import sys
from typing import Dict, List, Optional, Tuple, Union
from lm_eval.api import metrics
from typing import List, Optional, Tuple, Union
from lm_eval.api.group import ConfigurableGroup
from lm_eval.api.metrics import (
aggregate_subtask_metrics,
pooled_sample_stderr,
stderr_for_metric,
)
from lm_eval.api.task import Task
from lm_eval.utils import eval_logger, positional_deprecated
......@@ -98,7 +104,7 @@ class TaskOutput:
self.agg_metrics[metric_key] = agg_fn(items)
self.sample_len = len(items) # TODO: same sample size for each metric?
if isinstance(bootstrap_iters, int):
stderr_fn = metrics.stderr_for_metric(
stderr_fn = stderr_for_metric(
metric=agg_fn,
bootstrap_iters=min(bootstrap_iters, 100)
if metric in ["bleu", "chrf", "ter"]
......@@ -116,23 +122,71 @@ class TaskOutput:
return (
f"TaskOutput(task_name={self.task_name}, "
f"group_name={self.group_name}, "
f"version={self.version},"
f"n_shot={self.n_shot}"
f"task_alias={self.task_alias}, group_alias={self.group_alias})"
f"version={self.version}, "
f"n_shot={self.n_shot}, "
f"task_alias={self.task_alias}, "
f"group_alias={self.group_alias})"
)
def get_task_list(task_dict: dict) -> Tuple[Dict[str, list], List[TaskOutput]]:
task_hierarchy = collections.defaultdict(list)
outputs = list(TaskOutput.from_taskdict(x, y) for x, y in task_dict.items())
for task_output in outputs:
if group_name := task_output.group_name:
task_hierarchy[group_name].append(task_output.task_name)
def get_task_list(task_dict: dict) -> List[TaskOutput]:
outputs = []
for task_name, task_obj in task_dict.items():
if isinstance(task_obj, dict):
_outputs = get_task_list(task_obj)
outputs.extend(_outputs)
else:
task_hierarchy[task_output.task_name] = []
# returns task_hierarchy tracking which groups contain which subtasks,
# and a list of TaskOutput classes for each non-group subtask
return task_hierarchy, [x for x in outputs if x.task]
task_output = TaskOutput.from_taskdict(task_name, task_obj)
outputs.append(task_output)
return outputs
def get_subtask_list(task_dict, task_root=None, depth=0):
subtask_list = {}
for group_obj, task_obj in task_dict.items():
if isinstance(group_obj, ConfigurableGroup):
# group_name = group_obj.group_name
group_name = group_obj.group_name
else:
group_name = group_obj
if isinstance(task_obj, dict):
_subtask_list = get_subtask_list(
task_obj, task_root=group_name, depth=depth + 1
)
if task_root:
subtask_list.setdefault((task_root, depth), []).extend(
[
_task
for (_task, _depth) in _subtask_list.keys()
if (_depth - 1) == depth
]
)
subtask_list = {**subtask_list, **_subtask_list}
else:
if isinstance(task_obj, ConfigurableGroup):
# group_or_task_name = task_obj.group_name
group_or_task_name = task_obj.group_name
elif isinstance(task_obj, Task):
# group_or_task_name = task_obj.task_name
group_or_task_name = task_obj.task_name
if task_root is None:
subtask_list.setdefault((group_or_task_name, depth), [])
else:
subtask_list.setdefault((task_root, depth), []).append(
group_or_task_name
)
if depth == 0:
_subtask_list = {}
for group_key, task_list in subtask_list.items():
group_name, depth = group_key
_subtask_list[group_name] = task_list
subtask_list = _subtask_list
return subtask_list
def print_writeout(task) -> None:
......@@ -155,70 +209,95 @@ def get_sample_size(task, limit: Optional[int]) -> Union[int, None]:
def prepare_print_tasks(
task_hierarchy: dict, results: dict, tab=0
task_dict: dict,
results: dict,
task_depth=0,
group_depth=0,
) -> Tuple[dict, dict]:
"""
@param task_hierarchy: Dictionary representing the group hierarchy of tasks. Each key is a group name and its
@param task_dict: Dictionary representing the group hierarchy of tasks. Each key is a group name and its
value is a list of task names.
@param results: Dictionary containing the results of each task. Each key is a
group name and its value is a dictionary of task results.
@param tab: The indentation level for printing the task
@param task_depth: The indentation level for printing the task
hierarchy. Default is 0.
@param group_depth: The indentation level for printing the group
hierarchy. Default is 0.
@return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains
aggregated results for each task, and groups_agg contains aggregated results for each group.
Prepares the task hierarchy and aggregates the results for each task and group recursively for printing.
"""
results_agg = collections.defaultdict(dict)
groups_agg = collections.defaultdict(dict)
(group_name, task_list), *_ = task_hierarchy.items()
task_list = sorted(task_list)
results_agg[group_name] = results[group_name].copy()
# results_agg[group_name]["tab"] = tab
if "samples" in results_agg[group_name]:
results_agg[group_name].pop("samples")
tab_string = " " * tab + "- " if tab > 0 else ""
if "alias" in results_agg[group_name]:
results_agg[group_name]["alias"] = tab_string + results_agg[group_name]["alias"]
else:
results_agg[group_name]["alias"] = tab_string + group_name
if len(task_list) > 0:
groups_agg[group_name] = results[group_name].copy()
# groups_agg[group_name]["tab"] = tab
if "samples" in groups_agg[group_name]:
groups_agg[group_name].pop("samples")
if "alias" in groups_agg[group_name]:
groups_agg[group_name]["alias"] = (
tab_string + groups_agg[group_name]["alias"]
def _sort_task_dict(task_dict):
"""
Helper utility. Sorts the task dict at the current level of the hierarchy based on alphabetized task name.
Required so that we end up sorting within each sub-header correctly.
"""
return dict(
sorted(
task_dict.items(),
key=lambda item: item[0].group_name
if isinstance(item[0], ConfigurableGroup)
else item[0],
)
else:
groups_agg[group_name]["alias"] = tab_string + group_name
)
for task_name in task_list:
if task_name in task_hierarchy:
_task_hierarchy = {
**{task_name: task_hierarchy[task_name]},
**task_hierarchy,
}
task_agg = collections.defaultdict(dict)
group_agg = collections.defaultdict(dict)
task_dict = _sort_task_dict(task_dict)
for task_or_group_name, task_or_group_obj in task_dict.items():
tab_string = " " * task_depth + "- " if task_depth > 0 else ""
if isinstance(task_or_group_name, ConfigurableGroup):
# string_name = task_or_group_name.group_name
name = task_or_group_name.group_name
from_configurable_group = True
task_or_group_obj = _sort_task_dict(task_or_group_obj)
elif isinstance(task_or_group_name, str):
name = task_or_group_name
if isinstance(task_or_group_obj, Task):
# string_name = task_or_group_obj.task_name
name = task_or_group_obj.task_name
from_configurable_group = False
task_agg[name] = results[name].copy()
if from_configurable_group:
if task_or_group_name.group_alias is not None:
alias = task_or_group_name.group_alias
else:
_task_hierarchy = {
**{task_name: []},
**task_hierarchy,
}
_results_agg, _groups_agg = prepare_print_tasks(
_task_hierarchy, results, tab + 1
alias = task_or_group_name.group
else:
if "alias" in task_agg[name]:
alias = task_agg[name]["alias"]
else:
alias = name
task_agg[name]["alias"] = tab_string + alias
if "samples" in task_agg[name]:
task_agg[name].pop("samples")
if from_configurable_group and (" " not in results[name]):
group_tab_string = " " * group_depth + "- " if group_depth > 0 else ""
group_agg[name] = results[name].copy()
group_agg[name]["alias"] = group_tab_string + alias
if "samples" in group_agg[name]:
group_agg[name].pop("samples")
if isinstance(task_or_group_obj, dict):
task_depth += 1
group_depth += 1
_task_agg, _group_agg = prepare_print_tasks(
task_or_group_obj, results, task_depth, group_depth
)
results_agg = {**results_agg, **_results_agg}
groups_agg = {**groups_agg, **_groups_agg}
return results_agg, groups_agg
task_agg = {
**task_agg,
**_task_agg,
}
group_agg = {**group_agg, **_group_agg}
task_depth -= 1
group_depth -= 1
return task_agg, group_agg
def consolidate_results(
......@@ -261,6 +340,8 @@ def consolidate_results(
for task_output in eval_tasks:
if "task_alias" in (task_config := task_output.task_config):
results[task_output.task_name]["alias"] = task_config["task_alias"]
else:
results[task_output.task_name]["alias"] = task_output.task_name
if group_alias := task_output.group_alias:
if group_alias not in results and (group_name := task_output.group_name):
results[group_name]["alias"] = group_alias
......@@ -281,6 +362,147 @@ def consolidate_results(
return results, samples, configs, versions, num_fewshot, higher_is_better
def consolidate_group_results(
results,
versions,
task_dict,
task_root=None,
show_group_table=False,
task_aggregation_list=None,
) -> Tuple[dict, dict, bool, Union[None,]]:
"""
(Recursively) calculates groups' aggregated metrics and updates the results and versions dictionaries with this info.
@return: a tuple [results, versions, show_group_table, task_aggregation_list] with formats described below:
- results: A defaultdict with task names (and, after this function is called, group names of
groups that perform aggregation) as keys, and dictionaries with "alias" and metric,filter_name pairs as keys.
- versions: A defaultdict with task names (and, after this function is called, group names of
groups that perform aggregation) as keys, and float values representing the task or group's version if a version is specified. (defaulting to None).
- show_group_table: a boolean which is true if there exists a group that requires printing of its aggregated scores in a group table.
- task_aggregation_list: a defaultdict listing the subtasks to average over to produce a given group's end metric.
The method then returns the updated results, versions, show_group_table, and task_aggregation_list as a tuple.
In the top-level invocation of this function, task_aggregation_list is ignored.
"""
if task_root is None:
task_root = {}
if task_aggregation_list is None:
task_aggregation_list = {}
for group_or_task, group_or_task_info in task_dict.items():
# Convert to string
if isinstance(group_or_task, ConfigurableGroup):
group_config = group_or_task.config
group_or_task = group_or_task.group_name
else:
group_config = None
if isinstance(group_or_task_info, Task):
if task_root:
task_aggregation_list.setdefault(task_root, []).append(
group_or_task_info.task_name
)
else:
(
results,
versions,
show_group_table,
_task_aggregation_list,
) = consolidate_group_results(
results,
versions,
group_or_task_info,
group_or_task,
show_group_table,
task_aggregation_list,
)
if task_root:
task_aggregation_list.setdefault(task_root, []).extend(
task_aggregation_list.get(group_or_task, [])
)
if (group_config is None) or (
group_config["aggregate_metric_list"] is None
):
results[group_or_task][" "] = " "
continue
if "aggregate_metric_list" in group_config:
agg_metric_list = group_config["aggregate_metric_list"]
show_group_table = show_group_table | bool(
group_config["aggregate_metric_list"]
)
task_list = _task_aggregation_list[group_or_task]
metric_list = list(
{
key
for task in task_list
for key in results[task].keys()
if "_stderr" not in key and key not in ["task", "alias", "samples"]
}
)
for metric in metric_list:
stderr = "_stderr,".join(metric.split(","))
# gather metrics, sizes, and stderrs from subtasks
metrics = [
results[task][metric]
for task in task_list
if metric in results[task]
] # TODO: copy?
stderrs = [
results[task][stderr]
for task in task_list
if stderr in results[task]
]
sizes = [
results[task]["samples"]
for task in task_list
if metric in results[task]
]
for metric_config in agg_metric_list:
for filter_name in metric_config["filter_list"]:
if metric != ",".join([metric_config["metric"], filter_name]):
continue
# compute group's pooled metric and stderr
if metric_config["aggregation"] == "mean":
aggregate_fn = aggregate_subtask_metrics
elif callable(metric_config["aggregation"]):
aggregate_fn = metric_config["aggregation"]
else:
raise ValueError(
f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{metric_config['aggregation']}' for group '{group_or_task}'"
)
results[group_or_task][metric] = aggregate_fn(
metrics,
sizes,
metric_config["weight_by_size"],
)
# TODO: calculate groups' metrics using arbitrary agg fns
if "N/A" in stderrs:
results[group_or_task][stderr] = "N/A"
else:
# NOTE: this assumes we are using the mean to aggregate. There are warnings about this elsewhere
results[group_or_task][stderr] = pooled_sample_stderr(
stderrs, sizes
)
results[group_or_task]["samples"] = sum(sizes)
group_metadata = group_config.get("metadata", None)
if group_metadata is not None:
versions[group_or_task] = group_metadata.get("version", None)
# print(results)
return results, versions, show_group_table, task_aggregation_list
@positional_deprecated
def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
"""
......
......@@ -62,11 +62,8 @@ class WhitespaceFilter(Filter):
def filter_set(inst):
filtered_resp = []
for resp in inst:
if resp.startswith(" "):
resp = resp[1:]
resp = resp.lstrip()
filtered_resp.append(resp)
return filtered_resp
filtered_resps = [filter_set(resp) for resp in resps]
......
import json
import os
import re
import time
from collections import defaultdict
......@@ -14,6 +15,7 @@ from huggingface_hub import (
HfApi,
hf_hub_url,
)
from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
from lm_eval.utils import (
eval_logger,
......@@ -112,12 +114,15 @@ class EvaluationTracker:
output_path: str = None,
hub_results_org: str = "",
hub_repo_name: str = "",
details_repo_name: str = "",
results_repo_name: str = "",
push_results_to_hub: bool = False,
push_samples_to_hub: bool = False,
public_repo: bool = False,
token: str = "",
leaderboard_url: str = "",
point_of_contact: str = "",
gated: bool = False,
) -> None:
"""
Creates all the necessary loggers for evaluation tracking.
......@@ -126,12 +131,15 @@ class EvaluationTracker:
output_path (str): Path to save the results. If not provided, the results won't be saved.
hub_results_org (str): The Hugging Face organization to push the results to. If not provided, the results will be pushed to the owner of the Hugging Face token.
hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`.
details_repo_name (str): The name of the Hugging Face repository to push the details to. If not provided, the results will be pushed to `lm-eval-results`.
result_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will not be pushed and will be found in the details_hub_repo.
push_results_to_hub (bool): Whether to push the results to the Hugging Face hub.
push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub.
public_repo (bool): Whether to push the results to a public or private repository.
token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`.
leaderboard_url (str): URL to the leaderboard on the Hugging Face hub on the dataset card.
point_of_contact (str): Contact information on the Hugging Face hub dataset card.
gated (bool): Whether to gate the repository.
"""
self.general_config_tracker = GeneralConfigTracker()
......@@ -142,6 +150,7 @@ class EvaluationTracker:
self.leaderboard_url = leaderboard_url
self.point_of_contact = point_of_contact
self.api = HfApi(token=token) if token else None
self.gated_repo = gated
if not self.api and (push_results_to_hub or push_samples_to_hub):
raise ValueError(
......@@ -159,9 +168,24 @@ class EvaluationTracker:
f"hub_results_org was not specified. Results will be pushed to '{hub_results_org}'."
)
hub_repo_name = hub_repo_name if hub_repo_name else "lm-eval-results"
self.hub_results_repo = f"{hub_results_org}/{hub_repo_name}"
self.hub_results_repo_private = f"{hub_results_org}/{hub_repo_name}-private"
if hub_repo_name == "":
details_repo_name = (
details_repo_name if details_repo_name != "" else "lm-eval-results"
)
results_repo_name = (
results_repo_name if results_repo_name != "" else details_repo_name
)
else:
details_repo_name = hub_repo_name
results_repo_name = hub_repo_name
eval_logger.warning(
"hub_repo_name was specified. Both details and results will be pushed to the same repository. Using hub_repo_name is no longer recommended, details_repo_name and results_repo_name should be used instead."
)
self.details_repo = f"{hub_results_org}/{details_repo_name}"
self.details_repo_private = f"{hub_results_org}/{details_repo_name}-private"
self.results_repo = f"{hub_results_org}/{results_repo_name}"
self.results_repo_private = f"{hub_results_org}/{results_repo_name}-private"
def save_results_aggregated(
self,
......@@ -211,9 +235,9 @@ class EvaluationTracker:
if self.api and self.push_results_to_hub:
repo_id = (
self.hub_results_repo
self.results_repo
if self.public_repo
else self.hub_results_repo_private
else self.results_repo_private
)
self.api.create_repo(
repo_id=repo_id,
......@@ -221,10 +245,15 @@ class EvaluationTracker:
private=not self.public_repo,
exist_ok=True,
)
self.api.upload_folder(
self.api.upload_file(
repo_id=repo_id,
folder_path=str(path),
path_in_repo=self.general_config_tracker.model_name_sanitized,
path_or_fileobj=str(
path.joinpath(f"results_{self.date_id}.json")
),
path_in_repo=os.path.join(
self.general_config_tracker.model_name,
f"results_{self.date_id}.json",
),
repo_type="dataset",
commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
)
......@@ -278,6 +307,7 @@ class EvaluationTracker:
sample["resps"] = sanitize_list(sample["resps"])
sample["filtered_resps"] = sanitize_list(sample["filtered_resps"])
sample["arguments"] = arguments
sample["target"] = str(sample["target"])
sample_dump = (
json.dumps(
......@@ -288,14 +318,14 @@ class EvaluationTracker:
+ "\n"
)
with open(file_results_samples, "a") as f:
with open(file_results_samples, "a", encoding="utf-8") as f:
f.write(sample_dump)
if self.api and self.push_samples_to_hub:
repo_id = (
self.hub_results_repo
self.details_repo
if self.public_repo
else self.hub_results_repo_private
else self.details_repo_private
)
self.api.create_repo(
repo_id=repo_id,
......@@ -303,6 +333,18 @@ class EvaluationTracker:
private=not self.public_repo,
exist_ok=True,
)
try:
if self.gated_repo:
headers = build_hf_headers()
r = get_session().put(
url=f"https://huggingface.co/api/datasets/{repo_id}/settings",
headers=headers,
json={"gated": "auto"},
)
hf_raise_for_status(r)
except Exception as e:
eval_logger.warning("Could not gate the repository")
eval_logger.info(repr(e))
self.api.upload_folder(
repo_id=repo_id,
folder_path=str(path),
......@@ -327,9 +369,7 @@ class EvaluationTracker:
"""
eval_logger.info("Recreating metadata card")
repo_id = (
self.hub_results_repo if self.public_repo else self.hub_results_repo_private
)
repo_id = self.details_repo if self.public_repo else self.details_repo_private
files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
results_files = get_results_filenames(files_in_repo)
......@@ -360,7 +400,10 @@ class EvaluationTracker:
results_datetime,
)
latest_task_results_datetime[samples_key] = latest_datetime
latest_task_results_datetime[results_key] = latest_datetime
latest_task_results_datetime[results_key] = max(
latest_task_results_datetime[results_key],
latest_datetime,
)
# Create metadata card
card_metadata = MetadataConfigs()
......@@ -377,14 +420,15 @@ class EvaluationTracker:
sanitized_last_eval_date_results = re.sub(
r"[^\w\.]", "_", latest_task_results_datetime[config_name]
)
# Ensure that all results files are listed in the metadata card
current_results = card_metadata.get(config_name, {"data_files": []})
current_results["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_results
# If the results file is the newest, update the "latest" field in the metadata card
if eval_date_sanitized == sanitized_last_eval_date_results:
# Ensure that all results files are listed in the metadata card
current_results = card_metadata.get(config_name, {"data_files": []})
current_results["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_results
# If the results file is the newest, update the "latest" field in the metadata card
card_metadata[config_name]["data_files"].append(
{"split": "latest", "path": [str(results_filename)]}
)
......@@ -403,65 +447,20 @@ class EvaluationTracker:
sanitized_last_eval_date_results = re.sub(
r"[^\w\.]", "_", latest_task_results_datetime[config_name]
)
# Ensure that all sample results files are listed in the metadata card
current_details_for_task = card_metadata.get(
config_name, {"data_files": []}
)
current_details_for_task["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_details_for_task
# If the samples results file is the newest, update the "latest" field in the metadata card
if eval_date_sanitized == sanitized_last_eval_date_results:
# Ensure that all sample results files are listed in the metadata card
current_details_for_task = card_metadata.get(
config_name, {"data_files": []}
)
current_details_for_task["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_details_for_task
# If the samples results file is the newest, update the "latest" field in the metadata card
card_metadata[config_name]["data_files"].append(
{"split": "latest", "path": [str(results_filename)]}
)
# Special case for MMLU with a single split covering it all
# We add another config with all MMLU splits results together for easy inspection
SPECIAL_TASKS = ["mmlu", "gpqa", "minerva_math"]
for special_task in SPECIAL_TASKS:
if special_task in config_name:
special_task = f"{model_name}__{special_task}"
former_entry = card_metadata.get(special_task, {"data_files": []})
former_split = [
(i, entry)
for i, entry in enumerate(former_entry["data_files"])
if entry.get("split", None) == eval_date_sanitized
]
if len(former_split) == 0:
former_entry["data_files"].append(
{
"split": eval_date_sanitized,
"path": [str(results_filename)],
}
)
else:
split_index, _ = former_split[0]
former_entry["data_files"][split_index]["path"].append(
str(results_filename)
)
if eval_date_sanitized == sanitized_last_eval_date_results:
latest_split = [
(i, entry)
for i, entry in enumerate(former_entry["data_files"])
if entry.get("split", None) == "latest"
]
if len(latest_split) == 0:
former_entry["data_files"].append(
{"split": "latest", "path": [str(results_filename)]}
)
else:
latest_index, _ = latest_split[0]
former_entry["data_files"][latest_index]["path"].append(
str(results_filename)
)
card_metadata[special_task] = former_entry
# Get latest results and extract info to update metadata card examples
latest_datetime = max(latest_task_results_datetime.values())
latest_model_name = max(
......
......@@ -114,15 +114,29 @@ def add_env_info(storage: Dict[str, Any]):
def add_tokenizer_info(storage: Dict[str, Any], lm):
if getattr(lm, "tokenizer", False):
tokenizer_info = {
"tokenizer_pad_token": [lm.tokenizer.pad_token, lm.tokenizer.pad_token_id],
"tokenizer_eos_token": [lm.tokenizer.eos_token, lm.tokenizer.eos_token_id],
"tokenizer_bos_token": [lm.tokenizer.bos_token, lm.tokenizer.bos_token_id],
"eot_token_id": getattr(lm, "eot_token_id", None),
"max_length": getattr(lm, "max_length", None),
}
storage.update(tokenizer_info)
# seems gguf and textsynth do not have tokenizer
try:
tokenizer_info = {
"tokenizer_pad_token": [
lm.tokenizer.pad_token,
str(lm.tokenizer.pad_token_id),
],
"tokenizer_eos_token": [
lm.tokenizer.eos_token,
str(lm.tokenizer.eos_token_id),
],
"tokenizer_bos_token": [
lm.tokenizer.bos_token,
str(lm.tokenizer.bos_token_id),
],
"eot_token_id": getattr(lm, "eot_token_id", None),
"max_length": getattr(lm, "max_length", None),
}
storage.update(tokenizer_info)
except Exception as err:
logger.debug(
f"Logging detailed tokenizer info failed with {err}, skipping..."
)
# seems gguf and textsynth do not have tokenizer
else:
logger.debug(
"LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results."
......
from . import (
anthropic_llms,
api_models,
dummy,
gguf,
hf_vlms,
huggingface,
ibm_watsonx_ai,
mamba_lm,
nemo_lm,
neuralmagic,
......@@ -11,6 +14,7 @@ from . import (
optimum_lm,
textsynth,
vllm_causallms,
vllm_vlms,
)
......
from typing import Any, List, Tuple
import os
from functools import cached_property
from typing import Any, Dict, List, Tuple, Union
from tqdm import tqdm
from lm_eval import utils
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.models.openai_completions import LocalCompletionsAPI
from lm_eval.models.utils import retry_on_specific_exceptions
......@@ -42,8 +45,8 @@ def anthropic_completion(
try:
import anthropic
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
)
......@@ -105,8 +108,8 @@ def anthropic_chat(
try:
import anthropic
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
)
......@@ -138,7 +141,7 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
return messages()
@register_model("anthropic")
@register_model("anthropic-completions")
class AnthropicLM(LM):
REQ_CHUNK_SIZE = 20 # TODO: not used
......@@ -165,8 +168,8 @@ class AnthropicLM(LM):
try:
import anthropic
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
)
......@@ -214,8 +217,8 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
try:
import anthropic
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
)
......@@ -271,90 +274,89 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
@register_model("anthropic-chat", "anthropic-chat-completions")
class AnthropicChatLM(AnthropicLM):
REQ_CHUNK_SIZE = 20 # TODO: not used
class AnthropicChat(LocalCompletionsAPI):
def __init__(
self,
model: str,
batch_size: int = 1,
max_tokens: int = 256,
temperature: float = 0, # defaults to 1
**kwargs, # top_p, top_k, etc.
) -> None:
"""Anthropic API wrapper.
:param model: str
Anthropic model e.g. 'claude-3-opus-20240229', 'claude-3-sonnet-20240229'
:param max_tokens: int
Maximum number of tokens to sample from the model
:param temperature: float
Sampling temperature
:param kwargs: Any
Additional model_args to pass to the API client
"""
super().__init__()
base_url="https://api.anthropic.com/v1/messages",
tokenizer_backend=None,
**kwargs,
):
super().__init__(
base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
)
eval_logger.warning(
"Chat completions does not support batching. Defaulting to batch size 1."
)
self._batch_size = 1
self.anthropic_version = "2023-06-01"
eval_logger.warning(
f"Using Anthropic Version: {self.anthropic_version}. Confirm the current version here: https://docs.anthropic.com/en/api/versioning"
)
try:
import anthropic
except ModuleNotFoundError:
raise Exception(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
@cached_property
def api_key(self):
"""Override this property to return the API key for the API request."""
key = os.environ.get("ANTHROPIC_API_KEY", None)
if key is None:
raise ValueError(
"API key not found. Please set the ANTHROPIC_API_KEY environment variable."
)
self.model = model
# defaults to os.environ.get("ANTHROPIC_API_KEY")
self.client = anthropic.Anthropic()
self.temperature = temperature
self.max_tokens = max_tokens
self.tokenizer = self.client.get_tokenizer()
self.kwargs = kwargs
@property
def max_gen_toks(self) -> int:
return self.max_tokens
def generate_until(self, requests) -> List[str]:
try:
import anthropic
except ModuleNotFoundError:
raise Exception(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
)
if not requests:
return []
_requests: List[Tuple[str, dict]] = [req.args for req in requests]
return key
@cached_property
def header(self):
return {
"x-api-key": f"{self.api_key}",
"anthropic-version": self.anthropic_version,
}
def _create_payload(
self, messages: List[Dict], generate=True, gen_kwargs: dict = None, **kwargs
) -> dict:
system = (
messages[0].get("content") if messages[0].get("role") == "system" else None
)
if system:
messages = messages[1:]
gen_kwargs.pop("do_sample", False)
max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
temperature = gen_kwargs.pop("temperature", 0)
stop = gen_kwargs.pop("until", ["\n\nHuman:"])
if not isinstance(stop, list):
stop = [stop]
out = {
"messages": messages,
"model": self.model,
"max_tokens": max_tokens,
"temperature": temperature,
"stop_sequences": stop,
**gen_kwargs,
}
if system:
out["system"] = system
return out
def parse_generations(
self, outputs: Union[Dict, List[Dict]], **kwargs
) -> List[str]:
res = []
for request in tqdm(_requests):
try:
inp = request[0]
request_args = request[1]
# generation_kwargs
until = request_args.get("until")
max_tokens = request_args.get("max_gen_toks", self.max_length)
temperature = request_args.get("temperature", self.temperature)
response = anthropic_chat(
client=self.client,
model=self.model,
prompt=inp,
max_tokens=max_tokens,
temperature=temperature, # TODO: implement non-greedy sampling for Anthropic
stop=until, # type: ignore
**self.kwargs,
)
res.append(response)
self.cache_hook.add_partial("generate_until", request, response)
except anthropic.APIConnectionError as e: # type: ignore # noqa: F821
eval_logger.critical(f"Server unreachable: {e.__cause__}")
break
except anthropic.APIStatusError as e: # type: ignore # noqa: F821
eval_logger.critical(f"API error {e.status_code}: {e.message}")
break
if not isinstance(outputs, list):
outputs = [outputs]
for out in outputs:
for choices in out["content"]:
res.append(choices["text"])
return res
def tok_encode(
self,
string: str,
left_truncate_len=None,
add_special_tokens=None,
**kwargs,
) -> List[str]:
return [string]
def loglikelihood(self, requests, **kwargs):
raise NotImplementedError(
"Anthropic Chat Completions API does not support the return of loglikelihood"
)
import abc
import asyncio
import copy
import itertools
import json
from functools import cached_property
from typing import (
Any,
Awaitable,
Callable,
Dict,
Iterable,
List,
Literal,
NamedTuple,
Optional,
Tuple,
Union,
)
try:
import requests
from aiohttp import ClientSession, TCPConnector
from tenacity import RetryError, retry, stop_after_attempt, wait_exponential
from tqdm import tqdm
from tqdm.asyncio import tqdm_asyncio
except ModuleNotFoundError:
pass
from importlib.util import find_spec
from lm_eval import utils
from lm_eval.api.instance import Instance
from lm_eval.api.model import TemplateLM
from lm_eval.models.utils import Collator, chunks, configure_pad_token
LogLikelihoodInputs = Tuple[Tuple[str, str], List[int], List[int]]
# utility class to keep track of json encoded chats
class JsonChatStr(NamedTuple):
prompt: str
def encode(self, encoding):
return self.prompt.encode(encoding)
eval_logger = utils.eval_logger
class TemplateAPI(TemplateLM):
def __init__(
self,
model: str = None,
pretrained: str = None, # `model` takes precedence over `pretrained` when passed.
base_url: str = None,
tokenizer: Optional[str] = None,
# Loglikelihood tasks require a tokenizer to calculate context lengths,
# however the requests can be sent as a string if the API doesn't support token inputs.
# use tokenized_requests=False
tokenizer_backend: Optional[
Literal["tiktoken", "huggingface", None]
] = "huggingface",
truncate: bool = False,
# number of concurrent requests. More useful if not batching
num_concurrent: int = 1,
max_retries: int = 3,
max_gen_toks: int = 256,
batch_size: Union[str, int] = 1,
seed: int = 1234,
max_length: Optional[int] = 2048,
add_bos_token: bool = False,
custom_prefix_token_id: int = None,
# send the requests as tokens or strings
tokenized_requests: bool = True,
trust_remote_code: bool = False,
revision: Optional[str] = "main",
use_fast_tokenizer: bool = True,
verify_certificate: bool = True,
**kwargs,
) -> None:
super().__init__()
missing_packages = [
pkg
for pkg in ["aiohttp", "tqdm", "tenacity", "requests"]
if find_spec(pkg) is None
]
if missing_packages:
raise ModuleNotFoundError(
f"Attempted to use an API model, but the required packages {missing_packages} are not installed. "
'Please install these via `pip install lm-eval[api]` or `pip install -e ."[api]"`'
)
self.model = model or pretrained
self.base_url = base_url
self.tokenizer = tokenizer
if not isinstance(batch_size, int) and "auto" in batch_size:
eval_logger.warning(
"Automatic batch size is not supported for API models. Defaulting to batch size 1."
)
elif int(batch_size) > 1:
eval_logger.warning(
"Batch size > 1 detected. Ensure your API supports batched requests with varying total sequence lengths."
)
self._batch_size = int(batch_size) if batch_size != "auto" else 1
self._truncate = truncate
self._max_gen_toks = int(max_gen_toks)
self._seed = int(seed)
# max_length - 1 as we always have 1 token for generation
eval_logger.info(f"Using max length {max_length} - 1")
self.max_length = max_length - 1
if int(num_concurrent) <= 1:
eval_logger.info(
"Concurrent requests are disabled. To enable concurrent requests, set `num_concurrent` > 1."
)
self._concurrent = int(num_concurrent)
self.tokenizer_backend = tokenizer_backend
self.add_bos_token = add_bos_token
self.custom_prefix_token_id = custom_prefix_token_id
self.tokenized_requests = tokenized_requests
self.max_retries = int(max_retries)
self.verify_certificate = verify_certificate
eval_logger.info(f"Using tokenizer {self.tokenizer_backend}")
if self.tokenizer_backend is None:
self.tokenizer = None
self.tokenized_requests = False
else:
if self.tokenizer is None:
if self.tokenizer_backend == "huggingface":
import transformers
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
self.tokenizer if self.tokenizer else self.model,
trust_remote_code=trust_remote_code,
revision=revision,
use_fast=use_fast_tokenizer,
)
# Not used as the API will handle padding but to mirror the behavior of the HFLM
self.tokenizer = configure_pad_token(self.tokenizer)
elif self.tokenizer_backend == "tiktoken":
try:
import tiktoken
self.tokenizer = tiktoken.encoding_for_model(self.model)
except ModuleNotFoundError as e:
raise ModuleNotFoundError(
"Attempted to use 'openai' LM type, but the package `tiktoken` is not installed. "
"Please install it via `pip install lm-eval[api]` or `pip install -e .[api]`."
) from e
if "openai" not in self.base_url:
eval_logger.warning(
f"Passed `base_url={self.base_url}` but using (OpenAI) Tiktoken tokenizer backend. "
"Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken."
)
else:
import transformers
assert isinstance(tokenizer, str), "tokenizer must be a string"
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
tokenizer,
trust_remote_code=trust_remote_code,
revision=revision,
use_fast=use_fast_tokenizer,
)
@abc.abstractmethod
def _create_payload(
self,
messages: Union[List[List[int]], List[dict], List[str], str],
*,
generate: bool = True,
gen_kwargs: Optional[dict] = None,
seed: int = 1234,
**kwargs,
) -> dict:
"""This method is responsible for creating the json payload that will be sent to the API."""
raise NotImplementedError
def create_message(
self,
messages: Union[List[List[int]], List[str], List[JsonChatStr]],
generate=False,
) -> Union[List[List[int]], List[dict], List[str], str]:
"""Helper method to transform the prompt into the expected API input format. messages consist of batched requests"""
if isinstance(messages[0], JsonChatStr):
# for chat completions we need to decode the json string to list[dict,...]
assert (
self._batch_size == 1
), "non-tokenized chat requests are only supported with batch_size=1"
# list[dict["role":..., "content":...],...]
return json.loads(messages[0].prompt)
if not self.tokenized_requests:
# if messages are tokenized:
if isinstance(messages[0][0], int):
# assuming decoding is lossless. However, this is only for loglikelihood requests
# as we need to compute the context length. For generations, we don't need to tokenize.
messages = self.decode_batch(messages)
if self._batch_size <= 1:
# if batch is 1 return str
return messages[0]
else:
# list[str,...]
return messages
# list[list[int], ...]
return messages
@staticmethod
@abc.abstractmethod
def parse_logprobs(
outputs: Union[Any, List[Any]],
tokens: List[List[int]] = None,
ctxlen: List[int] = None,
**kwargs,
) -> List[Tuple[float, bool]]:
"""Method used to parse the logprobs from the (batched) API response. This method should return a list of tuples"""
raise NotImplementedError
@staticmethod
@abc.abstractmethod
def parse_generations(outputs: Union[Any, List[Any]], **kwargs) -> List[str]:
"""Method used to parse the generations from the (batched) API response. This method should return a list of str"""
raise NotImplementedError
@cached_property
def api_key(self) -> str:
"""Override this property to return the API key for the API request."""
return ""
@cached_property
def header(self) -> dict:
"""Override this property to return the headers for the API request."""
return {"Authorization": f"Bearer {self.api_key}"}
@property
def tokenizer_name(self) -> str:
"""Must be defined for LM subclasses which implement Chat Templating.
Should return the name of the tokenizer or chat template used.
Used only to properly fingerprint caches when requests are being cached with `--cache_requests`, otherwise not used.
"""
return ""
def apply_chat_template(
self, chat_history: List[Dict[str, str]]
) -> Union[str, JsonChatStr]:
"""Applies a chat template to a list of chat history between user and model."""
if self.tokenizer_backend == "huggingface" and self.tokenized_requests:
return self.tokenizer.apply_chat_template(
chat_history, tokenize=False, add_generation_prompt=True
)
else:
# bit of a hack. We'll load back before sending to the API
return JsonChatStr(json.dumps(chat_history))
@cached_property
def eot_token_id(self) -> Optional[int]:
if self.tokenizer is None:
return None
else:
if self.tokenizer_backend == "huggingface":
return self.tokenizer.eos_token_id
elif self.tokenizer_backend == "tiktoken":
return self.tokenizer.eot_token
@cached_property
def prefix_token_id(self) -> Optional[int]:
if self.tokenizer is None:
return None
else:
if self.custom_prefix_token_id is not None:
return self.custom_prefix_token_id
if self.tokenizer_backend == "huggingface":
if self.tokenizer.bos_token_id is not None:
return self.tokenizer.bos_token_id
return self.tokenizer.eos_token_id
else:
return self.tokenizer.eot_token
def tok_encode(
self,
string: str,
left_truncate_len: int = None,
add_special_tokens: bool = False,
truncation: bool = False,
**kwargs,
) -> Union[List[List[int]], List[int], List[str]]:
if self.tokenizer_backend is None:
return [string]
elif self.tokenizer_backend == "huggingface":
# by default for CausalLM - false or self.add_bos_token is set
if not add_special_tokens:
add_special_tokens = False or self.add_bos_token
encoding: Union[List[List[int]], List[int]] = self.tokenizer(
string,
add_special_tokens=add_special_tokens,
truncation=truncation,
return_attention_mask=False,
).input_ids
# left-truncate the encoded context to be at most `left_truncate_len` tokens long
if left_truncate_len:
if not isinstance(string, str):
encoding = [enc[-left_truncate_len:] for enc in encoding]
else:
encoding = encoding[-left_truncate_len:]
return encoding
else:
try:
encoding = self.tokenizer.encode(string)
except Exception:
encoding = self.tokenizer.encode_batch(string)
return encoding
def decode_batch(self, tokens: List[List[int]]) -> List[str]:
if self.tokenizer_backend == "huggingface":
return self.tokenizer.batch_decode(tokens)
elif self.tokenizer_backend == "tiktoken":
return self.tokenizer.decode_batch(tokens)
def model_call(
self,
messages: Union[List[List[int]], List[str], List[JsonChatStr]],
*,
generate: bool = True,
gen_kwargs: Optional[Dict] = None,
**kwargs,
) -> Optional[dict]:
# !!! Copy: shared dict for each request, need new object !!!
gen_kwargs = copy.deepcopy(gen_kwargs)
try:
response = requests.post(
self.base_url,
json=self._create_payload(
self.create_message(messages),
generate=generate,
gen_kwargs=gen_kwargs,
seed=self._seed,
**kwargs,
),
headers=self.header,
verify=self.verify_certificate,
)
if not response.ok:
eval_logger.warning(
f"API request failed with error message: {response.text}. Retrying..."
)
response.raise_for_status()
return response.json()
except RetryError:
eval_logger.error(
"API request failed after multiple retries. Please check the API status."
)
return None
async def amodel_call(
self,
session: ClientSession,
messages: Union[List[List[int]], List[str], List[JsonChatStr]],
*,
generate: bool = True,
cache_keys: list = None,
ctxlens: Optional[List[int]] = None,
gen_kwargs: Optional[Dict] = None,
**kwargs,
) -> Union[List[str], List[Tuple[float, bool]], None]:
# !!! Copy: shared dict for each request, need new object !!!
gen_kwargs = copy.deepcopy(gen_kwargs)
payload = self._create_payload(
self.create_message(messages),
generate=generate,
gen_kwargs=gen_kwargs,
seed=self._seed,
**kwargs,
)
cache_method = "generate_until" if generate else "loglikelihood"
try:
async with session.post(
self.base_url,
json=payload,
headers=self.header,
) as response:
if not response.ok:
error_text = await response.text()
eval_logger.warning(
f"API request failed with error message: {error_text}. Retrying..."
)
# raising exception will retry the request
response.raise_for_status()
outputs = await response.json()
answers = (
self.parse_generations(
outputs=outputs,
)
if generate
else self.parse_logprobs(
outputs=outputs,
tokens=messages,
ctxlens=ctxlens,
)
)
if cache_keys:
for res, cache in zip(answers, cache_keys):
self.cache_hook.add_partial(cache_method, cache, res)
return answers
# If the retries also fail
except RetryError:
eval_logger.error(
"API request failed after multiple retries. Please check the API status."
)
return None
def batch_loglikelihood_requests(
self, chunks: Iterable[List[LogLikelihoodInputs]]
) -> Tuple[List[List[int]], List[int], List[Tuple[str, str]]]:
inputs = []
ctxlens = []
cache_keys = []
for chunk in chunks:
for cache_key, context_enc, continuation_enc in chunk:
# max_length - 1 as we always have 1 token for generation
inp = (context_enc + continuation_enc)[-(self.max_length) :]
ctxlen = len(context_enc) - max(
0, len(context_enc) + len(continuation_enc) - (self.max_length)
)
inputs.append(inp)
ctxlens.append(ctxlen)
cache_keys.append(cache_key)
return inputs, ctxlens, cache_keys
async def get_batched_requests(
self,
requests: list,
cache_keys: list,
*,
generate: bool = True,
ctxlens: List[int] = None,
**kwargs,
) -> Union[List[List[str]], List[List[Tuple[float, bool]]]]:
ctxlens = ctxlens if ctxlens else [None] * len(requests)
conn = TCPConnector(limit=self._concurrent)
async with ClientSession(connector=conn) as session:
retry_: Callable[..., Awaitable[Any]] = retry(
stop=stop_after_attempt(self.max_retries),
wait=wait_exponential(multiplier=0.5, min=1, max=10),
reraise=True,
)(self.amodel_call)
# Create tasks for each batch of request
tasks = [
asyncio.create_task(
retry_(
session=session,
messages=message,
cache_keys=cache_key,
generate=generate,
ctxlens=ctxlen,
**kwargs,
)
)
for message, cache_key, ctxlen in zip(
chunks(requests, n=self._batch_size),
chunks(cache_keys, n=self._batch_size),
chunks(ctxlens, n=self._batch_size),
)
]
return await tqdm_asyncio.gather(*tasks, desc="Requesting API")
def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
assert (
self.tokenizer is not None
), "Tokenizer is required for loglikelihood tasks to compute context lengths."
res = []
def _collate(req: LogLikelihoodInputs):
"""Defines the key for the sorted method"""
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch
# padded context length. this is useful to simplify the batching logic and more importantly to make
# automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks = req[1] + req[2]
return -len(toks), tuple(toks)
re_ord = Collator(
requests,
sort_fn=_collate,
group_by=None,
)
# if concurrent then we'll batch in the async context
chunked = re_ord.get_batched(n=self._batch_size if self._concurrent <= 1 else 0)
if self._concurrent <= 1:
pbar = tqdm(desc="Requesting API", total=len(requests))
for chunk in chunked:
inputs, ctxlens, cache_keys = self.batch_loglikelihood_requests([chunk])
outputs = retry(
stop=stop_after_attempt(self.max_retries),
wait=wait_exponential(multiplier=0.5, min=1, max=10),
reraise=True,
)(self.model_call)(messages=inputs, generate=False)
if isinstance(outputs, dict):
outputs = [outputs]
for answer_, cache_key in zip(
self.parse_logprobs(
outputs=outputs, tokens=inputs, ctxlens=ctxlens
),
cache_keys,
):
if answer_ is not None:
res.append(answer_)
# cache requests that aren't from a loglikelihood_rolling request
if cache_key is not None:
self.cache_hook.add_partial(
"loglikelihood", cache_key, answer_
)
pbar.update(1)
else:
inputs, ctxlens, cache_keys = self.batch_loglikelihood_requests(chunked)
res = itertools.chain.from_iterable(
asyncio.run(
self.get_batched_requests(
inputs, cache_keys, generate=False, ctxlens=ctxlens
)
)
)
return re_ord.get_original(res)
def generate_until(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[str]:
res = []
def _collate_gen(_requests):
# sort by the length of the non-tokenized contexts
return -len(_requests[0])
# Let the API deal with tokenization
requests, all_gen_kwargs = zip(*(req.args for req in requests))
if self.tokenized_requests:
encodings_list = self.tok_encode(
requests, add_special_tokens=self.add_bos_token
)
else:
encodings_list = [None] * len(requests)
requests = [
(a, b, c) for a, b, c in zip(requests, all_gen_kwargs, encodings_list)
]
re_ord = Collator(
requests,
sort_fn=_collate_gen,
group_by="gen_kwargs",
)
chunked = re_ord.get_batched(
n=self._batch_size if self._concurrent <= 1 else 0, batch_fn=None
)
if self._concurrent <= 1:
pbar = tqdm(desc="Requesting API", total=len(requests))
for chunk in chunked:
contexts, all_gen_kwargs, encodings_list = zip(*chunk)
req = encodings_list if self.tokenized_requests else contexts
outputs = retry(
stop=stop_after_attempt(self.max_retries),
wait=wait_exponential(multiplier=0.5, min=1, max=10),
reraise=True,
)(self.model_call)(
messages=req,
generate=True,
gen_kwargs=copy.deepcopy(all_gen_kwargs[0]),
)
for generated_text, context in zip(
self.parse_generations(
outputs=outputs,
contexts=contexts,
),
contexts,
):
if generated_text is not None:
res.append(generated_text)
# partial caching
if context is not None:
self.cache_hook.add_partial(
"generate_until",
(context, all_gen_kwargs[0]),
generated_text,
)
pbar.update(1)
else:
for chunk in chunked:
contexts, all_gen_kwargs, encodings_list = zip(*chunk)
req = encodings_list if self.tokenized_requests else contexts
results = itertools.chain.from_iterable(
asyncio.run(
self.get_batched_requests(
req,
cache_keys=[(ctx, all_gen_kwargs[0]) for ctx in contexts],
generate=True,
gen_kwargs=copy.deepcopy(all_gen_kwargs[0]),
)
)
)
res.extend(results)
return re_ord.get_original(res)
def loglikelihood_rolling(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[float]:
loglikelihoods = []
for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
rolling_token_windows = list(
map(
utils.make_disjoint_window,
utils.get_rolling_token_windows(
token_list=self.tok_encode(string),
prefix_token=self.prefix_token_id,
# max_seq_len - (1 for context)
max_seq_len=self.max_length - 1,
context_len=1,
),
)
)
# TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
rolling_token_windows = [(None,) + x for x in rolling_token_windows]
string_nll = self._loglikelihood_tokens(
rolling_token_windows,
disable_tqdm=True,
)
# discard is_greedy
string_nll = [x[0] for x in string_nll]
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
# cache this loglikelihood_rolling request
self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
return loglikelihoods
......@@ -26,9 +26,9 @@ class DummyLM(LM):
def generate_until(self, requests, disable_tqdm: bool = False):
res = []
for ctx, _ in tqdm(requests, disable=disable_tqdm):
for request in tqdm(requests, disable=disable_tqdm):
res.append("lol")
assert ctx.strip() != ""
assert request.arguments[0].strip() != ""
return res
......
......@@ -68,7 +68,9 @@ class GGUFLM(LM):
logger.error(f"RequestException: {e}")
time.sleep(delay) # wait before retrying
else:
raise Exception(f"Failed to get a valid response after {retries} retries.")
raise RuntimeError(
f"Failed to get a valid response after {retries} retries."
)
def loglikelihood(self, requests, disable_tqdm: bool = False):
if not requests:
......
import copy
from typing import Dict, List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
import transformers
from tqdm import tqdm
from transformers import BatchEncoding
from lm_eval import utils
from lm_eval.api.instance import Instance
from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM
from lm_eval.models.utils import (
Collator,
flatten_image_list,
pad_and_concat,
replace_placeholders,
stop_sequences_criteria,
)
DEFAULT_IMAGE_PLACEHOLDER = "<image>"
eval_logger = utils.eval_logger
@register_model("hf-multimodal")
class HFMultimodalLM(HFLM):
"""
An abstracted Hugging Face model class for multimodal LMs like Llava and Idefics.
"""
AUTO_MODEL_CLASS = transformers.AutoModelForVision2Seq
MULTIMODAL = True # flag to indicate, for now, that this model type can run multimodal requests
def __init__(
self,
pretrained: Union[str, transformers.PreTrainedModel],
image_token_id: Optional[int] = None,
image_string: Optional[str] = None,
interleave: bool = True,
# TODO: handle whitespace in image placeholder (replacement)
max_images: Optional[int] = 999,
convert_img_format=False,
**kwargs,
):
# We initialize using HFLM's init. Sub-methods like _create_model and _create_tokenizer
# modify init behavior.
super().__init__(pretrained, **kwargs)
assert (
self.batch_size != "auto"
), "Batch size 'auto' is not yet supported for hf-multimodal models."
self.chat_applied: bool = False
# TODO: phi-3.5 "image placeholders" are <image_1>, <image_2>, ... in order. how to handle this case
# HF AutoModelForVision2Seq models have an `image_token_id` value in their configs
# denoting the token which indicates a location where an image will be substituted in.
# This can take different string values across models, e.g. <image> for Idefics2 and <|image_pad|> for Qwen2-VL
self.interleave = interleave
self.max_images = max_images
self.rgb = convert_img_format
# WARNING: improperly set image_token_id can lead to ignored image input or other (potentially silent) errors!
if not image_string:
self.image_token_id = (
int(image_token_id)
if image_token_id
else (
getattr(self.config, "image_token_id", None)
or getattr(self.config, "image_token_index", None)
)
)
assert (
self.image_token_id is not None
), "Must have a non-None image_token_id to evaluate a Hugging Face AutoModelForVision2Seq model. Please pass `image_token_id` in `--model_args` if model's config does not already specify one."
# get the string this token ID corresponds to
self.image_token = self.tok_decode(
[self.image_token_id], skip_special_tokens=False
)
if image_token_id is not None:
eval_logger.info(
f"A non-default image_token_id with image_token_id={self.image_token_id} and string value '{self.image_token}' was specified manually. Note that using an improper image_token placeholder may lead to ignored image input or errors!"
)
else:
eval_logger.info(
f"A non-default image_token string with string value image_string='{image_string}' was specified manually. Note that using an improper image_token placeholder may lead to ignored image input or errors!"
)
self.image_token = image_string
def _create_tokenizer(
self,
pretrained: Union[str, transformers.PreTrainedModel],
tokenizer: Optional[
Union[
str,
transformers.ProcessorMixin,
]
],
revision: Optional[str] = "main",
trust_remote_code: Optional[bool] = False,
**kwargs,
) -> None:
"""
Helper method during initialization.
For the multimodal variant, we initialize not just
`self.tokenizer` but also `self.processor`.
"""
if tokenizer:
if isinstance(tokenizer, str):
return transformers.AutoProcessor.from_pretrained(
tokenizer,
revision=revision,
trust_remote_code=trust_remote_code,
# use_fast=use_fast_tokenizer,
)
else:
assert isinstance(
tokenizer, transformers.ProcessorMixin
) # TODO: check this condition
return tokenizer
# Get tokenizer based on 'pretrained'
if isinstance(pretrained, str):
model_name = pretrained
else:
# get the HF hub name via accessor on model
model_name = self.model.name_or_path
self.processor = transformers.AutoProcessor.from_pretrained(
model_name,
revision=revision,
trust_remote_code=trust_remote_code,
# use_fast=use_fast_tokenizer,
)
self.tokenizer = self.processor.tokenizer
def tok_multimodal_encode(
self, string, images, left_truncate_len=None, add_special_tokens=None
):
"""Helper function which encodes an image + string combo using AutoProcessor"""
# We inherit special token kwarg setup from HFLM.tok_encode
# special_tokens_kwargs = {}
# by default for CausalLM - false or self.add_bos_token is set
# if add_special_tokens is None:
# special_tokens_kwargs = {"add_special_tokens": False or self.add_bos_token}
# otherwise the method explicitly defines the value
# else:
# special_tokens_kwargs = {"add_special_tokens": add_special_tokens}
# encode text+images
# TODO: why does (Qwen2-VL) processor error when attempting to add special tokens to text?
encoding = self.processor(
text=string, images=images, return_tensors=None
) # , **special_tokens_kwargs)
# remove (and store) our tokenized text
text_encoding = encoding.pop("input_ids")
encoding.pop("attention_mask")
# left-truncate the encoded context to be at most `left_truncate_len` tokens long
if left_truncate_len:
text_encoding = text_encoding[-left_truncate_len:]
return text_encoding, encoding # image_encoding is a dict
def _encode_multimodal_pair(self, context, continuation, images):
"""Helper function to perform the role of TemplateLM._encode_pair
Except allowing for image input to also be processed alongside `context`.
This method is a bit messy due to the need to defer conversion of image and text token input
into PyTorch tensors until the main inference loop.
"""
n_spaces = len(context) - len(context.rstrip())
if n_spaces > 0:
continuation = context[-n_spaces:] + continuation
context = context[:-n_spaces]
# TODO: replace default <image> placeholder with self.image_token, for contexts
whole_enc, image_enc = self.tok_multimodal_encode(
context + continuation, images
)
context_enc, _ = self.tok_multimodal_encode(context, images)
# tok_multimodal_encode returns List[List[int]] for tokenized text. Get rid of the batch dim
# since we only are encoding a single string.
# TODO: this is a bit hacky, it'd be nice to make this generally cleaner
whole_enc, context_enc = whole_enc[0], context_enc[0]
context_enc_len = len(context_enc)
continuation_enc = whole_enc[context_enc_len:]
return context_enc, continuation_enc, image_enc
def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
self.chat_applied = True
if not self.interleave:
for content in chat_history:
c = []
text = content["content"]
# Count and remove image placeholders
image_count = min(
self.max_images, text.count(DEFAULT_IMAGE_PLACEHOLDER)
)
text = text.replace(DEFAULT_IMAGE_PLACEHOLDER, "")
# Add image entries
for _ in range(image_count):
c.append({"type": "image", "image": None})
# Add single text entry at the end
c.append({"type": "text", "text": text})
content["content"] = c
else:
for content in chat_history:
c = []
text = content["content"]
expected_image_count = min(
self.max_images, text.count(DEFAULT_IMAGE_PLACEHOLDER)
)
actual_image_count = 0
text_parts = text.split(DEFAULT_IMAGE_PLACEHOLDER)
for i, part in enumerate(text_parts):
# TODO: concatenate text parts (esp. if skipping images)?
if part: # Add non-empty text parts
c.append({"type": "text", "text": part})
if (
(i < len(text_parts) - 1) and i < self.max_images
): # Add image placeholder after each split except the last
c.append({"type": "image"})
actual_image_count += 1
content["content"] = c
if actual_image_count != expected_image_count:
raise ValueError(
f"Mismatch in image placeholder count. Expected: {expected_image_count}, Actual: {actual_image_count}"
)
return self.processor.apply_chat_template(
chat_history, add_generation_prompt=True
)
def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
if hasattr(self.processor, "apply_chat_template"):
_tokenizer = self.tokenizer
self.tokenizer = self.processor
selected_template = super().chat_template(chat_template)
self.tokenizer = _tokenizer
return selected_template
else:
return super().chat_template(chat_template)
def tok_batch_multimodal_encode(
self,
strings: List[str], # note that input signature of this fn is different
images: List[List], # TODO: images are pil.Image at the moment, update typehint
padding_side: str = "left",
left_truncate_len: int = None,
truncation: bool = False,
) -> Union[
BatchEncoding, Dict[str, torch.Tensor]
]: # note that this return signature differs from HFLM tok_batch_encode.
# NOTE: here, we replace <image> tags with our model's corresponding image_token string value.
if not self.chat_applied:
# TODO<baber>: This still keeps the whitespace in the image placeholder, which is not ideal.
strings = [
replace_placeholders(
string, DEFAULT_IMAGE_PLACEHOLDER, self.image_token, self.max_images
)
for string in strings
]
# encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
old_padding_side = self.tokenizer.padding_side
self.tokenizer.padding_side = padding_side
# add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
images = [img[: self.max_images] for img in images]
if self.rgb:
images = [[img.convert("RGB") for img in sublist] for sublist in images]
# certain models like llava expect a single-level image list even for bs>1, multi-image. TODO: port this over to loglikelihoods
if getattr(self.config, "model_type", "") == "llava":
images = flatten_image_list(images)
encoding = self.processor(
images=images,
text=strings,
truncation=truncation,
padding="longest",
return_tensors="pt",
# **add_special_tokens, # TODO: at least some Processors error out when passing this. How do we control whether text gets BOS added?
)
encoding.to( # TODO: our other tokenization methods in HFLM don't typically move to device. this breaks convention
self.device, self.model.dtype
) # TODO: This only casts the pixel values. Should they always be float16?
if left_truncate_len:
encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
encoding["attention_mask"] = encoding["attention_mask"][
:, -left_truncate_len:
]
self.tokenizer.padding_side = old_padding_side
return encoding
def _model_multimodal_call(self, inps, imgs, attn_mask=None, labels=None):
"""
TODO: update docstring
"""
# note: imgs is a dict.
with torch.no_grad():
return self.model(inps, **imgs).logits
def _model_multimodal_generate(self, inputs, max_length, stop, **generation_kwargs):
generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
do_sample = generation_kwargs.get("do_sample", None)
# The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
generation_kwargs["do_sample"] = do_sample = False
if do_sample is False and generation_kwargs.get("temperature") == 0.0:
generation_kwargs.pop("temperature")
stopping_criteria = stop_sequences_criteria(
self.tokenizer,
stop,
inputs["input_ids"].shape[1],
inputs["input_ids"].shape[0],
)
return self.model.generate(
**inputs,
max_length=max_length,
stopping_criteria=stopping_criteria,
pad_token_id=self.tokenizer.pad_token_id,
use_cache=True,
**generation_kwargs,
)
def _batch_images(self, image_encs):
"""
Helper function: batch together image encodings across examples in a batch.
# TODO: for variable-sized images, this may break down.
"""
batched_imgs = {}
for key in image_encs[0].keys():
batched_imgs[key] = torch.cat(
[
torch.tensor(
image_enc[key], device=self.device, dtype=self.model.dtype
)
for image_enc in image_encs
],
dim=0,
)
return batched_imgs
def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
raise NotImplementedError(
"model type `hf-multimodal` does not support loglikelihood_rolling. Use 'hf' model type for text-only loglikelihood_rolling tasks ",
"this is because we do not support measuring the loglikelihood a model assigns to an image.",
)
def loglikelihood(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[Tuple[float, bool]]:
raise NotImplementedError(
"'loglikelihood' requests for model type `hf-multimodal` are not yet tested. This feature will be enabled when a loglikelihood-based multiple-choice VQA dataset is added!"
)
new_reqs = []
for context, continuation, aux_arguments in [req.args for req in requests]:
if context == "":
raise ValueError(
"Must get non-empty context for multimodal requests! You might be trying to run 'loglikelihood_rolling', which is not supported in the multimodal case."
)
else:
visuals = aux_arguments["visual"]
context_enc, continuation_enc, image_enc = self._encode_multimodal_pair(
context, continuation, visuals
)
# TODO: key to pick for caching images
new_reqs.append(
(
(context, continuation, visuals),
context_enc,
continuation_enc,
image_enc,
)
)
return self._loglikelihood_tokens(new_reqs, disable_tqdm=disable_tqdm)
def _loglikelihood_tokens(
self,
requests: List[
Tuple[Tuple[None, str, str], List[int], List[int], List[int]]
], # TODO: update typehint to be correct
disable_tqdm: bool = False,
override_bs: int = None,
) -> List[Tuple[float, bool]]:
res = []
# TODO: **improve multimodal collation.** We currently ignore image size when ordering docs. ideally we'd take them into account
def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
"""Defines the key for the sorted method"""
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch
# padded context length. this is useful to simplify the batching logic and more importantly to make
# automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks = req[1] + req[2]
return -len(toks), tuple(toks)
def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
"""Defines the key to group and lookup one-token continuations"""
# Use with group_by="contexts" (optional)"
# allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
# speeds up some multiple-choice tasks proportionally to the number of choices.
# groups requests by context+continuation[:-1] and infer on one request/group.
return req[-1] + req[-3] + req[-2][:-1]
re_ord = Collator(
requests,
sort_fn=_collate,
group_by="contexts" # TODO: can't group-by just "contexts" any more, need to incorporate imgs
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
and self.logits_cache
else None,
group_fn=_lookup_one_token_cont,
)
# automatic (variable) batch size detection for vectorization
# pull longest context sample from request
n_reordered_requests = len(re_ord)
batch_size = (
self.batch_size
if self.batch_size != "auto"
else override_bs
if override_bs is not None
else 0
)
batch_fn = (
self._batch_scheduler
if self.batch_size == "auto"
and n_reordered_requests > 0
and not override_bs
else None
)
chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn)
pbar = tqdm(
total=len(requests),
disable=(disable_tqdm or (self.rank != 0)),
desc="Running loglikelihood requests with text+image input",
)
for chunk in chunks:
imgs = []
inps = []
cont_toks_list = []
inplens = []
padding_len_inp = None
# because vectorizing is annoying, we first convert each (context, continuation) pair to padded
# tensors, then we pack them together into a batch, call the model, and then pick it all apart
# again because vectorizing is annoying
for _, context_enc, continuation_enc, image_enc in chunk:
# sanity check
assert len(image_enc) > 0
assert len(context_enc) > 0
assert len(continuation_enc) > 0
assert len(continuation_enc) <= self.max_length
# how this all works (illustrated on a causal decoder-only setup):
# CTX CONT
# inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
# model \ \
# logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the
# cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice
# when too long to fit in context, truncate from the left
# TODO: assuming that we won't handle enc-dec Vision2Seq models. Is that a safe assumption?
inp = torch.tensor(
(context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
dtype=torch.long,
device=self.device,
)
(inplen,) = inp.shape
padding_len_inp = (
max(padding_len_inp, inplen)
if padding_len_inp is not None
else inplen
)
inps.append(inp) # [1, inp_length]
cont_toks_list.append(continuation_enc)
inplens.append(inplen)
imgs.append(image_enc)
# create encoder attn mask and batched conts, if seq2seq
call_kwargs = {}
batched_inps = pad_and_concat(
padding_len_inp, inps, padding_side="right"
) # [batch, padding_len_inp]
# batch our examples' image inputs together
batched_imgs = self._batch_images(
imgs
) # TODO: fix/test for bs>1 case with differently-sized imgs!
multi_logits = F.log_softmax(
self._model_multimodal_call(batched_inps, batched_imgs, **call_kwargs),
dim=-1,
) # [batch, padding_length (inp or cont), vocab]
for (
request_str,
ctx_tokens,
_,
image_encs,
), logits, inplen, cont_toks in zip(
chunk, multi_logits, inplens, cont_toks_list
):
# Slice to original seq length
contlen = len(cont_toks)
# take only logits in the continuation
# (discard context toks if decoder-only ; discard right-padding)
# also discards + checks for "virtual tokens" in the causal LM's input window
# from prompt/prefix tuning tokens, if applicable
ctx_len = (
inplen + (logits.shape[0] - padding_len_inp)
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
else None
)
logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
logits = logits.unsqueeze(0) # [1, seq, vocab]
# Check if per-token argmax is exactly equal to continuation
greedy_tokens = logits.argmax(dim=-1)
# check for one-token continuation cache hits.
# noop in case group_by != "contexts" or no cache hit and returns the
# original args. Otherwise, expands the logits batch dimension and yields each
# batch along with matching continuation tokens and prompt strings.
# logits -> [1, seq, vocab]
for request_str, cont_toks, logits in re_ord.get_cache(
req_str=request_str,
cxt_toks=ctx_tokens,
cont_toks=cont_toks,
logits=logits,
):
cont_toks = torch.tensor(
cont_toks, dtype=torch.long, device=self.device
).unsqueeze(0) # [1, seq]
max_equal = (greedy_tokens == cont_toks).all()
# Obtain log-probs at the corresponding continuation token indices
# last_token_slice = logits[:, -1, :].squeeze(0).tolist()
logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
-1
) # [1, seq]
# Answer: (log prob, is-exact-match)
answer = (float(logits.sum()), bool(max_equal))
res.append(answer)
self.cache_hook.add_partial(
"loglikelihood", request_str, answer
) # TODO: choose convention for adding images into the cache key
pbar.update(1)
pbar.close()
return re_ord.get_original(res)
def generate_until(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[str]:
# TODO: back out to HFLM.generate_until() for all requests without aux_arguments (text-only reqs)
res = []
def _collate(x):
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch
# padded context length. this is useful to simplify the batching logic and more importantly to make
# automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks = self.tok_encode(x[0])
return -len(toks), x[0]
pbar = tqdm(
total=len(requests),
disable=(disable_tqdm or (self.rank != 0)),
desc="Running generate_until requests with text+image input",
)
# TODO: port auto-batch sizing into this.
# we group requests by their generation_kwargs,
# so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
# in the same batch.
re_ords = Collator(
[reg.args for reg in requests],
_collate,
group_by="gen_kwargs",
group_fn=lambda x: x[1],
)
chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
### Up to here: was identical to non-multimodal HFLM generate_until ###
for chunk in chunks:
contexts, all_gen_kwargs, aux_arguments = zip(*chunk)
visuals = [arg["visual"] for arg in aux_arguments]
if not isinstance(contexts, list):
contexts = list(
contexts
) # for Qwen2-VL, processor is unhappy accepting a tuple of strings instead of a list.
# TODO: could we upstream this workaround to HF?
### this part onward: same as HFLM ###
# we assume all gen kwargs in the batch are the same
# this is safe to assume because the `grouper` object ensures it.
gen_kwargs = all_gen_kwargs[0]
# unpack our keyword arguments.
until = None
if isinstance(gen_kwargs, dict):
kwargs = copy.deepcopy(gen_kwargs) # edge case for repeats > 1
if "until" in kwargs.keys():
until = kwargs.pop("until")
if isinstance(until, str):
until = [until]
elif not isinstance(until, list):
raise ValueError(
f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
)
else:
raise ValueError(
f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
)
# add EOS token to stop sequences
eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
if not until:
until = [eos]
else:
until.append(eos)
if "max_gen_toks" in kwargs.keys():
max_gen_toks = kwargs.pop("max_gen_toks")
else:
max_gen_toks = self.max_gen_toks
### end stuff that's entirely copied verbatim from HFLM ###
max_ctx_len = self.max_length - max_gen_toks
inputs = self.tok_batch_multimodal_encode(
contexts,
visuals,
left_truncate_len=max_ctx_len,
truncation=self.truncation,
)
context_enc = inputs["input_ids"]
if "max_length" not in kwargs:
kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
cont = self._model_multimodal_generate(inputs, stop=until, **kwargs)
del inputs
torch.cuda.empty_cache()
import gc
gc.collect()
### essentially same as HFLM beyond this line!
cont_toks_list = cont.tolist()
for cont_toks, context in zip(cont_toks_list, contexts):
# discard context + left-padding toks if using causal decoder-only VLM
cont_toks = cont_toks[context_enc.shape[1] :]
s = self.tok_decode(cont_toks)
# use secondary stop seqs to cut off should-have-been-stopped content post-hoc
for term in until:
if len(term) > 0:
# ignore '' separator,
# for seq2seq case where self.tok_decode(self.eot_token_id) = ''
s = s.split(term)[0]
res.append(s)
self.cache_hook.add_partial(
"generate_until", (context, gen_kwargs), s
) # TODO: cache key for multimodal input should be what?
pbar.update(1)
# reorder this group of results back to original unsorted form
res = re_ords.get_original(res)
pbar.close()
return res
......@@ -9,10 +9,10 @@ import torch.nn.functional as F
import transformers
from accelerate import (
Accelerator,
DistributedType,
InitProcessGroupKwargs,
find_executable_batch_size,
)
from accelerate.utils import get_max_memory
from huggingface_hub import HfApi
from packaging import version
from peft import PeftModel
......@@ -30,6 +30,7 @@ from lm_eval.api.registry import register_model
from lm_eval.models.utils import (
Collator,
clear_torch_cache,
configure_pad_token,
get_dtype,
pad_and_concat,
stop_sequences_criteria,
......@@ -39,31 +40,6 @@ from lm_eval.models.utils import (
eval_logger = utils.eval_logger
def _get_accelerate_args(
device_map_option: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None,
offload_folder: Optional[str] = "./offload",
gpus: Optional[int] = None,
) -> dict:
"""Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
max_memory = {}
if max_memory_per_gpu is not None:
max_memory_per_gpu_map = {
device_idx: max_memory_per_gpu for device_idx in range(gpus)
}
max_memory.update(max_memory_per_gpu_map)
if max_cpu_memory is not None:
max_memory["cpu"] = max_cpu_memory
args = {}
if max_memory:
args["max_memory"] = max_memory
args["device_map"] = device_map_option
args["offload_folder"] = offload_folder
return args
@register_model("hf-auto", "hf", "huggingface")
class HFLM(TemplateLM):
"""
......@@ -79,7 +55,7 @@ class HFLM(TemplateLM):
def __init__(
self,
pretrained: Union[str, transformers.PreTrainedModel],
backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
backend: Literal["default", "causal", "seq2seq"] = "default",
# override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq)
revision: Optional[str] = "main",
subfolder: Optional[str] = None,
......@@ -104,7 +80,6 @@ class HFLM(TemplateLM):
# arguments used for splitting a model across GPUs naively.
# only used if `parallelize=True`.
parallelize: Optional[bool] = False,
device_map_option: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None,
offload_folder: Optional[Union[str, os.PathLike]] = "./offload",
......@@ -112,10 +87,10 @@ class HFLM(TemplateLM):
peft: Optional[str] = None,
delta: Optional[str] = None,
autogptq: Optional[Union[bool, str]] = False,
gptqmodel: Optional[bool] = False,
**kwargs,
) -> None:
super().__init__()
# optionally: take in an already-initialized transformers.PreTrainedModel
if not isinstance(pretrained, str):
eval_logger.warning(
......@@ -127,21 +102,6 @@ class HFLM(TemplateLM):
self._config = self._model.config
gpus = 0
if tokenizer:
assert isinstance(
tokenizer, transformers.PreTrainedTokenizer
) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
self.tokenizer = tokenizer
else:
# Get tokenizer
model_name = self._model.name_or_path
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
model_name,
revision=revision,
trust_remote_code=trust_remote_code,
use_fast=use_fast_tokenizer,
)
else:
assert isinstance(device, str)
assert isinstance(pretrained, str)
......@@ -156,6 +116,7 @@ class HFLM(TemplateLM):
if "npu" in accelerator.device.type:
gpus = torch.npu.device_count()
# using one process with no model parallelism
if not (parallelize or accelerator.num_processes > 1):
# use user-passed device
device_list = set(
......@@ -181,14 +142,19 @@ class HFLM(TemplateLM):
if torch.cuda.is_available()
else torch.device("cpu")
)
else:
else: # Parallelism managed by accelerate
if device != "cuda":
eval_logger.info(
f"Using `accelerate launch` or `parallelize=True`, device '{device}' will be overridden when placing model."
)
# TODO: include in warning that `load_in_8bit` etc. affect this too
self._device = torch.device(device)
self._device = (
self.accelerator.device
if hasattr(self, "accelerator")
else torch.device(device)
)
revision = str(revision) # cast to string if not already one
# TODO: update this to be less of a hack once subfolder is fixed in HF
revision = revision + ("/" + subfolder if subfolder is not None else "")
......@@ -198,7 +164,7 @@ class HFLM(TemplateLM):
trust_remote_code=trust_remote_code,
)
# determine which of 'causal' and 'seq2seq' backends to use
# determine which of 'causal' and 'seq2seq' backends to use for HF models
self._get_backend(
config=self.config, backend=backend, trust_remote_code=trust_remote_code
)
......@@ -221,13 +187,13 @@ class HFLM(TemplateLM):
trust_remote_code=trust_remote_code,
parallelize=parallelize,
gpus=gpus,
device_map_option=device_map_option,
max_memory_per_gpu=max_memory_per_gpu,
max_cpu_memory=max_cpu_memory,
offload_folder=offload_folder,
peft=peft,
delta=delta,
autogptq=autogptq,
gptqmodel=gptqmodel,
**kwargs,
)
......@@ -236,52 +202,17 @@ class HFLM(TemplateLM):
self.model.eval()
self.model.tie_weights()
if isinstance(pretrained, str) and (gpus >= 1 or str(self.device) == "mps"):
# TODO: can remove this whole snippet except in the mps case, perhaps?
if not (parallelize or autogptq or hasattr(self, "accelerator")):
# place model onto device requested manually,
# if not using HF Accelerate or device_map
# or any other option that preloads model onto device
try:
self.model.to(self.device)
except ValueError:
eval_logger.debug(
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
)
self.truncation = truncation
self.logits_cache = logits_cache
self.vocab_size = self.tokenizer.vocab_size
# select (or create) a pad token to use
if self.tokenizer.pad_token:
pass
elif self.tokenizer.unk_token:
self.tokenizer.pad_token_id = self.tokenizer.unk_token_id
elif self.tokenizer.eos_token:
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
else:
if getattr(self.config, "model_type", None) == "qwen":
# Qwen's trust_remote_code tokenizer does not allow for adding special tokens
self.tokenizer.pad_token = "<|endoftext|>"
elif (
self.tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
or self.tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
):
# The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
# The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
# ---
# Note that the world tokenizer class name, might change in the future for the final huggingface merge
# https://github.com/huggingface/transformers/pull/26963
assert self.tokenizer.pad_token_id == 0
else:
self.tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)
# TODO: override this for Gemma
self.add_bos_token = add_bos_token
if getattr(self.config, "model_type", None) == "gemma":
if "gemma" in getattr(self.config, "model_type", ""):
self.add_bos_token = True
eval_logger.info(
f"Model type is '{self.config.model_type}', a BOS token will be used as Gemma underperforms without it."
f"Model type is '{self.config.model_type}', part of the Gemma family--a BOS token will be used as Gemma underperforms without it."
)
self._max_length = max_length
......@@ -302,49 +233,46 @@ class HFLM(TemplateLM):
self.batch_size_per_gpu = int(batch_size)
if isinstance(pretrained, str):
if gpus >= 1 or str(self.device) == "mps":
# TODO: can remove this whole snippet except in the mps case, perhaps?
if not (parallelize or autogptq or hasattr(self, "accelerator")):
# place model onto device requested manually,
# if not using HF Accelerate or device_map
# or any other option that preloads model onto device
try:
self.model.to(self.device)
except ValueError:
eval_logger.debug(
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
)
# multigpu data-parallel support when launched with accelerate
if gpus > 1:
if parallelize:
if accelerator.num_processes > 1:
raise RuntimeError(
"Attempted to use both a HF Accelerate `device_map` and to launch via `accelerate launch`. If this is the case, please either remove `parallelize=True` from --model_args or launch outside of the Accelerate launcher."
if accelerator.num_processes > 1:
if parallelize:
eval_logger.warning(
"You are both using a HF Accelerate `device_map` (`--model_args parallelize=True`) and launching via `accelerate launch`. This will attempt to do model and data parallelism depending on the resources available."
)
else:
pass
elif accelerator.num_processes == 1:
# if we aren't launching via accelerate, ditch
self._rank = 0
self._world_size = 1
else:
if gpus > accelerator.num_processes:
elif gpus > accelerator.num_processes:
eval_logger.warning(
"WARNING: The number of total system GPUs does not match the number of spawned processes. "
"If you would like to use data parallelism, please launch the script "
"with 'accelerate launch *script*'. "
f"Current run will proceed with {accelerator.num_processes} devices."
)
assert (
accelerator.distributed_type
in [
DistributedType.FSDP,
DistributedType.MULTI_GPU,
DistributedType.MULTI_NPU,
]
), "Unsupported distributed type provided. Only DDP and FSDP are supported."
if accelerator.distributed_type == DistributedType.FSDP:
self._model = accelerator.prepare(self.model)
else:
self._model = accelerator.prepare_model(
self.model, evaluation_mode=True
)
if self.accelerator.is_local_main_process:
eval_logger.info(
f"Using {gpus} devices with data parallelism"
)
self._device = torch.device(f"{accelerator.device}")
self.accelerator = accelerator
if self.accelerator.is_local_main_process:
eval_logger.info(f"Using {gpus} devices with data parallelism")
self._rank = self.accelerator.local_process_index
self._world_size = self.accelerator.num_processes
else:
# if we aren't launching via accelerate, ditch
self._rank = 0
self._world_size = 1
else:
# if a PreTrainedModel was passed into HFLM, we forgo distributed setup.
eval_logger.warning(
......@@ -359,6 +287,94 @@ class HFLM(TemplateLM):
f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}"
)
def _get_accelerate_args(
self,
parallelize: Optional[bool] = None,
device_map: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None,
offload_folder: Optional[str] = "./offload",
gpus: Optional[int] = None,
) -> dict:
"""Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
num_local_processes = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
num_machines = int(os.environ.get("WORLD_SIZE", 0)) // num_local_processes
if (
num_machines == 0
and hasattr(self, "accelerator")
and self.accelerator is not None
):
eval_logger.info(
"We are not in a distributed setting for accelerate. Setting model_parallel to False."
)
parallelize = False
if parallelize is None:
# If parallelism is unset by the user, we automatically assign model parallelism
# if enough extra GPUs are available
max_memory_all_gpus = get_max_memory()
# We just want gpu, not cpu, max memory
if "cpu" in max_memory_all_gpus:
del max_memory_all_gpus["cpu"]
parallelize = bool(num_local_processes < len(max_memory_all_gpus))
eval_logger.info(
f"Setting model parallel to {parallelize} since "
f"the number of local processes is {num_local_processes} "
f"and the number of GPUs is {len(max_memory_all_gpus)}"
)
args = {}
if parallelize: # Model parallelism will be used
max_memory = {}
if max_memory_per_gpu is not None: # Using the provided memory requirements
max_memory_per_gpu_map = {
device_idx: max_memory_per_gpu for device_idx in range(gpus)
}
else: # Estimating the possible memory requirements
max_memory_all_gpus = get_max_memory()
if "cpu" in max_memory_all_gpus:
del max_memory_all_gpus["cpu"]
if not hasattr(self, "accelerator"):
max_memory_per_gpu_map = {
k: v for k, v in max_memory_all_gpus.items()
}
else:
# use only 1 / num_processes of the GPUs if we are running under accelerate launch
max_memory_per_gpu_map = {
k: v
for k, v in max_memory_all_gpus.items()
if k % num_local_processes
== (self.accelerator.process_index % num_local_processes)
}
args["max_memory"] = max_memory_per_gpu_map
args["device_map"] = "auto" if device_map is None else device_map
eval_logger.info(
f"Model parallel was set to True, setting max memory per GPU to {max_memory_per_gpu_map} and device map to {args.get('device_map')}"
)
if max_cpu_memory is not None:
max_memory["cpu"] = max_cpu_memory
args["offload_folder"] = offload_folder
elif (
device_map is None
): # No model parallelism, we use the default provided device for our model
if hasattr(self, "accelerator"):
device_map = {"": f"{self.accelerator.device}"}
else:
device_map = {"": str(self.device)}
args["max_memory"] = None
args["device_map"] = device_map
eval_logger.info(
f"Model parallel was set to False, max memory was not set, and device map was set to {device_map}"
)
else:
args["max_memory"] = None
args["device_map"] = None
eval_logger.info("Model parallel was set to False.")
return args
@property
def config(self):
# return the associated transformers.AutoConfig for the given pretrained model.
......@@ -424,31 +440,29 @@ class HFLM(TemplateLM):
def tokenizer_name(self) -> str:
return self.tokenizer.name_or_path.replace("/", "__")
@property
def chat_template(self) -> str:
if self.tokenizer.chat_template is not None:
return self.tokenizer.chat_template
return self.tokenizer.default_chat_template
def _get_backend(
self,
config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
backend: Literal["default", "causal", "seq2seq"] = "default",
trust_remote_code: Optional[bool] = False,
) -> None:
"""
Helper method during initialization.
Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder))
model type to be used.
Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder)) model type to be used.
sets `self.AUTO_MODEL_CLASS` appropriately if not already set.
**If not calling HFLM.__init__() or HFLM._get_backend() within a subclass of HFLM,
user must set `self.backend` to be either "causal" or "seq2seq" manually!**
"""
assert backend in ["default", "causal", "seq2seq"]
if backend != "default":
# if we've settled on non-default backend, use that manually
if backend == "causal":
self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
self.backend = backend
elif backend == "seq2seq":
self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
self.backend = backend
eval_logger.info(
f"Overrode HF model backend type, and using type '{backend}'"
)
......@@ -461,26 +475,32 @@ class HFLM(TemplateLM):
# first check if model type is listed under seq2seq models, since some
# models like MBart are listed in both seq2seq and causal mistakenly in HF transformers.
# these special cases should be treated as seq2seq models.
self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
self.backend = "seq2seq"
eval_logger.info(f"Using model type '{backend}'")
elif (
getattr(self.config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
):
self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
self.backend = "causal"
eval_logger.info(f"Using model type '{backend}'")
else:
if not trust_remote_code:
eval_logger.warning(
"HF model type is neither marked as CausalLM or Seq2SeqLM. \
This is expected if your model requires `trust_remote_code=True` but may be an error otherwise."
"Setting backend to causal"
)
# if model type is neither in HF transformers causal or seq2seq model registries
# then we default to AutoModelForCausalLM
self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
# then we default to assuming AutoModelForCausalLM
self.backend = "causal"
eval_logger.info(
f"Model type cannot be determined. Using default model type '{backend}'"
)
assert self.AUTO_MODEL_CLASS in [
transformers.AutoModelForCausalLM,
transformers.AutoModelForSeq2SeqLM,
]
return None
if self.AUTO_MODEL_CLASS is None:
if self.backend == "causal":
self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
elif self.backend == "seq2seq":
self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
def _get_config(
self,
......@@ -488,6 +508,7 @@ class HFLM(TemplateLM):
revision: str = "main",
trust_remote_code: bool = False,
) -> None:
"""Return the model config for HuggingFace models"""
self._config = transformers.AutoConfig.from_pretrained(
pretrained,
revision=revision,
......@@ -505,7 +526,6 @@ class HFLM(TemplateLM):
# (accelerate naive PP (device_map) options)
parallelize: Optional[bool] = False,
gpus: Optional[int] = None,
device_map_option: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None,
offload_folder: Optional[str] = "./offload",
......@@ -513,6 +533,7 @@ class HFLM(TemplateLM):
peft: Optional[str] = None,
delta: Optional[str] = None,
autogptq: Optional[Union[bool, str]] = False,
gptqmodel: Optional[bool] = False,
**kwargs,
) -> None:
"""
......@@ -529,27 +550,18 @@ class HFLM(TemplateLM):
model_kwargs = kwargs if kwargs else {}
if parallelize:
model_kwargs.update(
_get_accelerate_args(
device_map_option, # TODO: phase out device_map_option?
max_memory_per_gpu,
max_cpu_memory,
offload_folder,
gpus,
)
model_kwargs.update(
self._get_accelerate_args(
parallelize=parallelize,
device_map=kwargs.get("device_map", None),
max_memory_per_gpu=max_memory_per_gpu,
max_cpu_memory=max_cpu_memory,
offload_folder=offload_folder,
gpus=gpus,
)
elif "device_map" not in model_kwargs:
# set a device_map to initialize model on the right GPU.
# this is needed because it seems that the default behavior
# for quantized models now seems to be device_map="auto"
# which breaks data-parallel mode.
if hasattr(self, "accelerator"):
model_kwargs.update({"device_map": {"": f"{self.accelerator.device}"}})
else:
model_kwargs.update({"device_map": {"": str(self.device)}})
)
if not autogptq:
if not autogptq and not gptqmodel:
if model_kwargs.get("load_in_4bit", None):
assert (
transformers.__version__ >= "4.30.0"
......@@ -560,6 +572,7 @@ class HFLM(TemplateLM):
model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(
model_kwargs["bnb_4bit_compute_dtype"]
)
self._model = self.AUTO_MODEL_CLASS.from_pretrained(
pretrained,
revision=revision,
......@@ -568,23 +581,42 @@ class HFLM(TemplateLM):
**model_kwargs,
)
else:
try:
from auto_gptq import AutoGPTQForCausalLM
except ModuleNotFoundError:
raise Exception(
"Tried to load auto_gptq, but auto-gptq is not installed ",
"please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]",
if autogptq and gptqmodel:
raise ValueError(
"Cannot use both 'autogptq' and 'gptqmodel' options at the same time."
)
self._model = AutoGPTQForCausalLM.from_quantized(
pretrained,
trust_remote_code=trust_remote_code,
model_basename=None if autogptq is True else Path(autogptq).stem,
use_safetensors=True
if autogptq is True
else autogptq.endswith(".safetensors"),
**model_kwargs,
)
if autogptq:
try:
from auto_gptq import AutoGPTQForCausalLM
except ModuleNotFoundError as exception:
raise type(exception)(
"Tried to load auto_gptq, but auto-gptq is not installed ",
"please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]",
)
self._model = AutoGPTQForCausalLM.from_quantized(
pretrained,
trust_remote_code=trust_remote_code,
model_basename=None if autogptq is True else Path(autogptq).stem,
use_safetensors=True
if autogptq is True
else autogptq.endswith(".safetensors"),
**model_kwargs,
)
if gptqmodel:
try:
from gptqmodel import GPTQModel
except ModuleNotFoundError as exception:
raise type(exception)(
"Tried to load gptqmodel, but gptqmodel is not installed ",
"please install gptqmodel via `pip install gptqmodel --no-build-isolation` or `pip install lm-eval[gptqmodel] --no-build-isolation`",
)
self._model = GPTQModel.from_quantized(
pretrained, trust_remote_code=trust_remote_code, **model_kwargs
)
if peft and delta:
raise ValueError(
......@@ -597,10 +629,10 @@ class HFLM(TemplateLM):
raise AssertionError("load_in_4bit requires peft >= 0.4.0")
if self._model.config.vocab_size != len(self.tokenizer):
# resize model for LoRAs with added tokens
self._model.resize_token_embeddings(len(self.tokenizer))
eval_logger.info(
f"Model config indicates vocab_size='{self._model.config.vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..."
)
self._model.resize_token_embeddings(len(self.tokenizer))
self._model = PeftModel.from_pretrained(
self._model, peft, revision=revision
)
......@@ -718,7 +750,7 @@ class HFLM(TemplateLM):
@find_executable_batch_size(starting_batch_size=self.max_batch_size)
def forward_batch(batch_size):
security_margin = int(0.05 * security_margin_factor * batch_size)
if self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
if self.backend == "seq2seq":
length = max(max_context_enc, max_cont_enc)
batched_conts = torch.ones(
(batch_size + security_margin, length), device=self.device
......@@ -773,7 +805,7 @@ class HFLM(TemplateLM):
# by default for CausalLM - false or self.add_bos_token is set
if add_special_tokens is None:
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
if self.backend == "causal":
special_tokens_kwargs = {
"add_special_tokens": False or self.add_bos_token
}
......@@ -801,7 +833,7 @@ class HFLM(TemplateLM):
self.tokenizer.padding_side = padding_side
add_special_tokens = {}
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
if self.backend == "causal":
add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
encoding = self.tokenizer(
......@@ -879,14 +911,14 @@ class HFLM(TemplateLM):
def _select_cont_toks(
self, logits: torch.Tensor, contlen: int = None, inplen: int = None
) -> torch.Tensor:
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
if self.backend == "causal":
assert (
contlen and inplen
), "Must pass input len and cont. len to select scored logits for causal LM"
# discard right-padding.
# also discard the input/context tokens. we'll only score continuations.
logits = logits[inplen - contlen : inplen]
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
elif self.backend == "seq2seq":
assert (
contlen and not inplen
), "Selecting scored logits for Seq2SeqLM requires only cont. len"
......@@ -944,6 +976,9 @@ class HFLM(TemplateLM):
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
# cache this loglikelihood_rolling request
self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
return loglikelihoods
def _batch_scheduler(self, pos, n_reordered_requests):
......@@ -1000,8 +1035,7 @@ class HFLM(TemplateLM):
requests,
sort_fn=_collate,
group_by="contexts"
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
and self.logits_cache
if self.backend == "causal" and self.logits_cache
else None,
group_fn=_lookup_one_token_cont,
)
......@@ -1058,14 +1092,14 @@ class HFLM(TemplateLM):
# cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice
# when too long to fit in context, truncate from the left
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
if self.backend == "causal":
inp = torch.tensor(
(context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
dtype=torch.long,
device=self.device,
)
(inplen,) = inp.shape
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
elif self.backend == "seq2seq":
inp = torch.tensor(
(context_enc)[-self.max_length :],
dtype=torch.long,
......@@ -1105,11 +1139,11 @@ class HFLM(TemplateLM):
# create encoder attn mask and batched conts, if seq2seq
call_kwargs = {}
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
if self.backend == "causal":
batched_inps = pad_and_concat(
padding_len_inp, inps, padding_side="right"
) # [batch, padding_len_inp]
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
elif self.backend == "seq2seq":
# TODO: left-pad encoder inps and mask?
batched_inps = pad_and_concat(
padding_len_inp, inps
......@@ -1142,7 +1176,7 @@ class HFLM(TemplateLM):
# from prompt/prefix tuning tokens, if applicable
ctx_len = (
inplen + (logits.shape[0] - padding_len_inp)
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
if self.backend == "causal"
else None
)
logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
......@@ -1178,7 +1212,13 @@ class HFLM(TemplateLM):
res.append(answer)
self.cache_hook.add_partial("loglikelihood", request_str, answer)
if request_str is not None:
# special case: loglikelihood_rolling produces a number of loglikelihood requests
# all with cache key None. instead do add_partial on the per-example level
# in the loglikelihood_rolling() function for those.
self.cache_hook.add_partial(
"loglikelihood", request_str, answer
)
pbar.update(1)
pbar.close()
......@@ -1260,7 +1300,7 @@ class HFLM(TemplateLM):
max_gen_toks = self.max_gen_toks
# set the max length in tokens of inputs ("context_enc")
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
if self.backend == "causal":
# max len for inputs = max length, minus room to generate the max new tokens
# if the max new tokens is too large, halve it until it fits as we cannot change
# the max model length
......@@ -1268,7 +1308,7 @@ class HFLM(TemplateLM):
while max_ctx_len <= 0:
max_gen_toks = max_gen_toks // 2
max_ctx_len = self.max_length - max_gen_toks
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
elif self.backend == "seq2seq":
# max len for inputs = encoder's whole max_length
max_ctx_len = self.max_length
......@@ -1295,7 +1335,7 @@ class HFLM(TemplateLM):
cont_toks_list = cont.tolist()
for cont_toks, context in zip(cont_toks_list, contexts):
# discard context + left-padding toks if using causal decoder-only LM
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
if self.backend == "causal":
cont_toks = cont_toks[context_enc.shape[1] :]
s = self.tok_decode(cont_toks)
......
import json
import os
from configparser import ConfigParser
from functools import lru_cache
from pathlib import Path
from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Type, cast
from tqdm import tqdm
from lm_eval.api.instance import Instance
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.utils import eval_logger, simple_parse_args_string
class LogLikelihoodResult(NamedTuple):
log_likelihood: float
is_greedy: bool
@lru_cache(maxsize=None)
def get_watsonx_credentials(
env_name: str = "YP_QA",
config_path: str = "config.ini",
) -> Dict[str, str]:
"""
Retrieves Watsonx API credentials from environmental variables or from a configuration file.
Args:
env_name (str, optional): The name of the environment from which to retrieve credentials. Defaults to "YP_QA".
config_path (str, optional): The file path to the `config.ini` configuration file. Defaults to "config.ini".
Returns:
dict[str, str]: A dictionary containing the credentials necessary for authentication, including
keys such as `apikey`, `url`, and `project_id`.
Raises:
FileNotFoundError: If the specified configuration file does not exist.
AssertionError: If the credentials format is invalid.
"""
def _verify_credentials(creds: Any) -> None:
assert isinstance(creds, dict) and all(
key in creds.keys() for key in ["apikey", "url", "project_id"]
), "Wrong configuration for credentials."
credentials = {
"apikey": os.getenv("WATSONX_API_KEY", None),
"url": os.getenv("WATSONX_URL", None),
"project_id": os.getenv("WATSONX_PROJECT_ID", None),
}
if any(credentials.get(key) is None for key in ["apikey", "url", "project_id"]):
eval_logger.warning(
"One or more required environment variables are missing, trying to load config.ini file."
)
config_path = "config.ini" if not config_path else config_path
if not Path(config_path).is_absolute():
config_path = os.path.join(
Path(__file__).parent.parent.absolute(), config_path
)
if not os.path.exists(config_path):
raise FileNotFoundError(
f"Provided config file path {config_path} does not exist. "
"You need to specify credentials in config.ini file under specified location."
)
config = ConfigParser()
config.read(config_path)
credentials = json.loads(config.get(env_name))
_verify_credentials(credentials)
return credentials
@register_model("watsonx_llm")
class WatsonxLLM(LM):
"""
Implementation of LM model interface for evaluating Watsonx model with the lm_eval framework.
See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/model_guide.md for reference.
"""
@classmethod
def create_from_arg_string(
cls: Type["WatsonxLLM"],
arg_string: str,
config_path: Optional[str] = None,
) -> "WatsonxLLM":
"""
Allow the user to specify model parameters (TextGenerationParameters) in CLI arguments.
"""
try:
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
except ImportError:
raise ImportError(
"Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
)
args = simple_parse_args_string(arg_string)
model_id = args.pop("model_id", None)
if model_id is None:
raise ValueError("'model_id' is required, please pass it in 'model_args'")
if not args.get("do_sample", None):
args["temperature"] = None
args["top_p"] = None
args["top_k"] = None
args["seed"] = None
cls.generate_params = {
GenParams.DECODING_METHOD: (
"greedy" if not args.get("do_sample", None) else "sample"
),
GenParams.LENGTH_PENALTY: args.get("length_penalty", None),
GenParams.TEMPERATURE: args.get("temperature", None),
GenParams.TOP_P: args.get("top_p", None),
GenParams.TOP_K: args.get("top_k", None),
GenParams.RANDOM_SEED: args.get("seed", None),
GenParams.REPETITION_PENALTY: args.get("repetition_penalty", None),
GenParams.MIN_NEW_TOKENS: args.get("min_new_tokens", None),
GenParams.MAX_NEW_TOKENS: args.get("max_new_tokens", 256),
GenParams.STOP_SEQUENCES: args.get("stop_sequences", None),
GenParams.TIME_LIMIT: args.get("time_limit", None),
GenParams.TRUNCATE_INPUT_TOKENS: args.get("truncate_input_tokens", None),
GenParams.RETURN_OPTIONS: {
"generated_tokens": True,
"input_tokens": True,
"token_logprobs": True,
"token_ranks": True,
},
}
generate_params = {
k: v for k, v in cls.generate_params.items() if v is not None
}
return cls(
watsonx_credentials=get_watsonx_credentials(config_path),
model_id=model_id,
generate_params=generate_params,
)
def __init__(
self,
watsonx_credentials: Dict,
model_id,
generate_params: Optional[Dict[Any, Any]] = None,
) -> None:
try:
from ibm_watsonx_ai import APIClient
from ibm_watsonx_ai.foundation_models import ModelInference
except ImportError:
raise ImportError(
"Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
)
super().__init__()
client = APIClient(watsonx_credentials)
project_id = watsonx_credentials.get("project_id", None)
deployment_id = watsonx_credentials.get("deployment_id", None)
client.set.default_project(project_id)
self.generate_params = generate_params or {}
self.model = ModelInference(
model_id=model_id,
deployment_id=deployment_id,
api_client=client,
project_id=project_id,
)
self._model_id = model_id
@staticmethod
def _has_stop_token(response_tokens: List[str], context_tokens: List[str]) -> bool:
"""
Determines whether a stop token has been generated in the `response_tokens` compared to the `context_tokens`.
If the tokens do not match as expected, the function raises a RuntimeError, indicating a possible
misalignment between the tokens generated by the tokenizer and the model.
Args:
response_tokens (List[str]): The List of tokens generated as a response by the model.
context_tokens (List[str]): The List of tokens representing the input context.
Returns:
bool: True if the `response_tokens` likely contain a stop token that terminates the sequence,
otherwise raises an exception.
Raises:
RuntimeError: If there is an unexpected mismatch between the `response_tokens` and the `context_tokens`.
"""
context_length = len(context_tokens)
if response_tokens[: context_length - 1] == context_tokens[:-1]:
return (
response_tokens[-1] != context_tokens[-1]
) # only last token differs, probably stop sequence (</s>)
raise RuntimeError(
f"There is an unexpected difference between tokenizer and model tokens:\n"
f"context_tokens={context_tokens}\n"
f"response_tokens={response_tokens[:context_length]}"
)
def _check_model_logprobs_support(self):
"""
Verifies if the model supports returning log probabilities for input tokens.
This function sends a prompt to the model and checks whether the model's response
includes log probabilities for the input tokens. If log probabilities are not present,
it raises a `RuntimeError`, indicating that the model is not supported.
Raises:
RuntimeError: If the model does not return log probabilities for input tokens.
"""
tokens = self.model.generate_text(
prompt=["The best ice cream flavor is:"],
params=self.generate_params,
raw_response=True,
)[0]["results"][0]
if all(token.get("logprob", None) is None for token in tokens["input_tokens"]):
raise RuntimeError(
f"Model {self._model_id} is not supported: does not return logprobs for input tokens"
)
def _get_log_likelihood(
self,
input_tokens: List[Dict[str, float]],
context_tokens: List[Dict[str, float]],
) -> LogLikelihoodResult:
"""
Calculates the log likelihood of the generated tokens compared to the context tokens.
Args:
input_tokens (List[dict[str, float]]): A List of token dictionaries, each containing
token information like `text` and `logprob`.
context_tokens (List[dict[str, float]]): A List of token dictionaries representing
the input context.
Returns:
LogLikelihoodResult: An object containing the calculated log likelihood and a boolean
flag indicating if the tokens were generated greedily.
"""
response_tokens = [token["text"] for token in input_tokens]
context_length = len(context_tokens)
if self._has_stop_token(response_tokens, context_tokens):
context_length -= 1
return LogLikelihoodResult(
log_likelihood=sum(
token.get("logprob", 0) for token in input_tokens[context_length:]
),
is_greedy=all(
token["rank"] == 1 for token in input_tokens[context_length:]
),
)
def generate_until(self, requests: List[Instance]) -> List[str]:
"""
Generates text responses for a List of requests, with progress tracking and caching.
Args:
requests (List[Instance]): A List of instances, each containing a text input to be processed.
Returns:
List[str]: A List of generated responses.
"""
requests = [request.args[0] for request in requests]
results = []
batch_size = 5
for i in tqdm(
range(0, len(requests), batch_size),
desc=f"Running generate_until function with batch size {batch_size}",
):
batch = requests[i : i + batch_size]
try:
responses = self.model.generate_text(batch, self.generate_params)
except Exception as exp:
eval_logger.error(f"Error while generating text {exp}")
continue
for response, context in zip(responses, batch):
results.append(response)
self.cache_hook.add_partial("generated_text", context, response)
eval_logger.info("Cached responses")
return results
def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
"""
Args:
requests: Each request contains Instance.args : Tuple[str, str] containing:
1. an input string to the LM and
2. a target string on which the loglikelihood of the LM producing this target,
conditioned on the input, will be returned.
Returns:
tuple (loglikelihood, is_greedy) for each request according to the input order:
loglikelihood: probability of generating the target string conditioned on the input
is_greedy: True if and only if the target string would be generated by greedy sampling from the LM
"""
try:
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
except ImportError:
raise ImportError(
"Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
)
self._check_model_logprobs_support()
self.generate_params[GenParams.MAX_NEW_TOKENS] = 1
requests = [request.args for request in requests]
results: List[LogLikelihoodResult] = []
batch_size = 5
for i in tqdm(
range(0, len(requests), batch_size),
desc=f"Running loglikelihood function with batch size {batch_size}",
):
batch = requests[i : i + batch_size]
try:
tokenized_contexts = [
self.model.tokenize(prompt=context, return_tokens=True)["result"][
"tokens"
]
for context, _ in batch
]
except Exception as exp:
eval_logger.error(f"Error while model tokenize:\n {exp}")
continue
input_prompts = [context + continuation for context, continuation in batch]
try:
responses = self.model.generate_text(
prompt=input_prompts, params=self.generate_params, raw_response=True
)
except Exception as exp:
eval_logger.error(f"Error while model generate text:\n {exp}")
continue
for response, tokenized_context, (context, continuation) in zip(
responses, tokenized_contexts, batch
):
log_likelihood_response = self._get_log_likelihood(
response["results"][0]["input_tokens"], tokenized_context
)
results.append(log_likelihood_response)
self.cache_hook.add_partial(
"loglikelihood",
(context, continuation),
(
log_likelihood_response.log_likelihood,
log_likelihood_response.is_greedy,
),
)
eval_logger.info("Cached batch")
return cast(List[Tuple[float, bool]], results)
def loglikelihood_rolling(self, requests) -> List[Tuple[float, bool]]:
"""
Used to evaluate perplexity on a data distribution.
Args:
requests: Each request contains Instance.args : tuple[str] containing an input string to the model whose
entire loglikelihood, conditioned on purely the EOT token, will be calculated.
Returns:
tuple (loglikelihood,) for each request according to the input order:
loglikelihood: solely the probability of producing each piece of text given no starting input.
"""
try:
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
except ImportError:
raise ImportError(
"Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
)
self._check_model_logprobs_support()
self.generate_params[GenParams.MAX_NEW_TOKENS] = 1
requests = [request.args[0] for request in requests]
results: List[LogLikelihoodResult] = []
batch_size = 5
for i in tqdm(
range(0, len(requests), batch_size),
desc=f"Running loglikelihood_rolling function with batch size {batch_size}",
):
batch = requests[i : i + batch_size]
try:
responses = self.model.generate_text(
prompt=batch, params=self.generate_params, raw_response=True
)
except Exception as exp:
eval_logger.error(f"Error while model generate text:\n {exp}")
continue
for response, context in zip(responses, batch):
try:
log_likelihood_response = self._get_log_likelihood(
response["results"][0]["input_tokens"], []
)
results.append(log_likelihood_response)
self.cache_hook.add_partial(
"loglikelihood_rolling",
context,
(
log_likelihood_response.log_likelihood,
log_likelihood_response.is_greedy,
),
)
except Exception as exp:
eval_logger.error(
f"Error during log likelihood calculation:\n {exp}"
)
continue
eval_logger.info("Cached batch")
return cast(List[Tuple[float, bool]], results)
......@@ -69,8 +69,8 @@ class MambaLMWrapper(HFLM):
) -> None:
try:
from mamba_ssm.utils.hf import load_config_hf # noqa: F811
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
)
......@@ -88,8 +88,8 @@ please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba
) -> None:
try:
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel # noqa: F811
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
)
......
......@@ -39,8 +39,8 @@ def _patch_pretrained_cfg(
):
try:
import omegaconf
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
"Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
"or installing nemo following https://github.com/NVIDIA/NeMo.",
......@@ -79,8 +79,8 @@ def load_model(
MegatronGPTModel,
)
from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
"Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
"or installing nemo following https://github.com/NVIDIA/NeMo.",
......@@ -140,8 +140,8 @@ def load_model(
def setup_distributed_environment(trainer):
try:
from nemo.utils.app_state import AppState
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
"Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
"or installing nemo following https://github.com/NVIDIA/NeMo.",
......@@ -194,8 +194,8 @@ class NeMoLM(LM):
from pytorch_lightning.trainer.trainer import Trainer
self.generate = generate
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
"Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
"or installing nemo following https://github.com/NVIDIA/NeMo.",
......@@ -386,6 +386,9 @@ class NeMoLM(LM):
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
# cache this loglikelihood_rolling request
self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
return loglikelihoods
def _loglikelihood_tokens(self, requests, disable_tqdm=False):
......@@ -468,6 +471,9 @@ class NeMoLM(LM):
answer = (logprob, is_greedy)
if cache_key is not None:
# special case: loglikelihood_rolling produces a number of loglikelihood requests
# all with cache key None. instead do add_partial on the per-example level
# in the loglikelihood_rolling() function for those.
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
res.append(answer)
......
......@@ -38,8 +38,8 @@ class SparseMLLM(HFLM):
) -> None:
try:
from sparseml.transformers import SparseAutoModelForCausalLM
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"Package `sparseml` is not installed. "
"Please install it via `pip install sparseml[transformers]`"
)
......@@ -88,8 +88,8 @@ class SparseMLLM(HFLM):
def _get_config(self, pretrained: str, **kwargs) -> None:
try:
from sparseml.transformers import SparseAutoConfig
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"Package `sparseml` is not installed. "
"Please install it via `pip install sparseml[transformers]`"
)
......@@ -112,8 +112,8 @@ class SparseMLLM(HFLM):
) -> None:
try:
from sparseml.transformers import SparseAutoTokenizer
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"Package `sparseml` is not installed. "
"Please install it via `pip install sparseml[transformers]`"
)
......@@ -171,8 +171,8 @@ class DeepSparseLM(LM):
try:
import deepsparse
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"Package `deepsparse` is not installed. "
"Please install it via `pip install deepsparse[transformers]`"
)
......@@ -321,6 +321,9 @@ class DeepSparseLM(LM):
res.append(answer)
if cache_key is not None:
# special case: loglikelihood_rolling produces a number of loglikelihood requests
# all with cache key None. instead do add_partial on the per-example level
# in the loglikelihood_rolling() function for those.
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
return re_ord.get_original(res)
......
import copy
import json
import logging
import subprocess
from collections import defaultdict
from typing import List, Optional, Union
......@@ -33,54 +31,6 @@ except ImportError:
logger = logging.getLogger(__name__)
def get_nc_count() -> Union[int, None]:
"""Returns the number of neuron cores on the current instance."""
try:
cmd = "neuron-ls --json-output"
result = subprocess.run(cmd, shell=True, capture_output=True)
print(f"inferring nc_count from `neuron-ls` {result.stdout}")
json_output = json.loads(result.stdout)
count = sum([x["nc_count"] for x in json_output])
print(f"nc_count={count}")
return count
except Exception:
return None
def wrap_constant_batch_size(func):
def _decorator(self, input_ids):
"""input_ids a 2D array with batch_size on dim=0
makes sure the func runs with self.batch_size
"""
# access a from TestSample
batch_size = input_ids.shape[0]
if batch_size < self.batch_size:
# handle the event of input_ids.shape[0] != batch_size
# Neuron cores expect constant batch_size
input_ids = torch.concat(
(
input_ids,
# add missing_batch_size dummy
torch.zeros(
[self.batch_size - batch_size, *input_ids.size()[1:]],
dtype=input_ids.dtype,
device=input_ids.device,
),
),
dim=0,
)
elif batch_size > self.batch_size:
raise ValueError(
f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
)
# return the forward pass that requires constant batch size
return func(self, input_ids)[:batch_size]
return _decorator
class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
"""NeuronModelForCausalLM with `stopping_criteria` in `generate`"""
......@@ -146,7 +96,7 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
raise ValueError(
f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
)
elif batch_size < self.batch_size:
elif batch_size < self.batch_size and not self.continuous_batching:
logger.warning(
"Inputs will be padded to match the model static batch size. This will increase latency."
)
......@@ -158,8 +108,6 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
if attention_mask is not None:
padding = torch.zeros(padding_shape, dtype=torch.int64)
padded_attention_mask = torch.cat([attention_mask, padding])
# Drop the current generation context and clear the Key/Value cache
self.reset_generation()
output_ids = self.generate_tokens(
padded_input_ids,
......@@ -179,8 +127,6 @@ class NEURON_HF(TemplateLM):
Tested with neuron 2.17.0
"""
_DEFAULT_MAX_LENGTH = 2048
def __init__(
self,
pretrained: Optional[str] = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
......@@ -198,12 +144,12 @@ class NEURON_HF(TemplateLM):
add_bos_token: Optional[bool] = False,
) -> None:
if not NEURON_AVAILABLE:
raise Exception(
raise ImportError(
"Tried to load neuron model, but neuron is not installed ",
"please install neuron via pip install transformers-neuron ",
"also make sure you are running on an AWS inf2 instance",
)
if version.parse(optimum_neuron_version) != version.parse("0.0.17"):
if version.parse(optimum_neuron_version) != version.parse("0.0.24"):
logger.warning(
'`optimum-neuron` model requires `pip install "optimum[neuronx]>=0.0.17" '
"preferably using the Hugging Face Neuron Deep Learning AMI (Ubuntu 22.04) "
......@@ -217,34 +163,16 @@ class NEURON_HF(TemplateLM):
self.batch_size_per_gpu = int(batch_size)
batch_size = int(batch_size)
if tp_degree is None:
# execute `neuron-ls --json-output | jq '.[0].nc_count'``
# to get the number of neuron cores on your instance
tp_degree = get_nc_count()
assert isinstance(tp_degree, int), (
f"model_args must include tp_degree. tp_degree must be set to an integer,"
f" but is tp_degree=`{tp_degree}` with type=`{type(tp_degree)}`."
"Set it to number of neuron cores on your instance."
" For inf2.xlarge and inf2.8xlarge, set it to `2`."
" For inf2.24xlarge, set it to `12`."
" For inf2.48xlarge, set it to `24`."
)
# TODO: update this to be less of a hack once subfolder is fixed in HF
revision = revision + ("/" + subfolder if subfolder is not None else "")
self._config = transformers.AutoConfig.from_pretrained(
pretrained,
revision=revision,
trust_remote_code=trust_remote_code,
)
torch_dtype = lm_eval.models.utils.get_dtype(dtype)
assert torch_dtype in [
torch.float16,
torch.bfloat16,
], "Only float16 and bfloat16 are supported"
revision = str(revision) # cast to string if not already one
# TODO: update this to be less of a hack once subfolder is fixed in HF
revision = revision + ("/" + subfolder if subfolder is not None else "")
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
pretrained if tokenizer is None else tokenizer,
......@@ -253,36 +181,58 @@ class NEURON_HF(TemplateLM):
use_fast=use_fast_tokenizer,
)
# Neuron specific code
if torch_dtype == torch.float16:
self.amp_dtype = "f16"
elif torch_dtype == torch.bfloat16:
self.amp_dtype = "bf16"
elif torch_dtype == torch.float32:
self.amp_dtype = "f32"
else:
raise NotImplementedError("Only float16 and bfloat16 are implemented.")
compiler_args = {"num_cores": tp_degree, "auto_cast_type": self.amp_dtype}
input_shapes = {
"batch_size": batch_size,
"sequence_length": self._DEFAULT_MAX_LENGTH,
}
neuron_config = getattr(self._config, "neuron", None)
if neuron_config is None:
# Check export parameters
if tp_degree is not None:
assert isinstance(tp_degree, int), (
f"tp_degree must be set to an integer,"
f" but is tp_degree=`{tp_degree}` with type=`{type(tp_degree)}`."
"Set it to a number lower than the number of neuron cores on your instance."
" For inf2.xlarge and inf2.8xlarge, set it to `2`."
" For inf2.24xlarge, set it <= `12`."
" For inf2.48xlarge, set it <= `24`."
)
torch_dtype = lm_eval.models.utils.get_dtype(dtype)
if torch_dtype == torch.float16:
self.amp_dtype = "f16"
elif torch_dtype == torch.bfloat16:
self.amp_dtype = "bf16"
elif torch_dtype == torch.float32:
self.amp_dtype = "f32"
else:
raise NotImplementedError(
"Only float16/bfloat16/float32 are supported."
)
print(
f"{'='*20} \n loading model to neuron with"
f" {compiler_args}, {input_shapes}..."
)
self.model = CustomNeuronModelForCausalLM.from_pretrained(
pretrained,
revision=revision,
trust_remote_code=trust_remote_code,
low_cpu_mem_usage=low_cpu_mem_usage,
export=True,
**compiler_args,
**input_shapes,
)
print(f"SUCCESS: neuron model compiled. \n {'='*20}")
print(f"{'='*20} \n exporting model to neuron")
self.model = CustomNeuronModelForCausalLM.from_pretrained(
pretrained,
revision=revision,
trust_remote_code=trust_remote_code,
low_cpu_mem_usage=low_cpu_mem_usage,
export=True,
batch_size=batch_size,
num_cores=tp_degree,
auto_cast_type=self.amp_dtype,
sequence_length=max_length,
)
neuron_config = self.model.config.neuron
print(
f"SUCCESS: neuron model exported with config {neuron_config}. \n {'='*20}"
)
else:
print(
f"{'='*20} \n loading neuron model with config" f" {neuron_config}..."
)
self.model = CustomNeuronModelForCausalLM.from_pretrained(
pretrained,
revision=revision,
trust_remote_code=trust_remote_code,
low_cpu_mem_usage=low_cpu_mem_usage,
)
print(f"SUCCESS: neuron model loaded. \n {'='*20}")
self.truncation = truncation
......@@ -290,8 +240,6 @@ class NEURON_HF(TemplateLM):
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self.add_bos_token = add_bos_token
self._max_length = max_length
self.batch_schedule = 1
self.batch_sizes = {}
......@@ -312,17 +260,7 @@ class NEURON_HF(TemplateLM):
@property
def max_length(self):
if self._max_length: # if max length manually set, return it
return self._max_length
seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
for attr in seqlen_config_attrs:
if hasattr(self.model.config, attr):
return getattr(self.model.config, attr)
if hasattr(self.tokenizer, "model_max_length"):
if self.tokenizer.model_max_length == 1000000000000000019884624838656:
return self._DEFAULT_MAX_LENGTH
return self.tokenizer.model_max_length
return self._DEFAULT_MAX_LENGTH
return self.model.max_length
@property
def max_gen_toks(self) -> int:
......@@ -390,34 +328,6 @@ class NEURON_HF(TemplateLM):
def tok_decode(self, tokens):
return self.tokenizer.decode(tokens)
@wrap_constant_batch_size
def _model_call(self, input_ids: torch.Tensor):
"""
get logits for the entire sequence
:param input_ids: torch.Tensor
A torch tensor of shape [batch, sequence_cont]
the size of sequence may vary from call to call
:return
A torch tensor of shape [batch, sequence, vocab] with the
logits returned from the model's decoder-lm head
"""
_, sequence_length = input_ids.shape
with torch.inference_mode():
cache_ids = torch.arange(0, sequence_length, dtype=torch.int32).split(1)
input_ids_split = input_ids.split(1, dim=1)
return torch.concat(
[
self.model.forward(
input_ids=input_id, cache_ids=cache_id, return_dict=False
)[0]
for input_id, cache_id in zip(input_ids_split, cache_ids)
],
dim=1,
)
def _model_generate(self, context, max_length, stop, **generation_kwargs):
# we require users to pass do_sample=True explicitly
# for non-greedy gen. This should be reevaluated when considering beam search.
......@@ -501,7 +411,8 @@ class NEURON_HF(TemplateLM):
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
# cache this loglikelihood_rolling request
self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
return loglikelihoods
def _loglikelihood_tokens(
......@@ -578,15 +489,41 @@ class NEURON_HF(TemplateLM):
cont_toks_list.append(continuation_enc)
inplens.append(inplen)
# create encoder attn mask and batched conts, if seq2seq
call_kwargs = {}
# Add dummy inputs up to the model static batch size
if len(inps) < self.batch_size:
inps = inps + [
torch.zeros_like(inps[0]),
] * (self.batch_size - len(inps))
masks = [torch.ones_like(inp) for inp in inps]
batched_inps = lm_eval.models.utils.pad_and_concat(
padding_len_inp, inps, padding_side="right"
) # [batch, padding_len_inp]
multi_logits = F.log_softmax(
self._model_call(batched_inps, **call_kwargs), dim=-1
) # [batch, padding_length (inp or cont), vocab]
batched_masks = lm_eval.models.utils.pad_and_concat(
padding_len_inp, masks, padding_side="right"
)
if self.model.model.neuron_config.output_all_logits:
inputs = self.model.prepare_inputs_for_prefill(
batched_inps, batched_masks
)
multi_logits = F.log_softmax(
self.model.forward(**inputs).logits, dim=-1
) # [batch, padding_length (inp or cont), vocab]
else:
# The model will only return the logits for the last input token, so we need
# to iterate over inputs to accumulate logits.
# To speed things up we use the KV cache as we would do when generating.
inputs = self.model.prepare_inputs_for_prefill(
batched_inps[:, :1], batched_masks[:, :1]
)
outputs = [self.model.forward(**inputs).logits]
for i in range(1, padding_len_inp):
inputs = self.model.prepare_inputs_for_decode(
batched_inps[:, : i + 1], batched_masks[:, : i + 1]
)
outputs.append(self.model.forward(**inputs).logits)
multi_logits = F.log_softmax(torch.concat(outputs, dim=1), dim=-1)
for (cache_key, _, _), logits, inplen, cont_toks in zip(
chunk, multi_logits, inplens, cont_toks_list
......@@ -619,7 +556,11 @@ class NEURON_HF(TemplateLM):
res.append(answer)
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
if cache_key is not None:
# special case: loglikelihood_rolling produces a number of loglikelihood requests
# all with cache key None. instead do add_partial on the per-example level
# in the loglikelihood_rolling() function for those.
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
return re_ord.get_original(res)
......
import copy
import os
from collections import defaultdict
from importlib.util import find_spec
from typing import List, Literal, Optional, Tuple
from functools import cached_property
from typing import Any, Dict, List, Optional, Tuple, Union
from tqdm import tqdm
import lm_eval.models.utils
from lm_eval import utils
from lm_eval.api.model import LM, TemplateLM
from lm_eval.api.registry import register_model
from lm_eval.models.utils import retry_on_specific_exceptions
from lm_eval.models.api_models import TemplateAPI
from lm_eval.utils import eval_logger
def get_result(response) -> Tuple[float, bool]:
"""Process results from OpenAI API response.
:param response: dict
OpenAI API Response
:return:
continuation_logprobs: np.array
Log probabilities of continuation tokens
is_greedy: bool
whether argmax matches given continuation exactly
"""
is_greedy = True
logprobs = response.logprobs.token_logprobs
continuation_logprobs = sum(logprobs)
for i in range(len(response.logprobs.token_logprobs)):
token = response.logprobs.token_logprobs[i]
top_tokens = response.logprobs.top_logprobs[i]
top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
if top_token != token:
is_greedy = False
break
return continuation_logprobs, is_greedy
def oa_completion(client, chat: bool = False, **kwargs):
"""Query OpenAI API for completion.
Retry with back-off until they respond
"""
if not find_spec("openai") or not find_spec("tiktoken"):
raise Exception(
"attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. "
"Please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`"
@register_model("local-completions")
class LocalCompletionsAPI(TemplateAPI):
def __init__(
self,
base_url=None,
tokenizer_backend="huggingface",
**kwargs,
):
super().__init__(
base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
)
else:
import openai
def _exception_callback(e: Exception, sleep_time: float) -> None:
import traceback
traceback.print_exc()
@retry_on_specific_exceptions(
on_exceptions=[openai.OpenAIError],
max_retries=None, # retry forever, consider changing
on_exception_callback=_exception_callback,
)
def completion():
if chat:
return client.chat.completions.create(**kwargs)
else:
return client.completions.create(**kwargs)
return completion()
@register_model("openai-completions", "local-completions")
class OpenaiCompletionsLM(TemplateLM):
_DEFAULT_MAX_LENGTH = 2048
def __init__(
def _create_payload(
self,
model: str,
base_url: str = None,
tokenizer: Optional[str] = None,
tokenizer_backend: Literal["tiktoken", "huggingface"] = "tiktoken",
truncate: bool = False,
max_gen_toks: int = 256,
batch_size: int = 1,
messages: Union[List[List[int]], List[dict], List[str], str],
generate=False,
gen_kwargs: Optional[dict] = None,
seed: int = 1234,
max_length: Optional[int] = None,
) -> None:
"""
:param engine: str
OpenAI API engine (e.g. gpt-3.5-turbo-instruct)
:param truncate: bool
Truncate input if too long (if False and input is too long, throw error)
"""
super().__init__()
self.seed = seed
try:
import openai # noqa: E401
import tiktoken
except ModuleNotFoundError:
raise Exception(
"attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
please install these via `pip install lm-eval[openai]` or `pip install -e .\"[openai]\"`",
)
self.model = model
self.base_url = base_url
self.tokenizer_backend = tokenizer_backend
self.truncate = truncate
self._batch_size = int(batch_size)
self._max_gen_toks = max_gen_toks
self._max_length = max_length
# if we have a local model, use HF tokenizer over tiktoken
if self.tokenizer_backend == "huggingface":
import transformers # noqa: E401
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
tokenizer if tokenizer else self.model
)
self.vocab_size = self.tokenizer.vocab
self.end_of_text_token_id = self.tokenizer.eos_token
elif self.tokenizer_backend == "tiktoken":
if self.base_url:
eval_logger.warning(
f"Passed `base_url={self.base_url}` but using Tiktoken tokenizer backend. "
"Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken."
)
self.tokenizer = tiktoken.encoding_for_model(self.model)
self.vocab_size = self.tokenizer.n_vocab
self.end_of_text_token_id = self.tokenizer.eot_token
else:
raise ValueError(
f"Expected tokenizer_backend to be one of ['tiktoken', 'huggingface'] but got {self.tokenizer_backend}"
)
# Read from environment variable OPENAI_API_KEY
# Set to EMPTY for local
openai.api_key = os.environ["OPENAI_API_KEY"]
if self.base_url:
self.client = openai.OpenAI(base_url=self.base_url)
else:
self.client = openai.OpenAI()
@property
def eot_token_id(self):
return self.end_of_text_token_id
@property
def max_length(self) -> int:
if self._max_length:
return self._max_length
**kwargs,
) -> dict:
if generate:
gen_kwargs.pop("do_sample", False)
if "max_tokens" in gen_kwargs:
max_tokens = gen_kwargs.pop("max_tokens")
else:
max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
temperature = gen_kwargs.pop("temperature", 0)
stop = gen_kwargs.pop("until", ["<|endoftext|>"])
return {
"prompt": messages,
"model": self.model,
"max_tokens": max_tokens,
"temperature": temperature,
"stop": stop,
"seed": seed,
**gen_kwargs,
}
else:
return self._DEFAULT_MAX_LENGTH
@property
def max_gen_toks(self) -> int:
return self._max_gen_toks
@property
def batch_size(self) -> int:
return self._batch_size
@property
def device(self):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
def tok_encode(self, string: str, **kwargs) -> List[int]:
return self.tokenizer.encode(string)
def tok_decode(self, tokens: List[int]) -> str:
return self.tokenizer.decode(tokens)
def _loglikelihood_tokens(
self, requests, disable_tqdm: bool = False
return {
"model": self.model,
"prompt": messages,
"temperature": 0,
"max_tokens": 1,
"logprobs": 1,
"seed": seed,
"echo": True,
}
@staticmethod
def parse_logprobs(
outputs: Union[Dict, List[Dict]],
tokens: List[List[int]] = None,
ctxlens: List[int] = None,
**kwargs,
) -> List[Tuple[float, bool]]:
res = []
def _collate(x):
# this doesn't efficiently handle last-token differences yet, but those are kinda annoying because
# it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations
# we care about, and so we need some kind of backup for when it isn't
toks = x[1] + x[2]
return -len(toks), tuple(toks)
re_ord = utils.Reorderer(requests, _collate)
for chunk in tqdm(
list(lm_eval.models.utils.chunks(re_ord.get_reordered(), self.batch_size)),
disable=disable_tqdm,
):
inps = []
ctxlens = []
for cache_key, context_enc, continuation_enc in chunk:
# max_length+1 because the API takes up to 2049 tokens, including the first context token
inp = (context_enc + continuation_enc)[-(self.max_length + 1) :]
# TODO: the logic is much simpler if we just look at the length of continuation tokens
ctxlen = len(context_enc) - max(
0, len(context_enc) + len(continuation_enc) - (self.max_length + 1)
)
inps.append(inp)
ctxlens.append(ctxlen)
response = oa_completion(
client=self.client,
model=self.model,
prompt=inps,
max_tokens=0,
temperature=0.0,
logprobs=10,
seed=self.seed,
)
for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip(
response.choices, ctxlens, chunk
):
answer = get_result(resp)
res.append(answer)
# partial caching
if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
return re_ord.get_original(res)
def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
if not requests:
return []
if not isinstance(outputs, list):
outputs = [outputs]
for out in outputs:
for choice, ctxlen in zip(out["choices"], ctxlens):
assert ctxlen > 0, "Context length must be greater than 0"
logprobs = sum(choice["logprobs"]["token_logprobs"][ctxlen:-1])
tokens_logprobs = choice["logprobs"]["token_logprobs"][ctxlen:-1]
top_logprobs = choice["logprobs"]["top_logprobs"][ctxlen:-1]
is_greedy = True
for tok, top in zip(tokens_logprobs, top_logprobs):
if tok != max(top.values()):
is_greedy = False
break
res.append((logprobs, is_greedy))
return res
@staticmethod
def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]:
res = []
requests = [req.args for req in requests]
def _collate(x):
toks = self.tok_encode(x[0])
return len(toks), x[0]
if not isinstance(outputs, list):
outputs = [outputs]
for out in outputs:
for choices in out["choices"]:
res.append(choices["text"])
return res
re_ord = utils.Reorderer(requests, _collate)
def sameuntil_chunks(xs, size):
ret = []
lastuntil = xs[0][1]
for x in xs:
if len(ret) >= size or x[1] != lastuntil:
yield ret, lastuntil
ret = []
lastuntil = x[1]
ret.append(x)
if ret:
yield ret, lastuntil
# todo: more intelligent batching for heterogeneous `until`
for chunk, request_args in tqdm(
list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size)),
disable=disable_tqdm,
):
inps = []
self._max_gen_toks = request_args.get("max_gen_toks", self.max_gen_toks)
for context, _ in chunk:
context_enc = self.tok_encode(context)
inp = context_enc[-(self.max_length - self.max_gen_toks) :]
inps.append(inp)
@property
def api_key(self):
return os.environ.get("OPENAI_API_KEY", "")
until = request_args.get("until", ["<|endoftext|>"])
request_args["temperature"] = request_args.get("temperature", 0)
response = oa_completion(
client=self.client,
model=self.model,
prompt=inps,
max_tokens=self.max_gen_toks,
stop=until,
seed=self.seed,
**{
k: v
for k, v in request_args.items()
if k not in {"do_sample", "max_gen_toks", "until"}
},
@register_model("local-chat-completions")
class LocalChatCompletion(LocalCompletionsAPI):
def __init__(
self,
base_url=None,
tokenizer_backend=None,
tokenized_requests=False,
**kwargs,
):
eval_logger.warning(
"chat-completions endpoint requires the `--apply_chat_template` flag."
)
super().__init__(
base_url=base_url,
tokenizer_backend=tokenizer_backend,
tokenized_requests=tokenized_requests,
**kwargs,
)
if self._batch_size > 1:
eval_logger.warning(
"Chat completions does not support batching. Defaulting to batch size 1."
)
for resp, (context, args_) in zip(response.choices, chunk):
s = getattr(resp, "text")
until_ = until
for term in until_:
if len(term) > 0:
s = s.split(term)[0]
# partial caching
self.cache_hook.add_partial(
"generate_until", (context, {"until": until_}), s
)
res.append(s)
return re_ord.get_original(res)
def _model_call(self, inps):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
self._batch_size = 1
def _model_generate(self, context, max_length, eos_token_id):
# Isn't used because we override generate_until
raise NotImplementedError()
def _create_payload(
self,
messages: List[Dict],
generate=False,
gen_kwargs: dict = None,
seed=1234,
**kwargs,
) -> dict:
assert (
type(messages) is not str
), "chat-completions require the --apply_chat_template flag."
gen_kwargs.pop("do_sample", False)
if "max_tokens" in gen_kwargs:
max_tokens = gen_kwargs.pop("max_tokens")
else:
max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
temperature = gen_kwargs.pop("temperature", 0)
stop = gen_kwargs.pop("until", ["<|endoftext|>"])
if not isinstance(stop, (list, tuple)):
stop = [stop]
return {
"messages": messages,
"model": self.model,
"max_tokens": max_tokens,
"temperature": temperature,
"stop": stop[:4],
"seed": seed,
**gen_kwargs,
}
@staticmethod
def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]:
res = []
if not isinstance(outputs, list):
outputs = [outputs]
for out in outputs:
for choices in out["choices"]:
res.append(choices["message"]["content"])
return res
def tok_encode(
self,
string: Union[str, Any],
left_truncate_len=None,
add_special_tokens=None,
**kwargs,
) -> Union[List[str], List[int], Any]:
return string
def loglikelihood_rolling(
self, requests, disable_tqdm: bool = False
) -> List[float]:
loglikelihoods = []
def loglikelihood(self, requests, **kwargs):
raise NotImplementedError(
"Loglikelihood is not supported for chat completions. Consider using the completions API instead."
)
for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
rolling_token_windows = list(
map(
utils.make_disjoint_window,
utils.get_rolling_token_windows(
token_list=self.tok_encode(string),
prefix_token=self.eot_token_id,
max_seq_len=self.max_length,
context_len=1,
),
)
)
# TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
rolling_token_windows = [(None,) + x for x in rolling_token_windows]
@register_model(
"openai-completions",
)
class OpenAICompletionsAPI(LocalCompletionsAPI):
def __init__(
self,
base_url="https://api.openai.com/v1/completions",
tokenizer_backend="tiktoken",
**kwargs,
):
super().__init__(
base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
)
string_nll = self._loglikelihood_tokens(
rolling_token_windows,
disable_tqdm=True,
@cached_property
def api_key(self):
"""Override this property to return the API key for the API request."""
key = os.environ.get("OPENAI_API_KEY", None)
if key is None:
raise ValueError(
"API key not found. Please set the `OPENAI_API_KEY` environment variable."
)
return key
# discard is_greedy
string_nll = [x[0] for x in string_nll]
def loglikelihood(self, requests, **kwargs):
assert (
self.model
in [
"babbage-002",
"davinci-002",
]
), f"Prompt loglikelihoods are only supported by OpenAI's API for {['babbage-002', 'davinci-002']}."
return super().loglikelihood(requests, **kwargs)
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
return loglikelihoods
def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
return ""
@register_model("openai-chat-completions", "local-chat-completions")
class OpenaiChatCompletionsLM(LM):
@register_model("openai-chat-completions")
class OpenAIChatCompletion(LocalChatCompletion):
def __init__(
self,
model: str = "gpt-3.5-turbo", # GPT model or Local model using HuggingFace model paths
base_url: str = None,
truncate: bool = False,
base_url="https://api.openai.com/v1/chat/completions",
tokenizer_backend=None,
tokenized_requests=False,
**kwargs,
) -> None:
"""
:param model: str
Implements an OpenAI-style chat completion API for
accessing both OpenAI OR locally-hosted models using
HuggingFace Tokenizer
OpenAI API model (e.g. gpt-3.5-turbo)
using the **gen_kwargs passed on init
:param truncate: bool
Truncate input if too long (if False and input is too long, throw error)
"""
super().__init__()
try:
import openai # noqa: E401
except ModuleNotFoundError:
raise Exception(
"attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
):
if "o1" in kwargs.get("model", ""):
eval_logger.warning(
"o1 models do not support `stop` and only support temperature=1"
)
self.model = model
self.base_url = base_url
self.truncate = truncate
# Read from environment variable OPENAI_API_KEY
# Set to EMPTY for local
if self.base_url:
self.client = openai.OpenAI(base_url=self.base_url)
else:
self.client = openai.OpenAI() # openai.AsyncOpenAI()
@property
def max_length(self) -> int:
# Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
return 2048
@property
def max_gen_toks(self) -> int:
return 256
@property
def batch_size(self):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
@property
def device(self):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
res = defaultdict(list)
re_ords = {}
super().__init__(
base_url=base_url,
tokenizer_backend=tokenizer_backend,
tokenized_requests=tokenized_requests,
**kwargs,
)
# we group requests by their generation_kwargs,
# so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
# in the same batch.
grouper = lm_eval.models.utils.Grouper(requests, lambda x: str(x.args[1]))
for key, reqs in grouper.get_grouped().items():
# within each set of reqs for given kwargs, we reorder by token length, descending.
re_ords[key] = utils.Reorderer(
[req.args for req in reqs], lambda x: (-len(x[0]), x[0])
@cached_property
def api_key(self):
"""Override this property to return the API key for the API request."""
key = os.environ.get("OPENAI_API_KEY", None)
if key is None:
raise ValueError(
"API key not found. Please set the `OPENAI_API_KEY` environment variable."
)
return key
pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
for key, re_ord in re_ords.items():
# n needs to be 1 because messages in
# chat completion are not batch but
# is regarded as a single conversation.
chunks = lm_eval.models.utils.chunks(re_ord.get_reordered(), n=1)
for chunk in chunks:
contexts, all_gen_kwargs = zip(*chunk)
inps = [{"role": "user", "content": context} for context in contexts]
gen_kwargs = all_gen_kwargs[0]
until = None
if isinstance(kwargs := copy.deepcopy(gen_kwargs), dict):
if "do_sample" in kwargs.keys():
kwargs.pop("do_sample")
if "until" in kwargs.keys():
until = kwargs.pop("until")
if isinstance(until, str):
until = [until]
elif not isinstance(until, list):
raise ValueError(
f"Expected repr(kwargs['until']) to be of type Union[str, list] but got {until}"
)
kwargs["stop"] = until
kwargs["max_tokens"] = kwargs.pop("max_gen_toks", self.max_gen_toks)
else:
raise ValueError(
f"Expected repr(kwargs) to be of type repr(dict) but got {kwargs}"
)
response = oa_completion(
client=self.client,
chat=True,
messages=inps,
model=self.model,
**kwargs,
)
for resp, (context, args_) in zip(response.choices, chunk):
s = resp.message.content
if until is not None:
for term in until:
if len(term) > 0:
s = s.split(term)[0]
res[key].append(s)
self.cache_hook.add_partial(
"generate_until", (context, {"until": until}), s
)
pbar.update(1)
# reorder this group of results back to original unsorted form
res[key] = re_ord.get_original(res[key])
pbar.close()
return grouper.get_original(res)
def loglikelihood(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.")
def loglikelihood(self, requests, **kwargs):
raise NotImplementedError(
"Loglikelihood (and therefore `multiple_choice`-type tasks) is not supported for chat completions as OpenAI does not provide prompt logprobs. See https://github.com/EleutherAI/lm-evaluation-harness/issues/942#issuecomment-1777836312 or https://github.com/EleutherAI/lm-evaluation-harness/issues/1196 for more background on this limitation."
)
def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.")
def _create_payload(
self,
messages: List[Dict],
generate=False,
gen_kwargs: dict = None,
seed=1234,
**kwargs,
) -> dict:
assert (
type(messages) is not str
), "chat-completions require the --apply_chat_template flag."
gen_kwargs.pop("do_sample", False)
if "max_tokens" in gen_kwargs:
max_tokens = gen_kwargs.pop("max_tokens")
else:
max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
temperature = gen_kwargs.pop("temperature", 0)
stop = gen_kwargs.pop("until", ["<|endoftext|>"])
if not isinstance(stop, (list, tuple)):
stop = [stop]
output = {
"messages": messages,
"model": self.model,
"max_completion_tokens": max_tokens,
"temperature": temperature,
"stop": stop[:4],
"seed": seed,
**gen_kwargs,
}
if "o1" in self.model:
output.pop("stop")
output["temperature"] = 1
return output
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment